''' lecture16_linear_regression.py Demo of using Numpy to compute linear regression, R^2, and p-value of slope test vs. slope = 0 Using diamond dataset Oliver W. Layton CS251: Data Analysis and Visualization Spring 2019 ''' import numpy as np from scipy import stats import matplotlib.pyplot as plt import seaborn as sns sns.set_style("darkgrid") # Load diamond data CSV file (included in seaborn module) data = sns.load_dataset('diamonds') # Randomly sample 500 points data = data.sample(500) # Do the linear regression using # x: 'carat' # y: 'price' x = data['carat'] y = data['price'] # Plot data sample plt.scatter(x, y) # superimpose linear regression line m, b, r, p, stderr = stats.linregress(x, y) plt.plot(x, m*x + b, 'r') # Plot styling stuff plt.title(f'y = {m:.1f}x+{b:.1f}') plt.xlabel('Diamond carat') plt.ylabel('Price ($)') plt.text(1, 15, f'$R^2$ = {r**2:.2f}', fontsize=15) plt.text(2, 15, f'$p$ = {p:.2}', fontsize=15) # Show us the plot in a pop-up window plt.show()