Linear Regression#
Intuition#
Practice#
In this section, you will practice to implement linear regression models using Python on the ‘Boston Housing’ dataset. Three different methods will be used to implement the linear regression model:
Simple implementation with
numpy
.Vectorization implementation with
numpy
.Linear regression from
sklearn
framework
import os
import numpy as np
import pandas as pd
import wget
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()
Dataset#
Suppose we have a dataset giving the living areas and prices of 47 houses from Portland, Oregon as:
Living area (feet2) |
Price (1000$s) |
---|---|
2104 |
400 |
1600 |
330 |
2400 |
369 |
1416 |
232 |
3000 |
540 |
… |
… |
(full data is available at ex1data2.txt
)
if not os.path.exists('ex1data2.txt'):
wget.download("https://raw.githubusercontent.com/girishkuniyal/Predict-housing-prices-in-Portland/master/ex1data2.txt")
with open("./ex1data2.txt") as f:
data = f.read()
house_price = []
for line in data.split("\n"):
if line:
area, n_bedrooms, price = line.split(",")
house_price.append([int(area), int(n_bedrooms), np.round(float(price)/1000, 0).astype(int)])
df = pd.DataFrame(house_price, columns=["area", "n_bedrooms", "price"])
df.head()
area | n_bedrooms | price | |
---|---|---|---|
0 | 2104 | 3 | 400 |
1 | 1600 | 3 | 330 |
2 | 2400 | 3 | 369 |
3 | 1416 | 2 | 232 |
4 | 3000 | 4 | 540 |
Exploratory Data Analysis#
Price#
Show code cell source
def get_bokeh_box_plot(dataframe, x_col, y_col, direction="vertical"):
# let df as the subdataframe with two columns of data: x_col and y_col
if x_col is None:
x_values = [""] * len(dataframe)
y_values = dataframe[y_col]
x_col = "x_col"
df = pd.DataFrame({x_col: x_values, y_col: y_values})
else:
df = dataframe[[x_col, y_col]]
cats = list(df[x_col].unique())
# find the quartiles and IQR for each category
groups = df.groupby(x_col)
q1 = groups.quantile(q=0.25)
q2 = groups.quantile(q=0.5)
q3 = groups.quantile(q=0.75)
mean = groups.mean()
iqr = q3 - q1
upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr
# find the outliers for each category
def outliers(group):
cat = group.name
return group[
(group[y_col] > upper.loc[cat][y_col]) | (group[y_col] < lower.loc[cat][y_col])
][y_col]
out = groups.apply(outliers, include_groups=False) # .dropna()
# prepare outlier data for plotting, we need coordinates for every outlier.
if not out.empty:
outx = list(out.index.get_level_values(0))
outy = list(out.values)
if len(cats) == 1:
outy = outy[0]
outx = outx * len(outy)
# if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
qmin = groups.quantile(q=0.00)
qmax = groups.quantile(q=1.00)
upper[y_col] = [min([x, y]) for (x, y) in zip(list(qmax.loc[:, y_col]), upper[y_col])]
lower[y_col] = [max([x, y]) for (x, y) in zip(list(qmin.loc[:, y_col]), lower[y_col])]
boxes_data = pd.concat(
[
q1.rename(columns={y_col: "q1"}),
q2.rename(columns={y_col: "q2"}),
q3.rename(columns={y_col: "q3"}),
iqr.rename(columns={y_col: "iqr"}),
upper.rename(columns={y_col: "upper"}),
lower.rename(columns={y_col: "lower"}),
mean.rename(columns={y_col: "mean"}),
],
axis=1,
)
source = ColumnDataSource(boxes_data)
cats = [str(x) for x in boxes_data.index]
if direction == "vertical":
# p = figure(background_fill_color="#efefef", x_range=cats, toolbar_location=None)
p = figure(background_fill_color="#efefef", x_range=cats,)
# stems
upper_stem_line = p.segment(x_col, "upper", x_col, "q3", source=source, line_color="black")
lower_stem_line = p.segment(x_col, "lower", x_col, "q1", source=source, line_color="black")
# boxes
top_box = p.vbar(
x_col, 0.7, "q2", "q3", source=source, fill_color="#E08E79", line_color="black"
)
bottom_box = p.vbar(
x_col, 0.7, "q1", "q2", source=source, fill_color="#3B8686", line_color="black"
)
# mean
mean_line = p.rect(
x_col,
"mean",
0.7,
0.0001,
source=source,
line_color="black",
fill_color="black",
line_dash="dashed",
)
# whiskers (almost-0 height rects simpler than segments)
upper_fence = p.rect(x_col, "lower", 0.2, 0.005, source=source, line_color="black")
lower_fence = p.rect(x_col, "upper", 0.2, 0.005, source=source, line_color="black")
# outliers
if not out.empty:
out_points = p.scatter(outx, outy, size=6, color="#F38630", fill_alpha=0.6)
p.add_tools(
HoverTool(
renderers=[out_points],
tooltips=[("x", "@x"), ("y", "@y")],
)
)
p.yaxis.axis_label = y_col
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = "white"
p.xaxis.major_label_text_font_size = "16px"
elif direction == "horizontal":
cats = sorted(cats, reverse=True)
p = figure(
background_fill_color="#efefef",
y_range=cats,
toolbar_location=None,
width=800,
height=300,
)
upper_stem_line = p.segment("upper", x_col, "q3", x_col, source=source, line_color="black")
lower_stem_line = p.segment("lower", x_col, "q1", x_col, source=source, line_color="black")
top_box = p.hbar(
x_col, 0.7, "q2", "q3", source=source, fill_color="#E08E79", line_color="black"
)
bottom_box = p.hbar(
x_col, 0.7, "q1", "q2", source=source, fill_color="#3B8686", line_color="black"
)
mean_line = p.rect(
"mean",
x_col,
0.0001,
0.7,
source=source,
line_color="black",
fill_color="black",
line_dash="dashed",
)
upper_fence = p.rect("lower", x_col, 0.005, 0.2, source=source, line_color="black")
lower_fence = p.rect("upper", x_col, 0.005, 0.2, source=source, line_color="black")
if not out.empty:
out_points = p.scatter(outy, outx, size=6, color="#F38630", fill_alpha=0.6)
p.add_tools(
HoverTool(
renderers=[out_points],
tooltips=[("x", "@x"), ("y", "@y")],
)
)
p.xaxis.axis_label = y_col
p.ygrid.grid_line_color = None
p.xgrid.grid_line_color = "white"
p.yaxis.major_label_text_font_size = "16px"
else:
raise ValueError("direction must be either 'vertical' or 'horizontal'")
p.grid.grid_line_width = 2
box_hover = HoverTool(
renderers=[
top_box,
bottom_box,
mean_line,
upper_stem_line,
lower_stem_line,
upper_fence,
lower_fence,
],
tooltips=[
("q1", "@q1"),
("q2", "@q2"),
("q3", "@q3"),
("iqr", "@iqr"),
("upper", "@upper"),
("lower", "@lower"),
("mean", "@mean"),
],
)
p.add_tools(box_hover)
return p
# # Plot with plotly
# fig = go.Figure()
# fig.add_trace(go.Box(x=df["price"], name="House Price", marker_color='#3D9970', boxpoints='all', jitter=0.3, pointpos=-1.8, boxmean=True))
# fig.update_layout(title_text='House Price Distribution')
# fig.show()
# Plot with Bokeh
show(get_bokeh_box_plot(df, None, "price", direction="horizontal"))
From the visualization, we observe that the boxplot of price is left-skewed. This means that the median (300) is less than the mean (340). This is because there are a few houses with very high prices, around 250 to 400.
In addition, it is noticable that there is one outlier with a price of 700. This is a very high price compared to the rest of the prices.
Area#
Show code cell source
# fig = make_subplots(rows=1, cols=2)
# fig = px.scatter(df, x="area", y="price", title="House price in Portland, Oregon with respect to area")
# fig.update_traces(marker=dict(color="blue", symbol="x"))
# fig.update_xaxes(title_text="Area (sq ft)")
# fig.update_yaxes(title_text="Price ($1000)")
# fig.show()
# Plot the area vs price
p = figure(title="House price in Portland, Oregon with respect to area", x_axis_label="Area (sq ft)", y_axis_label="Price ($1000)", sizing_mode="stretch_width")
p.scatter(df["area"], df["price"], size=10, color="blue", alpha=0.5)
p.add_tools(HoverTool(tooltips=[("Area", "@x"), ("Price", "@y")]))
show(p)
There is a positive correlation between the living area and the price. This means that the price increases as the living area increases.
Number of Bedrooms#
Show code cell source
# # Plot with plotly
# fig = px.scatter(df, x="n_bedrooms", y="price", title="House price in Portland, Oregon with respect to number of bedrooms")
# fig.update_traces(marker=dict(color="red", symbol="x"))
# fig.update_xaxes(title_text="Number of bedrooms")
# fig.update_yaxes(title_text="Price ($1000)")
# fig.show()
# Plot with Bokeh
p = figure(title="House price in Portland, Oregon with respect to number of bedrooms", x_axis_label="Number of bedrooms", y_axis_label="Price ($1000)", sizing_mode="stretch_width")
p.scatter(df["n_bedrooms"], df["price"], size=10, color="red", alpha=0.5)
p.add_tools(HoverTool(tooltips=[("#Bedrooms", "@x"), ("Price", "@y")]))
show(p)
However, there is a weak correlation between the number of bedrooms and the price. Most of the houses have 3 bedrooms, and the price is not significantly different from the houses with 4 bedrooms.
Data Preprocessing#
In this step, we normalize the features using the mean and standard deviation.
where:
\(x_i\) is the feature
\(\mu_i\) is the mean of the feature
\(\sigma_i\) is the standard deviation of the feature
By normalizing the features, the gradient descent will converge more quickly and avoid overflow during computation.
# scale the features
mean = df.mean()
std = df.std()
normalized_df = (df - mean) / std
normalized_df.head()
area | n_bedrooms | price | |
---|---|---|---|
0 | 0.130010 | -0.223675 | 0.476337 |
1 | -0.504190 | -0.223675 | -0.083559 |
2 | 0.502476 | -0.223675 | 0.228383 |
3 | -0.735723 | -1.537767 | -0.867413 |
4 | 1.257476 | 1.090417 | 1.596128 |
Linear regression#
Closed-form solution#
def closed_form_linear_regression(X, y):
try:
return np.linalg.inv(X.T @ X) @ X.T @ y
except np.linalg.LinAlgError:
return np.linalg.pinv(X.T @ X) @ X.T @ y
X = normalized_df["area"].values.reshape(-1, 1)
X = np.c_[np.ones((len(X), 1)), X]
y = normalized_df["price"].values.reshape(-1, 1)
theta = closed_form_linear_regression(X, y)
b, w = theta[0], theta[1]
print(f'The formula for the regression line is y = {b[0]:.2f} + {w[0]:.2f}x')
The formula for the regression line is y = 0.00 + 0.86x
Show code cell source
# # plot with plotly
# fig = px.scatter(
# df, x="area", y="price", title="House price in Portland, Oregon with respect to area",
# labels={"area": "Area (sq ft)", "price": "Price ($1000)"},
# )
# fig.update_traces(marker=dict(color="blue", symbol="x"))
# fig.update_xaxes(title_text="Area (sq ft)")
# fig.update_yaxes(title_text="Price ($1000)")
# fig.add_scatter(
# x=df["area"],
# y=(w[0] * normalized_df["area"] + b) * std["price"] + mean["price"],
# mode="lines",
# line=dict(color="red"),
# )
# fig.update_layout(showlegend=False)
# fig.show()
Show code cell source
p = figure(title="House price in Portland, Oregon with respect to area", x_axis_label="Area (sq ft)", y_axis_label="Price ($1000)", sizing_mode="stretch_width")
point_plot = p.scatter(df["area"], df["price"], size=10, color="blue", alpha=0.5)
line_plot = p.line(df["area"], (w[0] * normalized_df["area"] + b) * std["price"] + mean["price"], line_width=2, color="red")
p.add_tools(HoverTool(renderers=[point_plot], tooltips=[("Area", "@x"), ("Price", "@y")]))
p.add_tools(HoverTool(renderers=[line_plot], tooltips=[("", f"y = {b[0] * std['price'] + mean['price']:.2f} + {w[0] * std['price']:.2f}x")]))
show(p)
Single variable linear regression with gradient descent#
feature_name = "n_bedrooms"
X = normalized_df[[feature_name]].values
y = normalized_df['price'].values.reshape(-1, 1)
num_epochs = 1000
alpha = 0.01
w, b = np.random.rand(2)
init_w, init_b = w, b
losses = []
for i in range(num_epochs):
y_pred = w * X + b
loss = np.mean(0.5 * (y_pred - y) ** 2)
losses.append(loss)
w -= alpha * np.mean((y_pred - y) * X)
b -= alpha * np.mean(y_pred - y)
if i % 100 == 0:
print(f'Epoch {i}, loss: {loss:.2f}')
print(f'The formula for the regression line is y = {b:.2f} + {w:.2f}x')
Epoch 0, loss: 0.56
Epoch 100, loss: 0.42
Epoch 200, loss: 0.40
Epoch 300, loss: 0.39
Epoch 400, loss: 0.39
Epoch 500, loss: 0.39
Epoch 600, loss: 0.39
Epoch 700, loss: 0.39
Epoch 800, loss: 0.39
Epoch 900, loss: 0.39
The formula for the regression line is y = 0.00 + 0.44x
Show code cell source
p = figure(title=f"House price in Portland, Oregon with respect to {feature_name}", x_axis_label=f"{feature_name}", y_axis_label="Price ($1000)", sizing_mode="stretch_width")
point_plot = p.scatter(df[feature_name], df["price"], size=10, color="red", alpha=0.5)
line_plot = p.line(df[feature_name], (w * normalized_df[feature_name] + b) * std["price"] + mean["price"], line_width=2, color="blue")
p.add_tools(HoverTool(renderers=[point_plot], tooltips=[(feature_name, "@x"), ("Price", "@y")]))
p.add_tools(HoverTool(renderers=[line_plot], tooltips=[("", f"y = {b * std['price'] + mean['price']:.2f} + {w * std['price']:.2f}x")]))
show(p)
Multivariate linear regression with gradient descent and vectorization#
X = normalized_df[['area', 'n_bedrooms']].values
X = np.c_[np.ones((len(X), 1)), X]
y = normalized_df['price'].values.reshape(-1, 1)
m = len(y)
num_features = X.shape[1]
w = np.random.rand(num_features, 1)
num_epochs = 600
alpha = 0.01
losses = []
for i in range(num_epochs):
y_pred = X @ w
cur_loss = 1 / (2 * m) * (y_pred - y).T @ (y_pred - y)
cur_loss = cur_loss[0][0]
losses.append(cur_loss)
grad = 1 / m * X.T @ (y_pred - y)
w -= alpha * grad
if i % 100 == 0:
print(f'Epoch {i+1:03d}, loss = {cur_loss:0.2f}')
Epoch 001, loss = 0.66
Epoch 101, loss = 0.20
Epoch 201, loss = 0.14
Epoch 301, loss = 0.13
Epoch 401, loss = 0.13
Epoch 501, loss = 0.13
Linear regression with scikit-learn#
from sklearn import linear_model
First, we train the linear regression model using the LinearRegression
class from the sklearn
framework. Then, we evaluate the model using the mean squared error (MSE) and the coefficient of determination (\(R^2\)).
# Train the linear regression model
linear_regressor = linear_model.LinearRegression()
X = normalized_df[["area"]]
y = normalized_df["price"]
linear_regressor = linear_regressor.fit(X, y)
# Evaluate the model
y_pred = linear_regressor.predict(X)
mse = np.mean((y_pred - y) ** 2)
r2 = linear_regressor.score(X, y)
print(f'Mean Squared Error: {mse:.2f}')
print(f'R^2: {r2:.2f}')
Mean Squared Error: 0.26
R^2: 0.73
Show code cell source
# # Draw the linear regression line with plotly
# fig = px.scatter(df, x="area", y="price", title="House price in Portland, Oregon with respect to area")
# fig.update_traces(marker=dict(color="blue", symbol="x"))
# fig.update_xaxes(title_text="Area (sq ft)")
# fig.update_yaxes(title_text="Price ($1000)")
# fig.add_scatter(
# x=df["area"],
# y=linear_regressor.predict(X) * std["price"] + mean["price"],
# mode="lines",
# line=dict(color="red"),
# )
# fig.update_layout(showlegend=False)
# fig.show()
Show code cell source
p = figure(title="House price in Portland, Oregon with respect to area", x_axis_label="Area (sq ft)", y_axis_label="Price ($1000)", sizing_mode="stretch_width")
point_plot = p.scatter(df["area"], df["price"], size=10, color="blue", alpha=0.5)
line_plot = p.line(df["area"], linear_regressor.predict(X) * std["price"] + mean["price"], line_width=2, color="red")
p.add_tools(HoverTool(renderers=[point_plot], tooltips=[("Area", "@x"), ("Price", "@y")]))
p.add_tools(HoverTool(renderers=[line_plot], tooltips=[("", f"y = {linear_regressor.intercept_ * std['price'] + mean['price']:.2f} + {linear_regressor.coef_[0] * std['price']:.2f}x")]))
show(p)
Now, we train a linear regression model to inspect the relationship between the number of bedrooms (n_bedrooms
) and the price. Similarly, we also evaluate the model using the mean squared error (MSE) and the coefficient of determination (\(R^2\)).
linear_regressor = linear_model.LinearRegression()
X = normalized_df[["n_bedrooms"]]
y = normalized_df["price"]
linear_regressor = linear_regressor.fit(X, y)
# Evaluate the model
y_pred = linear_regressor.predict(X)
mse = np.mean((y_pred - y) ** 2)
r2 = linear_regressor.score(X, y)
print(f'Mean Squared Error: {mse:.2f}')
print(f'R^2: {r2:.2f}')
Mean Squared Error: 0.79
R^2: 0.20
Show code cell source
p = figure(title="House price in Portland, Oregon with respect to number of bedrooms", x_axis_label="Number of bedrooms", y_axis_label="Price ($1000)", sizing_mode="stretch_width")
point_plot = p.scatter(df["n_bedrooms"], df["price"], size=10, color="red", alpha=0.5)
line_plot = p.line(df["n_bedrooms"], linear_regressor.predict(X) * std["price"] + mean["price"], line_width=2, color="blue")
p.add_tools(HoverTool(renderers=[point_plot], tooltips=[("#Bedrooms", "@x"), ("Price", "@y")]))
p.add_tools(HoverTool(renderers=[line_plot], tooltips=[("", f"y = {linear_regressor.intercept_ * std['price'] + mean['price']:.2f} + {linear_regressor.coef_[0] * std['price']:.2f}x")]))
show(p)