Linear Regression#

Intuition#

Practice#

In this section, you will practice to implement linear regression models using Python on the ‘Boston Housing’ dataset. Three different methods will be used to implement the linear regression model:

  • Simple implementation with numpy.

  • Vectorization implementation with numpy.

  • Linear regression from sklearn framework

import os
import numpy as np
import pandas as pd
import wget

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool

output_notebook()
Loading BokehJS ...

Dataset#

Suppose we have a dataset giving the living areas and prices of 47 houses from Portland, Oregon as:

Living area (feet2)

Price (1000$s)

2104

400

1600

330

2400

369

1416

232

3000

540

(full data is available at ex1data2.txt )

if not os.path.exists('ex1data2.txt'):
    wget.download("https://raw.githubusercontent.com/girishkuniyal/Predict-housing-prices-in-Portland/master/ex1data2.txt")
with open("./ex1data2.txt") as f:
    data = f.read()

house_price = []
for line in data.split("\n"):
    if line:
        area, n_bedrooms, price = line.split(",")
        house_price.append([int(area), int(n_bedrooms), np.round(float(price)/1000, 0).astype(int)])

df = pd.DataFrame(house_price, columns=["area", "n_bedrooms", "price"])
df.head()
area n_bedrooms price
0 2104 3 400
1 1600 3 330
2 2400 3 369
3 1416 2 232
4 3000 4 540

Exploratory Data Analysis#

Price#

Hide code cell source
def get_bokeh_box_plot(dataframe, x_col, y_col, direction="vertical"):
    # let df as the subdataframe with two columns of data: x_col and y_col
    if x_col is None:
        x_values = [""] * len(dataframe)
        y_values = dataframe[y_col]
        x_col = "x_col"
        df = pd.DataFrame({x_col: x_values, y_col: y_values})
    else:
        df = dataframe[[x_col, y_col]]
    cats = list(df[x_col].unique())

    # find the quartiles and IQR for each category
    groups = df.groupby(x_col)
    q1 = groups.quantile(q=0.25)
    q2 = groups.quantile(q=0.5)
    q3 = groups.quantile(q=0.75)
    mean = groups.mean()
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr

    # find the outliers for each category
    def outliers(group):
        cat = group.name
        return group[
            (group[y_col] > upper.loc[cat][y_col]) | (group[y_col] < lower.loc[cat][y_col])
        ][y_col]

    out = groups.apply(outliers, include_groups=False)  # .dropna()

    # prepare outlier data for plotting, we need coordinates for every outlier.
    if not out.empty:
        outx = list(out.index.get_level_values(0))
        outy = list(out.values)
        if len(cats) == 1:
            outy = outy[0]
            outx = outx * len(outy)

    # if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
    qmin = groups.quantile(q=0.00)
    qmax = groups.quantile(q=1.00)
    upper[y_col] = [min([x, y]) for (x, y) in zip(list(qmax.loc[:, y_col]), upper[y_col])]
    lower[y_col] = [max([x, y]) for (x, y) in zip(list(qmin.loc[:, y_col]), lower[y_col])]

    boxes_data = pd.concat(
        [
            q1.rename(columns={y_col: "q1"}),
            q2.rename(columns={y_col: "q2"}),
            q3.rename(columns={y_col: "q3"}),
            iqr.rename(columns={y_col: "iqr"}),
            upper.rename(columns={y_col: "upper"}),
            lower.rename(columns={y_col: "lower"}),
            mean.rename(columns={y_col: "mean"}),
        ],
        axis=1,
    )
    source = ColumnDataSource(boxes_data)
    cats = [str(x) for x in boxes_data.index]

    if direction == "vertical":
        # p = figure(background_fill_color="#efefef", x_range=cats, toolbar_location=None)
        p = figure(background_fill_color="#efefef", x_range=cats,)

        # stems
        upper_stem_line = p.segment(x_col, "upper", x_col, "q3", source=source, line_color="black")
        lower_stem_line = p.segment(x_col, "lower", x_col, "q1", source=source, line_color="black")

        # boxes
        top_box = p.vbar(
            x_col, 0.7, "q2", "q3", source=source, fill_color="#E08E79", line_color="black"
        )
        bottom_box = p.vbar(
            x_col, 0.7, "q1", "q2", source=source, fill_color="#3B8686", line_color="black"
        )

        # mean
        mean_line = p.rect(
            x_col,
            "mean",
            0.7,
            0.0001,
            source=source,
            line_color="black",
            fill_color="black",
            line_dash="dashed",
        )

        # whiskers (almost-0 height rects simpler than segments)
        upper_fence = p.rect(x_col, "lower", 0.2, 0.005, source=source, line_color="black")
        lower_fence = p.rect(x_col, "upper", 0.2, 0.005, source=source, line_color="black")

        # outliers
        if not out.empty:
            out_points = p.scatter(outx, outy, size=6, color="#F38630", fill_alpha=0.6)
            p.add_tools(
                HoverTool(
                    renderers=[out_points],
                    tooltips=[("x", "@x"), ("y", "@y")],
                )
            )

        p.yaxis.axis_label = y_col
        p.xgrid.grid_line_color = None
        p.ygrid.grid_line_color = "white"
        p.xaxis.major_label_text_font_size = "16px"

    elif direction == "horizontal":
        cats = sorted(cats, reverse=True)
        p = figure(
            background_fill_color="#efefef",
            y_range=cats,
            toolbar_location=None,
            width=800,
            height=300,
        )

        upper_stem_line = p.segment("upper", x_col, "q3", x_col, source=source, line_color="black")
        lower_stem_line = p.segment("lower", x_col, "q1", x_col, source=source, line_color="black")

        top_box = p.hbar(
            x_col, 0.7, "q2", "q3", source=source, fill_color="#E08E79", line_color="black"
        )
        bottom_box = p.hbar(
            x_col, 0.7, "q1", "q2", source=source, fill_color="#3B8686", line_color="black"
        )

        mean_line = p.rect(
            "mean",
            x_col,
            0.0001,
            0.7,
            source=source,
            line_color="black",
            fill_color="black",
            line_dash="dashed",
        )

        upper_fence = p.rect("lower", x_col, 0.005, 0.2, source=source, line_color="black")
        lower_fence = p.rect("upper", x_col, 0.005, 0.2, source=source, line_color="black")

        if not out.empty:
            out_points = p.scatter(outy, outx, size=6, color="#F38630", fill_alpha=0.6)
            p.add_tools(
                HoverTool(
                    renderers=[out_points],
                    tooltips=[("x", "@x"), ("y", "@y")],
                )
            )
        p.xaxis.axis_label = y_col
        p.ygrid.grid_line_color = None
        p.xgrid.grid_line_color = "white"
        p.yaxis.major_label_text_font_size = "16px"
    else:
        raise ValueError("direction must be either 'vertical' or 'horizontal'")

    p.grid.grid_line_width = 2
    box_hover = HoverTool(
        renderers=[
            top_box,
            bottom_box,
            mean_line,
            upper_stem_line,
            lower_stem_line,
            upper_fence,
            lower_fence,
        ],
        tooltips=[
            ("q1", "@q1"),
            ("q2", "@q2"),
            ("q3", "@q3"),
            ("iqr", "@iqr"),
            ("upper", "@upper"),
            ("lower", "@lower"),
            ("mean", "@mean"),
        ],
    )
    p.add_tools(box_hover)

    return p
# # Plot with plotly
# fig = go.Figure()
# fig.add_trace(go.Box(x=df["price"], name="House Price", marker_color='#3D9970', boxpoints='all', jitter=0.3, pointpos=-1.8, boxmean=True))
# fig.update_layout(title_text='House Price Distribution')
# fig.show()

# Plot with Bokeh
show(get_bokeh_box_plot(df, None, "price", direction="horizontal"))

From the visualization, we observe that the boxplot of price is left-skewed. This means that the median (300) is less than the mean (340). This is because there are a few houses with very high prices, around 250 to 400.

In addition, it is noticable that there is one outlier with a price of 700. This is a very high price compared to the rest of the prices.

Area#

Hide code cell source
# fig = make_subplots(rows=1, cols=2)
# fig = px.scatter(df, x="area", y="price", title="House price in Portland, Oregon with respect to area")
# fig.update_traces(marker=dict(color="blue", symbol="x"))
# fig.update_xaxes(title_text="Area (sq ft)")
# fig.update_yaxes(title_text="Price ($1000)")
# fig.show()
# Plot the area vs price
p = figure(title="House price in Portland, Oregon with respect to area", x_axis_label="Area (sq ft)", y_axis_label="Price ($1000)", sizing_mode="stretch_width")
p.scatter(df["area"], df["price"], size=10, color="blue", alpha=0.5)
p.add_tools(HoverTool(tooltips=[("Area", "@x"), ("Price", "@y")]))
show(p)

There is a positive correlation between the living area and the price. This means that the price increases as the living area increases.

Number of Bedrooms#

Hide code cell source
# # Plot with plotly
# fig = px.scatter(df, x="n_bedrooms", y="price", title="House price in Portland, Oregon with respect to number of bedrooms")
# fig.update_traces(marker=dict(color="red", symbol="x"))
# fig.update_xaxes(title_text="Number of bedrooms")
# fig.update_yaxes(title_text="Price ($1000)")
# fig.show()

# Plot with Bokeh
p = figure(title="House price in Portland, Oregon with respect to number of bedrooms", x_axis_label="Number of bedrooms", y_axis_label="Price ($1000)", sizing_mode="stretch_width")
p.scatter(df["n_bedrooms"], df["price"], size=10, color="red", alpha=0.5)
p.add_tools(HoverTool(tooltips=[("#Bedrooms", "@x"), ("Price", "@y")]))
show(p)

However, there is a weak correlation between the number of bedrooms and the price. Most of the houses have 3 bedrooms, and the price is not significantly different from the houses with 4 bedrooms.

Data Preprocessing#

In this step, we normalize the features using the mean and standard deviation.

\[\begin{align*} x_i &= \frac{x_i - \mu_i}{\sigma_i} \end{align*}\]

where:

  • \(x_i\) is the feature

  • \(\mu_i\) is the mean of the feature

  • \(\sigma_i\) is the standard deviation of the feature

By normalizing the features, the gradient descent will converge more quickly and avoid overflow during computation.

# scale the features
mean = df.mean()
std = df.std()

normalized_df = (df - mean) / std
normalized_df.head()
area n_bedrooms price
0 0.130010 -0.223675 0.476337
1 -0.504190 -0.223675 -0.083559
2 0.502476 -0.223675 0.228383
3 -0.735723 -1.537767 -0.867413
4 1.257476 1.090417 1.596128

Linear regression#

Closed-form solution#

def closed_form_linear_regression(X, y):
    try:
        return np.linalg.inv(X.T @ X) @ X.T @ y
    except np.linalg.LinAlgError:
        return np.linalg.pinv(X.T @ X) @ X.T @ y

X = normalized_df["area"].values.reshape(-1, 1)
X = np.c_[np.ones((len(X), 1)), X]
y = normalized_df["price"].values.reshape(-1, 1)

theta = closed_form_linear_regression(X, y)
b, w = theta[0], theta[1]
print(f'The formula for the regression line is y = {b[0]:.2f} + {w[0]:.2f}x')
The formula for the regression line is y = 0.00 + 0.86x
Hide code cell source
# # plot with plotly
# fig = px.scatter(
#     df, x="area", y="price", title="House price in Portland, Oregon with respect to area",
#     labels={"area": "Area (sq ft)", "price": "Price ($1000)"},
# )
# fig.update_traces(marker=dict(color="blue", symbol="x"))
# fig.update_xaxes(title_text="Area (sq ft)")
# fig.update_yaxes(title_text="Price ($1000)")
# fig.add_scatter(
#     x=df["area"],
#     y=(w[0] * normalized_df["area"] + b) * std["price"] + mean["price"],
#     mode="lines", 
#     line=dict(color="red"),
# )
# fig.update_layout(showlegend=False)
# fig.show()
Hide code cell source
p = figure(title="House price in Portland, Oregon with respect to area", x_axis_label="Area (sq ft)", y_axis_label="Price ($1000)", sizing_mode="stretch_width")

point_plot = p.scatter(df["area"], df["price"], size=10, color="blue", alpha=0.5)
line_plot = p.line(df["area"], (w[0] * normalized_df["area"] + b) * std["price"] + mean["price"], line_width=2, color="red")
p.add_tools(HoverTool(renderers=[point_plot], tooltips=[("Area", "@x"), ("Price", "@y")]))
p.add_tools(HoverTool(renderers=[line_plot], tooltips=[("", f"y = {b[0] * std['price'] + mean['price']:.2f} + {w[0] * std['price']:.2f}x")]))
show(p)

Single variable linear regression with gradient descent#

feature_name = "n_bedrooms"
X = normalized_df[[feature_name]].values
y = normalized_df['price'].values.reshape(-1, 1)

num_epochs = 1000
alpha = 0.01

w, b = np.random.rand(2)
init_w, init_b = w, b
losses = []

for i in range(num_epochs):
    y_pred = w * X + b
    loss = np.mean(0.5 * (y_pred - y) ** 2)
    losses.append(loss)
    
    w -= alpha * np.mean((y_pred - y) * X)
    b -= alpha * np.mean(y_pred - y)
    
    if i % 100 == 0:
        print(f'Epoch {i}, loss: {loss:.2f}')
        
print(f'The formula for the regression line is y = {b:.2f} + {w:.2f}x')
Epoch 0, loss: 0.56
Epoch 100, loss: 0.42
Epoch 200, loss: 0.40
Epoch 300, loss: 0.39
Epoch 400, loss: 0.39
Epoch 500, loss: 0.39
Epoch 600, loss: 0.39
Epoch 700, loss: 0.39
Epoch 800, loss: 0.39
Epoch 900, loss: 0.39
The formula for the regression line is y = 0.00 + 0.44x
Hide code cell source
p = figure(title=f"House price in Portland, Oregon with respect to {feature_name}", x_axis_label=f"{feature_name}", y_axis_label="Price ($1000)", sizing_mode="stretch_width")

point_plot = p.scatter(df[feature_name], df["price"], size=10, color="red", alpha=0.5)
line_plot = p.line(df[feature_name], (w * normalized_df[feature_name] + b) * std["price"] + mean["price"], line_width=2, color="blue")
p.add_tools(HoverTool(renderers=[point_plot], tooltips=[(feature_name, "@x"), ("Price", "@y")]))
p.add_tools(HoverTool(renderers=[line_plot], tooltips=[("", f"y = {b * std['price'] + mean['price']:.2f} + {w * std['price']:.2f}x")]))
show(p)

Multivariate linear regression with gradient descent and vectorization#

X = normalized_df[['area', 'n_bedrooms']].values
X = np.c_[np.ones((len(X), 1)), X]
y = normalized_df['price'].values.reshape(-1, 1)
m = len(y)
num_features = X.shape[1]

w = np.random.rand(num_features, 1)
num_epochs = 600
alpha = 0.01
losses = []

for i in range(num_epochs):
    y_pred = X @ w
    
    cur_loss = 1 / (2 * m) * (y_pred - y).T @ (y_pred - y)
    cur_loss = cur_loss[0][0]
    losses.append(cur_loss)
    
    grad = 1 / m * X.T @ (y_pred - y)
    w -= alpha * grad
    
    if i % 100 == 0:
        print(f'Epoch {i+1:03d}, loss = {cur_loss:0.2f}')
Epoch 001, loss = 0.66
Epoch 101, loss = 0.20
Epoch 201, loss = 0.14
Epoch 301, loss = 0.13
Epoch 401, loss = 0.13
Epoch 501, loss = 0.13

Linear regression with scikit-learn#

from sklearn import linear_model

First, we train the linear regression model using the LinearRegression class from the sklearn framework. Then, we evaluate the model using the mean squared error (MSE) and the coefficient of determination (\(R^2\)).

# Train the linear regression model
linear_regressor = linear_model.LinearRegression()
X = normalized_df[["area"]]
y = normalized_df["price"]
linear_regressor = linear_regressor.fit(X, y)
# Evaluate the model
y_pred = linear_regressor.predict(X)
mse = np.mean((y_pred - y) ** 2)
r2 = linear_regressor.score(X, y)
print(f'Mean Squared Error: {mse:.2f}')
print(f'R^2: {r2:.2f}')
Mean Squared Error: 0.26
R^2: 0.73
Hide code cell source
# # Draw the linear regression line with plotly
# fig = px.scatter(df, x="area", y="price", title="House price in Portland, Oregon with respect to area")
# fig.update_traces(marker=dict(color="blue", symbol="x"))
# fig.update_xaxes(title_text="Area (sq ft)")
# fig.update_yaxes(title_text="Price ($1000)")
# fig.add_scatter(
#     x=df["area"],
#     y=linear_regressor.predict(X) * std["price"] + mean["price"],
#     mode="lines", 
#     line=dict(color="red"),
# )
# fig.update_layout(showlegend=False)
# fig.show()
Hide code cell source
p = figure(title="House price in Portland, Oregon with respect to area", x_axis_label="Area (sq ft)", y_axis_label="Price ($1000)", sizing_mode="stretch_width")

point_plot = p.scatter(df["area"], df["price"], size=10, color="blue", alpha=0.5)
line_plot = p.line(df["area"], linear_regressor.predict(X) * std["price"] + mean["price"], line_width=2, color="red")
p.add_tools(HoverTool(renderers=[point_plot], tooltips=[("Area", "@x"), ("Price", "@y")]))
p.add_tools(HoverTool(renderers=[line_plot], tooltips=[("", f"y = {linear_regressor.intercept_ * std['price'] + mean['price']:.2f} + {linear_regressor.coef_[0] * std['price']:.2f}x")]))
show(p)

Now, we train a linear regression model to inspect the relationship between the number of bedrooms (n_bedrooms) and the price. Similarly, we also evaluate the model using the mean squared error (MSE) and the coefficient of determination (\(R^2\)).

linear_regressor = linear_model.LinearRegression()
X = normalized_df[["n_bedrooms"]]
y = normalized_df["price"]
linear_regressor = linear_regressor.fit(X, y)

# Evaluate the model
y_pred = linear_regressor.predict(X)
mse = np.mean((y_pred - y) ** 2)
r2 = linear_regressor.score(X, y)
print(f'Mean Squared Error: {mse:.2f}')
print(f'R^2: {r2:.2f}')
Mean Squared Error: 0.79
R^2: 0.20
Hide code cell source
p = figure(title="House price in Portland, Oregon with respect to number of bedrooms", x_axis_label="Number of bedrooms", y_axis_label="Price ($1000)", sizing_mode="stretch_width")

point_plot = p.scatter(df["n_bedrooms"], df["price"], size=10, color="red", alpha=0.5)
line_plot = p.line(df["n_bedrooms"], linear_regressor.predict(X) * std["price"] + mean["price"], line_width=2, color="blue")
p.add_tools(HoverTool(renderers=[point_plot], tooltips=[("#Bedrooms", "@x"), ("Price", "@y")]))
p.add_tools(HoverTool(renderers=[line_plot], tooltips=[("", f"y = {linear_regressor.intercept_ * std['price'] + mean['price']:.2f} + {linear_regressor.coef_[0] * std['price']:.2f}x")]))
show(p)