Linear Regression

Linear Regression#

Intuition#

Practice#

In this section, you will practice to implement linear regression models using Python on the ‘Boston Housing’ dataset. Three different methods will be used to implement the linear regression model:

Simple implementation with numpy.
Vectorization implementation with numpy.
Linear regression from sklearn framework

import os
import numpy as np
import pandas as pd
import wget

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool

output_notebook()

Loading BokehJS ...

Dataset#

Suppose we have a dataset giving the living areas and prices of 47 houses from Portland, Oregon as:

Living area (feet2)	Price (1000$s)
2104	400
1600	330
2400	369
1416	232
3000	540
…	…

(full data is available at ex1data2.txt )

if not os.path.exists('ex1data2.txt'):
    wget.download("https://raw.githubusercontent.com/girishkuniyal/Predict-housing-prices-in-Portland/master/ex1data2.txt")

with open("./ex1data2.txt") as f:
    data = f.read()

house_price = []
for line in data.split("\n"):
    if line:
        area, n_bedrooms, price = line.split(",")
        house_price.append([int(area), int(n_bedrooms), np.round(float(price)/1000, 0).astype(int)])

df = pd.DataFrame(house_price, columns=["area", "n_bedrooms", "price"])
df.head()

	area	n_bedrooms	price
0	2104	3	400
1	1600	3	330
2	2400	3	369
3	1416	2	232
4	3000	4	540

Exploratory Data Analysis#

Price#

Show code cell source Hide code cell source

def get_bokeh_box_plot(dataframe, x_col, y_col, direction="vertical"):
    # let df as the subdataframe with two columns of data: x_col and y_col
    if x_col is None:
        x_values = [""] * len(dataframe)
        y_values = dataframe[y_col]
        x_col = "x_col"
        df = pd.DataFrame({x_col: x_values, y_col: y_values})
    else:
        df = dataframe[[x_col, y_col]]
    cats = list(df[x_col].unique())

    # find the quartiles and IQR for each category
    groups = df.groupby(x_col)
    q1 = groups.quantile(q=0.25)
    q2 = groups.quantile(q=0.5)
    q3 = groups.quantile(q=0.75)
    mean = groups.mean()
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr

    # find the outliers for each category
    def outliers(group):
        cat = group.name
        return group[
            (group[y_col] > upper.loc[cat][y_col]) | (group[y_col] < lower.loc[cat][y_col])
        ][y_col]

    out = groups.apply(outliers, include_groups=False)  # .dropna()

    # prepare outlier data for plotting, we need coordinates for every outlier.
    if not out.empty:
        outx = list(out.index.get_level_values(0))
        outy = list(out.values)
        if len(cats) == 1:
            outy = outy[0]
            outx = outx * len(outy)

    # if no outliers, shrink lengths of stems to be no longer than the minimums or maximums
    qmin = groups.quantile(q=0.00)
    qmax = groups.quantile(q=1.00)
    upper[y_col] = [min([x, y]) for (x, y) in zip(list(qmax.loc[:, y_col]), upper[y_col])]
    lower[y_col] = [max([x, y]) for (x, y) in zip(list(qmin.loc[:, y_col]), lower[y_col])]

    boxes_data = pd.concat(
        [
            q1.rename(columns={y_col: "q1"}),
            q2.rename(columns={y_col: "q2"}),
            q3.rename(columns={y_col: "q3"}),
            iqr.rename(columns={y_col: "iqr"}),
            upper.rename(columns={y_col: "upper"}),
            lower.rename(columns={y_col: "lower"}),
            mean.rename(columns={y_col: "mean"}),
        ],
        axis=1,
    )
    source = ColumnDataSource(boxes_data)
    cats = [str(x) for x in boxes_data.index]

    if direction == "vertical":
        # p = figure(background_fill_color="#efefef", x_range=cats, toolbar_location=None)
        p = figure(background_fill_color="#efefef", x_range=cats,)

        # stems
        upper_stem_line = p.segment(x_col, "upper", x_col, "q3", source=source, line_color="black")
        lower_stem_line = p.segment(x_col, "lower", x_col, "q1", source=source, line_color="black")

        # boxes
        top_box = p.vbar(
            x_col, 0.7, "q2", "q3", source=source, fill_color="#E08E79", line_color="black"
        )
        bottom_box = p.vbar(
            x_col, 0.7, "q1", "q2", source=source, fill_color="#3B8686", line_color="black"
        )

        # mean
        mean_line = p.rect(
            x_col,
            "mean",
            0.7,
            0.0001,
            source=source,
            line_color="black",
            fill_color="black",
            line_dash="dashed",
        )

        # whiskers (almost-0 height rects simpler than segments)
        upper_fence = p.rect(x_col, "lower", 0.2, 0.005, source=source, line_color="black")
        lower_fence = p.rect(x_col, "upper", 0.2, 0.005, source=source, line_color="black")

        # outliers
        if not out.empty:
            out_points = p.scatter(outx, outy, size=6, color="#F38630", fill_alpha=0.6)
            p.add_tools(
                HoverTool(
                    renderers=[out_points],
                    tooltips=[("x", "@x"), ("y", "@y")],
                )
            )

        p.yaxis.axis_label = y_col
        p.xgrid.grid_line_color = None
        p.ygrid.grid_line_color = "white"
        p.xaxis.major_label_text_font_size = "16px"

    elif direction == "horizontal":
        cats = sorted(cats, reverse=True)
        p = figure(
            background_fill_color="#efefef",
            y_range=cats,
            toolbar_location=None,
            width=800,
            height=300,
        )

        upper_stem_line = p.segment("upper", x_col, "q3", x_col, source=source, line_color="black")
        lower_stem_line = p.segment("lower", x_col, "q1", x_col, source=source, line_color="black")

        top_box = p.hbar(
            x_col, 0.7, "q2", "q3", source=source, fill_color="#E08E79", line_color="black"
        )
        bottom_box = p.hbar(
            x_col, 0.7, "q1", "q2", source=source, fill_color="#3B8686", line_color="black"
        )

        mean_line = p.rect(
            "mean",
            x_col,
            0.0001,
            0.7,
            source=source,
            line_color="black",
            fill_color="black",
            line_dash="dashed",
        )

        upper_fence = p.rect("lower", x_col, 0.005, 0.2, source=source, line_color="black")
        lower_fence = p.rect("upper", x_col, 0.005, 0.2, source=source, line_color="black")

        if not out.empty:
            out_points = p.scatter(outy, outx, size=6, color="#F38630", fill_alpha=0.6)
            p.add_tools(
                HoverTool(
                    renderers=[out_points],
                    tooltips=[("x", "@x"), ("y", "@y")],
                )
            )
        p.xaxis.axis_label = y_col
        p.ygrid.grid_line_color = None
        p.xgrid.grid_line_color = "white"
        p.yaxis.major_label_text_font_size = "16px"
    else:
        raise ValueError("direction must be either 'vertical' or 'horizontal'")

    p.grid.grid_line_width = 2
    box_hover = HoverTool(
        renderers=[
            top_box,
            bottom_box,
            mean_line,
            upper_stem_line,
            lower_stem_line,
            upper_fence,
            lower_fence,
        ],
        tooltips=[
            ("q1", "@q1"),
            ("q2", "@q2"),
            ("q3", "@q3"),
            ("iqr", "@iqr"),
            ("upper", "@upper"),
            ("lower", "@lower"),
            ("mean", "@mean"),
        ],
    )
    p.add_tools(box_hover)

    return p

# # Plot with plotly
# fig = go.Figure()
# fig.add_trace(go.Box(x=df["price"], name="House Price", marker_color='#3D9970', boxpoints='all', jitter=0.3, pointpos=-1.8, boxmean=True))
# fig.update_layout(title_text='House Price Distribution')
# fig.show()

# Plot with Bokeh
show(get_bokeh_box_plot(df, None, "price", direction="horizontal"))

From the visualization, we observe that the boxplot of price is left-skewed. This means that the median (300) is less than the mean (340). This is because there are a few houses with very high prices, around 250 to 400.

In addition, it is noticable that there is one outlier with a price of 700. This is a very high price compared to the rest of the prices.

Area#

# Plot the area vs price
p = figure(title="House price in Portland, Oregon with respect to area", x_axis_label="Area (sq ft)", y_axis_label="Price ($1000)", sizing_mode="stretch_width")
p.scatter(df["area"], df["price"], size=10, color="blue", alpha=0.5)
p.add_tools(HoverTool(tooltips=[("Area", "@x"), ("Price", "@y")]))
show(p)

There is a positive correlation between the living area and the price. This means that the price increases as the living area increases.

Number of Bedrooms#

However, there is a weak correlation between the number of bedrooms and the price. Most of the houses have 3 bedrooms, and the price is not significantly different from the houses with 4 bedrooms.

Data Preprocessing#

In this step, we normalize the features using the mean and standard deviation.

\[\begin{align*} x_i &= \frac{x_i - \mu_i}{\sigma_i} \end{align*}\]

where:

$x_i$ is the feature
$\mu_i$ is the mean of the feature
$\sigma_i$ is the standard deviation of the feature

By normalizing the features, the gradient descent will converge more quickly and avoid overflow during computation.

# scale the features
mean = df.mean()
std = df.std()

normalized_df = (df - mean) / std
normalized_df.head()

	area	n_bedrooms	price
0	0.130010	-0.223675	0.476337
1	-0.504190	-0.223675	-0.083559
2	0.502476	-0.223675	0.228383
3	-0.735723	-1.537767	-0.867413
4	1.257476	1.090417	1.596128