Logistic Regression

Contents

Logistic Regression#

This notebook will introduce you to the Logistic Regression algorithm and how to implement it in Python.

Practice#

import os
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objs as go

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.models import Range1d, LinearAxis, Legend

# train_test_split
from sklearn.model_selection import train_test_split

output_notebook()
Loading BokehJS ...

Dataset#

In this implementation, we are going to use Pima Indians Diabetes Database.

df = pd.read_csv(
    "https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv"
)
df.head()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
# # Plot with Plotly
# fig = px.histogram(df, x="Outcome", labels={"Outcome": "Diabetes Outcome"})
# fig.update_layout(bargap=0.8)
# fig.show()

# Plot with Bokeh
source = ColumnDataSource(df)

p = figure(
    title="Diabetes Outcome",
    x_axis_label="Outcome",
    y_axis_label="Count",
    toolbar_location=None,
    tools=""
)
# use np.unique to get unique values
values, counts = np.unique(df["Outcome"], return_counts=True)
p.vbar(x=values, top=counts, width=0.25, color="navy")
p.add_tools(HoverTool(tooltips=[("Count", "@top")]))
p.xgrid.grid_line_color = None
p.y_range.start = 0
show(p)

There are 268 patients that have diabetes and 500 patients that do not have diabetes.

Now, we split the data into training and testing sets. We will use 80% of the data for training and 20% for testing. By using function train_test_split from sklearn.model_selection, we can split the data easily.

To keep the ratio of the classes in the training and testing sets, we use the parameter stratify=y in the train_test_split function.

# Split the data
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(614, 8) (154, 8) (614,) (154,)

Next, we will standardize the data using z-score normalization. In z-score normalization, the values are standardized based on the mean and standard deviation of the data. The formula for z-score normalization is given by:

\[z = \frac{x - \mu}{\sigma}\]

where \(x\) is the original feature vector, \(\mu\) is the mean of the feature vector, and \(\sigma\) is the standard deviation of the feature vector.

Data standardization is a common requirement/technique in data preprocessing and it is important. It ensures that each feature contributes approximately proportionately to the final distance. Besides that, many machine learning algorithms perform better or converge faster when features are on a relatively similar scale and/or close to normally distributed.

Now, for each feature, we calculate the mean and standard deviation of the training set. Then, we use these values to standardize both the training and testing sets.

# mean = df.mean()
# std = df.std()

# scaled_df = (df - mean) / std
# scaled_df.head()

mean = X_train.mean()
std = X_train.std()

X_train_scaled = (X_train - mean) / std
X_test_scaled = (X_test - mean) / std
X_train_scaled.head()
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
353 -0.850662 -0.979332 -0.404454 -0.553521 -0.331049 -0.607183 0.310541 -0.791524
711 0.356285 0.161313 0.464989 0.392467 -0.525969 -0.301893 -0.116344 0.560577
373 -0.548925 -0.504063 -0.621815 1.212324 0.142327 0.372290 -0.764239 -0.707018
46 -0.850662 0.795004 -0.730495 -1.310312 -0.730171 -0.289172 0.262100 -0.368992
682 -1.152398 -0.820909 -0.295774 1.149258 0.244428 1.606173 -0.337355 -0.960537
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def loss_function(y, y_pred):
    return - np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
X = X_train_scaled[['Pregnancies', 'Glucose']].values
X = np.c_[np.ones(X.shape[0]), X]
y_train_values = y_train.values.reshape(-1, 1)
m = X.shape[0]

w = np.random.rand(X.shape[1], 1)
alpha = 0.01
epochs = 2500

losses = []
accuracies = []

for i in range(epochs):
    # Forward pass
    z = X @ w # (m, n) * (n, 1) = (m, 1)
    h = sigmoid(z) # (m, 1) = (m, 1)
    
    # Compute loss and accuracy
    loss = loss_function(y_train_values, h)
    losses.append(loss)
    accuracy = np.mean((h > 0.5) == y_train_values)
    accuracies.append(np.round(accuracy * 100.0, 2))

    # Backward pass
    gradient = 1 / m * X.T @ (h - y_train_values) # scalar * (n, m) * (m, 1) = (n, 1)
    w -= alpha * gradient # (n, 1) - scalar * (n, 1) = (n, 1)

Plot the training loss and accuracy.

Hide code cell source
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=np.arange(epochs), y=losses, name="Loss", mode="lines"))
# fig.add_trace(go.Scatter(x=np.arange(epochs), y=accuracies, name="Accuracy", yaxis="y2", mode="lines"))
# fig.update_layout(title="Training Loss and Accuracy", xaxis_title="Epochs", yaxis_title="Loss")
# fig.update_layout(yaxis2=dict(title="Accuracy", overlaying="y", side="right"))
# fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
# fig.show()


# Plot with Bokeh
source = ColumnDataSource(data={"x": np.arange(epochs), "loss": losses, "accuracy": accuracies})

p = figure(
    title="Training Loss and Accuracy",
    x_axis_label="Epochs",
    y_axis_label="Loss",
    sizing_mode="scale_width",
    aspect_ratio=2,
    toolbar_location=None,
    tools=""
)
p.y_range = Range1d(start=min(losses) - 0.05 * max(losses), end=max(losses)*1.05)
p.extra_y_ranges = {"accuracy": Range1d(start=min(accuracies) - 0.05 * max(accuracies), end=max(accuracies)*1.05)}
p.add_layout(LinearAxis(y_range_name="accuracy", axis_label="Accuracy"), "right")

# p.add_layout(Legend(location='right', orientation='horizontal', padding=0, border_line_alpha=0, margin=0), 'above')

loss_plot = p.line(x="x", y="loss", source=source, color="navy", legend_label="Loss")
accuracy_plot = p.line(x="x", y="accuracy", source=source, color="red", legend_label="Accuracy", y_range_name="accuracy")

p.add_tools(HoverTool(renderers=[loss_plot], tooltips=[("Epoch", "@x"), ("Loss", "@loss")]))
p.add_tools(HoverTool(renderers=[accuracy_plot], tooltips=[("Epoch", "@x"), ("Accuracy", "@accuracy")]))

p.legend.location = "center_right"
p.legend.click_policy="mute"

show(p)

Now, we evaluate the model on the testing set. We calculate the accuracy, precision, recall, and F1-score of the model.

X = X_test_scaled[['Pregnancies', 'Glucose']].values
X = np.c_[np.ones(X.shape[0]), X]
y_test_values = y_test.values.reshape(-1, 1)
m = X.shape[0]

z = X @ w
h = sigmoid(z)
accuracy = np.mean((h > 0.5) == y_test_values)
print(f"Test Accuracy: {np.round(accuracy * 100.0, 2)}%")

# # Calculate the confusion matrix with sklearn
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

report = classification_report(y_test, h > 0.5)
print(report)
Test Accuracy: 70.78%
              precision    recall  f1-score   support

           0       0.74      0.85      0.79       100
           1       0.62      0.44      0.52        54

    accuracy                           0.71       154
   macro avg       0.68      0.65      0.65       154
weighted avg       0.70      0.71      0.69       154
Hide code cell source
cm = confusion_matrix(y_test, h > 0.5, normalize='true')
target_names = ['No Diabetes', 'Diabetes']
p = figure(
    title='Confusion matrix',
    width=500,
    height=500,
    x_range=target_names,
    y_range=target_names,
)
source = ColumnDataSource(pd.DataFrame(cm, columns=target_names, index=target_names))
image_plot = p.image(image=[cm], x=0, y=0, dw=len(target_names), dh=len(target_names), palette='Blues256')
p.xaxis.major_label_orientation = 1
p.xaxis.axis_label = 'Prediction'
p.yaxis.axis_label = 'Ground truth'
p.add_tools(HoverTool(tooltips=[('accuracy', '@image')], renderers=[image_plot]))
for i in range(len(target_names)):
    for j in range(len(target_names)):
        p.text(
            x=[target_names[j]],
            y=[target_names[i]],
            text=[f'{cm[i, j]*100:.2f}%'],
            text_color='black' if cm[i, j] > 0.5 else 'white',
            text_align='center',
            text_baseline='middle',
        )
p.toolbar_location = None
show(p)