Logistic Regression#
This notebook will introduce you to the Logistic Regression algorithm and how to implement it in Python.
Practice#
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.models import Range1d, LinearAxis, Legend
# train_test_split
from sklearn.model_selection import train_test_split
output_notebook()
Dataset#
In this implementation, we are going to use Pima Indians Diabetes Database.
df = pd.read_csv(
"https://raw.githubusercontent.com/npradaschnor/Pima-Indians-Diabetes-Dataset/master/diabetes.csv"
)
df.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
# # Plot with Plotly
# fig = px.histogram(df, x="Outcome", labels={"Outcome": "Diabetes Outcome"})
# fig.update_layout(bargap=0.8)
# fig.show()
# Plot with Bokeh
source = ColumnDataSource(df)
p = figure(
title="Diabetes Outcome",
x_axis_label="Outcome",
y_axis_label="Count",
toolbar_location=None,
tools=""
)
# use np.unique to get unique values
values, counts = np.unique(df["Outcome"], return_counts=True)
p.vbar(x=values, top=counts, width=0.25, color="navy")
p.add_tools(HoverTool(tooltips=[("Count", "@top")]))
p.xgrid.grid_line_color = None
p.y_range.start = 0
show(p)
There are 268 patients that have diabetes and 500 patients that do not have diabetes.
Now, we split the data into training and testing sets. We will use 80% of the data for training and 20% for testing. By using function train_test_split
from sklearn.model_selection
, we can split the data easily.
To keep the ratio of the classes in the training and testing sets, we use the parameter stratify=y
in the train_test_split
function.
# Split the data
X = df.drop("Outcome", axis=1)
y = df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(614, 8) (154, 8) (614,) (154,)
Next, we will standardize the data using z-score normalization. In z-score normalization, the values are standardized based on the mean and standard deviation of the data. The formula for z-score normalization is given by:
where \(x\) is the original feature vector, \(\mu\) is the mean of the feature vector, and \(\sigma\) is the standard deviation of the feature vector.
Data standardization is a common requirement/technique in data preprocessing and it is important. It ensures that each feature contributes approximately proportionately to the final distance. Besides that, many machine learning algorithms perform better or converge faster when features are on a relatively similar scale and/or close to normally distributed.
Now, for each feature, we calculate the mean and standard deviation of the training set. Then, we use these values to standardize both the training and testing sets.
# mean = df.mean()
# std = df.std()
# scaled_df = (df - mean) / std
# scaled_df.head()
mean = X_train.mean()
std = X_train.std()
X_train_scaled = (X_train - mean) / std
X_test_scaled = (X_test - mean) / std
X_train_scaled.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | |
---|---|---|---|---|---|---|---|---|
353 | -0.850662 | -0.979332 | -0.404454 | -0.553521 | -0.331049 | -0.607183 | 0.310541 | -0.791524 |
711 | 0.356285 | 0.161313 | 0.464989 | 0.392467 | -0.525969 | -0.301893 | -0.116344 | 0.560577 |
373 | -0.548925 | -0.504063 | -0.621815 | 1.212324 | 0.142327 | 0.372290 | -0.764239 | -0.707018 |
46 | -0.850662 | 0.795004 | -0.730495 | -1.310312 | -0.730171 | -0.289172 | 0.262100 | -0.368992 |
682 | -1.152398 | -0.820909 | -0.295774 | 1.149258 | 0.244428 | 1.606173 | -0.337355 | -0.960537 |
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def loss_function(y, y_pred):
return - np.mean(y * np.log(y_pred) + (1 - y) * np.log(1 - y_pred))
X = X_train_scaled[['Pregnancies', 'Glucose']].values
X = np.c_[np.ones(X.shape[0]), X]
y_train_values = y_train.values.reshape(-1, 1)
m = X.shape[0]
w = np.random.rand(X.shape[1], 1)
alpha = 0.01
epochs = 2500
losses = []
accuracies = []
for i in range(epochs):
# Forward pass
z = X @ w # (m, n) * (n, 1) = (m, 1)
h = sigmoid(z) # (m, 1) = (m, 1)
# Compute loss and accuracy
loss = loss_function(y_train_values, h)
losses.append(loss)
accuracy = np.mean((h > 0.5) == y_train_values)
accuracies.append(np.round(accuracy * 100.0, 2))
# Backward pass
gradient = 1 / m * X.T @ (h - y_train_values) # scalar * (n, m) * (m, 1) = (n, 1)
w -= alpha * gradient # (n, 1) - scalar * (n, 1) = (n, 1)
Plot the training loss and accuracy.
Show code cell source
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=np.arange(epochs), y=losses, name="Loss", mode="lines"))
# fig.add_trace(go.Scatter(x=np.arange(epochs), y=accuracies, name="Accuracy", yaxis="y2", mode="lines"))
# fig.update_layout(title="Training Loss and Accuracy", xaxis_title="Epochs", yaxis_title="Loss")
# fig.update_layout(yaxis2=dict(title="Accuracy", overlaying="y", side="right"))
# fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
# fig.show()
# Plot with Bokeh
source = ColumnDataSource(data={"x": np.arange(epochs), "loss": losses, "accuracy": accuracies})
p = figure(
title="Training Loss and Accuracy",
x_axis_label="Epochs",
y_axis_label="Loss",
sizing_mode="scale_width",
aspect_ratio=2,
toolbar_location=None,
tools=""
)
p.y_range = Range1d(start=min(losses) - 0.05 * max(losses), end=max(losses)*1.05)
p.extra_y_ranges = {"accuracy": Range1d(start=min(accuracies) - 0.05 * max(accuracies), end=max(accuracies)*1.05)}
p.add_layout(LinearAxis(y_range_name="accuracy", axis_label="Accuracy"), "right")
# p.add_layout(Legend(location='right', orientation='horizontal', padding=0, border_line_alpha=0, margin=0), 'above')
loss_plot = p.line(x="x", y="loss", source=source, color="navy", legend_label="Loss")
accuracy_plot = p.line(x="x", y="accuracy", source=source, color="red", legend_label="Accuracy", y_range_name="accuracy")
p.add_tools(HoverTool(renderers=[loss_plot], tooltips=[("Epoch", "@x"), ("Loss", "@loss")]))
p.add_tools(HoverTool(renderers=[accuracy_plot], tooltips=[("Epoch", "@x"), ("Accuracy", "@accuracy")]))
p.legend.location = "center_right"
p.legend.click_policy="mute"
show(p)
Now, we evaluate the model on the testing set. We calculate the accuracy, precision, recall, and F1-score of the model.
X = X_test_scaled[['Pregnancies', 'Glucose']].values
X = np.c_[np.ones(X.shape[0]), X]
y_test_values = y_test.values.reshape(-1, 1)
m = X.shape[0]
z = X @ w
h = sigmoid(z)
accuracy = np.mean((h > 0.5) == y_test_values)
print(f"Test Accuracy: {np.round(accuracy * 100.0, 2)}%")
# # Calculate the confusion matrix with sklearn
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
report = classification_report(y_test, h > 0.5)
print(report)
Test Accuracy: 70.78%
precision recall f1-score support
0 0.74 0.85 0.79 100
1 0.62 0.44 0.52 54
accuracy 0.71 154
macro avg 0.68 0.65 0.65 154
weighted avg 0.70 0.71 0.69 154
Show code cell source
cm = confusion_matrix(y_test, h > 0.5, normalize='true')
target_names = ['No Diabetes', 'Diabetes']
p = figure(
title='Confusion matrix',
width=500,
height=500,
x_range=target_names,
y_range=target_names,
)
source = ColumnDataSource(pd.DataFrame(cm, columns=target_names, index=target_names))
image_plot = p.image(image=[cm], x=0, y=0, dw=len(target_names), dh=len(target_names), palette='Blues256')
p.xaxis.major_label_orientation = 1
p.xaxis.axis_label = 'Prediction'
p.yaxis.axis_label = 'Ground truth'
p.add_tools(HoverTool(tooltips=[('accuracy', '@image')], renderers=[image_plot]))
for i in range(len(target_names)):
for j in range(len(target_names)):
p.text(
x=[target_names[j]],
y=[target_names[i]],
text=[f'{cm[i, j]*100:.2f}%'],
text_color='black' if cm[i, j] > 0.5 else 'white',
text_align='center',
text_baseline='middle',
)
p.toolbar_location = None
show(p)