# Common imports used across all classification models

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay
from pathlib import Path


from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report
)

import joblib

# Load the dataset

data = load_breast_cancer(as_frame=True)

X = data.data
y = data.target

# Split data into training and test sets

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# Feature scaling

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Logistic Regression model
log_reg_model = LogisticRegression(
    max_iter=1000,
    random_state=42
)

# Train the model
log_reg_model.fit(X_train_scaled, y_train)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 10
      4 log_reg_model = LogisticRegression(
      5     max_iter=1000,
      6     random_state=42
      7 )
      9 # Train the model
---> 10 log_reg_model.fit(X_train_scaled, y_train)

NameError: name 'X_train_scaled' is not defined

# Predict class labels on the test set

y_pred = log_reg_model.predict(X_test_scaled)
print(y_pred)

[1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0
 1 1 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 1 1 0
 1 0 0]

# Predict class probabilities on the test set

y_pred_proba = log_reg_model.predict_proba(X_test_scaled)

pd.DataFrame(
    y_pred_proba[:10],
    columns=["P(class 0)", "P(class 1)"]
)

# Compute accuracy

accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy :{accuracy}")

accuracy :0.9736842105263158

# Compute confusion matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[41,  2],
       [ 1, 70]])

# Visualize confusion matrix
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=log_reg_model.classes_
)

disp.plot(cmap="Blues")
plt.title("Confusion Matrix – Logistic Regression")
plt.show()

# Classification report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

# Define model directory
model_dir = Path("models/supervised_learning/classification/logistic_regression")

# Create directory if it does not exist
model_dir.mkdir(parents=True, exist_ok=True)

# Save model and scaler
joblib.dump(log_reg_model, model_dir / "logistic_regression_model.joblib")
joblib.dump(scaler, model_dir / "scaler.joblib")

['models\\supervised_learning\\classification\\logistic_regression\\scaler.joblib']

# ====================================
# Imports
# ====================================

import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay
)

from pathlib import Path
import joblib
import matplotlib.pyplot as plt


# ====================================
# Dataset loading
# ====================================

data = load_breast_cancer(as_frame=True)

X = data.data
y = data.target


# ====================================
# Train-test split
# ====================================

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


# ====================================
# Feature scaling
# ====================================

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# ====================================
# Model initialization
# ====================================

log_reg_model = LogisticRegression(
    max_iter=1000,
    random_state=42
)


# ====================================
# Model training
# ====================================

log_reg_model.fit(X_train_scaled, y_train)


# ====================================
# Predictions
# ====================================

y_pred = log_reg_model.predict(X_test_scaled)
y_pred_proba = log_reg_model.predict_proba(X_test_scaled)


# ====================================
# Model evaluation
# ====================================

accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

accuracy
cm

print(classification_report(y_test, y_pred))


# ====================================
# Confusion matrix visualization
# ====================================

disp = ConfusionMatrixDisplay(
    confusion_matrix=cm,
    display_labels=log_reg_model.classes_
)

disp.plot(cmap="Blues")
plt.title("Confusion Matrix – Logistic Regression")
plt.show()


# ====================================
# Model persistence
# ====================================

model_dir = Path("models/supervised_learning/classification/logistic_regression")
model_dir.mkdir(parents=True, exist_ok=True)

joblib.dump(log_reg_model, model_dir / "logistic_regression_model.joblib")
joblib.dump(scaler, model_dir / "scaler.joblib")

	P(class 0)	P(class 1)
0	0.113590	8.864098e-01
1	0.999991	9.009362e-06
2	0.996921	3.079095e-03
3	0.000510	9.994899e-01
4	0.000061	9.999394e-01
5	1.000000	9.480878e-11
6	1.000000	1.536726e-09
7	0.965097	3.490306e-02
8	0.379342	6.206579e-01
9	0.000759	9.992406e-01

Supervised Learning → Logistic Regression (Classification)¶

How this notebook should be read¶

What is Logistic Regression?¶

Why we start with intuition¶

What you should expect from the results¶

1. Project setup and common pipeline¶

____________________________________¶

2. Dataset loading¶

Inputs and target¶

Why this dataset is suitable¶

____________________________________¶

3. Train-test split¶

Why this step is important¶

Consistency across notebooks¶

Note on train-test split proportions¶

____________________________________¶

4. Feature scaling (why we do it)¶

Why scaling matters for Logistic Regression¶

What scaling does¶

Important rule: fit only on training data¶

What we have after this step¶

____________________________________¶

5. What is this model? (Logistic Regression)¶

The core idea¶

From probability to class¶

How this differs from Linear Regression¶

Why the sigmoid function matters¶

Key takeaway¶

____________________________________¶

6. Model training (Logistic Regression)¶

What does "training" mean for Logistic Regression?¶

Important parameters (kept simple)¶

What these parameters mean¶

What we have after training¶

____________________________________¶

7. Model behavior and key parameters (Logistic Regression)¶

Coefficients and feature influence¶

Decision boundary¶

Probability vs class prediction¶

Key takeaway¶

____________________________________¶

8. Predictions (Logistic Regression)¶

Predicting class labels¶

Predicting probabilities¶

How probabilities are used¶

What you should expect to see¶

____________________________________¶

9. Model evaluation (Logistic Regression)¶

Accuracy¶

Confusion matrix¶

About the confusion matrix¶

How to read the confusion matrix¶

Classification report¶

What you should focus on¶

Key takeaway¶

____________________________________¶

10. When to use it and when not to (Logistic Regression)¶

When Logistic Regression is a good choice¶

When Logistic Regression is NOT a good choice¶

Typical warning signs¶

Key takeaway¶

____________________________________¶

11. Model persistence (Logistic Regression)¶

Why saving the model is important¶

Important rule: save the scaler together with the model¶

What we have now¶

Loading the model later (conceptual example)¶

____________________________________¶

12. Mathematical formulation (deep dive)¶

From linear model to classification¶

The role of the sigmoid function¶

Probabilistic interpretation¶

Loss function and optimization¶

Decision boundary¶

Regularization (conceptual view)¶

Final takeaway¶

____________________________________¶

Final summary – Code only¶