- since no single classifier will work best for all the problems, we need to experiment with a handful
- need to effectively compare the models and select the best one for the problem
- over or under fitting can occur if training data is not properly sampled or features are not properly selected
- models can suffer from underfitting (high bias) if the model is too simple
- bias measures how far off the predictions are from the correct values in general if we rebuild the model multiple times on different datasets
- models can suffer from overfitting the training data (high variance) if the model is too complex for the underlying training data
- variance measures the consistency (or variability) of the model prediction for classifying a particular example if we retrain the model multiple times, e.g., on different subsets of the training dataset
- the following figure demonstrates under and over fitting the models based
-
k-fold cross-validation can help us obtain reliable estimates of the model's performance on unseen data

-
stratified k-fold cross-validation can yield better bias and variance estimates, especially in cases of unequal class proportions
-
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
- details: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
- let's use the binary classification dataset for detecting breast cancer
import pandas as pd
import numpy as npurl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
df = pd.read_csv(url, header=None)df
# Note col 0 is ID of the sample and col 1 is the corresponding diagnoses (M = malignant, B = benign)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
569 rows × 32 columns
df.describe()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
</style>
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
8 rows × 31 columns
# class distribution
df.groupby(1).size()1
B 357
M 212
dtype: int64
# Let's create X and y numpy ndarrays
X = df.loc[:, 2:].values
y = df.loc[:, 1].valuesX[:5]array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
[2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02],
[1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
2.250e-02, 4.571e-03, 2.357e+01, 2.553e+01, 1.525e+02, 1.709e+03,
1.444e-01, 4.245e-01, 4.504e-01, 2.430e-01, 3.613e-01, 8.758e-02],
[1.142e+01, 2.038e+01, 7.758e+01, 3.861e+02, 1.425e-01, 2.839e-01,
2.414e-01, 1.052e-01, 2.597e-01, 9.744e-02, 4.956e-01, 1.156e+00,
3.445e+00, 2.723e+01, 9.110e-03, 7.458e-02, 5.661e-02, 1.867e-02,
5.963e-02, 9.208e-03, 1.491e+01, 2.650e+01, 9.887e+01, 5.677e+02,
2.098e-01, 8.663e-01, 6.869e-01, 2.575e-01, 6.638e-01, 1.730e-01],
[2.029e+01, 1.434e+01, 1.351e+02, 1.297e+03, 1.003e-01, 1.328e-01,
1.980e-01, 1.043e-01, 1.809e-01, 5.883e-02, 7.572e-01, 7.813e-01,
5.438e+00, 9.444e+01, 1.149e-02, 2.461e-02, 5.688e-02, 1.885e-02,
1.756e-02, 5.115e-03, 2.254e+01, 1.667e+01, 1.522e+02, 1.575e+03,
1.374e-01, 2.050e-01, 4.000e-01, 1.625e-01, 2.364e-01, 7.678e-02]])
y[:5]array([1, 1, 1, 1, 1])
# let's encode the labels with LabelEncoder
from sklearn.preprocessing import LabelEncoderle = LabelEncoder()
y = le.fit_transform(y)y[-10:]array([0, 0, 0, 1, 1, 1, 1, 1, 1, 0])
y.shape(569,)
X.shape(569, 30)
le.classes_
# 0 is Benign (Not-Cancer) and 1 is Malignant (Cancer)array(['B', 'M'], dtype=object)
# let's Scale the data using StandardScaler
from sklearn.preprocessing import StandardScalersc = StandardScaler()
sc.fit(X) # fit the whole data to calculate mean and standard deviation
X_sc = sc.transform(X) # transform training set# let's do the StratifiedKFold cross validation
from sklearn.model_selection import StratifiedKFold
# use logistic regression classifier
from sklearn.linear_model import LogisticRegression
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.htmlkfold = StratifiedKFold(n_splits=10)scores = []
for k, (train, test) in enumerate(kfold.split(X_sc, y)): # iterator
lr_model = LogisticRegression(random_state=1, solver='lbfgs')
#print(train.shape, test.shape)
lr_model.fit(X_sc[train], y[train])
score = lr_model.score(X_sc[test], y[test])
scores.append(score)
print(f'Fold:{k+1:2d}, Class dist.:{np.bincount(y[train])}, Acc: {score:.3f}')Fold: 1, Class dist.:[322 190], Acc: 0.982
Fold: 2, Class dist.:[322 190], Acc: 0.982
Fold: 3, Class dist.:[321 191], Acc: 0.982
Fold: 4, Class dist.:[321 191], Acc: 0.965
Fold: 5, Class dist.:[321 191], Acc: 0.982
Fold: 6, Class dist.:[321 191], Acc: 0.982
Fold: 7, Class dist.:[321 191], Acc: 0.947
Fold: 8, Class dist.:[321 191], Acc: 1.000
Fold: 9, Class dist.:[321 191], Acc: 1.000
Fold:10, Class dist.:[322 191], Acc: 0.982
print(f'CV accuracy : {np.mean(scores):.3f}, +/- {np.std(scores):.3f}')CV accuracy : 0.981, +/- 0.015
# better: use scikit learn's cross_val_score
from sklearn.model_selection import cross_val_scorelr_model = LogisticRegression(random_state=1, solver='lbfgs')
scores = cross_val_score(estimator=lr_model, X=X_sc, y=y, cv=10, n_jobs=-1)
# n_jobs = -1 means use all available processors to do computation in parallelprint(f'CV accuracy scores: {scores}')CV accuracy scores: [0.98245614 0.98245614 0.98245614 0.96491228 0.98245614 0.98245614
0.94736842 1. 1. 0.98214286]
print(f'CV accuracy: {np.mean(scores):.3f}, +/- {np.std(scores):.3f}')CV accuracy: 0.981, +/- 0.015
# let's compare a handful of Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysisnames = ["KNN", "Linear SVM", "RBF SVM", "Gaussian Process",
"Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
"Naive Bayes", "QDA", 'Logistic Reg']
scores = [] # store (name, mean, std_dev) for each classifier
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
GaussianProcessClassifier(1.0 * RBF(1.0)),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
MLPClassifier(alpha=1, max_iter=1000),
AdaBoostClassifier(),
GaussianNB(),
QuadraticDiscriminantAnalysis(),
LogisticRegression(random_state=1, solver='lbfgs')
]
# iterate over classifiers
for name, clf in zip(names, classifiers):
cvs = cross_val_score(estimator=clf, X=X_sc, y=y, cv=10, n_jobs=-1)
scores.append((name, np.mean(cvs), np.std(cvs)))
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
scores[('KNN', np.float64(0.9647869674185465), np.float64(0.02239183921884522)),
('Linear SVM',
np.float64(0.9736215538847116),
np.float64(0.021152440486425998)),
('RBF SVM', np.float64(0.6274122807017544), np.float64(0.006965956216784447)),
('Gaussian Process',
np.float64(0.9789473684210526),
np.float64(0.017189401703741607)),
('Decision Tree',
np.float64(0.9157894736842105),
np.float64(0.04354271454733634)),
('Random Forest',
np.float64(0.938533834586466),
np.float64(0.02379377199566088)),
('Neural Net',
np.float64(0.9771616541353383),
np.float64(0.020824474721646426)),
('AdaBoost',
np.float64(0.9718984962406015),
np.float64(0.025044655973063212)),
('Naive Bayes',
np.float64(0.9315162907268169),
np.float64(0.0327113878182645)),
('QDA', np.float64(0.9560776942355889), np.float64(0.02110040899790857)),
('Logistic Reg',
np.float64(0.9806704260651629),
np.float64(0.01456955548732776))]
# let's sort the scores in descending order of accuracy
scores.sort(key=lambda t: t[1], reverse=True)scores[('Logistic Reg',
np.float64(0.9806704260651629),
np.float64(0.01456955548732776)),
('Gaussian Process',
np.float64(0.9789473684210526),
np.float64(0.017189401703741607)),
('Neural Net',
np.float64(0.9771616541353383),
np.float64(0.020824474721646426)),
('Linear SVM',
np.float64(0.9736215538847116),
np.float64(0.021152440486425998)),
('AdaBoost',
np.float64(0.9718984962406015),
np.float64(0.025044655973063212)),
('KNN', np.float64(0.9647869674185465), np.float64(0.02239183921884522)),
('QDA', np.float64(0.9560776942355889), np.float64(0.02110040899790857)),
('Random Forest',
np.float64(0.938533834586466),
np.float64(0.02379377199566088)),
('Naive Bayes',
np.float64(0.9315162907268169),
np.float64(0.0327113878182645)),
('Decision Tree',
np.float64(0.9157894736842105),
np.float64(0.04354271454733634)),
('RBF SVM', np.float64(0.6274122807017544), np.float64(0.006965956216784447))]
- Good resource - https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc
- Receiver Operating Characteristic (ROC) graphs are used to select models for classification based on the performance with respect to the FPR and TPR
- useful for comparing performances of models as long as the dataset is roughly balanced
- use precision-recall curve for imbalanced datasets
- the diagonal of the ROC curve can be interpreted as random guessing
- classification models that fall below the diagonal are considered as worse than random guessing
- a perfect classifier would fall into the top-left corner of the graph with a TPR of 1 and and an FPR of 0
- based on ROC curve, we can compute area under the curve (AUC) to characterize the performance of a classification model
- we can use ROC curve for tuning and chosing model and threshold
- threshold choice depends on which metric is most important to the specific use case
- e.g., if false positives (false alarams) are more costly, it may make sense to choose a threshold that tives a lower FPR even if TPR is reduced (point A in the figure below)
- conversely if FPR are cheap but false negatives (missed true positives) are costly, the threshold for point C (which maximizes TPR at the cost of higher FPR)
- point B offers best compromise between TPR and FPR
from sklearn.metrics import RocCurveDisplay, auc
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold# To generate more representitive ROC graph,
# we'll use just 2 features 4 and 14 making it more challenging for the classifier
X_train = X_sc[:, [4, 14]]cv = StratifiedKFold(n_splits=5) # just to 5 fold
classifier = LogisticRegression(random_state=1, solver='lbfgs')
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
fig, ax = plt.subplots()
# create and add ROC for each fold
for i, (train, test) in enumerate(cv.split(X_train, y)): # iterator
classifier.fit(X_train[train], y[train])
viz = RocCurveDisplay.from_estimator(classifier, X_train[test], y[test],
name=f'ROC fold {i}',
alpha=0.3, lw=1, ax=ax)
interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
interp_tpr[0] = 0.0
tprs.append(interp_tpr)
aucs.append(viz.roc_auc)
# add curve for random guessing
ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
label='Random guessing', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
# add curve for mean scores
ax.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
# add curve for a perfect score
ax.plot([0, 0, 1],
[0, 1, 1], linestyle=':', color='black', label='Perfect performance')
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
title="ROC Curve Example")
ax.legend(loc="lower right")
plt.show()import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc, roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from itertools import cycle
from sklearn.model_selection import train_test_split# let's compare a handful of Classifiers
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysisurl = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
df = pd.read_csv(url, header=None)X = df.loc[:, 2:].values
y = df.loc[:, 1].valuesle = LabelEncoder()
y = le.fit_transform(y)sc = StandardScaler()
sc.fit(X) # fit the whole data to calculate mean and standard deviation
X_sc = sc.transform(X) # transform training setnames = ["KNN", "Linear SVM", "RBF SVM", "Gaussian Process",
"Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
"Naive Bayes", "QDA", 'Logistic Reg']
classifiers = [
KNeighborsClassifier(2),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
GaussianProcessClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
MLPClassifier(),
AdaBoostClassifier(),
GaussianNB(),
QuadraticDiscriminantAnalysis(),
LogisticRegression(random_state=1, solver='lbfgs')
]
mean_fpr = np.linspace(0, 1, 100)
#cv = StratifiedKFold(n_splits=5) # just to 5 fold# let's plot the ROC Curves for all the classifiers
fig, ax = plt.subplots(figsize=(10, 6))
lw=2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,
random_state=0)
for name, classifier in zip(names, classifiers):
classifier.fit(X_train, y_train)
RocCurveDisplay.from_estimator(classifier, X_test, y_test,
name=f'{name}',
alpha=0.3, lw=1, ax=ax)
ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="ROC Curve Example")
ax.legend(loc="lower right")
plt.title("ROC Curves of Classifiers")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 0 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/discriminant_analysis.py:1024: LinAlgWarning: The covariance matrix of class 1 is not full rank. Increasing the value of parameter `reg_param` might help reducing the collinearity.
warnings.warn(
/home/codespace/.local/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:465: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(




