Tuesday, April 5, 2022

Classifying Handwritten Digits via logitboost in Python

 В этом посте мы применим алгоритм LogitBoost к игрушечному набору данных для идентификации рукописных цифр.

***************

Сode 1

***************

(.env) [boris@fedora34server LOGITBOOST]$ cat logitBoost2.py

from itertools import product

import numpy as np

import matplotlib.pyplot as plt

from matplotlib import offsetbox

import seaborn as sns

sns.set(style='darkgrid', palette='colorblind', color_codes=True)


from sklearn.datasets import load_digits

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import MinMaxScaler

from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import (accuracy_score, classification_report,

                             confusion_matrix)

from logitboost import LogitBoost

from tensorflow.keras.layers import Embedding



digits = load_digits()

X = digits.data

y = digits.target

images = digits.images.astype(np.int_)

n_classes = 10


# Scale the digits for numerical stability

X /= 16


# Shuffle the data and split them into training and testing sets

test_size = 1 / 3

X_train, X_test, y_train, y_test, images_train, images_test \

    = train_test_split(X, y, images, test_size=test_size, shuffle=True,

                       stratify=y, random_state=0)


print('Training shape: ', X_train.shape)

print('Test shape:     ', X_test.shape)

n_rows = 8

n_cols = 8


fig, ax = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(10, 10))

k = 0

for i, j in product(range(n_rows), range(n_cols)):

    image = images_train[n_cols * i + j]

    ax[i, j].imshow(image, cmap='binary', interpolation='none')

    ax[i, j].axis('off')


plt.show(block=False)

# plt.close()


lboost = LogitBoost(DecisionTreeRegressor(max_depth=3),

                    n_estimators=30, random_state=0)

lboost.fit(X_train, y_train)

LogitBoost(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=3,

                                                max_features=None,

                                                max_leaf_nodes=None,

                                                min_impurity_decrease=0.0,

                                                min_samples_leaf=1,

                                                min_samples_split=2,

                                                min_weight_fraction_leaf=0.0,

                                                random_state=None,

                                                splitter='best'),

           bootstrap=False, learning_rate=1.0, max_response=4.0,

           n_estimators=30, random_state=0, weight_trim_quantile=0.05)


y_pred_train = lboost.predict(X_train)

y_pred_test = lboost.predict(X_test)

accuracy_train = accuracy_score(y_train, y_pred_train)

accuracy_test = accuracy_score(y_test, y_pred_test)

print('Training accuracy: %.4f' % accuracy_train)

print('Test accuracy:     %.4f' % accuracy_test)


report_train = classification_report(y_train, y_pred_train)

report_test = classification_report(y_test, y_pred_test)

print('Training\n%s' % report_train)

print('Test\n%s' % report_test)


fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))


sns.heatmap(confusion_matrix(y_train, y_pred_train), ax=ax[0],

            robust=True, annot=True, fmt=',d', cmap=plt.get_cmap('Blues'),

            square=True, cbar=False)

ax[0].set_xlabel('Predicted Class')

ax[0].set_ylabel('Actual Class')

ax[0].set_title('Training')


sns.heatmap(confusion_matrix(y_test, y_pred_test), ax=ax[1],

            robust=True, annot=True, fmt=',d', cmap=plt.get_cmap('Blues'),

            square=True, cbar=False)

ax[1].set_title('Testing', fontsize=14)

ax[1].set_xlabel('Predicted Class')

ax[1].set_ylabel('Actual Class')

plt.tight_layout()

plt.show(block=False)

# plt.close()

iterations = np.arange(1, lboost.n_estimators + 1)

staged_accuracy_train = list(lboost.staged_score(X_train, y_train))

staged_accuracy_test = list(lboost.staged_score(X_test, y_test))

plt.figure(figsize=(10, 8))

plt.plot(iterations, staged_accuracy_train, label='Training', marker='.')

plt.plot(iterations, staged_accuracy_test, label='Test', marker='.')

plt.xlabel('Iteration')

plt.ylabel('Accuracy')

plt.title('Ensemble accuracy during each boosting iteration', fontsize=14)

plt.legend(loc='best', shadow=True, frameon=True)

plt.tight_layout()

plt.show()

plt.close()






























*******************
Comlete Code 2
*******************
Добавим к предыдущему коду финальный фрагмент

(.env) [boris@fedora34server LOGITBOOST]$ cat logitBoost1.py
from itertools import product
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import offsetbox
import seaborn as sns
sns.set(style='darkgrid', palette='colorblind', color_codes=True)

from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix)
from logitboost import LogitBoost
from tensorflow.keras.layers import Embedding


digits = load_digits()
X = digits.data
y = digits.target
images = digits.images.astype(np.int_)
n_classes = 10

# Scale the digits for numerical stability
X /= 16

# Shuffle the data and split them into training and testing sets
test_size = 1 / 3
X_train, X_test, y_train, y_test, images_train, images_test \
    = train_test_split(X, y, images, test_size=test_size, shuffle=True,
                       stratify=y, random_state=0)

print('Training shape: ', X_train.shape)
print('Test shape:     ', X_test.shape)
n_rows = 8
n_cols = 8

fig, ax = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(10, 10))
k = 0
for i, j in product(range(n_rows), range(n_cols)):
    image = images_train[n_cols * i + j]
    ax[i, j].imshow(image, cmap='binary', interpolation='none')
    ax[i, j].axis('off')

plt.show(block=False)
plt.close()

lboost = LogitBoost(DecisionTreeRegressor(max_depth=3),
                    n_estimators=30, random_state=0)
lboost.fit(X_train, y_train)
LogitBoost(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=3,
                                                max_features=None,
                                                max_leaf_nodes=None,
                                                min_impurity_decrease=0.0,
                                                min_samples_leaf=1,
                                                min_samples_split=2,
                                                min_weight_fraction_leaf=0.0,
                                                random_state=None,
                                                splitter='best'),
           bootstrap=False, learning_rate=1.0, max_response=4.0,
           n_estimators=30, random_state=0, weight_trim_quantile=0.05)

y_pred_train = lboost.predict(X_train)
y_pred_test = lboost.predict(X_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)

print('Training accuracy: %.4f' % accuracy_train)
print('Test accuracy:     %.4f' % accuracy_test)

report_train = classification_report(y_train, y_pred_train)
report_test = classification_report(y_test, y_pred_test)
print('Training\n%s' % report_train)
print('Test\n%s' % report_test)

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

sns.heatmap(confusion_matrix(y_train, y_pred_train), ax=ax[0],
            robust=True, annot=True, fmt=',d', cmap=plt.get_cmap('Blues'),
            square=True, cbar=False)
ax[0].set_xlabel('Predicted Class')
ax[0].set_ylabel('Actual Class')
ax[0].set_title('Training')

sns.heatmap(confusion_matrix(y_test, y_pred_test), ax=ax[1],
            robust=True, annot=True, fmt=',d', cmap=plt.get_cmap('Blues'),
            square=True, cbar=False)
ax[1].set_title('Testing', fontsize=14)
ax[1].set_xlabel('Predicted Class')
ax[1].set_ylabel('Actual Class')

plt.tight_layout()
plt.show(block=False)
plt.close()

iterations = np.arange(1, lboost.n_estimators + 1)
staged_accuracy_train = list(lboost.staged_score(X_train, y_train))
staged_accuracy_test = list(lboost.staged_score(X_test, y_test))

plt.figure(figsize=(10, 8))
plt.plot(iterations, staged_accuracy_train, label='Training', marker='.')
plt.plot(iterations, staged_accuracy_test, label='Test', marker='.')

plt.xlabel('Iteration')
plt.ylabel('Accuracy')
plt.title('Ensemble accuracy during each boosting iteration', fontsize=14)
plt.legend(loc='best', shadow=True, frameon=True)

plt.tight_layout()
plt.show(block=False)
plt.close()

# Maximum number of misclassifications to show
n_misclassifications = 50
# Estimated class probabilities for the test set
prob_test_pred = lboost.predict_proba(X_test)
# Indices of the misclassified test examples
incorrect = (y_test != y_pred_test)
incorrect = np.where(incorrect)[0]

n_incorrect = len(incorrect)
print(f'Test set misclassification rate: {n_incorrect} out of {len(X_test)}')

for i in range(len(incorrect)):
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 2),
                           gridspec_kw={'width_ratios': [1, 4]})
    ax = ax.ravel()

    # Reassign i to the index of the ith incorrectly classified example
    i = incorrect[i]

    # Show the wrongly classified image
    image = images_test[i]
    ax[0].imshow(image, cmap='binary')
    ax[0].axis('off')
    ax[0].set_aspect('equal', 'box')

    palette = ['r'] * n_classes
    palette[y_test[i]] = 'g'
    sns.barplot(x=np.arange(n_classes), y=prob_test_pred[i],
                ax=ax[1], palette=palette)
    ax[1].set(ylabel='Probability', xlabel='Digit')
    ax[1].set(yscale='log', ylim=(1e-3, 1))

    true_label = y_test[i]
    pred_label = y_pred_test[i]
    pred_label_prob = prob_test_pred[i, pred_label]
    color = 'r' if true_label != pred_label else 'g'
    ax[1].get_xticklabels()[true_label].set_color('g')
    ax[1].get_xticklabels()[pred_label].set_color(color)

    ax[0].set_title('True label : %r' % true_label, fontsize=14)
    ax[1].set_title(f'Prediction: {pred_label} (prob: {pred_label_prob:.2f})',
                    color=color, fontsize=14)

    plt.tight_layout()
    plt.show()
    plt.close()

Наконец, давайте посмотрим на цифры в тестовом наборе, которые были неправильно классифицированы, и на то, что, по мнению модели LogitBoost, они представляют собой на самом деле.



No comments:

Post a Comment