In [4]:
# 上載手寫辨識數據
# sklearn.datasets.load_digits
from sklearn import datasets
mnist = datasets.load_digits()
In [5]:
mnist.keys()
Out[5]:
dict_keys(['data', 'target', 'target_names', 'images', 'DESCR'])
In [8]:
data = mnist.images
mnist.target
mnist.target.shape
Out[8]:
(1797,)
In [3]:
data = mnist.images
target = mnist.target.reshape(-1,1)
data.shape , target.shape
Out[3]:
((1797, 8, 8), (1797, 1))
In [9]:
%matplotlib inline
index = 3
import matplotlib.pyplot as plt
plt.figure(figsize=(1,1))
plt.imshow(data[index], cmap='gray_r')
plt.axis('off')
plt.show()
In [10]:
# 檢查 X 與 y
index = 0
print(data[index])
print('-'*35)
print(mnist.target[index])
print('-reshape-')
print(data[index].reshape(-1)) # .ravel() , .flatten()
[[ 0.  0.  5. 13.  9.  1.  0.  0.]
 [ 0.  0. 13. 15. 10. 15.  5.  0.]
 [ 0.  3. 15.  2.  0. 11.  8.  0.]
 [ 0.  4. 12.  0.  0.  8.  8.  0.]
 [ 0.  5.  8.  0.  0.  9.  8.  0.]
 [ 0.  4. 11.  0.  1. 12.  7.  0.]
 [ 0.  2. 14.  5. 10. 12.  0.  0.]
 [ 0.  0.  6. 13. 10.  0.  0.  0.]]
-----------------------------------
0
-reshape-
[ 0.  0.  5. 13.  9.  1.  0.  0.  0.  0. 13. 15. 10. 15.  5.  0.  0.  3.
 15.  2.  0. 11.  8.  0.  0.  4. 12.  0.  0.  8.  8.  0.  0.  5.  8.  0.
  0.  9.  8.  0.  0.  4. 11.  0.  1. 12.  7.  0.  0.  2. 14.  5. 10. 12.
  0.  0.  0.  0.  6. 13. 10.  0.  0.  0.]
In [11]:
# 分類 mnist 分十類別  [0....9]
from sklearn.model_selection import train_test_split
X = data.reshape(1797,64)
y = target.flatten()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [12]:
# hyper-parameter <-- GridSearchCV 

from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier

# clf = LogisticRegression(random_state=0, solver='newton-cg', multi_class='multinomial') # 0.972
# clf = MLPClassifier() # 0.975
clf = ExtraTreesClassifier(n_estimators=2000) # 0.9805
clf.fit(X_train, y_train)
Out[12]:
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
In [13]:
# 評分
print('X_train ',  clf.score(X_train , y_train) )
print('X_test ',   clf.score(X_test , y_test)   )
X_train  1.0
X_test  0.9777777777777777
In [14]:
# 找出那些沒有預測正確
import numpy as np
import matplotlib.pyplot as plt

y_pred = clf.predict(X_test) 
diff = (y_pred != y_test)
print('Ground Truth', y_test[diff])
print('Predictive  ', y_pred[diff])

for i in np.arange(X_test.shape[0])[diff]:
    plt.figure(figsize=(1,1))
    plt.axis('off')    
    plt.imshow(X_test[i].reshape(8,8) ,  cmap='gray')    
    plt.title( str(y_test[i])+ '-->' + str(y_pred[i]) )
    plt.show()   
Ground Truth [7 3 9 8 8 6 9 5]
Predictive   [9 5 7 1 1 5 5 6]
In [15]:
X_test.shape
Out[15]:
(360, 64)