#!/usr/bin/env python3 # -*- coding: utf-8 -*- # compare the number of repeats for repeated k-fold cross-validation from scipy.stats import sem from numpy import mean from numpy import std from sklearn.datasets import load_wine from sklearn.model_selection import KFold, train_test_split, cross_val_score from sklearn.metrics import plot_confusion_matrix from sklearn.neighbors import KNeighborsClassifier import matplotlib.pyplot as plt import numpy as np # evaluate a model with a given number of repeats def evaluate_model(X, y, repeats): # prepare the cross-validation procedure # cv = RepeatedKFold(n_splits=5, n_repeats=1) cv = KFold(n_splits=5) # create model model = KNeighborsClassifier(n_neighbors=repeats) # evaluate model scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1) return scores # create dataset wine=load_wine() X=wine.data y=wine.target #X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=1) # configurations to test neighbors = range(3,20) results = list() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3) for r in neighbors: # evaluate using a given number of repeats scores = evaluate_model(X_train, y_train, r) # summarize print('>%d mean=%.4f se=%.3f' % (r, mean(scores), sem(scores))) # store results.append(mean(scores)) # plot the results plt.plot(neighbors,results) plt.show() # Final Model and confusion matrix: classifier = KNeighborsClassifier(n_neighbors=10).fit(X_train, y_train) np.set_printoptions(precision=2) # Plot non-normalized confusion matrix titles_options = [("Confusion matrix, without normalization", None), ("Normalized confusion matrix", 'true')] for title, normalize in titles_options: disp = plot_confusion_matrix(classifier, X_test, y_test, cmap=plt.cm.Blues, normalize=normalize) disp.ax_.set_title(title) print(title) print(disp.confusion_matrix) plt.show()