#!/usr/bin/env python # # file: knn_demonstration.py # # description: # This script demonstrates the K-Nearest Neighbors (KNN) algorithm. # It splits a synthetic non-linear dataset into training and test sets, # trains models for different K values, prints evaluation metrics, # and visualizes the decision boundaries along with the train/test splits. # # revision history: # 20260316 (AM): added train/test split, metrics printing, and test set plotting # 20260316 (AM): initial version #------------------------------------------------------------------------------ # import system modules # import os import sys import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn.datasets import make_moons from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score, f1_score #------------------------------------------------------------------------------ # # global variables are listed here # #------------------------------------------------------------------------------ # set the filename using basename # __FILE__ = os.path.basename(__file__) # define default values for the dataset, model, and plotting # DEF_N_SAMPLES = 300 DEF_NOISE = 0.30 DEF_TEST_SIZE = 0.30 # 30% of data used for testing DEF_RANDOM_SEED = 42 DEF_K_VALUES = [1, 15] # Compare K=1 (overfit) to K=15 (smooth) DEF_MESH_STEP = 0.02 # Step size for the decision boundary grid DEF_OUT_FILE_NAME = "knn_decision_boundaries_with_test.png" DEF_PLOT_TITLE = "KNN Decision Boundaries (Train vs Test)" #------------------------------------------------------------------------------ # # functions are listed here # #------------------------------------------------------------------------------ # generate and split a synthetic 2D dataset # def generate_and_split_data(n_samples=DEF_N_SAMPLES, noise=DEF_NOISE, test_size=DEF_TEST_SIZE, seed=DEF_RANDOM_SEED): """ method: generate_and_split_data arguments: n_samples: total number of points to generate noise: standard deviation of Gaussian noise added to the data test_size: fraction of data to reserve for testing seed: random seed for reproducibility return: X_train, X_test, y_train, y_test: the split dataset arrays description: Generates a 2-class non-linear dataset (interleaving half circles) and splits it into training and testing subsets. """ # generate the moons dataset using sklearn # X, y = make_moons(n_samples=n_samples, noise=noise, random_state=seed) # split into train and test sets # X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=test_size, random_state=seed, stratify=y) # exit gracefully # return X_tr, X_te, y_tr, y_te # plot the decision boundaries and print metrics # def evaluate_and_plot_knn(X_train, X_test, y_train, y_test, k_values, outfile=DEF_OUT_FILE_NAME): """ method: evaluate_and_plot_knn arguments: X_train, y_train: training data X_test, y_test: testing data k_values: list of K values to train and plot outfile: path to save the resulting image return: status: boolean indicating success description: Trains a KNN classifier for each K in k_values, prints train/test metrics to the console, and plots the boundaries side-by-side. """ # define color maps for the plot # Light colors for the background decision regions # Dark colors for the actual data points # cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#0000FF']) # determine the min and max limits for the grid based on ALL data # X_all = np.vstack((X_train, X_test)) x_min, x_max = X_all[:, 0].min() - 0.5, X_all[:, 0].max() + 0.5 y_min, y_max = X_all[:, 1].min() - 0.5, X_all[:, 1].max() + 0.5 # generate a dense grid of points (mesh) # xx, yy = np.meshgrid(np.arange(x_min, x_max, DEF_MESH_STEP), np.arange(y_min, y_max, DEF_MESH_STEP)) # flatten the grid for predictions # grid_points = np.c_[xx.ravel(), yy.ravel()] # setup the figure layout based on the number of K values # n_plots = len(k_values) fig, axes = plt.subplots(1, n_plots, figsize=(7 * n_plots, 6)) # ensure axes is always iterable (even if only 1 plot) # if n_plots == 1: axes = [axes] print("\n" + "="*40) print(" KNN EVALUATION METRICS ") print("="*40) # iterate over each K value and its corresponding subplot axis # for k, ax in zip(k_values, axes): # initialize and train the KNN classifier # clf = KNeighborsClassifier(n_neighbors=k) clf.fit(X_train, y_train) # --- METRICS CALCULATION --- # Predict on both Train and Test to show overfitting/generalization # y_train_pred = clf.predict(X_train) y_test_pred = clf.predict(X_test) train_acc = accuracy_score(y_train, y_train_pred) test_acc = accuracy_score(y_test, y_test_pred) test_f1 = f1_score(y_test, y_test_pred, average='macro') print("Model: KNN (K = %d)" % k) print(" Train Accuracy : %.2f%%" % (train_acc * 100)) print(" Test Accuracy : %.2f%%" % (test_acc * 100)) print(" Test F1-Score : %.4f" % test_f1) print("-" * 40) # --- PLOTTING --- # predict the class for every point in the grid # Z = clf.predict(grid_points) Z = Z.reshape(xx.shape) # plot the decision boundary (background colors) # ax.contourf(xx, yy, Z, cmap=cmap_light, alpha=0.8) # plot the TRAINING points (Circles) # ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold, edgecolor='k', s=35, marker='o', label='Train Data') # plot the TESTING points (Large Stars with white edges) # ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold, edgecolor='white', s=150, marker='*', label='Test Data') # format the subplot # ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_title("KNN (K = %i)\nTest Acc: %.1f%%" % (k, test_acc * 100), fontsize=14) ax.set_xlabel("Feature 1") ax.set_ylabel("Feature 2") # add legend to explain markers # ax.legend(loc='lower right', framealpha=0.9) # add a main title to the figure # fig.suptitle(DEF_PLOT_TITLE, fontsize=16, y=0.98) # adjust layout # plt.tight_layout(rect=[0, 0, 1, 0.95]) # save the plot to disk # try: plt.savefig(outfile, dpi=150) print("\nSaved visualization to: %s" % outfile) except Exception as e: print("**> Error saving plot: %s" % str(e)) return False # display the plot to the user # plt.show() # exit gracefully # return True # function: main # def main(argv): print("--- Starting KNN Metrics Demonstration ---") # 1. Generate and split the dataset # print("Generating synthetic non-linear dataset...") X_train, X_test, y_train, y_test = generate_and_split_data() print(" Total samples : %d" % (len(y_train) + len(y_test))) print(" Training set : %d samples" % len(y_train)) print(" Testing set : %d samples" % len(y_test)) # 2. Evaluate and plot # status = evaluate_and_plot_knn(X_train, X_test, y_train, y_test, DEF_K_VALUES) if not status: print("**> Process failed during evaluation/plotting.") return False print("--- Demonstration Complete ---") # exit gracefully # return True # begin gracefully # if __name__ == '__main__': main(sys.argv[0:]) # # end of file