#!/usr/bin/env python
#
# file: knn_demonstration.py
#
# description:
#  This script demonstrates the K-Nearest Neighbors (KNN) algorithm.
#  It splits a synthetic non-linear dataset into training and test sets,
#  trains models for different K values, prints evaluation metrics, 
#  and visualizes the decision boundaries along with the train/test splits.
#
# revision history:
#  20260316 (AM): added train/test split, metrics printing, and test set plotting
#  20260316 (AM): initial version
#------------------------------------------------------------------------------

# import system modules
#
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# define default values for the dataset, model, and plotting
#
DEF_N_SAMPLES     = 300
DEF_NOISE         = 0.30
DEF_TEST_SIZE     = 0.30     # 30% of data used for testing
DEF_RANDOM_SEED   = 42
DEF_K_VALUES      = [1, 15]  # Compare K=1 (overfit) to K=15 (smooth)
DEF_MESH_STEP     = 0.02     # Step size for the decision boundary grid
DEF_OUT_FILE_NAME = "knn_decision_boundaries_with_test.png"
DEF_PLOT_TITLE    = "KNN Decision Boundaries (Train vs Test)"

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# generate and split a synthetic 2D dataset
#
def generate_and_split_data(n_samples=DEF_N_SAMPLES, noise=DEF_NOISE, 
                            test_size=DEF_TEST_SIZE, seed=DEF_RANDOM_SEED):
    """
    method: generate_and_split_data
    
    arguments:
     n_samples: total number of points to generate
     noise: standard deviation of Gaussian noise added to the data
     test_size: fraction of data to reserve for testing
     seed: random seed for reproducibility
     
    return:
     X_train, X_test, y_train, y_test: the split dataset arrays
     
    description:
     Generates a 2-class non-linear dataset (interleaving half circles) 
     and splits it into training and testing subsets.
    """
    
    # generate the moons dataset using sklearn
    #
    X, y = make_moons(n_samples=n_samples, noise=noise, random_state=seed)
    
    # split into train and test sets
    #
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=test_size, 
                                              random_state=seed, stratify=y)
    
    # exit gracefully
    #
    return X_tr, X_te, y_tr, y_te

# plot the decision boundaries and print metrics
#
def evaluate_and_plot_knn(X_train, X_test, y_train, y_test, 
                          k_values, outfile=DEF_OUT_FILE_NAME):
    """
    method: evaluate_and_plot_knn
    
    arguments:
     X_train, y_train: training data
     X_test, y_test: testing data
     k_values: list of K values to train and plot
     outfile: path to save the resulting image
     
    return:
     status: boolean indicating success
     
    description:
     Trains a KNN classifier for each K in k_values, prints train/test 
     metrics to the console, and plots the boundaries side-by-side.
    """
    
    # define color maps for the plot
    # Light colors for the background decision regions
    # Dark colors for the actual data points
    #
    cmap_light = ListedColormap(['#FFAAAA', '#AAAAFF'])
    cmap_bold  = ListedColormap(['#FF0000', '#0000FF'])
    
    # determine the min and max limits for the grid based on ALL data
    #
    X_all = np.vstack((X_train, X_test))
    x_min, x_max = X_all[:, 0].min() - 0.5, X_all[:, 0].max() + 0.5
    y_min, y_max = X_all[:, 1].min() - 0.5, X_all[:, 1].max() + 0.5
    
    # generate a dense grid of points (mesh)
    #
    xx, yy = np.meshgrid(np.arange(x_min, x_max, DEF_MESH_STEP),
                         np.arange(y_min, y_max, DEF_MESH_STEP))
    
    # flatten the grid for predictions
    #
    grid_points = np.c_[xx.ravel(), yy.ravel()]
    
    # setup the figure layout based on the number of K values
    #
    n_plots = len(k_values)
    fig, axes = plt.subplots(1, n_plots, figsize=(7 * n_plots, 6))
    
    # ensure axes is always iterable (even if only 1 plot)
    #
    if n_plots == 1:
        axes = [axes]
        
    print("\n" + "="*40)
    print(" KNN EVALUATION METRICS ")
    print("="*40)
        
    # iterate over each K value and its corresponding subplot axis
    #
    for k, ax in zip(k_values, axes):
        
        # initialize and train the KNN classifier
        #
        clf = KNeighborsClassifier(n_neighbors=k)
        clf.fit(X_train, y_train)
        
        # --- METRICS CALCULATION ---
        # Predict on both Train and Test to show overfitting/generalization
        #
        y_train_pred = clf.predict(X_train)
        y_test_pred  = clf.predict(X_test)
        
        train_acc = accuracy_score(y_train, y_train_pred)
        test_acc  = accuracy_score(y_test, y_test_pred)
        test_f1   = f1_score(y_test, y_test_pred, average='macro')
        
        print("Model: KNN (K = %d)" % k)
        print("  Train Accuracy : %.2f%%" % (train_acc * 100))
        print("  Test Accuracy  : %.2f%%" % (test_acc * 100))
        print("  Test F1-Score  : %.4f" % test_f1)
        print("-" * 40)
        
        # --- PLOTTING ---
        # predict the class for every point in the grid
        #
        Z = clf.predict(grid_points)
        Z = Z.reshape(xx.shape)
        
        # plot the decision boundary (background colors)
        #
        ax.contourf(xx, yy, Z, cmap=cmap_light, alpha=0.8)
        
        # plot the TRAINING points (Circles)
        #
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold,
                   edgecolor='k', s=35, marker='o', label='Train Data')
                   
        # plot the TESTING points (Large Stars with white edges)
        #
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold,
                   edgecolor='white', s=150, marker='*', label='Test Data')
        
        # format the subplot
        #
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_title("KNN (K = %i)\nTest Acc: %.1f%%" % (k, test_acc * 100), fontsize=14)
        ax.set_xlabel("Feature 1")
        ax.set_ylabel("Feature 2")
        
        # add legend to explain markers
        #
        ax.legend(loc='lower right', framealpha=0.9)
        
    # add a main title to the figure
    #
    fig.suptitle(DEF_PLOT_TITLE, fontsize=16, y=0.98)
    
    # adjust layout
    #
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    
    # save the plot to disk
    #
    try:
        plt.savefig(outfile, dpi=150)
        print("\nSaved visualization to: %s" % outfile)
    except Exception as e:
        print("**> Error saving plot: %s" % str(e))
        return False
        
    # display the plot to the user
    #
    plt.show()
    
    # exit gracefully
    #
    return True

# function: main
#
def main(argv):
    
    print("--- Starting KNN Metrics Demonstration ---")
    
    # 1. Generate and split the dataset
    #
    print("Generating synthetic non-linear dataset...")
    X_train, X_test, y_train, y_test = generate_and_split_data()
    
    print("  Total samples : %d" % (len(y_train) + len(y_test)))
    print("  Training set  : %d samples" % len(y_train))
    print("  Testing set   : %d samples" % len(y_test))
    
    # 2. Evaluate and plot
    #
    status = evaluate_and_plot_knn(X_train, X_test, y_train, y_test, DEF_K_VALUES)
    
    if not status:
        print("**> Process failed during evaluation/plotting.")
        return False
        
    print("--- Demonstration Complete ---")
    
    # exit gracefully
    #
    return True

# begin gracefully
#
if __name__ == '__main__':
    main(sys.argv[0:])

#
# end of file