#!/usr/bin/env python
#
# file: hierarchical_clustering_demo.py
#
# description:
#  This script provides an educational demonstration of Hierarchical 
#  (Agglomerative) Clustering. It generates a synthetic dataset, plots 
#  the raw data, the hierarchical tree (dendrogram), and the final cluster
#  assignments, while evaluating performance using the Adjusted Rand Index.
#
# revision history:
#  20260316 (AM): added raw data plot to the visualization
#  20260316 (AM): initial version
#------------------------------------------------------------------------------

# import system modules
#
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score, silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# define default values for the dataset, model, and plotting
#
DEF_N_SAMPLES     = 150
DEF_N_CLUSTERS    = 3
DEF_RANDOM_SEED   = 42
DEF_LINKAGE       = 'ward'   # 'ward' minimizes the variance of clusters being merged
DEF_OUT_FILE_NAME = "hierarchical_clustering_3panel.png"
DEF_PLOT_TITLE    = "Hierarchical Agglomerative Clustering"

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# generate a synthetic 2D dataset
#
def generate_data(n_samples=DEF_N_SAMPLES, centers=DEF_N_CLUSTERS, seed=DEF_RANDOM_SEED):
    """
    method: generate_data
    
    arguments:
     n_samples: total number of points to generate
     centers: number of distinct blobs (ground truth clusters)
     seed: random seed for reproducibility
     
    return:
     X: feature matrix of shape (n_samples, 2)
     y_true: ground truth label array of shape (n_samples,)
     
    description:
     Generates a 2D dataset of distinct blobs to demonstrate clustering.
    """
    
    # generate the blobs dataset using sklearn
    #
    X, y_true = make_blobs(n_samples=n_samples, centers=centers, 
                           cluster_std=1.2, random_state=seed)
    
    # exit gracefully
    #
    return X, y_true

# evaluate the clustering model and plot the 3-panel visualization
#
def evaluate_and_plot_clustering(X, y_true, n_clusters, linkage_method, 
                                 outfile=DEF_OUT_FILE_NAME):
    """
    method: evaluate_and_plot_clustering
    
    arguments:
     X: feature matrix
     y_true: ground truth labels (used only for evaluation/plotting)
     n_clusters: the number of clusters to form
     linkage_method: the linkage criterion (e.g., 'ward', 'average')
     outfile: path to save the resulting image
     
    return:
     status: boolean indicating success
     
    description:
     Applies Agglomerative Clustering, prints evaluation metrics, and 
     generates a 3-panel plot (Raw Data + Dendrogram + Final Clusters).
    """
    
    # --- MODELING & METRICS ---
    #
    
    # initialize and fit the Agglomerative Clustering model
    #
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage_method)
    y_pred = model.fit_predict(X)
    
    # calculate evaluation metrics
    #
    ari_score = adjusted_rand_score(y_true, y_pred)
    sil_score = silhouette_score(X, y_pred)
    
    print("\n" + "="*40)
    print(" CLUSTERING EVALUATION METRICS ")
    print("="*40)
    print("Model: Agglomerative Clustering")
    print("  Linkage Method       : %s" % linkage_method)
    print("  Target Clusters (K)  : %d" % n_clusters)
    print("  Adjusted Rand Index  : %.4f (1.0 is perfect match)" % ari_score)
    print("  Silhouette Score     : %.4f (-1 to 1, higher is better)" % sil_score)
    print("-" * 40)
    
    # --- PLOTTING ---
    #
    # setup a 3-panel figure layout
    #
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5.5))
    
    # Subplot 1: Raw Data (Ground Truth)
    #
    ax1.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', 
                edgecolor='k', s=50, alpha=0.9)
    ax1.set_title("Raw Data (Ground Truth)", fontsize=14)
    ax1.set_xlabel("Feature 1")
    ax1.set_ylabel("Feature 2")

    # Subplot 2: Dendrogram
    # We use scipy's linkage matrix to plot the tree structure
    #
    Z = linkage(X, method=linkage_method)
    dendrogram(Z, ax=ax2, truncate_mode='level', p=5, 
               show_leaf_counts=True, no_labels=True)
               
    ax2.set_title("Hierarchical Dendrogram (%s)" % linkage_method, fontsize=14)
    ax2.set_xlabel("Data Points (Leaves)")
    ax2.set_ylabel("Merge Distance")
    
    # draw a horizontal line to show where the tree is "cut"
    #
    cut_distance = Z[-(n_clusters-1), 2]
    ax2.axhline(y=cut_distance, color='r', linestyle='--', 
                label='Cut for K=%d' % n_clusters)
    ax2.legend()

    # Subplot 3: Discovered Clusters
    # Note: Cluster colors may not align exactly with Ground Truth colors 
    # since unsupervised learning assigns arbitrary integer IDs to groups.
    #
    ax3.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis', 
                edgecolor='k', s=50, alpha=0.9)
    ax3.set_title("Discovered Clusters\nARI: %.2f" % ari_score, fontsize=14)
    ax3.set_xlabel("Feature 1")
    ax3.set_ylabel("Feature 2")
    
    # add a main title to the figure
    #
    fig.suptitle(DEF_PLOT_TITLE, fontsize=16, y=0.98)
    
    # adjust layout
    #
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    
    # save the plot to disk
    #
    try:
        plt.savefig(outfile, dpi=150)
        print("\nSaved visualization to: %s" % outfile)
    except Exception as e:
        print("**> Error saving plot: %s" % str(e))
        return False
        
    # display the plot to the user
    #
    plt.show()
    
    # exit gracefully
    #
    return True

# function: main
#
def main(argv):
    
    print("--- Starting Hierarchical Clustering Demonstration ---")
    
    # 1. Generate the dataset
    #
    print("Generating synthetic clustering dataset...")
    X, y_true = generate_data()
    
    print("  Total samples : %d" % len(y_true))
    print("  Hidden classes: %d" % DEF_N_CLUSTERS)
    
    # 2. Evaluate and plot
    #
    print("\nRunning Hierarchical Clustering...")
    status = evaluate_and_plot_clustering(X, y_true, 
                                          n_clusters=DEF_N_CLUSTERS, 
                                          linkage_method=DEF_LINKAGE)
    
    if not status:
        print("**> Process failed during evaluation/plotting.")
        return False
        
    print("--- Demonstration Complete ---")
    
    # exit gracefully
    #
    return True

# begin gracefully
#
if __name__ == '__main__':
    main(sys.argv[0:])

#
# end of file