#!/usr/bin/env python # # file: hierarchical_clustering_demo.py # # description: # This script provides an educational demonstration of Hierarchical # (Agglomerative) Clustering. It generates a synthetic dataset, plots # the raw data, the hierarchical tree (dendrogram), and the final cluster # assignments, while evaluating performance using the Adjusted Rand Index. # # revision history: # 20260316 (AM): added raw data plot to the visualization # 20260316 (AM): initial version #------------------------------------------------------------------------------ # import system modules # import os import sys import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import adjusted_rand_score, silhouette_score from scipy.cluster.hierarchy import dendrogram, linkage #------------------------------------------------------------------------------ # # global variables are listed here # #------------------------------------------------------------------------------ # set the filename using basename # __FILE__ = os.path.basename(__file__) # define default values for the dataset, model, and plotting # DEF_N_SAMPLES = 150 DEF_N_CLUSTERS = 3 DEF_RANDOM_SEED = 42 DEF_LINKAGE = 'ward' # 'ward' minimizes the variance of clusters being merged DEF_OUT_FILE_NAME = "hierarchical_clustering_3panel.png" DEF_PLOT_TITLE = "Hierarchical Agglomerative Clustering" #------------------------------------------------------------------------------ # # functions are listed here # #------------------------------------------------------------------------------ # generate a synthetic 2D dataset # def generate_data(n_samples=DEF_N_SAMPLES, centers=DEF_N_CLUSTERS, seed=DEF_RANDOM_SEED): """ method: generate_data arguments: n_samples: total number of points to generate centers: number of distinct blobs (ground truth clusters) seed: random seed for reproducibility return: X: feature matrix of shape (n_samples, 2) y_true: ground truth label array of shape (n_samples,) description: Generates a 2D dataset of distinct blobs to demonstrate clustering. """ # generate the blobs dataset using sklearn # X, y_true = make_blobs(n_samples=n_samples, centers=centers, cluster_std=1.2, random_state=seed) # exit gracefully # return X, y_true # evaluate the clustering model and plot the 3-panel visualization # def evaluate_and_plot_clustering(X, y_true, n_clusters, linkage_method, outfile=DEF_OUT_FILE_NAME): """ method: evaluate_and_plot_clustering arguments: X: feature matrix y_true: ground truth labels (used only for evaluation/plotting) n_clusters: the number of clusters to form linkage_method: the linkage criterion (e.g., 'ward', 'average') outfile: path to save the resulting image return: status: boolean indicating success description: Applies Agglomerative Clustering, prints evaluation metrics, and generates a 3-panel plot (Raw Data + Dendrogram + Final Clusters). """ # --- MODELING & METRICS --- # # initialize and fit the Agglomerative Clustering model # model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage_method) y_pred = model.fit_predict(X) # calculate evaluation metrics # ari_score = adjusted_rand_score(y_true, y_pred) sil_score = silhouette_score(X, y_pred) print("\n" + "="*40) print(" CLUSTERING EVALUATION METRICS ") print("="*40) print("Model: Agglomerative Clustering") print(" Linkage Method : %s" % linkage_method) print(" Target Clusters (K) : %d" % n_clusters) print(" Adjusted Rand Index : %.4f (1.0 is perfect match)" % ari_score) print(" Silhouette Score : %.4f (-1 to 1, higher is better)" % sil_score) print("-" * 40) # --- PLOTTING --- # # setup a 3-panel figure layout # fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 5.5)) # Subplot 1: Raw Data (Ground Truth) # ax1.scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis', edgecolor='k', s=50, alpha=0.9) ax1.set_title("Raw Data (Ground Truth)", fontsize=14) ax1.set_xlabel("Feature 1") ax1.set_ylabel("Feature 2") # Subplot 2: Dendrogram # We use scipy's linkage matrix to plot the tree structure # Z = linkage(X, method=linkage_method) dendrogram(Z, ax=ax2, truncate_mode='level', p=5, show_leaf_counts=True, no_labels=True) ax2.set_title("Hierarchical Dendrogram (%s)" % linkage_method, fontsize=14) ax2.set_xlabel("Data Points (Leaves)") ax2.set_ylabel("Merge Distance") # draw a horizontal line to show where the tree is "cut" # cut_distance = Z[-(n_clusters-1), 2] ax2.axhline(y=cut_distance, color='r', linestyle='--', label='Cut for K=%d' % n_clusters) ax2.legend() # Subplot 3: Discovered Clusters # Note: Cluster colors may not align exactly with Ground Truth colors # since unsupervised learning assigns arbitrary integer IDs to groups. # ax3.scatter(X[:, 0], X[:, 1], c=y_pred, cmap='viridis', edgecolor='k', s=50, alpha=0.9) ax3.set_title("Discovered Clusters\nARI: %.2f" % ari_score, fontsize=14) ax3.set_xlabel("Feature 1") ax3.set_ylabel("Feature 2") # add a main title to the figure # fig.suptitle(DEF_PLOT_TITLE, fontsize=16, y=0.98) # adjust layout # plt.tight_layout(rect=[0, 0, 1, 0.95]) # save the plot to disk # try: plt.savefig(outfile, dpi=150) print("\nSaved visualization to: %s" % outfile) except Exception as e: print("**> Error saving plot: %s" % str(e)) return False # display the plot to the user # plt.show() # exit gracefully # return True # function: main # def main(argv): print("--- Starting Hierarchical Clustering Demonstration ---") # 1. Generate the dataset # print("Generating synthetic clustering dataset...") X, y_true = generate_data() print(" Total samples : %d" % len(y_true)) print(" Hidden classes: %d" % DEF_N_CLUSTERS) # 2. Evaluate and plot # print("\nRunning Hierarchical Clustering...") status = evaluate_and_plot_clustering(X, y_true, n_clusters=DEF_N_CLUSTERS, linkage_method=DEF_LINKAGE) if not status: print("**> Process failed during evaluation/plotting.") return False print("--- Demonstration Complete ---") # exit gracefully # return True # begin gracefully # if __name__ == '__main__': main(sys.argv[0:]) # # end of file