#!/usr/bin/env python
#
# file: kmeans.py
#
# revision history:
#
# 20260317 (SP): implement simple k-means clustering 
#------------------------------------------------------------------------------

# import system modules
#
import os
import sys
import random
import math

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# define default values for arguments
#
# a simple 2D dataset with two natural groupings (bottom-left and top-right)
#
DEF_DATA_SET       = [[1.0, 2.0], [1.5, 1.8], [5.0, 8.0], [8.0, 8.0], [1.0, 0.6], [9.0, 11.0]]
DEF_K_CLUSTERS     = int(2)
DEF_MAX_ITERATIONS = int(100)
DEF_RANDOM_SEED    = 27

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# calculate the euclidean distance between two points
#
def calculate_distance(point1, point2):
    """
    method: calculate_distance
    arguments:
        point1: list of coordinates for the first point
        point2: list of coordinates for the second point
    return:
        distance: the euclidean distance between the two points
    description:
        Calculate the Euclidean distance between two n-dimensional points.
    """

    # verify that both points have the same number of dimensions
    #
    if len(point1) != len(point2):
        print("Error: points must have the same dimensionality")
        return 0.0

    # calculate the sum of squared differences for each dimension
    #
    sq_diffs = [(a - b) ** 2 for a, b in zip(point1, point2)]
    
    # take the square root to get the euclidean distance
    #
    distance = math.sqrt(sum(sq_diffs))
    
    # exit gracefully
    #
    return distance

# assign each data point to the nearest centroid
#
def assign_clusters(data, centroids):
    """
    method: assign_clusters
    arguments:
        data: list of data points (lists of numbers)
        centroids: list of current cluster centers
    return:
        clusters: list of lists containing the data points assigned to each cluster
    description:
        Iterate through all data points and assign each one to the 
        cluster with the nearest centroid.
    """

    # initialize empty lists for each cluster
    #
    k = len(centroids)
    clusters = [[] for _ in range(k)]

    # loop through each point in the dataset
    #
    for point in data:
        
        # calculate distances from the point to all centroids
        #
        distances = [calculate_distance(point, c) for c in centroids]
        
        # find the index of the closest centroid
        #
        closest_index = distances.index(min(distances))
        
        # assign the point to the corresponding cluster
        #
        clusters[closest_index].append(point)

    # exit gracefully
    #
    return clusters

# update cluster centroids by calculating the mean of assigned points
#
def update_centroids(clusters, old_centroids):
    """
    method: update_centroids
    arguments:
        clusters: list of lists containing data points for each cluster
        old_centroids: list of the previous cluster centers
    return:
        new_centroids: list of newly calculated cluster centers
    description:
        Calculate new centroids by taking the mean of all data points
        currently assigned to each cluster.
    """

    # store the newly calculated centroids
    #
    new_centroids = []

    # get the dimensionality of the data from the first old centroid
    #
    num_features = len(old_centroids[0])

    # iterate through each cluster and its corresponding old centroid
    #
    for idx, cluster in enumerate(clusters):
        
        # handle empty clusters by keeping the old centroid
        # to avoid division by zero
        #
        if not cluster:
            new_centroids.append(old_centroids[idx])
            continue
            
        # initialize a new centroid with zeros for each dimension
        #
        new_centroid = []
        
        # calculate the mean for each dimension
        #
        for dim in range(num_features):
            
            # sum all values in the current dimension
            #
            dim_sum = sum(point[dim] for point in cluster)
            
            # calculate the average
            #
            dim_mean = float(dim_sum) / len(cluster)
            new_centroid.append(dim_mean)
            
        # add the calculated mean point as the new centroid
        #
        new_centroids.append(new_centroid)

    # exit gracefully
    #
    return new_centroids

# perform the k-means clustering algorithm
#
def kmeans_clustering(data, k=DEF_K_CLUSTERS, max_iters=DEF_MAX_ITERATIONS, seed=DEF_RANDOM_SEED):
    """
    method: kmeans_clustering
    arguments:
        data: list of data points to cluster
        k: the number of clusters to form (default: DEF_K_CLUSTERS)
        max_iters: maximum number of iterations (default: DEF_MAX_ITERATIONS)
        seed: random seed for reproducibility (default: DEF_RANDOM_SEED)
    return:
        centroids: the final cluster centers
        clusters: the final assignment of data points to clusters
    description:
        Main K-Means algorithm loop. Initializes centroids randomly from 
        the dataset, then iteratively assigns points to the nearest centroid 
        and updates centroids until convergence or max iterations is reached.
    """

    # set the random seed for repeatable centroid initialization
    #
    random.seed(int(seed))

    # randomly select k initial centroids from the dataset
    #
    centroids = random.sample(data, k)

    # store the clusters
    #
    clusters = []

    # loop up to the maximum number of iterations
    #
    for i in range(max_iters):
        
        # assign points to the nearest centroid
        #
        clusters = assign_clusters(data, centroids)
        
        # calculate new centroids based on cluster assignments
        #
        new_centroids = update_centroids(clusters, centroids)
        
        # check for convergence (if centroids do not change)
        #
        if centroids == new_centroids:
            print("   Converged after %d iterations." % (i + 1))
            break
            
        # update centroids for the next iteration
        #
        centroids = new_centroids

    # exit gracefully
    #
    return centroids, clusters

# function: main
#
def main(argv):
    """
    method: main
    arguments:
        argv: command line arguments
    return:
        True: indicates successful execution
    description:
        Main entry point that demonstrates the k-means clustering process
        on a simple 2-dimensional dataset.
    """

    # define the initial dataset
    #
    data = DEF_DATA_SET

    # print header
    #
    print("Starting K-Means clustering demonstration...")
    print("Dataset: %s" % data)
    print("K (Number of clusters): %d" % DEF_K_CLUSTERS)
    print("-" * 60)

    # run the k-means clustering algorithm
    #
    print("Executing K-Means algorithm...")
    final_centroids, final_clusters = kmeans_clustering(data, k=DEF_K_CLUSTERS)
    print("-" * 60)

    # display the final results
    #
    print("Final Clustering Results:")
    
    # print each cluster and its corresponding centroid
    #
    for i in range(DEF_K_CLUSTERS):
        
        # format the centroid coordinates for cleaner output
        #
        centroid_str = ["%.2f" % coord for coord in final_centroids[i]]
        
        print("   Cluster %d (Centroid: %s):" % (i + 1, centroid_str))
        print("     Points: %s\n" % final_clusters[i])

    print("-" * 60)

    # exit gracefully
    #
    return True

# begin gracefully
#
if __name__ == '__main__':
    main(sys.argv[0:])

#
# end of file