#!/usr/bin/env python # # file: kmeans.py # # revision history: # # 20260317 (SP): implement simple k-means clustering #------------------------------------------------------------------------------ # import system modules # import os import sys import random import math #------------------------------------------------------------------------------ # # global variables are listed here # #------------------------------------------------------------------------------ # set the filename using basename # __FILE__ = os.path.basename(__file__) # define default values for arguments # # a simple 2D dataset with two natural groupings (bottom-left and top-right) # DEF_DATA_SET = [[1.0, 2.0], [1.5, 1.8], [5.0, 8.0], [8.0, 8.0], [1.0, 0.6], [9.0, 11.0]] DEF_K_CLUSTERS = int(2) DEF_MAX_ITERATIONS = int(100) DEF_RANDOM_SEED = 27 #------------------------------------------------------------------------------ # # functions are listed here # #------------------------------------------------------------------------------ # calculate the euclidean distance between two points # def calculate_distance(point1, point2): """ method: calculate_distance arguments: point1: list of coordinates for the first point point2: list of coordinates for the second point return: distance: the euclidean distance between the two points description: Calculate the Euclidean distance between two n-dimensional points. """ # verify that both points have the same number of dimensions # if len(point1) != len(point2): print("Error: points must have the same dimensionality") return 0.0 # calculate the sum of squared differences for each dimension # sq_diffs = [(a - b) ** 2 for a, b in zip(point1, point2)] # take the square root to get the euclidean distance # distance = math.sqrt(sum(sq_diffs)) # exit gracefully # return distance # assign each data point to the nearest centroid # def assign_clusters(data, centroids): """ method: assign_clusters arguments: data: list of data points (lists of numbers) centroids: list of current cluster centers return: clusters: list of lists containing the data points assigned to each cluster description: Iterate through all data points and assign each one to the cluster with the nearest centroid. """ # initialize empty lists for each cluster # k = len(centroids) clusters = [[] for _ in range(k)] # loop through each point in the dataset # for point in data: # calculate distances from the point to all centroids # distances = [calculate_distance(point, c) for c in centroids] # find the index of the closest centroid # closest_index = distances.index(min(distances)) # assign the point to the corresponding cluster # clusters[closest_index].append(point) # exit gracefully # return clusters # update cluster centroids by calculating the mean of assigned points # def update_centroids(clusters, old_centroids): """ method: update_centroids arguments: clusters: list of lists containing data points for each cluster old_centroids: list of the previous cluster centers return: new_centroids: list of newly calculated cluster centers description: Calculate new centroids by taking the mean of all data points currently assigned to each cluster. """ # store the newly calculated centroids # new_centroids = [] # get the dimensionality of the data from the first old centroid # num_features = len(old_centroids[0]) # iterate through each cluster and its corresponding old centroid # for idx, cluster in enumerate(clusters): # handle empty clusters by keeping the old centroid # to avoid division by zero # if not cluster: new_centroids.append(old_centroids[idx]) continue # initialize a new centroid with zeros for each dimension # new_centroid = [] # calculate the mean for each dimension # for dim in range(num_features): # sum all values in the current dimension # dim_sum = sum(point[dim] for point in cluster) # calculate the average # dim_mean = float(dim_sum) / len(cluster) new_centroid.append(dim_mean) # add the calculated mean point as the new centroid # new_centroids.append(new_centroid) # exit gracefully # return new_centroids # perform the k-means clustering algorithm # def kmeans_clustering(data, k=DEF_K_CLUSTERS, max_iters=DEF_MAX_ITERATIONS, seed=DEF_RANDOM_SEED): """ method: kmeans_clustering arguments: data: list of data points to cluster k: the number of clusters to form (default: DEF_K_CLUSTERS) max_iters: maximum number of iterations (default: DEF_MAX_ITERATIONS) seed: random seed for reproducibility (default: DEF_RANDOM_SEED) return: centroids: the final cluster centers clusters: the final assignment of data points to clusters description: Main K-Means algorithm loop. Initializes centroids randomly from the dataset, then iteratively assigns points to the nearest centroid and updates centroids until convergence or max iterations is reached. """ # set the random seed for repeatable centroid initialization # random.seed(int(seed)) # randomly select k initial centroids from the dataset # centroids = random.sample(data, k) # store the clusters # clusters = [] # loop up to the maximum number of iterations # for i in range(max_iters): # assign points to the nearest centroid # clusters = assign_clusters(data, centroids) # calculate new centroids based on cluster assignments # new_centroids = update_centroids(clusters, centroids) # check for convergence (if centroids do not change) # if centroids == new_centroids: print("   Converged after %d iterations." % (i + 1)) break # update centroids for the next iteration # centroids = new_centroids # exit gracefully # return centroids, clusters # function: main # def main(argv): """ method: main arguments: argv: command line arguments return: True: indicates successful execution description: Main entry point that demonstrates the k-means clustering process on a simple 2-dimensional dataset. """ # define the initial dataset # data = DEF_DATA_SET # print header # print("Starting K-Means clustering demonstration...") print("Dataset: %s" % data) print("K (Number of clusters): %d" % DEF_K_CLUSTERS) print("-" * 60) # run the k-means clustering algorithm # print("Executing K-Means algorithm...") final_centroids, final_clusters = kmeans_clustering(data, k=DEF_K_CLUSTERS) print("-" * 60) # display the final results # print("Final Clustering Results:") # print each cluster and its corresponding centroid # for i in range(DEF_K_CLUSTERS): # format the centroid coordinates for cleaner output # centroid_str = ["%.2f" % coord for coord in final_centroids[i]] print("   Cluster %d (Centroid: %s):" % (i + 1, centroid_str)) print("     Points: %s\n" % final_clusters[i]) print("-" * 60) # exit gracefully # return True # begin gracefully # if __name__ == '__main__': main(sys.argv[0:]) # # end of file