#!/usr/bin/env python
#
# file: Principal Component Analysis (PCA) Demonstration on 2D Gaussian Data
#
# revision history:
#
# 20260116 (SP): demonstration of PCA algorithm on 2D gaussian data
#------------------------------------------------------------------------------

# import system modules
#
import os
import sys
import numpy as np
import matplotlib.pyplot as plt

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# define default values for arguments
#
DEF_COV_0              = [[1.0, 0.0],
                          [0.0, 1.0]]
DEF_COV_1              = [[1.0, 0.0],
                          [0.0, 1.0]]
DEF_MEAN_0             = [0.0, 0.0]
DEF_MEAN_1             = [0.0, 0.0]
DEF_NSAMPLES           = int(1000)
DEF_PERCENT_MULTIPLIER = 100
DEF_PCA_NCOMP          = 2
DEF_RANDOM_SEED        = 27
DEF_SYMMETRY_TOL       = 1e-12

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# generate samples from a 2D multivariate Gaussian distribution
#
def generate_gaussian(mu, cov, nsamples, seed=DEF_RANDOM_SEED):
    """
    method: generate_gaussian
    arguments:
        mu: mean vector of length 2
        cov: 2x2 covariance matrix
        nsamples: number of samples to draw
        seed: random seed for repeatability
    return:
        x: (nsamples, 2) numpy array of samples from N(mu, cov)
    description:
        Generate samples from a 2D multivariate Gaussian distribution
        with specified mean and covariance.
    """

    # set the random seed for repeatability
    #
    np.random.seed(int(seed))

    # check basic input dimensions
    #
    mu = np.asarray(mu, dtype=float).reshape(-1)
    cov = np.asarray(cov, dtype=float)
    
    # ensure mu is length 2 and cov is 2x2
    #
    if (mu.size != 2) or (cov.shape != (2, 2)):
        print("**> Error: mu must be length 2 and cov must be 2x2")
        return None

    # ensure the covariance matrix is symmetric
    #
    if not np.allclose(cov, cov.T, atol=DEF_SYMMETRY_TOL):
        print("**> Error: covariance matrix must be symmetric")
        return None

    # draw nsamples from N(mu, cov)
    #
    x = np.random.multivariate_normal(mean=mu, cov=cov, size=int(nsamples))

    # return samples as an (N,2) numpy array
    #
    return x

def run_pca(X):
    """
    method: run_pca
    
    arguments:
        X: data matrix of shape (N, D)
        N = number of samples
        D = number of dimensions (features)
    
    return:
        mu: mean vector of shape (D,)
        eigvals: eigenvalues sorted in descending order
        eigvecs: corresponding eigenvectors as columns

    description:
        Perform Principal Component Analysis (PCA) on the data matrix X.
        Computes the mean, covariance matrix, eigenvalues, and eigenvectors.
    """
    
    # make sure X is a numpy array
    #
    X = np.asarray(X, dtype=float)
    
    # get the mean of the data (D,)
    #
    mu = X.mean(axis=0)
    
    # center the data (N, D)
    #
    Xc = X - mu

    # calculate the covariance matrix (D x D)
    #
    C = (Xc.T @ Xc) / (Xc.shape[0] - 1)

    # calculating real eigenvalues and eigenvectors using np.linalg.eigh 
    #
    eigvals, eigvecs = np.linalg.eigh(C)

    # sort descending
    #
    idx = np.argsort(eigvals)[::-1]
    
    # get sorted eigenvalues and eigenvectors
    #
    eigvals = eigvals[idx]
    eigvecs = eigvecs[:, idx]
    
    # return the mean, eigenvalues, and eigenvectors
    #
    return mu, eigvals, eigvecs

def project_pca(X, mu, eigvecs, ncomp=DEF_PCA_NCOMP):
    """
    method: project_pca 
    arguments:
        X: data matrix of shape (N, D)
        mu: mean vector of shape (D,)
        eigvecs: eigenvectors as columns (D x D)
        ncomp: number of principal components to project onto
    return:
        Z: projected data of shape (N, ncomp)
    description:
        Project the data X onto the first ncomp principal components.
    """
    
    # center the data
    #
    Xc = np.asarray(X, dtype=float) - mu
    
    # get the projection matrix
    #
    W = eigvecs[:, :int(ncomp)]
    
    # project the data by matrix multiplication
    #
    Z = Xc @ W
    
    # return projected data
    #
    return Z

def classify_nearest_centroid(Z, y):
    """
    method: classify_nearest_centroid
    arguments:
        Z: data matrix in PCA space of shape (N, ncomp)
        y: true labels of shape (N,)
    return:
        yhat: predicted labels of shape (N,)
    description:
        Classify each sample in Z by nearest centroid based on true labels y.
    """
    # ensure inputs are numpy arrays
    #
    Z = np.asarray(Z, dtype=float)
    
    # ensure y is a 1D array of integers by reshaping
    #
    y = np.asarray(y, dtype=int).reshape(-1)
     
    # get unique classes and their centroids
    #
    classes = np.unique(y)
    
    # declare list to hold centroids
    #
    centroids = []
    
    # compute centroids for each class
    #
    for c in classes:
        # compute centroid for class c
        #
        centroids.append(Z[y == c].mean(axis=0))
        
    # stack centroids into a (K, ncomp) array
    #
    centroids = np.vstack(centroids)  

    # calculate distances to each centroid: (N, K)
    #
    d2 = ((Z[:, None, :] - centroids[None, :, :]) ** 2).sum(axis=2)
    
    # assign class based on nearest centroid
    #
    pred_idx = np.argmin(d2, axis=1) 
    
    # map indices back to class labels
    #
    yhat = classes[pred_idx]
    
    # return predicted labels
    #
    return yhat

def confusion_matrix_2class(y_true, y_pred):
    """
    method: confusion_matrix_2class
    arguments:
        y_true: true labels of shape (N,)
        y_pred: predicted labels of shape (N,)
    return:
        cm: 2x2 confusion matrix, CM where
            CM[i,j] = number of samples with true label i predicted as j 
            The final confusion matrix has the form:
                [[TP, FP],
                 [FN, TN]]
    description:
        Compute the confusion matrix for binary classification (labels 0 and 1).
    """
    
    # ensure inputs are 1D arrays of integers
    #
    y_true = np.asarray(y_true, dtype=int).reshape(-1)
    y_pred = np.asarray(y_pred, dtype=int).reshape(-1)
    
    # initialize confusion matrix
    #
    cm = np.zeros((2, 2), dtype=int)
    
    # populate confusion matrix
    #
    for t, p in zip(y_true, y_pred):
        # for true label t and predicted label p
        # increment the appropriate cell in cm
        #
        if t in (0, 1) and p in (0, 1):
            # if true and predicted labels are valid
            #
            cm[t, p] += 1
        else:
            raise ValueError("Labels must be 0 or 1 for this confusion_matrix_2class().")
        
    # return confusion matrix
    #
    return cm

def error_rate_from_cm(cm):
    """
    method: error_rate_from_cm
    arguments:
        cm: confusion matrix of shape (2, 2)
    return:
        err: error rate (float)
    description:
        Calculate the error rate from the confusion matrix.
    """
    # get total and correct predictions
    #
    total = cm.sum()
    
    # np.trace the sum of diagonal elements
    #
    correct = np.trace(cm)
    
    # calculate error rate
    #
    return 1.0 - (correct / total if total > 0 else 0.0)

# function: main
#
def main(argv):

    # samples per class
    #
    nsamples = DEF_NSAMPLES
    
    # if one command line argument provided, use it as number of samples
    #
    if len(argv) > 1:
        try:
            nsamples = int(argv[1])
        except:
            print("**> Using default value of %d" % nsamples)
            nsamples = DEF_NSAMPLES
    
    # define number of PCA components to project onto
    #
    ncomp = DEF_PCA_NCOMP
    
    print("\nUsing nsamples=%d per class, ncomp=%d\n" % (nsamples, ncomp))
    
    # generate 2D gaussian data for classe 0
    #
    x0 = generate_gaussian(DEF_MEAN_0, DEF_COV_0, nsamples, seed=DEF_RANDOM_SEED)
    
    # generate 2D gaussian data for classe 1
    #
    x1 = generate_gaussian(DEF_MEAN_1, DEF_COV_1, nsamples, seed=DEF_RANDOM_SEED + 1)
    
    # check for errors
    #
    if x0 is None or x1 is None:
        return False
    
    # combine data and labels
    # np.vstack: vertical stack (along rows)
    # np.hstack: horizontal stack (along columns)
    #
    X = np.vstack([x0, x1])                 
    y = np.hstack([np.zeros(nsamples, int),  
                   np.ones(nsamples, int)])

    # PCA on all data
    #
    mu, eigvals, eigvecs = run_pca(X)

    # project onto first ncomp components
    #
    Z = project_pca(X, mu, eigvecs, ncomp=ncomp)

    # classify in PCA space
    #
    yhat = classify_nearest_centroid(Z, y)

    # confusion matrix + error rate
    #
    cm = confusion_matrix_2class(y, yhat)
    
    # calculate error rate
    #
    err = error_rate_from_cm(cm)

    print("\nPCA eigenvalues (descending): %s" % np.array2string(eigvals, precision=4))
    print("\nConfusion matrix:")
    print("            Pred 0  Pred 1")
    print("True 0      %5d  %5d" % (cm[0, 0], cm[0, 1]))
    print("True 1      %5d  %5d" % (cm[1, 0], cm[1, 1]))
    print("\nError rate: %.2f%%  (accuracy: %.2f%%)\n" % (err * DEF_PERCENT_MULTIPLIER, DEF_PERCENT_MULTIPLIER - err * DEF_PERCENT_MULTIPLIER))
    
    # exit gracefully
    #
    return True


# begin gracefully
#
if __name__ == '__main__':
    main(sys.argv[0:])

#
# end of file