#!/usr/bin/env python # # file: Principal Component Analysis (PCA) Demonstration on 2D Gaussian Data # # revision history: # # 20260116 (SP): demonstration of PCA algorithm on 2D gaussian data #------------------------------------------------------------------------------ # import system modules # import os import sys import numpy as np import matplotlib.pyplot as plt #------------------------------------------------------------------------------ # # global variables are listed here # #------------------------------------------------------------------------------ # set the filename using basename # __FILE__ = os.path.basename(__file__) # define default values for arguments # DEF_COV_0 = [[1.0, 0.0], [0.0, 1.0]] DEF_COV_1 = [[1.0, 0.0], [0.0, 1.0]] DEF_MEAN_0 = [0.0, 0.0] DEF_MEAN_1 = [0.0, 0.0] DEF_NSAMPLES = int(1000) DEF_PERCENT_MULTIPLIER = 100 DEF_PCA_NCOMP = 2 DEF_RANDOM_SEED = 27 DEF_SYMMETRY_TOL = 1e-12 #------------------------------------------------------------------------------ # # functions are listed here # #------------------------------------------------------------------------------ # generate samples from a 2D multivariate Gaussian distribution # def generate_gaussian(mu, cov, nsamples, seed=DEF_RANDOM_SEED): """ method: generate_gaussian arguments: mu: mean vector of length 2 cov: 2x2 covariance matrix nsamples: number of samples to draw seed: random seed for repeatability return: x: (nsamples, 2) numpy array of samples from N(mu, cov) description: Generate samples from a 2D multivariate Gaussian distribution with specified mean and covariance. """ # set the random seed for repeatability # np.random.seed(int(seed)) # check basic input dimensions # mu = np.asarray(mu, dtype=float).reshape(-1) cov = np.asarray(cov, dtype=float) # ensure mu is length 2 and cov is 2x2 # if (mu.size != 2) or (cov.shape != (2, 2)): print("**> Error: mu must be length 2 and cov must be 2x2") return None # ensure the covariance matrix is symmetric # if not np.allclose(cov, cov.T, atol=DEF_SYMMETRY_TOL): print("**> Error: covariance matrix must be symmetric") return None # draw nsamples from N(mu, cov) # x = np.random.multivariate_normal(mean=mu, cov=cov, size=int(nsamples)) # return samples as an (N,2) numpy array # return x def run_pca(X): """ method: run_pca arguments: X: data matrix of shape (N, D) N = number of samples D = number of dimensions (features) return: mu: mean vector of shape (D,) eigvals: eigenvalues sorted in descending order eigvecs: corresponding eigenvectors as columns description: Perform Principal Component Analysis (PCA) on the data matrix X. Computes the mean, covariance matrix, eigenvalues, and eigenvectors. """ # make sure X is a numpy array # X = np.asarray(X, dtype=float) # get the mean of the data (D,) # mu = X.mean(axis=0) # center the data (N, D) # Xc = X - mu # calculate the covariance matrix (D x D) # C = (Xc.T @ Xc) / (Xc.shape[0] - 1) # calculating real eigenvalues and eigenvectors using np.linalg.eigh # eigvals, eigvecs = np.linalg.eigh(C) # sort descending # idx = np.argsort(eigvals)[::-1] # get sorted eigenvalues and eigenvectors # eigvals = eigvals[idx] eigvecs = eigvecs[:, idx] # return the mean, eigenvalues, and eigenvectors # return mu, eigvals, eigvecs def project_pca(X, mu, eigvecs, ncomp=DEF_PCA_NCOMP): """ method: project_pca arguments: X: data matrix of shape (N, D) mu: mean vector of shape (D,) eigvecs: eigenvectors as columns (D x D) ncomp: number of principal components to project onto return: Z: projected data of shape (N, ncomp) description: Project the data X onto the first ncomp principal components. """ # center the data # Xc = np.asarray(X, dtype=float) - mu # get the projection matrix # W = eigvecs[:, :int(ncomp)] # project the data by matrix multiplication # Z = Xc @ W # return projected data # return Z def classify_nearest_centroid(Z, y): """ method: classify_nearest_centroid arguments: Z: data matrix in PCA space of shape (N, ncomp) y: true labels of shape (N,) return: yhat: predicted labels of shape (N,) description: Classify each sample in Z by nearest centroid based on true labels y. """ # ensure inputs are numpy arrays # Z = np.asarray(Z, dtype=float) # ensure y is a 1D array of integers by reshaping # y = np.asarray(y, dtype=int).reshape(-1) # get unique classes and their centroids # classes = np.unique(y) # declare list to hold centroids # centroids = [] # compute centroids for each class # for c in classes: # compute centroid for class c # centroids.append(Z[y == c].mean(axis=0)) # stack centroids into a (K, ncomp) array # centroids = np.vstack(centroids) # calculate distances to each centroid: (N, K) # d2 = ((Z[:, None, :] - centroids[None, :, :]) ** 2).sum(axis=2) # assign class based on nearest centroid # pred_idx = np.argmin(d2, axis=1) # map indices back to class labels # yhat = classes[pred_idx] # return predicted labels # return yhat def confusion_matrix_2class(y_true, y_pred): """ method: confusion_matrix_2class arguments: y_true: true labels of shape (N,) y_pred: predicted labels of shape (N,) return: cm: 2x2 confusion matrix, CM where CM[i,j] = number of samples with true label i predicted as j The final confusion matrix has the form: [[TP, FP], [FN, TN]] description: Compute the confusion matrix for binary classification (labels 0 and 1). """ # ensure inputs are 1D arrays of integers # y_true = np.asarray(y_true, dtype=int).reshape(-1) y_pred = np.asarray(y_pred, dtype=int).reshape(-1) # initialize confusion matrix # cm = np.zeros((2, 2), dtype=int) # populate confusion matrix # for t, p in zip(y_true, y_pred): # for true label t and predicted label p # increment the appropriate cell in cm # if t in (0, 1) and p in (0, 1): # if true and predicted labels are valid # cm[t, p] += 1 else: raise ValueError("Labels must be 0 or 1 for this confusion_matrix_2class().") # return confusion matrix # return cm def error_rate_from_cm(cm): """ method: error_rate_from_cm arguments: cm: confusion matrix of shape (2, 2) return: err: error rate (float) description: Calculate the error rate from the confusion matrix. """ # get total and correct predictions # total = cm.sum() # np.trace the sum of diagonal elements # correct = np.trace(cm) # calculate error rate # return 1.0 - (correct / total if total > 0 else 0.0) # function: main # def main(argv): # samples per class # nsamples = DEF_NSAMPLES # if one command line argument provided, use it as number of samples # if len(argv) > 1: try: nsamples = int(argv[1]) except: print("**> Using default value of %d" % nsamples) nsamples = DEF_NSAMPLES # define number of PCA components to project onto # ncomp = DEF_PCA_NCOMP print("\nUsing nsamples=%d per class, ncomp=%d\n" % (nsamples, ncomp)) # generate 2D gaussian data for classe 0 # x0 = generate_gaussian(DEF_MEAN_0, DEF_COV_0, nsamples, seed=DEF_RANDOM_SEED) # generate 2D gaussian data for classe 1 # x1 = generate_gaussian(DEF_MEAN_1, DEF_COV_1, nsamples, seed=DEF_RANDOM_SEED + 1) # check for errors # if x0 is None or x1 is None: return False # combine data and labels # np.vstack: vertical stack (along rows) # np.hstack: horizontal stack (along columns) # X = np.vstack([x0, x1]) y = np.hstack([np.zeros(nsamples, int), np.ones(nsamples, int)]) # PCA on all data # mu, eigvals, eigvecs = run_pca(X) # project onto first ncomp components # Z = project_pca(X, mu, eigvecs, ncomp=ncomp) # classify in PCA space # yhat = classify_nearest_centroid(Z, y) # confusion matrix + error rate # cm = confusion_matrix_2class(y, yhat) # calculate error rate # err = error_rate_from_cm(cm) print("\nPCA eigenvalues (descending): %s" % np.array2string(eigvals, precision=4)) print("\nConfusion matrix:") print(" Pred 0 Pred 1") print("True 0 %5d %5d" % (cm[0, 0], cm[0, 1])) print("True 1 %5d %5d" % (cm[1, 0], cm[1, 1])) print("\nError rate: %.2f%% (accuracy: %.2f%%)\n" % (err * DEF_PERCENT_MULTIPLIER, DEF_PERCENT_MULTIPLIER - err * DEF_PERCENT_MULTIPLIER)) # exit gracefully # return True # begin gracefully # if __name__ == '__main__': main(sys.argv[0:]) # # end of file