#!/usr/bin/env python # # file: mlp.py # # revision history: # # 20260403 (SP): implement deep multi-layer perceptron with gaussian dataset #------------------------------------------------------------------------------ # import system modules # import os import sys import random import math # import third-party visualization and numerical libraries # import matplotlib.pyplot as plt import numpy as np #------------------------------------------------------------------------------ # # global variables are listed here # #------------------------------------------------------------------------------ # set the filename using basename # __FILE__ = os.path.basename(__file__) # define default hyperparameters # DEF_EPOCHS = int(2000) DEF_NUM_SAMPLES = 300 DEF_LEARNING_RATE = 0.05 DEF_LAMBDA = 0.001 DEF_GAUSSIAN_MEAN_0 = 2.0 DEF_GAUSSIAN_MEAN_1 = 4.0 DEF_GAUSSIAN_STD = 1.2 DEF_RANDOM_SEED = 27 DEF_OUTPUT_FILE = 'mlp_boundary.png' # define network architecture (number of neurons per layer) # DEF_INPUT_DIM = 2 DEF_HIDDEN_1_DIM = 8 DEF_HIDDEN_2_DIM = 8 DEF_OUTPUT_DIM = 1 # set the numpy random seed for reproducible data generation # np.random.seed(DEF_RANDOM_SEED) # initialize empty lists for our dataset # def_data_x = [] def_data_y = [] # generate a 2D overlapping gaussian dataset to evaluate the deep MLP # for _ in range(DEF_NUM_SAMPLES // 2): # generate class 0 (gaussian centered at [2.0, 2.0] with std deviation of 1.2) # g0_x = np.random.normal(DEF_GAUSSIAN_MEAN_0, DEF_GAUSSIAN_STD) g0_y = np.random.normal(DEF_GAUSSIAN_MEAN_0, DEF_GAUSSIAN_STD) def_data_x.append([g0_x, g0_y]) def_data_y.append(0) # generate class 1 (gaussian centered at [4.0, 4.0] with std deviation of 1.2) # g1_x = np.random.normal(DEF_GAUSSIAN_MEAN_1, DEF_GAUSSIAN_STD) g1_y = np.random.normal(DEF_GAUSSIAN_MEAN_1, DEF_GAUSSIAN_STD) def_data_x.append([g1_x, g1_y]) def_data_y.append(1) # expose the combined and fully generated standard python lists # DEF_DATA_X = def_data_x DEF_DATA_Y = def_data_y #------------------------------------------------------------------------------ # # functions are listed here # #------------------------------------------------------------------------------ # apply the ReLU activation function # def relu(x): """ method: relu arguments: x: float input value return: result: the non-linear relu activation description: Calculates the Rectified Linear Unit (ReLU) activation function, which returns the maximum of 0 and the input. """ # exit gracefully # return max(0.0, x) # calculate the derivative of the ReLU activation # def relu_deriv(x): """ method: relu_deriv arguments: x: float input value representing the pre-activation value return: result: the gradient of the relu function description: Calculates the derivative of the ReLU function for backpropagation. """ if x > 0.0: # positive pre-activation has unit gradient # val = 1.0 else: # non-positive pre-activation has zero gradient # val = 0.0 # exit gracefully # return val # apply the Sigmoid activation function # def sigmoid(x): """ method: sigmoid arguments: x: float input value return: result: the sigmoid activation bounded between 0 and 1 description: Calculates the Sigmoid activation function, commonly used for binary classification outputs. It includes a cap to prevent math domain overflows. """ # cap the input to prevent math overflow errors # x = max(min(x, 500.0), -500.0) # apply sigmoid equation # val = 1.0 / (1.0 + math.exp(-x)) # exit gracefully # return val # initialize a weight matrix with random values # def init_weights(rows, cols, seed=DEF_RANDOM_SEED): """ method: init_weights arguments: rows: integer representing the number of neurons in the current layer cols: integer representing the number of neurons in the previous layer seed: random seed for reproducibility return: matrix: a 2D list of randomly initialized weights description: Creates and populates a 2D python list with small random floats to serve as initial weights for a neural network layer. """ # set the seed # random.seed(seed) # generate a matrix with random values centered around 0 # matrix = [[random.uniform(-0.5, 0.5) for _ in range(cols)] for _ in range(rows)] # exit gracefully # return matrix # perform a single forward pass through the network # def forward_pass(x, W1, b1, W2, b2, W3, b3): """ method: forward_pass arguments: x: list representing a single input feature vector W1, b1: weights and biases for the first hidden layer W2, b2: weights and biases for the second hidden layer W3, b3: weights and biases for the output layer return: z1, a1, z2, a2, z3, y_hat: intermediate pre-activations and activations description: Pushes a single input vector forward through the deep MLP architecture. """ # calculate the first hidden layer: z1 = W1*x + b1, a1 = relu(z1) # z1 = [] for i in range(len(W1)): z1_sum = 0.0 for j in range(len(x)): z1_sum += W1[i][j] * x[j] z1.append(z1_sum + b1[i]) # get the activations for the first hidden layer # a1 = [] for z in z1: a1.append(relu(z)) # calculate the second hidden layer: z2 = W2*a1 + b2, a2 = relu(z2) # z2 = [] for i in range(len(W2)): z2_sum = 0.0 for j in range(len(a1)): z2_sum += W2[i][j] * a1[j] z2.append(z2_sum + b2[i]) # get the activations for the second hidden layer # a2 = [] for z in z2: a2.append(relu(z)) # calculate the output layer: z3 = W3*a2 + b3, y_hat = sigmoid(z3) # z3_sum = 0.0 # compute the weighted sum for the single output neuron # W3 has shape [1 x len(a2)], so we use row 0 and multiply each # second-layer activation a2[j] by its corresponding output weight W3[0][j] # for j in range(len(a2)): z3_sum += W3[0][j] * a2[j] # add the output-layer bias term to obtain the pre-activation (logit) # z3 = [z3_sum + b3[0]] # apply sigmoid to map the logit into a probability in [0, 1] # y_hat represents the model's predicted probability for class 1 # y_hat = sigmoid(z3[0]) # exit gracefully # return z1, a1, z2, a2, z3, y_hat # train the deep multi-layer perceptron # def train_mlp(X, y, epochs=DEF_EPOCHS, lr=DEF_LEARNING_RATE, lambda_param=DEF_LAMBDA, seed=DEF_RANDOM_SEED): """ method: train_mlp arguments: X: list of lists representing the feature vectors y: list of class labels (0 or 1) epochs: number of times to iterate over the dataset lr: learning rate for gradient descent lambda_param: L2 regularization strength parameter seed: random seed for reproducibility return: W1, b1, W2, b2, W3, b3: optimized weights and biases description: Trains a deep neural network (2 hidden layers) using Stochastic Gradient Descent (SGD) and backpropagation with pure python structures. Minimizes Binary Cross Entropy (BCE) loss combined with L2 regularization. """ # set the random seed # random.seed(int(seed)) # initialize weight matrices and bias vectors # W1 = init_weights(DEF_HIDDEN_1_DIM, DEF_INPUT_DIM, seed) b1 = [0.0 for _ in range(DEF_HIDDEN_1_DIM)] W2 = init_weights(DEF_HIDDEN_2_DIM, DEF_HIDDEN_1_DIM, seed+1) b2 = [0.0 for _ in range(DEF_HIDDEN_2_DIM)] W3 = init_weights(DEF_OUTPUT_DIM, DEF_HIDDEN_2_DIM, seed+2) b3 = [0.0 for _ in range(DEF_OUTPUT_DIM)] # loop up to the maximum number of epochs # for epoch in range(epochs): # pair features and labels so we can shuffle them together # dataset = list(zip(X, y)) random.shuffle(dataset) # iterate through each individual sample (Stochastic Gradient Descent) # for x_val, y_val in dataset: # forward pass # z1, a1, z2, a2, z3, y_hat = forward_pass(x_val, W1, b1, W2, b2, W3, b3) # backward pass (backpropagation) # calculate gradient of Binary Cross-Entropy loss w.r.t z3 # dz3 = y_hat - y_val # calculate gradients for output layer (layer 3) # dW3 = [[]] for j in range(DEF_HIDDEN_2_DIM): dW3[0].append(dz3 * a2[j]) db3 = [dz3] # backpropagate error to hidden layer 2 # da2 = [] for i in range(DEF_HIDDEN_2_DIM): da2.append(W3[0][i] * dz3) dz2 = [] for i in range(DEF_HIDDEN_2_DIM): dz2.append(da2[i] * relu_deriv(z2[i])) # calculate gradients for Hidden Layer 2 # dW2 = [] for i in range(DEF_HIDDEN_2_DIM): row = [] for j in range(DEF_HIDDEN_1_DIM): row.append(dz2[i] * a1[j]) dW2.append(row) db2 = dz2 # backpropagate error to hidden layer 1 # da1 = [] for j in range(DEF_HIDDEN_1_DIM): da1_sum = 0 for i in range(DEF_HIDDEN_2_DIM): da1_sum += W2[i][j] * dz2[i] da1.append(da1_sum) dz1 = [] for j in range(DEF_HIDDEN_1_DIM): dz1.append(da1[j] * relu_deriv(z1[j])) # calculate gradients for hidden layer 1 # dW1 = [] for i in range(DEF_HIDDEN_1_DIM): row = [] for j in range(DEF_INPUT_DIM): row.append(dz1[i] * x_val[j]) dW1.append(row) db1 = dz1 # update weights and biases (with L2 Regularization applied to weights) # # update layer 3 # for j in range(DEF_HIDDEN_2_DIM): W3[0][j] -= lr * (dW3[0][j] + lambda_param * W3[0][j]) b3[0] -= lr * db3[0] # update layer 2 # for i in range(DEF_HIDDEN_2_DIM): for j in range(DEF_HIDDEN_1_DIM): W2[i][j] -= lr * (dW2[i][j] + lambda_param * W2[i][j]) b2[i] -= lr * db2[i] # update layer 1 # for i in range(DEF_HIDDEN_1_DIM): for j in range(DEF_INPUT_DIM): W1[i][j] -= lr * (dW1[i][j] + lambda_param * W1[i][j]) b1[i] -= lr * db1[i] # exit gracefully # return W1, b1, W2, b2, W3, b3 # calculate the classification error rate of the trained MLP # def calculate_error_rate(X, y, W1, b1, W2, b2, W3, b3): """ method: calculate_error_rate arguments: X: list of lists representing the feature vectors y: list of true class labels (0 or 1) W1, b1, W2, b2, W3, b3: the optimized network parameters return: error_rate: float representing the percentage of incorrect predictions error_count: integer count of misclassified points description: Evaluates the model's accuracy by running a forward pass for each point and comparing the thresholded output against the true label. """ # initialize a counter for misclassified points # error_count = 0 total_points = len(y) # iterate through every point in the dataset # for i in range(total_points): # calculate the network output # _, _, _, _, _, y_hat = forward_pass(X[i], W1, b1, W2, b2, W3, b3) # determine the predicted class using a 0.5 probability threshold # if y_hat >= 0.5: prediction = 1 else: prediction = 0 # check if the prediction does not match the true label # if prediction != y[i]: error_count += 1 # calculate the error rate as a float # error_rate = float(error_count) / total_points # exit gracefully # return error_rate, error_count # render a plot of the data points and non-linear decision boundary # def plot_decision_boundary(X, y, W1, b1, W2, b2, W3, b3): """ method: plot_decision_boundary arguments: X: list of lists representing the feature vectors y: list of class labels (0 or 1) W1, b1, W2, b2, W3, b3: the optimized network parameters return: True: indicates successful execution description: Generates a graphical plot using matplotlib to display the dataset and the complex decision boundary formed by the deep MLP. """ # globally set the base font size for all matplotlib elements # plt.rcParams.update({'font.size': 12}) # determine the ranges for the x and y axes to build a mesh grid # x_min = min(p[0] for p in X) - 1.5 x_max = max(p[0] for p in X) + 1.5 y_min = min(p[1] for p in X) - 1.5 y_max = max(p[1] for p in X) + 1.5 # construct a mesh grid # xx, yy = np.meshgrid(np.linspace(x_min, x_max, 150), np.linspace(y_min, y_max, 150)) # predict on every point in the mesh grid using the pure python forward pass # Z = [] for i in range(xx.shape[0]): row_preds = [] for j in range(xx.shape[1]): grid_pt = [xx[i, j], yy[i, j]] _, _, _, _, _, y_hat = forward_pass(grid_pt, W1, b1, W2, b2, W3, b3) row_preds.append(1 if y_hat >= 0.5 else 0) Z.append(row_preds) Z = np.array(Z) # plot the filled contour to represent the decision boundary # plt.contourf(xx, yy, Z, cmap=plt.cm.RdBu, alpha=0.4) # separate the dataset into class 1 and class 0 for colored plotting # class_1_x = [X[i][0] for i in range(len(X)) if y[i] == 1] class_1_y = [X[i][1] for i in range(len(X)) if y[i] == 1] class_0_x = [X[i][0] for i in range(len(X)) if y[i] == 0] class_0_y = [X[i][1] for i in range(len(X)) if y[i] == 0] # plot the actual data points # plt.scatter(class_1_x, class_1_y, color='blue', marker='o', edgecolors='k', label='Class 1', s=50) plt.scatter(class_0_x, class_0_y, color='red', marker='s', edgecolors='k', label='Class 0', s=50) # configure the plot aesthetics and labels # plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.title('MLP - Decision Boundary') plt.legend(loc='upper right', fontsize=11) # ensure the axes are scaled equally # plt.axis('equal') # save the final plot # plt.savefig(DEF_OUTPUT_FILE) # exit gracefully # return True # function: main # def main(argv): """ method: main arguments: argv: command line arguments return: True: indicates successful execution description: Main entry point that demonstrates training a Deep MLP on a 2D overlapping Gaussian dataset and visualizing the boundary. """ # define the initial dataset and parameters # X = DEF_DATA_X y = DEF_DATA_Y # print header # print("Starting Deep Multi-Layer Perceptron (MLP) demonstration...") print("Architecture: %d Inputs -> %d Hidden -> %d Hidden -> %d Output" % (DEF_INPUT_DIM, DEF_HIDDEN_1_DIM, DEF_HIDDEN_2_DIM, DEF_OUTPUT_DIM)) print("Number of samples: %d" % len(X)) print("-" * 65) # run the mlp training loop using gradient descent # print("Executing MLP Stochastic Gradient Descent") W1, b1, W2, b2, W3, b3 = train_mlp(X, y) # calculate the error rate # error_rate, error_count = calculate_error_rate(X, y, W1, b1, W2, b2, W3, b3) accuracy = (1.0 - error_rate) * 100 # print the metrics # print("Training Complete.") print("-" * 65) print("Misclassified Points : %d out of %d" % (error_count, len(X))) print("Model Accuracy : %.2f%%" % accuracy) print("-" * 65) # render the matplotlib plot # print("Launching Matplotlib Visualization...") plot_decision_boundary(X, y, W1, b1, W2, b2, W3, b3) print("Plot Saved at %s. Execution finished." % DEF_OUTPUT_FILE) print("-" * 65) # exit gracefully # return True # begin gracefully # if __name__ == '__main__': main(sys.argv[0:]) # # end of file