#!/usr/bin/env python
#
# file: mlp.py
#
# revision history:
#
# 20260403 (SP): implement deep multi-layer perceptron with gaussian dataset
#------------------------------------------------------------------------------

# import system modules
#
import os
import sys
import random
import math

# import third-party visualization and numerical libraries
#
import matplotlib.pyplot as plt
import numpy as np

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# define default hyperparameters
#
DEF_EPOCHS          = int(2000)
DEF_NUM_SAMPLES     = 300
DEF_LEARNING_RATE   = 0.05
DEF_LAMBDA          = 0.001
DEF_GAUSSIAN_MEAN_0 = 2.0
DEF_GAUSSIAN_MEAN_1 = 4.0
DEF_GAUSSIAN_STD    = 1.2
DEF_RANDOM_SEED     = 27
DEF_OUTPUT_FILE     = 'mlp_boundary.png'

# define network architecture (number of neurons per layer)
#
DEF_INPUT_DIM      = 2
DEF_HIDDEN_1_DIM   = 8
DEF_HIDDEN_2_DIM   = 8
DEF_OUTPUT_DIM     = 1

# set the numpy random seed for reproducible data generation
#
np.random.seed(DEF_RANDOM_SEED)

# initialize empty lists for our dataset
#
def_data_x = []
def_data_y = []

# generate a 2D overlapping gaussian dataset to evaluate the deep MLP
#
for _ in range(DEF_NUM_SAMPLES // 2):
    
    # generate class 0 (gaussian centered at [2.0, 2.0] with std deviation of 1.2)
    #
    g0_x = np.random.normal(DEF_GAUSSIAN_MEAN_0, DEF_GAUSSIAN_STD)
    g0_y = np.random.normal(DEF_GAUSSIAN_MEAN_0, DEF_GAUSSIAN_STD)
    def_data_x.append([g0_x, g0_y])
    def_data_y.append(0)
    
    # generate class 1 (gaussian centered at [4.0, 4.0] with std deviation of 1.2)
    #
    g1_x = np.random.normal(DEF_GAUSSIAN_MEAN_1, DEF_GAUSSIAN_STD)
    g1_y = np.random.normal(DEF_GAUSSIAN_MEAN_1, DEF_GAUSSIAN_STD)
    def_data_x.append([g1_x, g1_y])
    def_data_y.append(1)

# expose the combined and fully generated standard python lists
#
DEF_DATA_X = def_data_x
DEF_DATA_Y = def_data_y

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# apply the ReLU activation function
#
def relu(x):
    """
    method: relu
    arguments:
        x: float input value
    return:
        result: the non-linear relu activation
    description:
        Calculates the Rectified Linear Unit (ReLU) activation function,
        which returns the maximum of 0 and the input.
    """
    
    # exit gracefully
    #
    return max(0.0, x)

# calculate the derivative of the ReLU activation
#
def relu_deriv(x):
    """
    method: relu_deriv
    arguments:
        x: float input value representing the pre-activation value
    return:
        result: the gradient of the relu function
    description:
        Calculates the derivative of the ReLU function for backpropagation.
    """

    if x > 0.0:
        # positive pre-activation has unit gradient
        #
        val = 1.0
    else:
        # non-positive pre-activation has zero gradient
        #
        val = 0.0
    
    # exit gracefully
    #
    return val

# apply the Sigmoid activation function
#
def sigmoid(x):
    """
    method: sigmoid
    arguments:
        x: float input value
    return:
        result: the sigmoid activation bounded between 0 and 1
    description:
        Calculates the Sigmoid activation function, commonly used for binary 
        classification outputs. It includes a cap to prevent math domain overflows.
    """
    
    # cap the input to prevent math overflow errors
    #
    x = max(min(x, 500.0), -500.0)
    
    # apply sigmoid equation
    #
    val = 1.0 / (1.0 + math.exp(-x))
    
    # exit gracefully
    #
    return val

# initialize a weight matrix with random values
#
def init_weights(rows, cols, seed=DEF_RANDOM_SEED):
    """
    method: init_weights
    arguments:
        rows: integer representing the number of neurons in the current layer
        cols: integer representing the number of neurons in the previous layer
        seed: random seed for reproducibility
    return:
        matrix: a 2D list of randomly initialized weights
    description:
        Creates and populates a 2D python list with small random floats 
        to serve as initial weights for a neural network layer.
    """
    
    # set the seed
    #
    random.seed(seed)
    
    # generate a matrix with random values centered around 0
    #
    matrix = [[random.uniform(-0.5, 0.5) for _ in range(cols)] for _ in range(rows)]
    
    # exit gracefully
    #
    return matrix

# perform a single forward pass through the network
#
def forward_pass(x, W1, b1, W2, b2, W3, b3):
    """
    method: forward_pass
    arguments:
        x: list representing a single input feature vector
        W1, b1: weights and biases for the first hidden layer
        W2, b2: weights and biases for the second hidden layer
        W3, b3: weights and biases for the output layer
    return:
        z1, a1, z2, a2, z3, y_hat: intermediate pre-activations and activations
    description:
        Pushes a single input vector forward through the deep MLP architecture.
    """
    
    # calculate the first hidden layer: z1 = W1*x + b1, a1 = relu(z1)
    #
    z1 = []
    for i in range(len(W1)):
        z1_sum = 0.0
        for j in range(len(x)):
            z1_sum += W1[i][j] * x[j]
        z1.append(z1_sum + b1[i])
        
    # get the activations for the first hidden layer
    #
    a1 = []
    for z in z1:
        a1.append(relu(z))
    
    # calculate the second hidden layer: z2 = W2*a1 + b2, a2 = relu(z2)
    #
    z2 = []
    for i in range(len(W2)):
        z2_sum = 0.0
        for j in range(len(a1)):
            z2_sum += W2[i][j] * a1[j]
        z2.append(z2_sum + b2[i])
    
    # get the activations for the second hidden layer
    #
    a2 = []
    for z in z2:
        a2.append(relu(z))
    
    # calculate the output layer: z3 = W3*a2 + b3, y_hat = sigmoid(z3)
    #
    z3_sum = 0.0
    
    # compute the weighted sum for the single output neuron
    # W3 has shape [1 x len(a2)], so we use row 0 and multiply each
    # second-layer activation a2[j] by its corresponding output weight W3[0][j]
    #
    for j in range(len(a2)):
        z3_sum += W3[0][j] * a2[j]

    # add the output-layer bias term to obtain the pre-activation (logit)
    #
    z3 = [z3_sum + b3[0]]

    # apply sigmoid to map the logit into a probability in [0, 1]
    # y_hat represents the model's predicted probability for class 1
    #
    y_hat = sigmoid(z3[0])
    
    # exit gracefully
    #
    return z1, a1, z2, a2, z3, y_hat

# train the deep multi-layer perceptron
#
def train_mlp(X, y, epochs=DEF_EPOCHS, lr=DEF_LEARNING_RATE, lambda_param=DEF_LAMBDA, seed=DEF_RANDOM_SEED):
    """
    method: train_mlp
    arguments:
        X: list of lists representing the feature vectors
        y: list of class labels (0 or 1)
        epochs: number of times to iterate over the dataset
        lr: learning rate for gradient descent
        lambda_param: L2 regularization strength parameter
        seed: random seed for reproducibility
    return:
        W1, b1, W2, b2, W3, b3: optimized weights and biases
    description:
        Trains a deep neural network (2 hidden layers) using Stochastic Gradient 
        Descent (SGD) and backpropagation with pure python structures. Minimizes 
        Binary Cross Entropy (BCE) loss combined with L2 regularization.
    """
    
    # set the random seed
    #
    random.seed(int(seed))
    
    # initialize weight matrices and bias vectors
    #
    W1 = init_weights(DEF_HIDDEN_1_DIM, DEF_INPUT_DIM, seed)
    b1 = [0.0 for _ in range(DEF_HIDDEN_1_DIM)]
    
    W2 = init_weights(DEF_HIDDEN_2_DIM, DEF_HIDDEN_1_DIM, seed+1)
    b2 = [0.0 for _ in range(DEF_HIDDEN_2_DIM)]
    
    W3 = init_weights(DEF_OUTPUT_DIM, DEF_HIDDEN_2_DIM, seed+2)
    b3 = [0.0 for _ in range(DEF_OUTPUT_DIM)]

    # loop up to the maximum number of epochs
    #
    for epoch in range(epochs):
        
        # pair features and labels so we can shuffle them together
        #
        dataset = list(zip(X, y))
        random.shuffle(dataset)
        
        # iterate through each individual sample (Stochastic Gradient Descent)
        #
        for x_val, y_val in dataset:
            
            # forward pass
            #
            z1, a1, z2, a2, z3, y_hat = forward_pass(x_val, W1, b1, W2, b2, W3, b3)
            
            # backward pass (backpropagation)
            # calculate gradient of Binary Cross-Entropy loss w.r.t z3
            #
            dz3 = y_hat - y_val
            
            # calculate gradients for output layer (layer 3)
            #
            dW3 = [[]]
            for j in range(DEF_HIDDEN_2_DIM):
                dW3[0].append(dz3 * a2[j])
            db3 = [dz3]
            
            # backpropagate error to hidden layer 2
            #
            da2 = []
            for i in range(DEF_HIDDEN_2_DIM):
                da2.append(W3[0][i] * dz3)

            dz2 = []
            for i in range(DEF_HIDDEN_2_DIM):
                dz2.append(da2[i] * relu_deriv(z2[i]))
            
            # calculate gradients for Hidden Layer 2
            #
            dW2 = []
            for i in range(DEF_HIDDEN_2_DIM):
                row = []
                for j in range(DEF_HIDDEN_1_DIM):
                    row.append(dz2[i] * a1[j])
                dW2.append(row)
            db2 = dz2
            
            # backpropagate error to hidden layer 1
            #
            da1 = []
            for j in range(DEF_HIDDEN_1_DIM):
                da1_sum = 0
                for i in range(DEF_HIDDEN_2_DIM):
                    da1_sum += W2[i][j] * dz2[i]
                da1.append(da1_sum)

            dz1 = []
            for j in range(DEF_HIDDEN_1_DIM):
                dz1.append(da1[j] * relu_deriv(z1[j]))
            
            # calculate gradients for hidden layer 1
            #
            dW1 = []
            for i in range(DEF_HIDDEN_1_DIM):
                row = []
                for j in range(DEF_INPUT_DIM):
                    row.append(dz1[i] * x_val[j])
                dW1.append(row)
            db1 = dz1
            
            # update weights and biases (with L2 Regularization applied to weights)
            #
            # update layer 3
            #
            for j in range(DEF_HIDDEN_2_DIM):
                W3[0][j] -= lr * (dW3[0][j] + lambda_param * W3[0][j])
            b3[0] -= lr * db3[0]
            
            # update layer 2
            #
            for i in range(DEF_HIDDEN_2_DIM):
                for j in range(DEF_HIDDEN_1_DIM):
                    W2[i][j] -= lr * (dW2[i][j] + lambda_param * W2[i][j])
                b2[i] -= lr * db2[i]
                
            # update layer 1
            #
            for i in range(DEF_HIDDEN_1_DIM):
                for j in range(DEF_INPUT_DIM):
                    W1[i][j] -= lr * (dW1[i][j] + lambda_param * W1[i][j])
                b1[i] -= lr * db1[i]

    # exit gracefully
    #
    return W1, b1, W2, b2, W3, b3

# calculate the classification error rate of the trained MLP
#
def calculate_error_rate(X, y, W1, b1, W2, b2, W3, b3):
    """
    method: calculate_error_rate
    arguments:
        X: list of lists representing the feature vectors
        y: list of true class labels (0 or 1)
        W1, b1, W2, b2, W3, b3: the optimized network parameters
    return:
        error_rate: float representing the percentage of incorrect predictions
        error_count: integer count of misclassified points
    description:
        Evaluates the model's accuracy by running a forward pass for each 
        point and comparing the thresholded output against the true label.
    """
    
    # initialize a counter for misclassified points
    #
    error_count = 0
    total_points = len(y)

    # iterate through every point in the dataset
    #
    for i in range(total_points):
        
        # calculate the network output
        #
        _, _, _, _, _, y_hat = forward_pass(X[i], W1, b1, W2, b2, W3, b3)
        
        # determine the predicted class using a 0.5 probability threshold
        #
        if y_hat >= 0.5:
            prediction = 1
        else:
            prediction = 0
        
        # check if the prediction does not match the true label
        #
        if prediction != y[i]:
            error_count += 1

    # calculate the error rate as a float
    #
    error_rate = float(error_count) / total_points

    # exit gracefully
    #
    return error_rate, error_count

# render a plot of the data points and non-linear decision boundary
#
def plot_decision_boundary(X, y, W1, b1, W2, b2, W3, b3):
    """
    method: plot_decision_boundary
    arguments:
        X: list of lists representing the feature vectors
        y: list of class labels (0 or 1)
        W1, b1, W2, b2, W3, b3: the optimized network parameters
    return:
        True: indicates successful execution
    description:
        Generates a graphical plot using matplotlib to display the dataset
        and the complex decision boundary formed by the deep MLP.
    """
    
    # globally set the base font size for all matplotlib elements
    #
    plt.rcParams.update({'font.size': 12})

    # determine the ranges for the x and y axes to build a mesh grid
    #
    x_min = min(p[0] for p in X) - 1.5
    x_max = max(p[0] for p in X) + 1.5
    y_min = min(p[1] for p in X) - 1.5
    y_max = max(p[1] for p in X) + 1.5

    # construct a mesh grid
    #
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 150),
                         np.linspace(y_min, y_max, 150))
    
    # predict on every point in the mesh grid using the pure python forward pass
    #
    Z = []
    for i in range(xx.shape[0]):
        row_preds = []
        for j in range(xx.shape[1]):
            grid_pt = [xx[i, j], yy[i, j]]
            _, _, _, _, _, y_hat = forward_pass(grid_pt, W1, b1, W2, b2, W3, b3)
            row_preds.append(1 if y_hat >= 0.5 else 0)
        Z.append(row_preds)
        
    Z = np.array(Z)

    # plot the filled contour to represent the decision boundary
    #
    plt.contourf(xx, yy, Z, cmap=plt.cm.RdBu, alpha=0.4)

    # separate the dataset into class 1 and class 0 for colored plotting
    #
    class_1_x = [X[i][0] for i in range(len(X)) if y[i] == 1]
    class_1_y = [X[i][1] for i in range(len(X)) if y[i] == 1]
    
    class_0_x = [X[i][0] for i in range(len(X)) if y[i] == 0]
    class_0_y = [X[i][1] for i in range(len(X)) if y[i] == 0]

    # plot the actual data points
    #
    plt.scatter(class_1_x, class_1_y, color='blue', marker='o', edgecolors='k', label='Class 1', s=50)
    plt.scatter(class_0_x, class_0_y, color='red', marker='s', edgecolors='k', label='Class 0', s=50)

    # configure the plot aesthetics and labels
    #
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('MLP - Decision Boundary')
    plt.legend(loc='upper right', fontsize=11)
    
    # ensure the axes are scaled equally
    #
    plt.axis('equal')

    # save the final plot 
    #
    plt.savefig(DEF_OUTPUT_FILE)

    # exit gracefully
    #
    return True

# function: main
#
def main(argv):
    """
    method: main
    arguments:
        argv: command line arguments
    return:
        True: indicates successful execution
    description:
        Main entry point that demonstrates training a Deep MLP on a 
        2D overlapping Gaussian dataset and visualizing the boundary.
    """

    # define the initial dataset and parameters
    #
    X = DEF_DATA_X
    y = DEF_DATA_Y

    # print header
    #
    print("Starting Deep Multi-Layer Perceptron (MLP) demonstration...")
    print("Architecture: %d Inputs -> %d Hidden -> %d Hidden -> %d Output" % 
          (DEF_INPUT_DIM, DEF_HIDDEN_1_DIM, DEF_HIDDEN_2_DIM, DEF_OUTPUT_DIM))
    print("Number of samples: %d" % len(X))
    print("-" * 65)

    # run the mlp training loop using gradient descent
    #
    print("Executing MLP Stochastic Gradient Descent")
    W1, b1, W2, b2, W3, b3 = train_mlp(X, y)
    
    # calculate the error rate
    #
    error_rate, error_count = calculate_error_rate(X, y, W1, b1, W2, b2, W3, b3)
    accuracy = (1.0 - error_rate) * 100

    # print the metrics
    #
    print("Training Complete.")
    print("-" * 65)
    print("Misclassified Points : %d out of %d" % (error_count, len(X)))
    print("Model Accuracy       : %.2f%%" % accuracy)
    print("-" * 65)

    # render the matplotlib plot
    #
    print("Launching Matplotlib Visualization...")
    plot_decision_boundary(X, y, W1, b1, W2, b2, W3, b3)
    print("Plot Saved at %s. Execution finished." % DEF_OUTPUT_FILE)
    print("-" * 65)

    # exit gracefully
    #
    return True

# begin gracefully
#
if __name__ == '__main__':
    main(sys.argv[0:])

#
# end of file