#!/usr/bin/env python
#
# file: xgboost.py
#
# revision history:
#
# 20260309 (SP): implement gradient boosting with mse tracking
#------------------------------------------------------------------------------

# import system modules
#
import os
import sys

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# define default values for arguments
#
# simple 1d dataset: x is feature, y is target
#
DEF_X_DATA = [1.00, 2.00, 3.00, 4.00, 5.00, 6.00]
DEF_Y_DATA = [5.55, 7.27, 8.11, 15.32, 17.59, 19.01]

DEF_LEARNING_RATE = 0.97
DEF_NUM_TREES     = int(10)

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# calculate the mean of a list of numbers
#
def calculate_mean(data):
    """
    method: calculate_mean
    arguments:
        data: list of numbers
    return:
        mean: the mean of the numbers
    description:
        Calculate the mean of a list of numbers.
    """
    
    # check for empty list
    #
    if not data:
        return 0.0

    # compute sum and divide by length
    #
    total = sum(data)
    
    # compute the mean
    #
    mean = float(total) / len(data)
    
    # exit gracefully
    #
    return mean

# calculate the mean squared error (mse)
#
def calculate_mse(actual_y, predicted_y):
    """
    method: calculate_mse
    arguments:
        actual_y: list of actual target values
        predicted_y: list of predicted values
    return:
        mse: the mean squared error between actual and predicted
    description:
        Calculate the mean squared error (MSE) between actual and predicted values.
    """
    
    # check for empty lists
    #
    if not actual_y or not predicted_y:
        return 0.0
        
    # sum the squared differences and divide by the number of items
    #
    n = len(actual_y)
    
    # declare empty list for squared differences
    #
    sq_diffs = []
    
    for a, p in zip(actual_y, predicted_y):
        # calculate squared difference between actual and predicted
        #
        sq_diffs.append((a - p) ** 2)
        
    # compute mean square error
    #  
    mse = sum(sq_diffs) / n
    
    # exit gracefully
    #
    return mse

# calculate the residual errors (actual - predicted)
#
def calculate_residuals(actual_y, predicted_y):
    """
    method: calculate_residuals
    arguments:
        actual_y: list of actual target values
        predicted_y: list of predicted values
    return:
        res: list of residual errors (actual - predicted)
    description:
        Calculate the residual errors by subtracting predicted from actual values.
    """
    
    # declare empty list for the results
    #
    res = []
    for a, p in zip(actual_y, predicted_y):
        # subtract prediction from actual for each data point
        #
        res.append(a - p)
    
    # exit gracefully
    #
    return res

# find the best split for a simple decision (1-level tree)
#
def fit_decision(x_data, y_residuals):
    """
    method: fit_decision
    arguments:
        x_data: list of feature values
        y_residuals: list of residual values to fit
    return:
        best_split: optimal threshold value for splitting
        best_left_mean: mean prediction for left partition
        best_right_mean: mean prediction for right partition
    description:
        Find the best split point for a decision stump by minimizing
        the sum of squared errors across all possible split thresholds.
    """
    
    # initialize best split tracking variables
    #
    best_sse = float('inf')
    best_split = None
    best_left_mean = 0.0
    best_right_mean = 0.0

    # try splitting at the midpoint between each consecutive x value
    #
    for i in range(len(x_data) - 1):
        
        # calculate the candidate split threshold
        #
        split_val = (x_data[i] + x_data[i+1]) / 2.0

        # partition the residuals based on the split
        #
        left_res  = [r for j, r in enumerate(y_residuals) if x_data[j] <= split_val]
        right_res = [r for j, r in enumerate(y_residuals) if x_data[j] > split_val]

        # calculate the mean of residuals for each partition
        #
        left_mean  = calculate_mean(left_res)
        right_mean = calculate_mean(right_res)

        # calculate sum of squared errors (sse) for this split
        #
        left_sse  = sum((r - left_mean)**2 for r in left_res)
        right_sse = sum((r - right_mean)**2 for r in right_res)
        total_sse = left_sse + right_sse

        # update best split if current sse is lower
        #
        if total_sse < best_sse:
            best_sse = total_sse
            best_split = split_val
            best_left_mean = left_mean
            best_right_mean = right_mean

    # exit gracefully
    #
    return best_split, best_left_mean, best_right_mean

# predict using a trained decision stump
#
def predict(x_data, split_val, left_val, right_val):
    """
    method: predict
    arguments:
        x_data: list of feature values to predict
        split_val: threshold value for the decision split
        left_val: prediction value for x <= split_val
        right_val: prediction value for x > split_val
    return:
        predictions: list of predicted values for each input
    description:
        Generate predictions using a trained decision stump by comparing
        each feature value against the split threshold.
    """

    # declare empty list for the predictions
    #
    predictions = []
    
    for x in x_data:
        # append left value if less than/equal to split, else right value
        #
        if x <= split_val:
            predictions.append(left_val)
        else:
            predictions.append(right_val)
    
    # exit gracefully
    #
    return predictions

# function: main
#
def main(argv):
    """
    method: main
    arguments:
        argv: command line arguments
    return:
        True: indicates successful execution
    description:
        Main entry point that demonstrates gradient boosting by iteratively
        fitting decision stumps to residuals and tracking MSE improvement.
    """

    # initialize dataset and hyperparameters
    #
    x = DEF_X_DATA
    y = DEF_Y_DATA
    lr = DEF_LEARNING_RATE
    num_trees = DEF_NUM_TREES

    print("Starting conceptual XGBoost with MSE Tracking...")
    print("Feature (X):", x)
    print("Target  (Y):", y)
    print("-" * 50)

    # initialize predictions with the mean of the target
    #
    initial_pred = calculate_mean(y)
    current_preds = [initial_pred] * len(y)
    
    # calculate and display initial mse
    #
    initial_mse = calculate_mse(y, current_preds)
    
    print("Initial Base Prediction (Mean): %.2f" % initial_pred)
    print("Initial MSE: %.2f" % initial_mse)
    print("-" * 50)

    # sequentially build trees to correct residuals
    #
    for step in range(num_trees):
        
        # calculate current residuals (the gradient to fit)
        #
        residuals = calculate_residuals(y, current_preds)
        
        # fit a new decision stump to the residuals
        #
        split, left_m, right_m = fit_decision(x, residuals)
        
        # generate predictions
        #
        stump_preds = predict(x, split, left_m, right_m)
        
        # update our overall predictions using the learning rate
        #
        for i in range(len(current_preds)):
            current_preds[i] = current_preds[i] + (lr * stump_preds[i])
        
        # calculate the new mse after updating predictions
        #
        current_mse = calculate_mse(y, current_preds)
        
        # output the progress for this iteration
        #
        print("Tree %d:" % (step + 1))
        print("  Targeting Residuals:   [%s]" % ", ".join(["%.2f" % r for r in residuals]))
        print("  Split X at: %.2f (Left update: %.2f, Right update: %.2f)" % (split, left_m, right_m))
        print("  Updated Predictions:   [%s]" % ", ".join(["%.2f" % p for p in current_preds]))
        print("  Current MSE:           %.2f" % current_mse)
        print("-" * 50)

    # exit gracefully
    #
    return True

# begin gracefully
#
if __name__ == '__main__':
    main(sys.argv[0:])
#
# end of file