#!/usr/bin/env python # # file: xgboost.py # # revision history: # # 20260309 (SP): implement gradient boosting with mse tracking #------------------------------------------------------------------------------ # import system modules # import os import sys #------------------------------------------------------------------------------ # # global variables are listed here # #------------------------------------------------------------------------------ # set the filename using basename # __FILE__ = os.path.basename(__file__) # define default values for arguments # # simple 1d dataset: x is feature, y is target # DEF_X_DATA = [1.00, 2.00, 3.00, 4.00, 5.00, 6.00] DEF_Y_DATA = [5.55, 7.27, 8.11, 15.32, 17.59, 19.01] DEF_LEARNING_RATE = 0.97 DEF_NUM_TREES = int(10) #------------------------------------------------------------------------------ # # functions are listed here # #------------------------------------------------------------------------------ # calculate the mean of a list of numbers # def calculate_mean(data): """ method: calculate_mean arguments: data: list of numbers return: mean: the mean of the numbers description: Calculate the mean of a list of numbers. """ # check for empty list # if not data: return 0.0 # compute sum and divide by length # total = sum(data) # compute the mean # mean = float(total) / len(data) # exit gracefully # return mean # calculate the mean squared error (mse) # def calculate_mse(actual_y, predicted_y): """ method: calculate_mse arguments: actual_y: list of actual target values predicted_y: list of predicted values return: mse: the mean squared error between actual and predicted description: Calculate the mean squared error (MSE) between actual and predicted values. """ # check for empty lists # if not actual_y or not predicted_y: return 0.0 # sum the squared differences and divide by the number of items # n = len(actual_y) # declare empty list for squared differences # sq_diffs = [] for a, p in zip(actual_y, predicted_y): # calculate squared difference between actual and predicted # sq_diffs.append((a - p) ** 2) # compute mean square error # mse = sum(sq_diffs) / n # exit gracefully # return mse # calculate the residual errors (actual - predicted) # def calculate_residuals(actual_y, predicted_y): """ method: calculate_residuals arguments: actual_y: list of actual target values predicted_y: list of predicted values return: res: list of residual errors (actual - predicted) description: Calculate the residual errors by subtracting predicted from actual values. """ # declare empty list for the results # res = [] for a, p in zip(actual_y, predicted_y): # subtract prediction from actual for each data point # res.append(a - p) # exit gracefully # return res # find the best split for a simple decision (1-level tree) # def fit_decision(x_data, y_residuals): """ method: fit_decision arguments: x_data: list of feature values y_residuals: list of residual values to fit return: best_split: optimal threshold value for splitting best_left_mean: mean prediction for left partition best_right_mean: mean prediction for right partition description: Find the best split point for a decision stump by minimizing the sum of squared errors across all possible split thresholds. """ # initialize best split tracking variables # best_sse = float('inf') best_split = None best_left_mean = 0.0 best_right_mean = 0.0 # try splitting at the midpoint between each consecutive x value # for i in range(len(x_data) - 1): # calculate the candidate split threshold # split_val = (x_data[i] + x_data[i+1]) / 2.0 # partition the residuals based on the split # left_res = [r for j, r in enumerate(y_residuals) if x_data[j] <= split_val] right_res = [r for j, r in enumerate(y_residuals) if x_data[j] > split_val] # calculate the mean of residuals for each partition # left_mean = calculate_mean(left_res) right_mean = calculate_mean(right_res) # calculate sum of squared errors (sse) for this split # left_sse = sum((r - left_mean)**2 for r in left_res) right_sse = sum((r - right_mean)**2 for r in right_res) total_sse = left_sse + right_sse # update best split if current sse is lower # if total_sse < best_sse: best_sse = total_sse best_split = split_val best_left_mean = left_mean best_right_mean = right_mean # exit gracefully # return best_split, best_left_mean, best_right_mean # predict using a trained decision stump # def predict(x_data, split_val, left_val, right_val): """ method: predict arguments: x_data: list of feature values to predict split_val: threshold value for the decision split left_val: prediction value for x <= split_val right_val: prediction value for x > split_val return: predictions: list of predicted values for each input description: Generate predictions using a trained decision stump by comparing each feature value against the split threshold. """ # declare empty list for the predictions # predictions = [] for x in x_data: # append left value if less than/equal to split, else right value # if x <= split_val: predictions.append(left_val) else: predictions.append(right_val) # exit gracefully # return predictions # function: main # def main(argv): """ method: main arguments: argv: command line arguments return: True: indicates successful execution description: Main entry point that demonstrates gradient boosting by iteratively fitting decision stumps to residuals and tracking MSE improvement. """ # initialize dataset and hyperparameters # x = DEF_X_DATA y = DEF_Y_DATA lr = DEF_LEARNING_RATE num_trees = DEF_NUM_TREES print("Starting conceptual XGBoost with MSE Tracking...") print("Feature (X):", x) print("Target (Y):", y) print("-" * 50) # initialize predictions with the mean of the target # initial_pred = calculate_mean(y) current_preds = [initial_pred] * len(y) # calculate and display initial mse # initial_mse = calculate_mse(y, current_preds) print("Initial Base Prediction (Mean): %.2f" % initial_pred) print("Initial MSE: %.2f" % initial_mse) print("-" * 50) # sequentially build trees to correct residuals # for step in range(num_trees): # calculate current residuals (the gradient to fit) # residuals = calculate_residuals(y, current_preds) # fit a new decision stump to the residuals # split, left_m, right_m = fit_decision(x, residuals) # generate predictions # stump_preds = predict(x, split, left_m, right_m) # update our overall predictions using the learning rate # for i in range(len(current_preds)): current_preds[i] = current_preds[i] + (lr * stump_preds[i]) # calculate the new mse after updating predictions # current_mse = calculate_mse(y, current_preds) # output the progress for this iteration # print("Tree %d:" % (step + 1)) print(" Targeting Residuals: [%s]" % ", ".join(["%.2f" % r for r in residuals])) print(" Split X at: %.2f (Left update: %.2f, Right update: %.2f)" % (split, left_m, right_m)) print(" Updated Predictions: [%s]" % ", ".join(["%.2f" % p for p in current_preds])) print(" Current MSE: %.2f" % current_mse) print("-" * 50) # exit gracefully # return True # begin gracefully # if __name__ == '__main__': main(sys.argv[0:]) # # end of file