#!/usr/bin/env python # # file: ensemble_resampling_demo.py # # revision history: # # 20260309 (SP): implement jackknife, bootstrap, and classifier combination #------------------------------------------------------------------------------ # import system modules # import os import sys import random #------------------------------------------------------------------------------ # # global variables are listed here # #------------------------------------------------------------------------------ # set the filename using basename # __FILE__ = os.path.basename(__file__) # define default values for arguments # DEF_C1_PREDS = ['cat', 'dog', 'cat', 'cat', 'cat'] DEF_C2_PREDS = ['cat', 'dog', 'dog', 'dog', 'cat'] DEF_C3_PREDS = ['dog', 'dog', 'dog', 'cat', 'cat'] DEF_DATA_SET = [10.00, 20.00, 30.00, 40.00, 50.00] DEF_NUM_BOOTSTRAPS = int(100) DEF_RANDOM_SEED = 27 #------------------------------------------------------------------------------ # # functions are listed here # #------------------------------------------------------------------------------ # calculate the mean of a list of numbers # def calculate_mean(data): """ method: calculate_mean arguments: data: list of numbers return: mean: the mean of the numbers description: Calculate the mean of a list of numbers. """ # check for empty list # if not data: return 0.0 # compute sum and divide by length # total = sum(data) # compute the mean # mean = float(total) / len(data) # exit gracefully # return mean # calculate the variance of a list of numbers given a mean # def calculate_variance(data, mean_val): """ method: calculate_variance arguments: data: list of numbers mean_val: the mean of the numbers return: variance: the variance of the numbers description: Calculate the variance of a list of numbers given their mean. """ # check for empty or single item list # n = len(data) if n < 2: return 0.0 # compute the sum of squared differences from the mean # sq_diffs = [(x - mean_val) ** 2 for x in data] # calculate the sample variance # sample_var = sum(sq_diffs) / (n - 1) # exit gracefully # return sample_var # perform jackknife estimation for the mean and variance # def jackknife_estimate(data): """ method: jackknife_estimate arguments: data: list of numbers return: jack_mean: the jackknife estimate of the mean jack_var: the jackknife estimate of the variance description: Perform jackknife estimation for the mean and variance of a dataset.The jackknife is a resampling technique that estimates the mean and variance of a population by systematically leaving out one observation at a time and computing statistics on the remaining data. This method is useful for bias reduction and variance estimation, particularly when the underlying distribution is unknown. """ # get the number of samples # n = len(data) # store the leave-one-out means # loo_means = [] # iterate through each data point to leave it out # for i in range(n): # create a subset excluding the current index # subset = data[:i] + data[i+1:] # calculate and store the mean of the subset # subset_mean = calculate_mean(subset) loo_means.append(subset_mean) # calculate the overall jackknife mean # jack_mean = calculate_mean(loo_means) # calculate the jackknife variance # # standard formula for jackknife variance involves multiplying # the sum of squared differences by (n-1)/n # sq_diffs = [(m - jack_mean) ** 2 for m in loo_means] jack_var = ((n - 1) / float(n)) * sum(sq_diffs) # exit gracefully # return jack_mean, jack_var # perform bootstrap estimation for the mean and variance # def bootstrap_estimate(data, num_bootstraps, seed=DEF_RANDOM_SEED): """ method: bootstrap_estimate arguments: data: list of numbers to estimate num_bootstraps: number of bootstrap samples to generate seed: random seed for reproducibility (default: DEF_RANDOM_SEED) return: boot_mean: bootstrap estimate of the mean boot_var: bootstrap estimate of the variance description: Perform bootstrap resampling to estimate the mean and variance by drawing samples with replacement from the original data. """ # set the random seed for repeatability # random.seed(int(seed)) # get the number of samples # n = len(data) # store the bootstrap sample means # boot_means = [] # perform resampling for the specified number of bootstraps # for _ in range(num_bootstraps): # draw n samples with replacement # sample = [random.choice(data) for _ in range(n)] # calculate and store the mean of the sample # sample_mean = calculate_mean(sample) boot_means.append(sample_mean) # calculate the overall bootstrap mean # boot_mean = calculate_mean(boot_means) # calculate the bootstrap variance # boot_var = calculate_variance(boot_means, boot_mean) # exit gracefully # return boot_mean, boot_var # combine three classifiers using majority voting # def combine_classifiers(preds1, preds2, preds3): """ method: combine_classifiers arguments: preds1: list of predictions from classifier 1 preds2: list of predictions from classifier 2 preds3: list of predictions from classifier 3 return: combined_preds: list of combined predictions using majority vote description: Combine predictions from three classifiers using majority voting to determine the final prediction for each instance. """ # verify that all prediction lists are the same length # if not (len(preds1) == len(preds2) == len(preds3)): print("Error: prediction lists must be equal in length") return None # store the final combined predictions # combined_preds = [] # iterate through the predictions element by element # for p1, p2, p3 in zip(preds1, preds2, preds3): # collect votes for the current instance # votes = [p1, p2, p3] # determine the majority vote using count # # find the most frequent prediction in the vote pool # majority_vote = max(set(votes), key=votes.count) combined_preds.append(majority_vote) # exit gracefully # return combined_preds # function: main # def main(argv): """ method: main arguments: argv: command line arguments return: True: indicates successful execution description: Main entry point that demonstrates jackknife estimation, bootstrap estimation, and classifier combination using majority voting. """ # define the initial dataset # data = DEF_DATA_SET # print header # print("Starting ensemble and resampling demonstrations...") print("Original data:", data) print("Original mean:", calculate_mean(data)) print("-" * 50) # demonstrate jackknifing # print("1. Jackknife Estimation") jack_mean, jack_var = jackknife_estimate(data) print(" Jackknife mean estimate: %.2f" % jack_mean) print(" Jackknife variance estimate: %.2f" % jack_var) print("-" * 50) # demonstrate bootstrapping # print("2. Bootstrap Estimation") boot_mean, boot_var = bootstrap_estimate(data, DEF_NUM_BOOTSTRAPS) print(" Bootstrap mean estimate (%d reps): %.2f" % (DEF_NUM_BOOTSTRAPS, boot_mean)) print(" Bootstrap variance estimate: %.2f" % boot_var) print("-" * 50) # demonstrate classifier combination # print("3. Combining Classifiers (Majority Vote)") # dummy predictions for 5 instances from 3 classifiers # c1_preds = DEF_C1_PREDS c2_preds = DEF_C2_PREDS c3_preds = DEF_C3_PREDS print(" Classifier 1 predictions: %s" % c1_preds) print(" Classifier 2 predictions: %s" % c2_preds) print(" Classifier 3 predictions: %s\n" % c3_preds) # combine the predictions # final_preds = combine_classifiers(c1_preds, c2_preds, c3_preds) print(" Combined voting result: %s" % final_preds) print("-" * 50) # exit gracefully # return True # begin gracefully # if __name__ == '__main__': main(sys.argv[0:]) # # end of file