#!/usr/bin/env python
#
# file: ensemble_resampling_demo.py
#
# revision history:
#
# 20260309 (SP): implement jackknife, bootstrap, and classifier combination
#------------------------------------------------------------------------------

# import system modules
#
import os
import sys
import random

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# define default values for arguments
#
DEF_C1_PREDS       = ['cat', 'dog', 'cat', 'cat', 'cat']
DEF_C2_PREDS       = ['cat', 'dog', 'dog', 'dog', 'cat']
DEF_C3_PREDS       = ['dog', 'dog', 'dog', 'cat', 'cat']
DEF_DATA_SET       = [10.00, 20.00, 30.00, 40.00, 50.00]
DEF_NUM_BOOTSTRAPS = int(100)
DEF_RANDOM_SEED    = 27

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# calculate the mean of a list of numbers
#
def calculate_mean(data):
    """
    method: calculate_mean
    arguments:
        data: list of numbers
    return:
        mean: the mean of the numbers
    description:
        Calculate the mean of a list of numbers.
    """
    
    # check for empty list
    #
    if not data:
        return 0.0

    # compute sum and divide by length
    #
    total = sum(data)
    
    # compute the mean
    #
    mean = float(total) / len(data)
    
    # exit gracefully
    #
    return mean

# calculate the variance of a list of numbers given a mean
#
def calculate_variance(data, mean_val):
    """
    method: calculate_variance
    arguments:
        data: list of numbers
        mean_val: the mean of the numbers
    return:
        variance: the variance of the numbers
    description:
        Calculate the variance of a list of numbers given their mean.
    """

    # check for empty or single item list
    #
    n = len(data)
    if n < 2:
        return 0.0

    # compute the sum of squared differences from the mean
    #
    sq_diffs = [(x - mean_val) ** 2 for x in data]
    
    # calculate the sample variance
    #
    sample_var = sum(sq_diffs) / (n - 1)
    
    # exit gracefully
    #
    return sample_var

# perform jackknife estimation for the mean and variance
#
def jackknife_estimate(data):
    """
    method: jackknife_estimate
    arguments:
        data: list of numbers
    return:
        jack_mean: the jackknife estimate of the mean
        jack_var: the jackknife estimate of the variance
    description:
          Perform jackknife estimation for the mean and variance of a
          dataset.The jackknife is a resampling technique that estimates the
          mean and variance of a population by systematically leaving out one
          observation at a time and computing statistics on the remaining data.
          This method is useful for bias reduction and variance estimation,
          particularly when the underlying distribution is unknown.
    """

    # get the number of samples
    #
    n = len(data)

    # store the leave-one-out means
    #
    loo_means = []

    # iterate through each data point to leave it out
    #
    for i in range(n):
        
        # create a subset excluding the current index
        #
        subset = data[:i] + data[i+1:]
        
        # calculate and store the mean of the subset
        #
        subset_mean = calculate_mean(subset)
        loo_means.append(subset_mean)

    # calculate the overall jackknife mean
    #
    jack_mean = calculate_mean(loo_means)

    # calculate the jackknife variance
    #
    # standard formula for jackknife variance involves multiplying 
    # the sum of squared differences by (n-1)/n
    #
    sq_diffs = [(m - jack_mean) ** 2 for m in loo_means]
    jack_var = ((n - 1) / float(n)) * sum(sq_diffs)

    # exit gracefully
    #
    return jack_mean, jack_var

# perform bootstrap estimation for the mean and variance
#
def bootstrap_estimate(data, num_bootstraps, seed=DEF_RANDOM_SEED):
    """
    method: bootstrap_estimate
    arguments:
        data: list of numbers to estimate
        num_bootstraps: number of bootstrap samples to generate
        seed: random seed for reproducibility (default: DEF_RANDOM_SEED)
    return:
        boot_mean: bootstrap estimate of the mean
        boot_var: bootstrap estimate of the variance
    description:
        Perform bootstrap resampling to estimate the mean and variance
        by drawing samples with replacement from the original data.
    """

    # set the random seed for repeatability
    #
    random.seed(int(seed))

    # get the number of samples
    #
    n = len(data)

    # store the bootstrap sample means
    #
    boot_means = []

    # perform resampling for the specified number of bootstraps
    #
    for _ in range(num_bootstraps):
        
        # draw n samples with replacement
        #
        sample = [random.choice(data) for _ in range(n)]
        
        # calculate and store the mean of the sample
        #
        sample_mean = calculate_mean(sample)
        boot_means.append(sample_mean)

    # calculate the overall bootstrap mean
    #
    boot_mean = calculate_mean(boot_means)

    # calculate the bootstrap variance
    #
    boot_var = calculate_variance(boot_means, boot_mean)

    # exit gracefully
    #
    return boot_mean, boot_var

# combine three classifiers using majority voting
#
def combine_classifiers(preds1, preds2, preds3):
    """
    method: combine_classifiers
    arguments:
        preds1: list of predictions from classifier 1
        preds2: list of predictions from classifier 2
        preds3: list of predictions from classifier 3
    return:
        combined_preds: list of combined predictions using majority vote
    description:
        Combine predictions from three classifiers using majority voting
        to determine the final prediction for each instance.
    """

    # verify that all prediction lists are the same length
    #
    if not (len(preds1) == len(preds2) == len(preds3)):
        print("Error: prediction lists must be equal in length")
        return None

    # store the final combined predictions
    #
    combined_preds = []

    # iterate through the predictions element by element
    #
    for p1, p2, p3 in zip(preds1, preds2, preds3):
        
        # collect votes for the current instance
        #
        votes = [p1, p2, p3]
        
        # determine the majority vote using count
        #
        # find the most frequent prediction in the vote pool
        #
        majority_vote = max(set(votes), key=votes.count)
        combined_preds.append(majority_vote)

    # exit gracefully
    #
    return combined_preds

# function: main
#
def main(argv):
    """
    method: main
    arguments:
        argv: command line arguments
    return:
        True: indicates successful execution
    description:
        Main entry point that demonstrates jackknife estimation,
        bootstrap estimation, and classifier combination using majority voting.
    """

    # define the initial dataset
    #
    data = DEF_DATA_SET

    # print header
    #
    print("Starting ensemble and resampling demonstrations...")
    print("Original data:", data)
    print("Original mean:", calculate_mean(data))
    print("-" * 50)

    # demonstrate jackknifing
    #
    print("1. Jackknife Estimation")
    jack_mean, jack_var = jackknife_estimate(data)
    print("   Jackknife mean estimate:     %.2f" % jack_mean)
    print("   Jackknife variance estimate: %.2f" % jack_var)
    print("-" * 50)

    # demonstrate bootstrapping
    #
    print("2. Bootstrap Estimation")
    boot_mean, boot_var = bootstrap_estimate(data, DEF_NUM_BOOTSTRAPS)
    print("   Bootstrap mean estimate (%d reps): %.2f" % (DEF_NUM_BOOTSTRAPS, boot_mean))
    print("   Bootstrap variance estimate:       %.2f" % boot_var)
    print("-" * 50)

    # demonstrate classifier combination
    #
    print("3. Combining Classifiers (Majority Vote)")
    
    # dummy predictions for 5 instances from 3 classifiers
    #
    c1_preds = DEF_C1_PREDS
    c2_preds = DEF_C2_PREDS
    c3_preds = DEF_C3_PREDS

    print("   Classifier 1 predictions: %s" % c1_preds)
    print("   Classifier 2 predictions: %s" % c2_preds)
    print("   Classifier 3 predictions: %s\n" % c3_preds)
    
    # combine the predictions
    #
    final_preds = combine_classifiers(c1_preds, c2_preds, c3_preds)
    
    print("   Combined voting result:   %s" % final_preds)
    print("-" * 50)

    # exit gracefully
    #
    return True

# begin gracefully
#
if __name__ == '__main__':
    main(sys.argv[0:])

#
# end of file