#!/usr/bin/env python
#
# file: rudimentary_probabilities.py
#
# description:
#  This script provides rudimentary probability calculations for information 
#  theory concepts such as Entropy, Joint Entropy, Conditional Entropy, 
#  and Mutual Information as outlined in Lecture 16.
#
# revision history:
#  20260225 (AA): initial version
#------------------------------------------------------------------------------

# import system modules
#
import os
import sys
import numpy as np

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# define numerical stability constant to avoid log2(0)
#
DEF_EPS = 1.0e-12

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

# compute the entropy of a 1D probability distribution
#
def compute_entropy(p_x):
    """
    method: compute_entropy
    
    arguments:
     p_x: a 1D numpy array representing a marginal probability distribution P(X)
     
    return:
     entropy: the entropy H(X) in bits (float)
     
    description:
     Calculates the Shannon entropy H(X) = - sum( p(x) * log2(p(x)) ).
    """
    
    # clamp probabilities to avoid log(0)
    #
    p_safe = np.maximum(p_x, DEF_EPS)
    
    # calculate entropy
    #
    entropy = -np.sum(p_x * np.log2(p_safe))
    
    # exit gracefully
    #
    return float(entropy)

# compute the joint entropy of a 2D probability distribution
#
def compute_joint_entropy(p_xy):
    """
    method: compute_joint_entropy
    
    arguments:
     p_xy: a 2D numpy array representing a joint probability distribution P(X,Y)
     
    return:
     j_entropy: the joint entropy H(X,Y) in bits (float)
     
    description:
     Calculates the joint entropy H(X,Y) = - sum(sum( p(x,y) * log2(p(x,y)) )).
    """
    
    # clamp probabilities
    #
    p_safe = np.maximum(p_xy, DEF_EPS)
    
    # calculate joint entropy over the flattened 2D array
    #
    j_entropy = -np.sum(p_xy * np.log2(p_safe))
    
    # exit gracefully
    #
    return float(j_entropy)

# compute the conditional entropy H(Y|X)
#
def compute_conditional_entropy_y_given_x(p_xy):
    """
    method: compute_conditional_entropy_y_given_x
    
    arguments:
     p_xy: a 2D numpy array representing a joint probability distribution P(X,Y)
           where X is rows (axis 0) and Y is columns (axis 1)
           
    return:
     cond_entropy: the conditional entropy H(Y|X) in bits (float)
     
    description:
     Calculates H(Y|X) = - sum(sum( p(x,y) * log2(p(y|x)) )).
     We find p(y|x) by dividing the joint p(x,y) by the marginal p(x).
    """
    
    # get the marginal distribution P(X) by summing over Y (columns)
    #
    p_x = np.sum(p_xy, axis=1)
    
    # reshape P(X) to a column vector to allow broadcasting division
    #
    p_x_col = np.maximum(p_x.reshape(-1, 1), DEF_EPS)
    
    # compute P(Y|X) = P(X,Y) / P(X)
    #
    p_y_given_x = p_xy / p_x_col
    
    # clamp conditional probabilities
    #
    p_y_given_x_safe = np.maximum(p_y_given_x, DEF_EPS)
    
    # calculate conditional entropy
    #
    cond_entropy = -np.sum(p_xy * np.log2(p_y_given_x_safe))
    
    # exit gracefully
    #
    return float(cond_entropy)

# compute the mutual information I(X;Y)
#
def compute_mutual_information(p_xy):
    """
    method: compute_mutual_information
    
    arguments:
     p_xy: a 2D numpy array representing a joint probability distribution P(X,Y)
     
    return:
     mi: the mutual information I(X;Y) in bits (float)
     
    description:
     Calculates I(X;Y) = sum(sum( p(x,y) * log2( p(x,y) / (p(x)*p(y)) ) )).
    """
    
    # calculate marginals P(X) and P(Y)
    #
    p_x = np.sum(p_xy, axis=1)
    p_y = np.sum(p_xy, axis=0)
    
    # compute the independent joint distribution P(X)*P(Y) using outer product
    #
    p_x_times_p_y = np.outer(p_x, p_y)
    
    # clamp arrays to prevent log(0) and division by zero
    #
    p_xy_safe = np.maximum(p_xy, DEF_EPS)
    p_x_times_p_y_safe = np.maximum(p_x_times_p_y, DEF_EPS)
    
    # calculate mutual information
    #
    mi = np.sum(p_xy * np.log2(p_xy_safe / p_x_times_p_y_safe))
    
    # exit gracefully
    #
    return float(mi)

# function: main
#
def main(argv):
    
    print("--- Information Theory Calculations (Lecture 16) ---")
    
    # 1. Data Setup: Define a synthetic Joint Probability Matrix P(X,Y)
    #    Rows = X (e.g., 2 classes), Cols = Y (e.g., 3 features)
    #    Make sure it sums to 1.0
    #
    p_xy = np.array([[0.1, 0.2, 0.1],
                     [0.3, 0.1, 0.2]])
    
    print("\nJoint Probability Matrix P(X,Y):")
    print(p_xy)
    
    # verify it is a valid probability distribution
    #
    if not np.isclose(np.sum(p_xy), 1.0):
        print("**> Error: Joint probability matrix does not sum to 1.")
        return False
        
    # 2. Extract Marginal Probabilities
    #
    p_x = np.sum(p_xy, axis=1)
    p_y = np.sum(p_xy, axis=0)
    
    print("\nMarginal Probabilities:")
    print("  P(X):", p_x)
    print("  P(Y):", p_y)
    
    # 3. Compute Base Entropies
    #
    h_x = compute_entropy(p_x)
    h_y = compute_entropy(p_y)
    
    print("\nEntropies:")
    print("  H(X) = %.4f bits" % h_x)
    print("  H(Y) = %.4f bits" % h_y)
    
    # 4. Compute Joint and Conditional Entropies
    #
    h_xy = compute_joint_entropy(p_xy)
    h_y_given_x = compute_conditional_entropy_y_given_x(p_xy)
    
    # for H(X|Y), we can just transpose the matrix and reuse the function
    #
    h_x_given_y = compute_conditional_entropy_y_given_x(p_xy.T)
    
    print("\nJoint & Conditional Entropies:")
    print("  H(X,Y) = %.4f bits" % h_xy)
    print("  H(Y|X) = %.4f bits" % h_y_given_x)
    print("  H(X|Y) = %.4f bits" % h_x_given_y)
    
    # 5. Compute Mutual Information
    #
    i_xy = compute_mutual_information(p_xy)
    
    print("\nMutual Information:")
    print("  I(X;Y) = %.4f bits" % i_xy)
    
    # 6. Verification / Sanity Checks
    #    We prove the mathematical relationships outlined in Lecture 16
    #
    print("\n" + "="*50)
    print(" VERIFICATION OF IDENTITIES (Lecture 16)")
    print("="*50)
    
    # Identity 1: I(X;Y) = H(X) - H(X|Y)
    #
    id1_val = h_x - h_x_given_y
    print("Identity 1: I(X;Y) == H(X) - H(X|Y)")
    print("  %.4f == %.4f - %.4f  (Result: %.4f)" % 
          (i_xy, h_x, h_x_given_y, id1_val))
    
    # Identity 2: I(X;Y) = H(Y) - H(Y|X)
    #
    id2_val = h_y - h_y_given_x
    print("\nIdentity 2: I(X;Y) == H(Y) - H(Y|X)")
    print("  %.4f == %.4f - %.4f  (Result: %.4f)" % 
          (i_xy, h_y, h_y_given_x, id2_val))
          
    # Identity 3: I(X;Y) = H(X) + H(Y) - H(X,Y)
    #
    id3_val = h_x + h_y - h_xy
    print("\nIdentity 3: I(X;Y) == H(X) + H(Y) - H(X,Y)")
    print("  %.4f == %.4f + %.4f - %.4f  (Result: %.4f)" % 
          (i_xy, h_x, h_y, h_xy, id3_val))
    
    # Final assertion to check math programmatically
    #
    assert np.isclose(i_xy, id1_val), "Identity 1 Failed!"
    assert np.isclose(i_xy, id2_val), "Identity 2 Failed!"
    assert np.isclose(i_xy, id3_val), "Identity 3 Failed!"
    print("\n>> All identities hold true! Math is correct.")
    
    # exit gracefully
    #
    return True

# begin gracefully
#
if __name__ == '__main__':
    main(sys.argv[0:])

#
# end of file