#!/usr/bin/env python # # file: em_demo.py # # description: # Demonstrates EM parameter estimation with overlapping Gaussian data. # Includes warning suppression for cleaner output. # #------------------------------------------------------------------------------ import numpy as np import matplotlib.pyplot as plt from sklearn.mixture import GaussianMixture from scipy.stats import norm import warnings from sklearn.exceptions import ConvergenceWarning def main(): # ------------------------------------------------------------------------- # Suppress "did not converge" warnings because we are INTENTIONALLY # stopping early (max_iter=1) to visualize the steps. # ------------------------------------------------------------------------- warnings.filterwarnings("ignore", category=ConvergenceWarning) print("--- EM Parameter Estimation Demo (Robust & Clean) ---") # 1. Generate Overlapping Data # True Means: 0 and 3. True Std: 1. # np.random.seed(42) n_samples = 1000 # Cluster 1 (Mean 0) blob1 = np.random.normal(loc=0.0, scale=1.0, size=(n_samples // 2, 1)) # Cluster 2 (Mean 3) blob2 = np.random.normal(loc=3.0, scale=1.0, size=(n_samples // 2, 1)) X = np.vstack([blob1, blob2]) print(f"Generated {len(X)} points. True Means: [0.0, 3.0]") # 2. Setup GMM with Manual Initialization # Start means at -2 and 8 to force a long convergence path. # bad_means_init = np.array([[-2.0], [8.0]]) gmm = GaussianMixture(n_components=2, means_init=bad_means_init, tol=1e-6, warm_start=True, max_iter=1, verbose=0) # 3. Run EM Step-by-Step # print("\nRunning EM Step-by-Step...") history_log_lik = [] max_steps = 50 converged = False prev_score = -np.inf for i in range(max_steps): # Taking one EM step gmm.fit(X) # Capture current log likelihood current_score = gmm.score(X) * len(X) history_log_lik.append(current_score) print(f" Iter {i+1}: Log-Likelihood = {current_score:.2f}, Means = {gmm.means_.flatten()}") # Check convergence manually change = current_score - prev_score if i > 0 and change < 1e-3: print(f" -> Converged at iteration {i+1}") converged = True break prev_score = current_score # 4. Visualization plot_results(X, gmm, history_log_lik) def plot_results(X, gmm, history): """ Visualizes the histogram, the final fit, and the convergence curve. """ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 10)) plt.subplots_adjust(hspace=0.4) # --- Plot 1: Histogram vs Final PDF --- ax1.hist(X, bins=50, density=True, alpha=0.4, color='gray', label='Raw Data') # Generate PDF points x_axis = np.linspace(np.min(X)-1, np.max(X)+1, 1000) # Extract learned parameters means = gmm.means_.flatten() stds = np.sqrt(gmm.covariances_.flatten()) weights = gmm.weights_.flatten() # Create individual component PDFs pdf_1 = norm.pdf(x_axis, means[0], stds[0]) pdf_2 = norm.pdf(x_axis, means[1], stds[1]) # Weighted Sum (The GMM Fit) pdf_total = (weights[0] * pdf_1) + (weights[1] * pdf_2) # Plot ax1.plot(x_axis, pdf_1, 'g--', linewidth=1, label=f'Comp 1 (Mean={means[0]:.2f})') ax1.plot(x_axis, pdf_2, 'b--', linewidth=1, label=f'Comp 2 (Mean={means[1]:.2f})') ax1.plot(x_axis, pdf_total, 'r-', linewidth=2, label='Final GMM Fit') ax1.set_title(f'EM Result: Means converged to {means[0]:.2f} and {means[1]:.2f}') ax1.set_xlabel('Data Value') ax1.set_ylabel('Density') ax1.legend() ax1.grid(True, alpha=0.3) # --- Plot 2: Learning Curve --- ax2.plot(range(1, len(history) + 1), history, 'o-', linewidth=2) ax2.set_title('EM Convergence History (Log-Likelihood Maximization)') ax2.set_xlabel('Iteration') ax2.set_ylabel('Log Likelihood') ax2.grid(True) # Force integer ticks on x-axis ax2.locator_params(axis='x', integer=True) print(f"Saving plot to 'em_demo_visualization.png'...") plt.savefig('em_demo_visualization.png') plt.show() if __name__ == "__main__": main()