#!/usr/bin/env python
#
# file: 09_main.py
#
# revision history:
#
# 20260207 (SP): initial version for Gaussian HMM demo on gold_price_usd.csv
#------------------------------------------------------------------------------

# import system modules
#
import os
import sys
import urllib.request
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

#------------------------------------------------------------------------------
#
# global variables are listed here
#
#------------------------------------------------------------------------------

# set the filename using basename
#
__FILE__ = os.path.basename(__file__)

# dataset location (gold prices)
#
DEF_DATA_URL      = "https://raw.githubusercontent.com/natsunoyuki/Data_Science/refs/heads/master/gold/gold/gold_price_usd.csv"
DEF_DATA_FILE     = "gold_price_usd.csv"

# local cache directory
#
DEF_DATA_DIR      = "./gold_data"

# numerical stability
#
DEF_EPS           = 1.0e-8

# HMM settings
#
DEF_NUM_STATES    = 3
DEF_MAX_ITERS     = 60
DEF_TOL           = 1.0e-4

# train/eval split fraction (time-ordered)
#
DEF_TRAIN_FRAC    = 0.80

# output plot
#
DEF_OUT_PLOT      = "gold_hmm_demo.png"

#------------------------------------------------------------------------------
# model names (alphabetical)
#------------------------------------------------------------------------------
MDL_HMM          = "HMM"

#------------------------------------------------------------------------------
# model dictionary key constants (alphabetical)
#------------------------------------------------------------------------------
MDL_KEY_A        = "A"
MDL_KEY_LL       = "log_lik"
MDL_KEY_MU       = "mu"
MDL_KEY_PI       = "pi"
MDL_KEY_SIG      = "sigma"
MDL_KEY_STATES   = "states"

#------------------------------------------------------------------------------
#
# functions are listed here
#
#------------------------------------------------------------------------------

def download_file(url, outpath):
    """method: download_file
    arguments:
        url: remote URL (str)
        outpath: local file path (str)
    return:
        status: True on success (bool)
    description:
        Download url -> outpath if outpath does not already exist.
    """
    os.makedirs(os.path.dirname(outpath), exist_ok=True)

    # skip download if file exists and is non-empty
    #
    if os.path.exists(outpath) and os.path.getsize(outpath) > 0:
        return True

    try:
        print("Downloading: %s" % url)
        urllib.request.urlretrieve(url, outpath)
    except Exception as e:
        print("Error: download failed (%s): %s" % (url, str(e)))
        return False

    # exit gracefully
    #
    return True


def load_gold_csv(csv_path):
    """method: load_gold_csv
    arguments:
        csv_path: path to csv file (str)
    return:
        dates: list of datetime.date (len=N)
        prices: gold prices (N,) float
    description:
        Loads CSV format:
            datetime,gold_price_usd
            YYYY-MM-DD,123.45
    """
    dates = []
    prices = []

    # read file line-by-line 
    #
    with open(csv_path, "r", encoding="utf-8") as fp:
        # read the header
        #
        header = fp.readline().strip().split(",")
        
        # if there are fewer than 2 columns, raise an error
        #
        if len(header) < 2:
            raise ValueError("Unexpected header in %s" % csv_path)

        # parse data rows
        #
        for line in fp:
            # strip empty lines
            #
            line = line.strip()
            
            # if there is no line, continue to the next line
            #
            if len(line) == 0:
                continue
            
            # if there is a line, split it with comma
            #
            parts = line.split(",")
            
            # if there are fewer than 2 columns, continue to the next line
            #
            if len(parts) < 2:
                continue

            # parse date and price
            #
            y, m, d = parts[0].split("-")
            dates.append(dt.date(int(y), int(m), int(d)))
            prices.append(float(parts[1]))
    
    # convert it to numpy array
    #
    prices = np.asarray(prices, dtype=float)

    # exit gracefully
    #
    return dates, prices


def compute_log_returns(prices, eps=DEF_EPS):
    """method: compute_log_returns
    arguments:
        prices: price series (N,)
        eps: stability constant
    return:
        r: log returns (N-1,)
    description:
        r[t] = log(p[t+1] / p[t])
    """

    # prices[:-1] takes all elements except the last:
    #   [p0, p1, p2, ..., p_{N-2}]
    # np.maximum(..., eps) ensures no value is <= 0 by clamping to eps,
    # which prevents divide-by-zero and log(0) problems.
    #
    p0 = np.maximum(prices[:-1], eps)

    # prices[1:] takes all elements except the first:
    #   [p1, p2, p3, ..., p_{N-1}]
    # again clamp to eps for numerical safety (no zeros/negatives).
    #
    p1 = np.maximum(prices[1:], eps)

    # compute element-wise ratio p[t+1] / p[t] for each t
    # then take natural log to get log returns:
    #   r[t] = log(p1[t] / p0[t]) = log(p[t+1] / p[t])
    #
    r = np.log(p1 / p0)

    # exit gracefully
    #
    return r


def logsumexp(a, axis=None):
    """method: logsumexp
    arguments:
        a: array
        axis: axis to reduce (int or None)
    return:
        lse: log(sum(exp(a))) computed stably
    """
    # np.logaddexp.reduce performs stable log-domain summation
    #
    lse = np.logaddexp.reduce(a, axis=axis)
    
    # exit gracefully
    #
    return lse

def gaussian_logpdf(x, mu, var):
    """method: gaussian_logpdf
    arguments:
        x: observations (T,)
        mu: means (K,)
        var: variances (K,)
    return:
        logB: log emission probs (T,K)
    description:
        1D Gaussian emissions, computed in log domain.
    """
    # reshape x into a column vector of shape (T, 1),
    # where T is the number of time steps/observations.
    # This makes broadcasting work cleanly against (1, K) arrays.
    #
    x = x.reshape(-1, 1)

    # reshape mu into a row vector of shape (1, K),
    # where K is the number of hidden states (one mean per state).
    # Now (T,1) - (1,K) broadcasts to (T,K).
    #
    mu = mu.reshape(1, -1)

    # reshape var into a row vector of shape (1, K) (one variance per state),
    # and clamp each variance to be at least DEF_EPS to prevent divide-by-zero
    # and log(0) in the Gaussian log-pdf computation.
    #
    var = np.maximum(var.reshape(1, -1), DEF_EPS)


    # log N(x | mu, var) = -0.5*(log(2pi*var) + (x-mu)^2/var)
    #
    logB = -0.5 * (np.log(2.0 * np.pi * var) + ((x - mu) ** 2) / var)

    # exit gracefully
    #
    return logB


def hmm_init_params(x, K):
    """method: hmm_init_params
    arguments:
        x: observations (T,)
        K: number of states
    return:
        pi: initial state distribution (K,)
        A: transition matrix (K,K)
        mu: means (K,)
        var: variances (K,)
    description:
        Simple quantile-based initialization (no external libraries).
    """

    # T is the number of observations (time steps) in the 1D sequence x
    #
    T = x.size

    # create an initial-state probability vector pi of length K filled with 1s,
    # then divide by K so every state is equally likely at time t=0
    #
    pi = np.ones(K, dtype=float) / float(K)

    # create a KxK matrix A initialized to zeros; A will become the transition matrix
    #
    A = np.full((K, K), 0.0, dtype=float)

    # fill the transition matrix:
    # - high probability (0.90) to stay in the same state (diagonal entries)
    # - remaining probability (0.10) spread uniformly across transitions to other states
    #
    for i in range(K):
        for j in range(K):
            if i == j:
                # probability of staying in the same state i -> i
                #
                A[i, j] = 0.90
            else:
                # probability of switching from state i to a different state j
                # distribute 0.10 equally among the (K-1) other states
                #
                A[i, j] = 0.10 / float(K - 1)

    # compute K+1 quantile boundaries of x using evenly spaced probabilities:
    # np.linspace(0,1,K+1) gives [0, 1/K, 2/K, ..., 1]
    # qs will contain values that split x into K roughly equal-sized groups
    #
    qs = np.quantile(x, np.linspace(0.0, 1.0, K + 1))

    # allocate an integer array of length T to store an initial state label for each sample
    #
    labels = np.zeros(T, dtype=int)

    # assign each observation x[t] to a quantile bin/state k based on the boundaries qs
    #
    for k in range(K):
        # lower boundary for bin k
        #
        lo = qs[k]

        # upper boundary for bin k
        #
        hi = qs[k + 1]

        # for the last bin, include the right endpoint (<= hi) so max values get included
        #
        if k == K - 1:
            idx = (x >= lo) & (x <= hi)
        else:
            # for other bins, use < hi to avoid double-counting boundary points
            #
            idx = (x >= lo) & (x < hi)

        # set label k for all points that fall inside this bin
        #
        labels[idx] = k

    # allocate arrays to hold per-state Gaussian parameters:
    # mu[k] = mean for state k
    # var[k] = variance for state k
    #
    mu = np.zeros(K, dtype=float)
    var = np.zeros(K, dtype=float)

    # compute overall mean of all observations (used as fallback if a bin is empty)
    #
    overall_mu = float(np.mean(x))

    # compute overall variance of all observations with ddof=1 (unbiased estimate),
    # then add DEF_EPS for numerical stability (avoid zero variance)
    #
    overall_var = float(np.var(x, ddof=1)) + DEF_EPS

    # for each state k, estimate initial mean/variance using the observations assigned to it
    #
    for k in range(K):
        # select the observations whose assigned label is k
        #
        xk = x[labels == k]

        # if we have at least 2 points, compute mean and variance normally
        #
        if xk.size >= 2:
            # mean of the points in bin k
            #
            mu[k] = float(np.mean(xk))

            # unbiased variance of the points in bin k + stability epsilon
            #
            var[k] = float(np.var(xk, ddof=1)) + DEF_EPS

        # if we have exactly 1 point, mean is that point, but variance can't be estimated,
        # so use the overall variance as a fallback
        #
        elif xk.size == 1:
            # mean equals the only sample
            #
            mu[k] = float(xk[0])

            # fallback variance
            #
            var[k] = overall_var

        # if a bin has no points (can happen with repeated values / degenerate data),
        # create a reasonable fallback mean near the overall mean and use overall variance
        #
        else:
            # place the mean slightly offset from overall_mu so different states start distinct
            # offset step size is 0.01 * sqrt(overall_var)
            #
            mu[k] = overall_mu + 0.01 * (k - (K - 1) / 2.0) * np.sqrt(overall_var)

            # fallback variance
            #
            var[k] = overall_var

    # exit gracefully
    #
    return pi, A, mu, var


def hmm_forward_backward(x, pi, A, mu, var):
    """method: hmm_forward_backward
    arguments:
        x: observations (T,)
        pi: initial state distribution (K,)
        A: transition matrix (K,K)
        mu: means (K,)
        var: variances (K,)
    return:
        gamma: state posteriors (T,K)
        xi_sum: expected transitions summed over time (K,K)
        log_lik: log-likelihood of x under the model (float)
    description:
        Standard forward-backward in log domain for numerical stability.
    """

    # T is the number of time steps / observations in the sequence
    #
    T = x.size

    # K is the number of hidden states in the HMM
    #
    K = pi.size

    # clamp pi so no entry is 0 (prevents log(0))
    #
    pi = np.maximum(pi, DEF_EPS)

    # renormalize pi so it sums to 1 (valid probability distribution)
    #
    pi = pi / np.sum(pi)

    # clamp A so no transition probability is 0 (prevents log(0))
    #
    A = np.maximum(A, DEF_EPS)

    # renormalize each row of A so each row sums to 1:
    # A[i,:] is a distribution over next states given current state i
    #
    A = A / np.sum(A, axis=1, keepdims=True)

    # convert initial probabilities into log domain for stability:
    # log_pi[k] = log(pi[k])
    #
    log_pi = np.log(pi)

    # convert transitions into log domain:
    # log_A[i,j] = log(A[i,j])
    #
    log_A = np.log(A)

    # compute log emission probabilities for every time t and state k:
    # log_B[t,k] = log p(x[t] | state=k) for 1D Gaussian(mu[k], var[k])
    # gaussian_logpdf returns a matrix of shape (T, K)
    #
    log_B = gaussian_logpdf(x, mu, var)

    # allocate the forward log-probability table:
    # log_alpha[t,k] = log p(x[0:t], z_t = k)
    #
    log_alpha = np.zeros((T, K), dtype=float)

    # initialize forward values at t=0:
    # log_alpha[0,k] = log(pi[k]) + log p(x[0] | z_0=k)
    #
    log_alpha[0, :] = log_pi + log_B[0, :]

    # forward recursion for t = 1 .. T-1
    #
    for t in range(1, T):

        # for each next state j, compute:
        # log_alpha[t,j] = log_B[t,j] + logsumexp_i( log_alpha[t-1,i] + log_A[i,j] )
        #
        for j in range(K):

            # compute the vector over i:
            # a[i] = log_alpha[t-1,i] + log_A[i,j]
            #
            a = log_alpha[t - 1, :] + log_A[:, j]

            # logsumexp(a) computes log(sum_i exp(a[i])) stably
            #
            log_alpha[t, j] = log_B[t, j] + logsumexp(a=a)

    # log-likelihood of the full observation sequence:
    # log p(x[0:T-1]) = logsumexp_k(log_alpha[T-1,k])
    #
    log_lik = logsumexp(log_alpha[T - 1, :])

    # allocate the backward log-probability table:
    # log_beta[t,k] = log p(x[t+1:T-1] | z_t = k)
    #
    log_beta = np.zeros((T, K), dtype=float)

    # base case at the end: after the last observation, probability is 1
    # log(1) = 0 for all states
    #
    log_beta[T - 1, :] = 0.0

    # backward recursion for t = T-2 .. 0
    #
    for t in range(T - 2, -1, -1):

        # for each current state i, compute:
        # log_beta[t,i] = logsumexp_j( log_A[i,j] + log_B[t+1,j] + log_beta[t+1,j] )
        #
        for i in range(K):

            # build the vector over j:
            # a[j] = log_A[i,j] + log_B[t+1,j] + log_beta[t+1,j]
            #
            a = log_A[i, :] + log_B[t + 1, :] + log_beta[t + 1, :]

            # logsumexp(a) computes log(sum_j exp(a[j])) stably
            #
            log_beta[t, i] = logsumexp(a=a)

    # compute posterior over states (gamma):
    # gamma[t,k] = p(z_t=k | x) = exp(log_alpha[t,k] + log_beta[t,k] - log_lik)
    #
    log_gamma = log_alpha + log_beta - log_lik

    # convert from log domain back to normal probability domain
    #
    gamma = np.exp(log_gamma)

    # xi_sum will hold expected transition counts summed over all time steps:
    # xi_sum[i,j] = sum_{t=0}^{T-2} p(z_t=i, z_{t+1}=j | x)
    #
    xi_sum = np.zeros((K, K), dtype=float)

    # compute expected transitions for each time step t
    #
    for t in range(T - 1):

        # compute log xi for all (i,j) at time t:
        # log_xi[i,j] = log_alpha[t,i] + log_A[i,j] + log_B[t+1,j] + log_beta[t+1,j] - log_lik
        # this is log of the normalized joint posterior for transition i->j at time t
        # (K,1) adds alpha[t,i] 
        # (K,K) adds transition log prob 
        # (1,K) adds emission at t+1 for state 
        # (1,K) adds beta at t+1 for state j
        # subtract log p(x) to normalize
        #
        log_xi = (log_alpha[t, :].reshape(K, 1) + 
                  log_A +                   
                  log_B[t + 1, :].reshape(1, K) + 
                  log_beta[t + 1, :].reshape(1, K) -   
                  log_lik)                              

        # exponentiate to convert log_xi to xi, then sum into xi_sum
        #
        xi_sum += np.exp(log_xi)

    # exit gracefully
    #
    return gamma, xi_sum, float(log_lik)


def hmm_em_train(x, K=DEF_NUM_STATES, max_iters=DEF_MAX_ITERS, tol=DEF_TOL):
    """method: hmm_em_train
    arguments:
        x: observations (T,)
        K: number of states
        max_iters: maximum EM iterations
        tol: convergence threshold on log_lik change
    return:
        model: dict with learned parameters
    description:
        Baum-Welch (EM) for 1D Gaussian HMM.
    """

    # initialize HMM parameters (pi, A, mu, var) using a simple heuristic
    #
    pi, A, mu, var = hmm_init_params(x, K)

    # prev_ll stores log-likelihood from previous iteration (for convergence check)
    #
    prev_ll = None

    # last_ll stores the most recent log-likelihood (for reporting in the model)
    #
    last_ll = None

    # print header info so the user sees what is being trained
    #
    print("\nTraining %s (Gaussian emissions)..." % MDL_HMM)
    print("  states: %d" % K)
    print("  iters : %d (tol=%.1e)\n" % (max_iters, tol))

    # EM loop: repeat E-step and M-step up to max_iters times
    #
    for it in range(1, max_iters + 1):

        # E-step:
        # run forward-backward to compute:
        #  - gamma[t,k] = p(z_t=k | x) (state posterior)
        #  - xi_sum[i,j] = sum_t p(z_t=i, z_{t+1}=j | x) (expected transitions)
        #  - log_lik = log p(x | current parameters)
        #
        gamma, xi_sum, log_lik = hmm_forward_backward(x, pi, A, mu, var)

        # M-step updates begin here
        #

        # update initial state distribution:
        # pi_new[k] = p(z_0=k | x) = gamma at time 0
        #
        pi_new = gamma[0, :].copy()

        # update transition matrix A:
        # denominator gamma_sum[i] = expected number of times we are in state i
        # (summing gamma over t=0..T-2 because transitions go to t+1)
        #
        gamma_sum = np.sum(gamma[:-1, :], axis=0) + DEF_EPS

        # A_new[i,j] = expected transitions i->j / expected times in state i
        # reshape(-1,1) turns (K,) into (K,1) so broadcasting divides each row correctly
        #
        A_new = xi_sum / gamma_sum.reshape(-1, 1)

        # update emission means/variances (Gaussian parameters) using gamma weights
        #

        # gtot[k] = total responsibility mass for state k across all times
        # (expected number of samples generated by state k)
        #
        gtot = np.sum(gamma, axis=0) + DEF_EPS

        # weighted mean for each state:
        # mu_new[k] = sum_t gamma[t,k] * x[t] / sum_t gamma[t,k]
        # gamma.T @ x gives a (K,) vector where element k is sum_t gamma[t,k]*x[t]
        #
        mu_new = (gamma.T @ x) / gtot

        # allocate array for new variances
        #
        var_new = np.zeros(K, dtype=float)

        # compute weighted variance for each state k
        #
        for k in range(K):

            # diff[t] = x[t] - mu_new[k]
            #
            diff = x - mu_new[k]

            # var_new[k] = sum_t gamma[t,k] * (x[t]-mu_k)^2 / sum_t gamma[t,k]
            # add DEF_EPS to keep variance strictly positive (numerical stability)
            #
            var_new[k] = float((gamma[:, k] @ (diff ** 2)) / gtot[k]) + DEF_EPS

        # normalize updated parameters to ensure they are valid probabilities
        #

        # clamp pi_new so entries are not zero (prevents log(0))
        #
        pi_new = np.maximum(pi_new, DEF_EPS)

        # renormalize pi_new so it sums to 1
        #
        pi_new = pi_new / np.sum(pi_new)

        # clamp A_new so entries are not zero (prevents log(0))
        #
        A_new = np.maximum(A_new, DEF_EPS)

        # renormalize rows of A_new so each row sums to 1
        #
        A_new = A_new / np.sum(A_new, axis=1, keepdims=True)

        # compute and print progress information
        #

        # if this is the first iteration, we cannot compute a delta yet
        #
        if prev_ll is None:
            d_ll = 0.0
        else:
            # delta log-likelihood: how much the objective improved this iteration
            #
            d_ll = float(log_lik - prev_ll)

        # print iteration number, current log-likelihood, and improvement
        #
        print("  iter %03d: log_lik = %.2f   delta = %+10.4f" % (it, log_lik, d_ll))

        # check convergence:
        # if improvement is smaller than tolerance, stop early
        #
        if prev_ll is not None and abs(d_ll) < tol:

            # accept the newly estimated parameters before exiting the loop
            #
            pi, A, mu, var = pi_new, A_new, mu_new, var_new

            # store the final log-likelihood
            #
            last_ll = log_lik
            break

        # otherwise, update parameters and continue to next EM iteration
        #
        pi, A, mu, var = pi_new, A_new, mu_new, var_new

        # save current log-likelihood to compute delta next time
        #
        prev_ll = log_lik

        # keep last_ll current for final reporting even if we do not break
        #
        last_ll = log_lik

    # pack learned parameters into a model dictionary
    #
    model = {
        # learned initial distribution pi
        #
        MDL_KEY_PI: pi,

        # learned transition matrix A
        #
        MDL_KEY_A: A,

        # learned Gaussian means per state
        #
        MDL_KEY_MU: mu,

        # store standard deviation (sigma) instead of variance for readability:
        # sigma = sqrt(var)
        #
        MDL_KEY_SIG: np.sqrt(np.maximum(var, DEF_EPS)),

        # store final log-likelihood value
        #
        MDL_KEY_LL: float(last_ll)
    }

    # exit gracefully
    #
    return model


def hmm_loglik(x, model):
    """method: hmm_loglik
    arguments:
        x: observations (T,)
        model: hmm model dict
    return:
        log_lik: float
    description:
        Compute log-likelihood via forward pass (uses forward_backward for simplicity).
    """

    # extract initial state probabilities pi from the model dictionary
    #
    pi = model[MDL_KEY_PI]

    # extract transition matrix A from the model dictionary
    #
    A = model[MDL_KEY_A]

    # extract Gaussian means (one mean per state) from the model dictionary
    #
    mu = model[MDL_KEY_MU]

    # extract Gaussian standard deviations (sigma) from the model dictionary
    #
    sig = model[MDL_KEY_SIG]

    # convert standard deviation to variance (var = sigma^2),
    # and clamp to DEF_EPS to avoid zero variance issues
    #
    var = np.maximum(sig ** 2, DEF_EPS)

    # run forward-backward using the model parameters; we ignore gamma and xi_sum
    # and only keep the returned log-likelihood (ll)
    #
    _, _, ll = hmm_forward_backward(x, pi, A, mu, var)

    # exit gracefully
    #
    return float(ll)


def hmm_viterbi(x, model):
    """method: hmm_viterbi
    arguments:
        x: observations (T,)
        model: hmm model dict
    return:
        states: most likely state sequence (T,) int
    description:
        Viterbi decoding in log domain.
    """

    # extract pi and clamp to avoid zeros (prevents log(0))
    #
    pi = np.maximum(model[MDL_KEY_PI], DEF_EPS)

    # extract A and clamp to avoid zeros (prevents log(0))
    #
    A = np.maximum(model[MDL_KEY_A], DEF_EPS)

    # extract Gaussian means
    #
    mu = model[MDL_KEY_MU]

    # extract sigma and clamp to at least sqrt(DEF_EPS) for stability
    #
    sig = np.maximum(model[MDL_KEY_SIG], np.sqrt(DEF_EPS))

    # convert sigma to variance and clamp
    #
    var = np.maximum(sig ** 2, DEF_EPS)

    # renormalize pi so it sums to 1
    #
    pi = pi / np.sum(pi)

    # renormalize each row of A so each row sums to 1
    #
    A = A / np.sum(A, axis=1, keepdims=True)

    # move to log domain for stable Viterbi computations
    #
    log_pi = np.log(pi)

    # move transitions to log domain
    #
    log_A = np.log(A)

    # compute log emission probabilities:
    # log_B[t,k] = log p(x[t] | state=k)
    #
    log_B = gaussian_logpdf(x, mu, var)

    # get sequence length T and number of states K from log_B shape
    #
    T, K = log_B.shape

    # delta[t,k] will store the best (max) log-probability of any path
    # that ends in state k at time t
    #
    delta = np.zeros((T, K), dtype=float)

    # psi[t,k] will store the argmax (best previous state index)
    # used to reconstruct the best path
    #
    psi = np.zeros((T, K), dtype=int)

    # initialization at t=0:
    # best path to state k is starting in k with probability pi[k],
    # and emitting x[0] from state k
    #
    delta[0, :] = log_pi + log_B[0, :]

    # psi at t=0 is unused; set to 0 for completeness
    #
    psi[0, :] = 0

    # dynamic programming recursion for t=1..T-1
    #
    for t in range(1, T):

        # compute delta[t,j] for each current state j
        #
        for j in range(K):

            # scores[i] = best log prob to reach previous state i at t-1
            #            plus log transition prob from i -> j
            #
            scores = delta[t - 1, :] + log_A[:, j]

            # psi[t,j] stores which previous state i gives the best score
            #
            psi[t, j] = int(np.argmax(scores))

            # delta[t,j] is the best previous score plus log emission at time t in state j
            #
            delta[t, j] = float(np.max(scores)) + log_B[t, j]

    # allocate array for decoded best state sequence
    #
    states = np.zeros(T, dtype=int)

    # termination: best final state is argmax over delta[T-1,k]
    #
    states[T - 1] = int(np.argmax(delta[T - 1, :]))

    # backtrace: recover best states backward using psi pointers
    #
    for t in range(T - 2, -1, -1):

        # state at time t is the best previous state leading to state at time 
        # t+1
        #
        states[t] = psi[t + 1, states[t + 1]]

    # exit gracefully
    #
    return states

def reorder_states_by_mean(model, states):
    """method: reorder_states_by_mean
    arguments:
        model: hmm model dict
        states: decoded states (T,)
    return:
        model2: reordered model
        states2: remapped state sequence
    description:
        Reorder states so that mu is ascending (makes plots/prints easier to read).
    """

    # pull out the per-state means from the model
    #
    mu = model[MDL_KEY_MU]

    # order gives the indices of states sorted by mean (ascending):
    # e.g., order = [2,0,1] means old state 2 becomes new state 0, etc.
    #
    order = np.argsort(mu)

    # inv will map old_state -> new_state
    #
    inv = np.zeros_like(order)

    # fill the inverse mapping:
    # for each new index new_k, find old state old_k = order[new_k],
    # then set inv[old_k] = new_k
    #
    for new_k, old_k in enumerate(order):
        inv[old_k] = new_k

    # reorder pi according to the new state ordering
    #
    pi2 = model[MDL_KEY_PI][order]

    # reorder A by applying the ordering to both rows and columns:
    # first reorder rows, then reorder columns
    #
    A2 = model[MDL_KEY_A][order, :][:, order]

    # reorder means
    #
    mu2 = model[MDL_KEY_MU][order]

    # reorder sigmas
    #
    sig2 = model[MDL_KEY_SIG][order]

    # remap decoded state sequence using inv (old -> new)
    #
    states2 = np.asarray([inv[s] for s in states], dtype=int)

    # pack a new model dictionary with reordered parameters
    #
    model2 = {

        # renormalize pi2 to sum to 1 
        #
        MDL_KEY_PI: pi2 / np.sum(pi2),

        # renormalize rows of A2 to sum to 1 
        #
        MDL_KEY_A: A2 / np.sum(A2, axis=1, keepdims=True),

        # store reordered means
        #
        MDL_KEY_MU: mu2,

        # store reordered sigmas
        #
        MDL_KEY_SIG: sig2,

        # keep the same final log-likelihood value
        #
        MDL_KEY_LL: float(model[MDL_KEY_LL])
    }

    # exit gracefully
    #
    return model2, states2


def plot_hmm(dates, prices, r_dates, r, states, model, outfile):
    """method: plot_hmm
    arguments:
        dates: date list for prices (N,)
        prices: price series (N,)
        r_dates: date list aligned to returns (N-1,)
        r: returns series (N-1,)
        states: decoded states for returns (N-1,)
        model: hmm model dict
        outfile: output filename
    return:
        status: True on success
    description:
        Plot (1) price time series colored by inferred regime
             (2) returns time series colored by inferred regime
    """
    # map dates to matplotlib float days (simple conversion)
    #
    x_price = np.arange(len(prices), dtype=float)
    x_ret = np.arange(len(r), dtype=float)

    K = model[MDL_KEY_PI].size

    fig, axes = plt.subplots(2, 1, figsize=(14, 7), sharex=True)

    # top: price (colored points by state, aligned to r_dates -> prices[1:])
    #
    axes[0].plot(x_price, prices, linewidth=0.8)
    axes[0].scatter(x_price[1:], prices[1:], c=states, s=6, vmin=0, vmax=K-1)
    axes[0].set_title("Gold Price (USD) with HMM Regimes")
    axes[0].set_ylabel("Price (USD)")
    axes[0].grid(True, alpha=0.25)

    # bottom: returns
    #
    axes[1].plot(x_ret, r, linewidth=0.6)
    axes[1].scatter(x_ret, r, c=states, s=6, vmin=0, vmax=K-1)
    axes[1].set_title("Log Returns with HMM Regimes")
    axes[1].set_ylabel("log return")
    axes[1].set_xlabel("Time index (daily)")
    axes[1].grid(True, alpha=0.25)

    # add state legend (proxy artists)
    #
    handles = []
    labels = []
    for k in range(K):
        h = axes[1].scatter([np.nan], [np.nan], c=[k], s=25, vmin=0, vmax=K-1)
        handles.append(h)
        labels.append("State %d" % k)
    axes[1].legend(handles, labels, loc="upper right", frameon=True)

    plt.tight_layout()
    plt.savefig(outfile, dpi=200, bbox_inches="tight")

    print("\nVisualization saved to: %s" % outfile)

    # exit gracefully
    #
    return True


# function: main
#
def main():
    """method: main
    arguments:
        none
    return:
        status: True on success (bool)
    description:
        (1) Download gold_price_usd.csv.
        (2) Load prices; compute log returns.
        (3) Train a Gaussian HMM using Baum-Welch.
        (4) Decode regimes using Viterbi.
        (5) Plot price/returns colored by regime.
    """
    print("Running: %s" % __FILE__)

    # download data
    #
    data_path = os.path.join(DEF_DATA_DIR, DEF_DATA_FILE)

    if not download_file(DEF_DATA_URL, data_path):
        return False

    # load data
    #
    dates, prices = load_gold_csv(data_path)

    if len(prices) < 10:
        print("Error: not enough data points.")
        return False

    # compute returns
    #
    r = compute_log_returns(prices, eps=DEF_EPS)
    r_dates = dates[1:]

    # time-ordered train/eval split
    #
    T = r.size
    Ttr = int(np.floor(DEF_TRAIN_FRAC * T))
    Ttr = max(10, min(T - 5, Ttr))

    r_tr = r[:Ttr]
    r_ev = r[Ttr:]

    print("\nLoaded:")
    print("  prices : N=%d  (%s -> %s)" % (len(prices), str(dates[0]), str(dates[-1])))
    print("  returns: T=%d  (%s -> %s)" % (T, str(r_dates[0]), str(r_dates[-1])))
    print("  train  : T=%d" % r_tr.size)
    print("  eval   : T=%d" % r_ev.size)

    # train HMM on returns 
    #
    model = hmm_em_train(r_tr, K=DEF_NUM_STATES, max_iters=DEF_MAX_ITERS, tol=DEF_TOL)

    # evaluate log-likelihood on train/eval
    #
    ll_tr = hmm_loglik(r_tr, model)
    ll_ev = hmm_loglik(r_ev, model)

    # decode states on full series (using trained model)
    #
    states_full = hmm_viterbi(r, model)

    # reorder states by mean return for readability
    #
    model, states_full = reorder_states_by_mean(model, states_full)

    # print model summary (match simple, consistent formatting)
    #
    K = model[MDL_KEY_PI].size
    pi = model[MDL_KEY_PI]
    A = model[MDL_KEY_A]
    mu = model[MDL_KEY_MU]
    sig = model[MDL_KEY_SIG]

    print("\n%s:" % MDL_HMM)
    print("  num states: %d" % K)
    print("  train log_lik: %.2f" % ll_tr)
    print("  eval  log_lik: %.2f" % ll_ev)

    print("\n  pi:")
    for k in range(K):
        print("    state %d: %.2f" % (k, pi[k]))

    print("\n  A (rows sum to 1):")
    for i in range(K):
        row = " ".join(["%.4f" % A[i, j] for j in range(K)])
        print("    row %d: %s" % (i, row))

    print("\n  emissions (Gaussian on returns):")
    for k in range(K):
        print("    state %d: mu = %10.4f   sigma = %10.4f" % (k, mu[k], sig[k]))

    print(f"\n{'-'*40}")

    # state statistics on full decoded path
    #
    print("\nState Information:")
    for k in range(K):
        idx = (states_full == k)
        
        cnt = int(np.sum(idx))
        
        if cnt > 1:
            m = float(np.mean(r[idx]))
            s = float(np.std(r[idx], ddof=1))
        
        elif cnt == 1:
            m = float(r[idx][0])
            s = 0.0
        
        else:
            m = 0.0
            s = 0.0
        print("  state %d: count = %4d   mean = %.4f   std = %.4f" % (k, cnt, m, s))

    print(f"\n{'-'*40}")

    # plot results
    #
    plot_hmm(dates, prices, r_dates, r, states_full, model, DEF_OUT_PLOT)

    # exit gracefully
    #
    return True


if __name__ == '__main__':
    main()