#!/usr/bin/env python # # file: 09_main.py # # revision history: # # 20260207 (SP): initial version for Gaussian HMM demo on gold_price_usd.csv #------------------------------------------------------------------------------ # import system modules # import os import sys import urllib.request import numpy as np import matplotlib.pyplot as plt import datetime as dt #------------------------------------------------------------------------------ # # global variables are listed here # #------------------------------------------------------------------------------ # set the filename using basename # __FILE__ = os.path.basename(__file__) # dataset location (gold prices) # DEF_DATA_URL = "https://raw.githubusercontent.com/natsunoyuki/Data_Science/refs/heads/master/gold/gold/gold_price_usd.csv" DEF_DATA_FILE = "gold_price_usd.csv" # local cache directory # DEF_DATA_DIR = "./gold_data" # numerical stability # DEF_EPS = 1.0e-8 # HMM settings # DEF_NUM_STATES = 3 DEF_MAX_ITERS = 60 DEF_TOL = 1.0e-4 # train/eval split fraction (time-ordered) # DEF_TRAIN_FRAC = 0.80 # output plot # DEF_OUT_PLOT = "gold_hmm_demo.png" #------------------------------------------------------------------------------ # model names (alphabetical) #------------------------------------------------------------------------------ MDL_HMM = "HMM" #------------------------------------------------------------------------------ # model dictionary key constants (alphabetical) #------------------------------------------------------------------------------ MDL_KEY_A = "A" MDL_KEY_LL = "log_lik" MDL_KEY_MU = "mu" MDL_KEY_PI = "pi" MDL_KEY_SIG = "sigma" MDL_KEY_STATES = "states" #------------------------------------------------------------------------------ # # functions are listed here # #------------------------------------------------------------------------------ def download_file(url, outpath): """method: download_file arguments: url: remote URL (str) outpath: local file path (str) return: status: True on success (bool) description: Download url -> outpath if outpath does not already exist. """ os.makedirs(os.path.dirname(outpath), exist_ok=True) # skip download if file exists and is non-empty # if os.path.exists(outpath) and os.path.getsize(outpath) > 0: return True try: print("Downloading: %s" % url) urllib.request.urlretrieve(url, outpath) except Exception as e: print("Error: download failed (%s): %s" % (url, str(e))) return False # exit gracefully # return True def load_gold_csv(csv_path): """method: load_gold_csv arguments: csv_path: path to csv file (str) return: dates: list of datetime.date (len=N) prices: gold prices (N,) float description: Loads CSV format: datetime,gold_price_usd YYYY-MM-DD,123.45 """ dates = [] prices = [] # read file line-by-line # with open(csv_path, "r", encoding="utf-8") as fp: # read the header # header = fp.readline().strip().split(",") # if there are fewer than 2 columns, raise an error # if len(header) < 2: raise ValueError("Unexpected header in %s" % csv_path) # parse data rows # for line in fp: # strip empty lines # line = line.strip() # if there is no line, continue to the next line # if len(line) == 0: continue # if there is a line, split it with comma # parts = line.split(",") # if there are fewer than 2 columns, continue to the next line # if len(parts) < 2: continue # parse date and price # y, m, d = parts[0].split("-") dates.append(dt.date(int(y), int(m), int(d))) prices.append(float(parts[1])) # convert it to numpy array # prices = np.asarray(prices, dtype=float) # exit gracefully # return dates, prices def compute_log_returns(prices, eps=DEF_EPS): """method: compute_log_returns arguments: prices: price series (N,) eps: stability constant return: r: log returns (N-1,) description: r[t] = log(p[t+1] / p[t]) """ # prices[:-1] takes all elements except the last: # [p0, p1, p2, ..., p_{N-2}] # np.maximum(..., eps) ensures no value is <= 0 by clamping to eps, # which prevents divide-by-zero and log(0) problems. # p0 = np.maximum(prices[:-1], eps) # prices[1:] takes all elements except the first: # [p1, p2, p3, ..., p_{N-1}] # again clamp to eps for numerical safety (no zeros/negatives). # p1 = np.maximum(prices[1:], eps) # compute element-wise ratio p[t+1] / p[t] for each t # then take natural log to get log returns: # r[t] = log(p1[t] / p0[t]) = log(p[t+1] / p[t]) # r = np.log(p1 / p0) # exit gracefully # return r def logsumexp(a, axis=None): """method: logsumexp arguments: a: array axis: axis to reduce (int or None) return: lse: log(sum(exp(a))) computed stably """ # np.logaddexp.reduce performs stable log-domain summation # lse = np.logaddexp.reduce(a, axis=axis) # exit gracefully # return lse def gaussian_logpdf(x, mu, var): """method: gaussian_logpdf arguments: x: observations (T,) mu: means (K,) var: variances (K,) return: logB: log emission probs (T,K) description: 1D Gaussian emissions, computed in log domain. """ # reshape x into a column vector of shape (T, 1), # where T is the number of time steps/observations. # This makes broadcasting work cleanly against (1, K) arrays. # x = x.reshape(-1, 1) # reshape mu into a row vector of shape (1, K), # where K is the number of hidden states (one mean per state). # Now (T,1) - (1,K) broadcasts to (T,K). # mu = mu.reshape(1, -1) # reshape var into a row vector of shape (1, K) (one variance per state), # and clamp each variance to be at least DEF_EPS to prevent divide-by-zero # and log(0) in the Gaussian log-pdf computation. # var = np.maximum(var.reshape(1, -1), DEF_EPS) # log N(x | mu, var) = -0.5*(log(2pi*var) + (x-mu)^2/var) # logB = -0.5 * (np.log(2.0 * np.pi * var) + ((x - mu) ** 2) / var) # exit gracefully # return logB def hmm_init_params(x, K): """method: hmm_init_params arguments: x: observations (T,) K: number of states return: pi: initial state distribution (K,) A: transition matrix (K,K) mu: means (K,) var: variances (K,) description: Simple quantile-based initialization (no external libraries). """ # T is the number of observations (time steps) in the 1D sequence x # T = x.size # create an initial-state probability vector pi of length K filled with 1s, # then divide by K so every state is equally likely at time t=0 # pi = np.ones(K, dtype=float) / float(K) # create a KxK matrix A initialized to zeros; A will become the transition matrix # A = np.full((K, K), 0.0, dtype=float) # fill the transition matrix: # - high probability (0.90) to stay in the same state (diagonal entries) # - remaining probability (0.10) spread uniformly across transitions to other states # for i in range(K): for j in range(K): if i == j: # probability of staying in the same state i -> i # A[i, j] = 0.90 else: # probability of switching from state i to a different state j # distribute 0.10 equally among the (K-1) other states # A[i, j] = 0.10 / float(K - 1) # compute K+1 quantile boundaries of x using evenly spaced probabilities: # np.linspace(0,1,K+1) gives [0, 1/K, 2/K, ..., 1] # qs will contain values that split x into K roughly equal-sized groups # qs = np.quantile(x, np.linspace(0.0, 1.0, K + 1)) # allocate an integer array of length T to store an initial state label for each sample # labels = np.zeros(T, dtype=int) # assign each observation x[t] to a quantile bin/state k based on the boundaries qs # for k in range(K): # lower boundary for bin k # lo = qs[k] # upper boundary for bin k # hi = qs[k + 1] # for the last bin, include the right endpoint (<= hi) so max values get included # if k == K - 1: idx = (x >= lo) & (x <= hi) else: # for other bins, use < hi to avoid double-counting boundary points # idx = (x >= lo) & (x < hi) # set label k for all points that fall inside this bin # labels[idx] = k # allocate arrays to hold per-state Gaussian parameters: # mu[k] = mean for state k # var[k] = variance for state k # mu = np.zeros(K, dtype=float) var = np.zeros(K, dtype=float) # compute overall mean of all observations (used as fallback if a bin is empty) # overall_mu = float(np.mean(x)) # compute overall variance of all observations with ddof=1 (unbiased estimate), # then add DEF_EPS for numerical stability (avoid zero variance) # overall_var = float(np.var(x, ddof=1)) + DEF_EPS # for each state k, estimate initial mean/variance using the observations assigned to it # for k in range(K): # select the observations whose assigned label is k # xk = x[labels == k] # if we have at least 2 points, compute mean and variance normally # if xk.size >= 2: # mean of the points in bin k # mu[k] = float(np.mean(xk)) # unbiased variance of the points in bin k + stability epsilon # var[k] = float(np.var(xk, ddof=1)) + DEF_EPS # if we have exactly 1 point, mean is that point, but variance can't be estimated, # so use the overall variance as a fallback # elif xk.size == 1: # mean equals the only sample # mu[k] = float(xk[0]) # fallback variance # var[k] = overall_var # if a bin has no points (can happen with repeated values / degenerate data), # create a reasonable fallback mean near the overall mean and use overall variance # else: # place the mean slightly offset from overall_mu so different states start distinct # offset step size is 0.01 * sqrt(overall_var) # mu[k] = overall_mu + 0.01 * (k - (K - 1) / 2.0) * np.sqrt(overall_var) # fallback variance # var[k] = overall_var # exit gracefully # return pi, A, mu, var def hmm_forward_backward(x, pi, A, mu, var): """method: hmm_forward_backward arguments: x: observations (T,) pi: initial state distribution (K,) A: transition matrix (K,K) mu: means (K,) var: variances (K,) return: gamma: state posteriors (T,K) xi_sum: expected transitions summed over time (K,K) log_lik: log-likelihood of x under the model (float) description: Standard forward-backward in log domain for numerical stability. """ # T is the number of time steps / observations in the sequence # T = x.size # K is the number of hidden states in the HMM # K = pi.size # clamp pi so no entry is 0 (prevents log(0)) # pi = np.maximum(pi, DEF_EPS) # renormalize pi so it sums to 1 (valid probability distribution) # pi = pi / np.sum(pi) # clamp A so no transition probability is 0 (prevents log(0)) # A = np.maximum(A, DEF_EPS) # renormalize each row of A so each row sums to 1: # A[i,:] is a distribution over next states given current state i # A = A / np.sum(A, axis=1, keepdims=True) # convert initial probabilities into log domain for stability: # log_pi[k] = log(pi[k]) # log_pi = np.log(pi) # convert transitions into log domain: # log_A[i,j] = log(A[i,j]) # log_A = np.log(A) # compute log emission probabilities for every time t and state k: # log_B[t,k] = log p(x[t] | state=k) for 1D Gaussian(mu[k], var[k]) # gaussian_logpdf returns a matrix of shape (T, K) # log_B = gaussian_logpdf(x, mu, var) # allocate the forward log-probability table: # log_alpha[t,k] = log p(x[0:t], z_t = k) # log_alpha = np.zeros((T, K), dtype=float) # initialize forward values at t=0: # log_alpha[0,k] = log(pi[k]) + log p(x[0] | z_0=k) # log_alpha[0, :] = log_pi + log_B[0, :] # forward recursion for t = 1 .. T-1 # for t in range(1, T): # for each next state j, compute: # log_alpha[t,j] = log_B[t,j] + logsumexp_i( log_alpha[t-1,i] + log_A[i,j] ) # for j in range(K): # compute the vector over i: # a[i] = log_alpha[t-1,i] + log_A[i,j] # a = log_alpha[t - 1, :] + log_A[:, j] # logsumexp(a) computes log(sum_i exp(a[i])) stably # log_alpha[t, j] = log_B[t, j] + logsumexp(a=a) # log-likelihood of the full observation sequence: # log p(x[0:T-1]) = logsumexp_k(log_alpha[T-1,k]) # log_lik = logsumexp(log_alpha[T - 1, :]) # allocate the backward log-probability table: # log_beta[t,k] = log p(x[t+1:T-1] | z_t = k) # log_beta = np.zeros((T, K), dtype=float) # base case at the end: after the last observation, probability is 1 # log(1) = 0 for all states # log_beta[T - 1, :] = 0.0 # backward recursion for t = T-2 .. 0 # for t in range(T - 2, -1, -1): # for each current state i, compute: # log_beta[t,i] = logsumexp_j( log_A[i,j] + log_B[t+1,j] + log_beta[t+1,j] ) # for i in range(K): # build the vector over j: # a[j] = log_A[i,j] + log_B[t+1,j] + log_beta[t+1,j] # a = log_A[i, :] + log_B[t + 1, :] + log_beta[t + 1, :] # logsumexp(a) computes log(sum_j exp(a[j])) stably # log_beta[t, i] = logsumexp(a=a) # compute posterior over states (gamma): # gamma[t,k] = p(z_t=k | x) = exp(log_alpha[t,k] + log_beta[t,k] - log_lik) # log_gamma = log_alpha + log_beta - log_lik # convert from log domain back to normal probability domain # gamma = np.exp(log_gamma) # xi_sum will hold expected transition counts summed over all time steps: # xi_sum[i,j] = sum_{t=0}^{T-2} p(z_t=i, z_{t+1}=j | x) # xi_sum = np.zeros((K, K), dtype=float) # compute expected transitions for each time step t # for t in range(T - 1): # compute log xi for all (i,j) at time t: # log_xi[i,j] = log_alpha[t,i] + log_A[i,j] + log_B[t+1,j] + log_beta[t+1,j] - log_lik # this is log of the normalized joint posterior for transition i->j at time t # (K,1) adds alpha[t,i] # (K,K) adds transition log prob # (1,K) adds emission at t+1 for state # (1,K) adds beta at t+1 for state j # subtract log p(x) to normalize # log_xi = (log_alpha[t, :].reshape(K, 1) + log_A + log_B[t + 1, :].reshape(1, K) + log_beta[t + 1, :].reshape(1, K) - log_lik) # exponentiate to convert log_xi to xi, then sum into xi_sum # xi_sum += np.exp(log_xi) # exit gracefully # return gamma, xi_sum, float(log_lik) def hmm_em_train(x, K=DEF_NUM_STATES, max_iters=DEF_MAX_ITERS, tol=DEF_TOL): """method: hmm_em_train arguments: x: observations (T,) K: number of states max_iters: maximum EM iterations tol: convergence threshold on log_lik change return: model: dict with learned parameters description: Baum-Welch (EM) for 1D Gaussian HMM. """ # initialize HMM parameters (pi, A, mu, var) using a simple heuristic # pi, A, mu, var = hmm_init_params(x, K) # prev_ll stores log-likelihood from previous iteration (for convergence check) # prev_ll = None # last_ll stores the most recent log-likelihood (for reporting in the model) # last_ll = None # print header info so the user sees what is being trained # print("\nTraining %s (Gaussian emissions)..." % MDL_HMM) print(" states: %d" % K) print(" iters : %d (tol=%.1e)\n" % (max_iters, tol)) # EM loop: repeat E-step and M-step up to max_iters times # for it in range(1, max_iters + 1): # E-step: # run forward-backward to compute: # - gamma[t,k] = p(z_t=k | x) (state posterior) # - xi_sum[i,j] = sum_t p(z_t=i, z_{t+1}=j | x) (expected transitions) # - log_lik = log p(x | current parameters) # gamma, xi_sum, log_lik = hmm_forward_backward(x, pi, A, mu, var) # M-step updates begin here # # update initial state distribution: # pi_new[k] = p(z_0=k | x) = gamma at time 0 # pi_new = gamma[0, :].copy() # update transition matrix A: # denominator gamma_sum[i] = expected number of times we are in state i # (summing gamma over t=0..T-2 because transitions go to t+1) # gamma_sum = np.sum(gamma[:-1, :], axis=0) + DEF_EPS # A_new[i,j] = expected transitions i->j / expected times in state i # reshape(-1,1) turns (K,) into (K,1) so broadcasting divides each row correctly # A_new = xi_sum / gamma_sum.reshape(-1, 1) # update emission means/variances (Gaussian parameters) using gamma weights # # gtot[k] = total responsibility mass for state k across all times # (expected number of samples generated by state k) # gtot = np.sum(gamma, axis=0) + DEF_EPS # weighted mean for each state: # mu_new[k] = sum_t gamma[t,k] * x[t] / sum_t gamma[t,k] # gamma.T @ x gives a (K,) vector where element k is sum_t gamma[t,k]*x[t] # mu_new = (gamma.T @ x) / gtot # allocate array for new variances # var_new = np.zeros(K, dtype=float) # compute weighted variance for each state k # for k in range(K): # diff[t] = x[t] - mu_new[k] # diff = x - mu_new[k] # var_new[k] = sum_t gamma[t,k] * (x[t]-mu_k)^2 / sum_t gamma[t,k] # add DEF_EPS to keep variance strictly positive (numerical stability) # var_new[k] = float((gamma[:, k] @ (diff ** 2)) / gtot[k]) + DEF_EPS # normalize updated parameters to ensure they are valid probabilities # # clamp pi_new so entries are not zero (prevents log(0)) # pi_new = np.maximum(pi_new, DEF_EPS) # renormalize pi_new so it sums to 1 # pi_new = pi_new / np.sum(pi_new) # clamp A_new so entries are not zero (prevents log(0)) # A_new = np.maximum(A_new, DEF_EPS) # renormalize rows of A_new so each row sums to 1 # A_new = A_new / np.sum(A_new, axis=1, keepdims=True) # compute and print progress information # # if this is the first iteration, we cannot compute a delta yet # if prev_ll is None: d_ll = 0.0 else: # delta log-likelihood: how much the objective improved this iteration # d_ll = float(log_lik - prev_ll) # print iteration number, current log-likelihood, and improvement # print(" iter %03d: log_lik = %.2f delta = %+10.4f" % (it, log_lik, d_ll)) # check convergence: # if improvement is smaller than tolerance, stop early # if prev_ll is not None and abs(d_ll) < tol: # accept the newly estimated parameters before exiting the loop # pi, A, mu, var = pi_new, A_new, mu_new, var_new # store the final log-likelihood # last_ll = log_lik break # otherwise, update parameters and continue to next EM iteration # pi, A, mu, var = pi_new, A_new, mu_new, var_new # save current log-likelihood to compute delta next time # prev_ll = log_lik # keep last_ll current for final reporting even if we do not break # last_ll = log_lik # pack learned parameters into a model dictionary # model = { # learned initial distribution pi # MDL_KEY_PI: pi, # learned transition matrix A # MDL_KEY_A: A, # learned Gaussian means per state # MDL_KEY_MU: mu, # store standard deviation (sigma) instead of variance for readability: # sigma = sqrt(var) # MDL_KEY_SIG: np.sqrt(np.maximum(var, DEF_EPS)), # store final log-likelihood value # MDL_KEY_LL: float(last_ll) } # exit gracefully # return model def hmm_loglik(x, model): """method: hmm_loglik arguments: x: observations (T,) model: hmm model dict return: log_lik: float description: Compute log-likelihood via forward pass (uses forward_backward for simplicity). """ # extract initial state probabilities pi from the model dictionary # pi = model[MDL_KEY_PI] # extract transition matrix A from the model dictionary # A = model[MDL_KEY_A] # extract Gaussian means (one mean per state) from the model dictionary # mu = model[MDL_KEY_MU] # extract Gaussian standard deviations (sigma) from the model dictionary # sig = model[MDL_KEY_SIG] # convert standard deviation to variance (var = sigma^2), # and clamp to DEF_EPS to avoid zero variance issues # var = np.maximum(sig ** 2, DEF_EPS) # run forward-backward using the model parameters; we ignore gamma and xi_sum # and only keep the returned log-likelihood (ll) # _, _, ll = hmm_forward_backward(x, pi, A, mu, var) # exit gracefully # return float(ll) def hmm_viterbi(x, model): """method: hmm_viterbi arguments: x: observations (T,) model: hmm model dict return: states: most likely state sequence (T,) int description: Viterbi decoding in log domain. """ # extract pi and clamp to avoid zeros (prevents log(0)) # pi = np.maximum(model[MDL_KEY_PI], DEF_EPS) # extract A and clamp to avoid zeros (prevents log(0)) # A = np.maximum(model[MDL_KEY_A], DEF_EPS) # extract Gaussian means # mu = model[MDL_KEY_MU] # extract sigma and clamp to at least sqrt(DEF_EPS) for stability # sig = np.maximum(model[MDL_KEY_SIG], np.sqrt(DEF_EPS)) # convert sigma to variance and clamp # var = np.maximum(sig ** 2, DEF_EPS) # renormalize pi so it sums to 1 # pi = pi / np.sum(pi) # renormalize each row of A so each row sums to 1 # A = A / np.sum(A, axis=1, keepdims=True) # move to log domain for stable Viterbi computations # log_pi = np.log(pi) # move transitions to log domain # log_A = np.log(A) # compute log emission probabilities: # log_B[t,k] = log p(x[t] | state=k) # log_B = gaussian_logpdf(x, mu, var) # get sequence length T and number of states K from log_B shape # T, K = log_B.shape # delta[t,k] will store the best (max) log-probability of any path # that ends in state k at time t # delta = np.zeros((T, K), dtype=float) # psi[t,k] will store the argmax (best previous state index) # used to reconstruct the best path # psi = np.zeros((T, K), dtype=int) # initialization at t=0: # best path to state k is starting in k with probability pi[k], # and emitting x[0] from state k # delta[0, :] = log_pi + log_B[0, :] # psi at t=0 is unused; set to 0 for completeness # psi[0, :] = 0 # dynamic programming recursion for t=1..T-1 # for t in range(1, T): # compute delta[t,j] for each current state j # for j in range(K): # scores[i] = best log prob to reach previous state i at t-1 # plus log transition prob from i -> j # scores = delta[t - 1, :] + log_A[:, j] # psi[t,j] stores which previous state i gives the best score # psi[t, j] = int(np.argmax(scores)) # delta[t,j] is the best previous score plus log emission at time t in state j # delta[t, j] = float(np.max(scores)) + log_B[t, j] # allocate array for decoded best state sequence # states = np.zeros(T, dtype=int) # termination: best final state is argmax over delta[T-1,k] # states[T - 1] = int(np.argmax(delta[T - 1, :])) # backtrace: recover best states backward using psi pointers # for t in range(T - 2, -1, -1): # state at time t is the best previous state leading to state at time # t+1 # states[t] = psi[t + 1, states[t + 1]] # exit gracefully # return states def reorder_states_by_mean(model, states): """method: reorder_states_by_mean arguments: model: hmm model dict states: decoded states (T,) return: model2: reordered model states2: remapped state sequence description: Reorder states so that mu is ascending (makes plots/prints easier to read). """ # pull out the per-state means from the model # mu = model[MDL_KEY_MU] # order gives the indices of states sorted by mean (ascending): # e.g., order = [2,0,1] means old state 2 becomes new state 0, etc. # order = np.argsort(mu) # inv will map old_state -> new_state # inv = np.zeros_like(order) # fill the inverse mapping: # for each new index new_k, find old state old_k = order[new_k], # then set inv[old_k] = new_k # for new_k, old_k in enumerate(order): inv[old_k] = new_k # reorder pi according to the new state ordering # pi2 = model[MDL_KEY_PI][order] # reorder A by applying the ordering to both rows and columns: # first reorder rows, then reorder columns # A2 = model[MDL_KEY_A][order, :][:, order] # reorder means # mu2 = model[MDL_KEY_MU][order] # reorder sigmas # sig2 = model[MDL_KEY_SIG][order] # remap decoded state sequence using inv (old -> new) # states2 = np.asarray([inv[s] for s in states], dtype=int) # pack a new model dictionary with reordered parameters # model2 = { # renormalize pi2 to sum to 1 # MDL_KEY_PI: pi2 / np.sum(pi2), # renormalize rows of A2 to sum to 1 # MDL_KEY_A: A2 / np.sum(A2, axis=1, keepdims=True), # store reordered means # MDL_KEY_MU: mu2, # store reordered sigmas # MDL_KEY_SIG: sig2, # keep the same final log-likelihood value # MDL_KEY_LL: float(model[MDL_KEY_LL]) } # exit gracefully # return model2, states2 def plot_hmm(dates, prices, r_dates, r, states, model, outfile): """method: plot_hmm arguments: dates: date list for prices (N,) prices: price series (N,) r_dates: date list aligned to returns (N-1,) r: returns series (N-1,) states: decoded states for returns (N-1,) model: hmm model dict outfile: output filename return: status: True on success description: Plot (1) price time series colored by inferred regime (2) returns time series colored by inferred regime """ # map dates to matplotlib float days (simple conversion) # x_price = np.arange(len(prices), dtype=float) x_ret = np.arange(len(r), dtype=float) K = model[MDL_KEY_PI].size fig, axes = plt.subplots(2, 1, figsize=(14, 7), sharex=True) # top: price (colored points by state, aligned to r_dates -> prices[1:]) # axes[0].plot(x_price, prices, linewidth=0.8) axes[0].scatter(x_price[1:], prices[1:], c=states, s=6, vmin=0, vmax=K-1) axes[0].set_title("Gold Price (USD) with HMM Regimes") axes[0].set_ylabel("Price (USD)") axes[0].grid(True, alpha=0.25) # bottom: returns # axes[1].plot(x_ret, r, linewidth=0.6) axes[1].scatter(x_ret, r, c=states, s=6, vmin=0, vmax=K-1) axes[1].set_title("Log Returns with HMM Regimes") axes[1].set_ylabel("log return") axes[1].set_xlabel("Time index (daily)") axes[1].grid(True, alpha=0.25) # add state legend (proxy artists) # handles = [] labels = [] for k in range(K): h = axes[1].scatter([np.nan], [np.nan], c=[k], s=25, vmin=0, vmax=K-1) handles.append(h) labels.append("State %d" % k) axes[1].legend(handles, labels, loc="upper right", frameon=True) plt.tight_layout() plt.savefig(outfile, dpi=200, bbox_inches="tight") print("\nVisualization saved to: %s" % outfile) # exit gracefully # return True # function: main # def main(): """method: main arguments: none return: status: True on success (bool) description: (1) Download gold_price_usd.csv. (2) Load prices; compute log returns. (3) Train a Gaussian HMM using Baum-Welch. (4) Decode regimes using Viterbi. (5) Plot price/returns colored by regime. """ print("Running: %s" % __FILE__) # download data # data_path = os.path.join(DEF_DATA_DIR, DEF_DATA_FILE) if not download_file(DEF_DATA_URL, data_path): return False # load data # dates, prices = load_gold_csv(data_path) if len(prices) < 10: print("Error: not enough data points.") return False # compute returns # r = compute_log_returns(prices, eps=DEF_EPS) r_dates = dates[1:] # time-ordered train/eval split # T = r.size Ttr = int(np.floor(DEF_TRAIN_FRAC * T)) Ttr = max(10, min(T - 5, Ttr)) r_tr = r[:Ttr] r_ev = r[Ttr:] print("\nLoaded:") print(" prices : N=%d (%s -> %s)" % (len(prices), str(dates[0]), str(dates[-1]))) print(" returns: T=%d (%s -> %s)" % (T, str(r_dates[0]), str(r_dates[-1]))) print(" train : T=%d" % r_tr.size) print(" eval : T=%d" % r_ev.size) # train HMM on returns # model = hmm_em_train(r_tr, K=DEF_NUM_STATES, max_iters=DEF_MAX_ITERS, tol=DEF_TOL) # evaluate log-likelihood on train/eval # ll_tr = hmm_loglik(r_tr, model) ll_ev = hmm_loglik(r_ev, model) # decode states on full series (using trained model) # states_full = hmm_viterbi(r, model) # reorder states by mean return for readability # model, states_full = reorder_states_by_mean(model, states_full) # print model summary (match simple, consistent formatting) # K = model[MDL_KEY_PI].size pi = model[MDL_KEY_PI] A = model[MDL_KEY_A] mu = model[MDL_KEY_MU] sig = model[MDL_KEY_SIG] print("\n%s:" % MDL_HMM) print(" num states: %d" % K) print(" train log_lik: %.2f" % ll_tr) print(" eval log_lik: %.2f" % ll_ev) print("\n pi:") for k in range(K): print(" state %d: %.2f" % (k, pi[k])) print("\n A (rows sum to 1):") for i in range(K): row = " ".join(["%.4f" % A[i, j] for j in range(K)]) print(" row %d: %s" % (i, row)) print("\n emissions (Gaussian on returns):") for k in range(K): print(" state %d: mu = %10.4f sigma = %10.4f" % (k, mu[k], sig[k])) print(f"\n{'-'*40}") # state statistics on full decoded path # print("\nState Information:") for k in range(K): idx = (states_full == k) cnt = int(np.sum(idx)) if cnt > 1: m = float(np.mean(r[idx])) s = float(np.std(r[idx], ddof=1)) elif cnt == 1: m = float(r[idx][0]) s = 0.0 else: m = 0.0 s = 0.0 print(" state %d: count = %4d mean = %.4f std = %.4f" % (k, cnt, m, s)) print(f"\n{'-'*40}") # plot results # plot_hmm(dates, prices, r_dates, r, states_full, model, DEF_OUT_PLOT) # exit gracefully # return True if __name__ == '__main__': main()