#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import sys
from pathlib import Path

import numpy as np
import torch
import torch.nn.functional as F
import csv

from wsi_stream import stream_windows


WINDOW_SIZE = (256, 256)
FRAME_SIZE  = (128, 128)     # stride  128x128
BATCH       = 256          
NUM_BINS    = 16
HIST_RANGE  = (0.0, 10.0)    #  histogram range

# full run → None
MAX_WINDOWS = None


# ---------- GPU  ----------

def get_device() -> torch.device:
    
    if torch.cuda.is_available():
        n_dev = torch.cuda.device_count()
        print(f"[INFO] cuda device count: {n_dev}")
        dev = torch.device("cuda") 
        print(f"[INFO] using GPU device: {dev}, name: {torch.cuda.get_device_name(dev)}")
    else:
        dev = torch.device("cpu")
        print("[WARN] CUDA not available – falling back to CPU. "
              "This is OK for debugging but NOT for the final timed GPU run.")
    return dev


# ---------- Gaussian kernel ----------

def gaussian_kernel_np(size: int = 7, sigma: float = 1.5) -> np.ndarray:

    assert size % 2 == 1, "Gaussian  size must be odd"
    ax = np.arange(-(size // 2), size // 2 + 1)
    xx, yy = np.meshgrid(ax, ax)
    kernel = np.exp(-(xx**2 + yy**2) / (2.0 * sigma**2))
    kernel /= np.sum(kernel)
    return kernel.astype(np.float32)


def make_gaussian_kernel_torch(size: int, sigma: float, device: torch.device) -> torch.Tensor:

    k_np = gaussian_kernel_np(size=size, sigma=sigma)
    k_t = torch.from_numpy(k_np).to(device=device, dtype=torch.float32)
    k_t = k_t.unsqueeze(0).unsqueeze(0)  # [1, 1, kH, kW]
    return k_t


# ---------- one image GPU （include GPU grayscale） ----------

def process_one_image_gpu(image_path: str,
                          gkernel: torch.Tensor,
                          device: torch.device) -> np.ndarray:
    """
      - stream_windows read windows (list of np.ndarray)
      - CPU：np.stack → (N, H, W, C) or (N, C, H, W)
      - GPU：RGB → gray + conv2d + fft2 + log + histogram
      - every batch  histogram accumulate 
      -  normalize become sum=1，return numpy array
    """
    hist_img = torch.zeros(NUM_BINS, device=device, dtype=torch.float64)
    total_windows = 0

    for windows in stream_windows(
        image_file=image_path,
        window_size=WINDOW_SIZE,
        frame_size=FRAME_SIZE,
        batch=BATCH,
        color_mode="RGB"
    ):
        if len(windows) == 0:
            continue

        # max window （for debug ）
        if MAX_WINDOWS is not None:
            remaining = MAX_WINDOWS - total_windows
            if remaining <= 0:
                break
            if len(windows) > remaining:
                windows = windows[:remaining]

        # ---------- 1) CPU  windows stack  ----------
        # windows may be (H,W,3) or (3,H,W)
        batch_np = np.stack(windows, axis=0)  # shape ~ (N, H, W, 3) or (N, 3, H, W) or (N, H, W)

        # ---------- 2) put it in GPU ----------
        batch_t = torch.from_numpy(batch_np).to(device=device, dtype=torch.float32)

        # ---------- 3)on  GPU use glayscale ----------
        # (N, C, H, W)
        if batch_t.ndim == 4:
            # (N, H, W, 3) → (N, 3, H, W)
            if batch_t.shape[-1] == 3:
                batch_t = batch_t.permute(0, 3, 1, 2)
            # (N, 3, H, W) → aleady is  CHW，do not move
            elif batch_t.shape[1] == 3:
                pass
            # (N, H, W, 1) or (N, 1, H, W) → already grayscale
            elif batch_t.shape[-1] == 1:
                batch_t = batch_t.permute(0, 3, 1, 2)  # (N,1,H,W)
            elif batch_t.shape[1] == 1:
                pass
            else:
                raise ValueError(f"Unexpected 4D batch shape: {batch_t.shape}")

            # if is  3 channel：RGB → gray
            if batch_t.shape[1] == 3:
                r = batch_t[:, 0, :, :]
                g = batch_t[:, 1, :, :]
                b = batch_t[:, 2, :, :]  
                gray_t = 0.2989 * r + 0.5870 * g + 0.1140 * b  # (N, H, W)
            elif batch_t.shape[1] == 1:
                gray_t = batch_t[:, 0, :, :]  # (N, H, W)
            else:
                raise ValueError(f"Unexpected channel count in batch_t: {batch_t.shape}")
        elif batch_t.ndim == 3:
            # (N, H, W) → already grayscale
            gray_t = batch_t
        else:
            raise ValueError(f"Unexpected batch ndim: {batch_t.ndim}, shape={batch_t.shape}")

        #  channel  conv2d → (N, 1, H, W)
        batch_t_gray = gray_t.unsqueeze(1)

        with torch.no_grad():
            # ---------- 4) Gaussian blur: conv2d with padding="same" ----------
            kH, kW = gkernel.shape[-2], gkernel.shape[-1]
            pad_h, pad_w = kH // 2, kW // 2
            blur = F.conv2d(batch_t_gray, gkernel, padding=(pad_h, pad_w))  # (N, 1, H, W)

            # ---------- 5) 2D FFT → magnitude → log ----------
            blur_2d = blur.squeeze(1)                  # (N, H, W)
            fft2 = torch.fft.fft2(blur_2d)             # (N, H, W), complex
            mag = torch.abs(fft2)                      # (N, H, W)
            log_mag = torch.log(mag + 1e-8)            # (N, H, W)

            # ---------- 6)  histogram ----------
            vals = log_mag.reshape(-1)                 # (N*H*W,)
            hist_batch = torch.histc(
                vals,
                bins=NUM_BINS,
                min=HIST_RANGE[0],
                max=HIST_RANGE[1]
            ).to(torch.float64)

            hist_img += hist_batch
            total_windows += batch_np.shape[0]

        # if MAX_WINDOWS is None and total_windows % 10000 == 0:
        #     print(f"    [DEBUG][GPU] {image_path} windows processed: {total_windows}",
        #           flush=True)

        if MAX_WINDOWS is not None and total_windows >= MAX_WINDOWS:
            break

    if total_windows == 0:
        return np.zeros(NUM_BINS, dtype=np.float64)

    # ---------- 7) Normalize become probability distribution (sum = 1) ----------
    total_count = torch.sum(hist_img)
    if total_count > 0:
        hist_img = hist_img / total_count

    # return to CPU / numpy for later mean & std calculation
    return hist_img.detach().cpu().numpy()


def main():
    if len(sys.argv) != 2:
        print("Usage: myaverager_gpu.py files.list")
        sys.exit(1)

    list_file = Path(sys.argv[1])
    if not list_file.exists():
        print(f"[ERROR] cannot find list: {list_file}")
        sys.exit(2)

    device = get_device()
    import os
    print("CPU logical cores:", os.cpu_count())
    print("Torch intra-op threads:", torch.get_num_threads())
    print("Torch inter-op threads:", torch.get_num_interop_threads())
    print("===================================") 
    # on GPU build Gaussian kernel
    gkernel = make_gaussian_kernel_torch(size=7, sigma=1.5, device=device)

    hist_sums    = np.zeros(NUM_BINS, dtype=np.float64)
    hist_sq_sums = np.zeros(NUM_BINS, dtype=np.float64)
    n_images     = 0

    with list_file.open("r") as f:
        for line in f:
            image_path = line.strip()
            if not image_path:
                continue

            n_images += 1
            print(f"[INFO] processing image {n_images}: {image_path}", flush=True)

            hist_img = process_one_image_gpu(image_path, gkernel, device)

            hist_sums    += hist_img
            hist_sq_sums += hist_img ** 2

    if n_images == 0:
        print("[ERROR] no images found in list file")
        sys.exit(3)

    # mean & std
    mean_hist = hist_sums / n_images
    var_hist = hist_sq_sums / n_images - mean_hist ** 2
    var_hist = np.maximum(var_hist, 0.0)
    std_hist = np.sqrt(var_hist)

    np.set_printoptions(precision=6, suppress=True)
    print("\n==== Mean Histogram ({} bins) ====".format(NUM_BINS))
    print(mean_hist)
    print("\n==== Std Histogram ({} bins) ====".format(NUM_BINS))
    print(std_hist)

    # -----  CSV -----
    csv_path = "results_gpu.csv"
    with open(csv_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["bin", "mean", "std"])
        for i in range(NUM_BINS):
            writer.writerow([i, mean_hist[i], std_hist[i]])

    print(f"\n[INFO] Results saved to {csv_path}")


if __name__ == "__main__":
    main()