#!/usr/bin/env python3

import sys
import struct
import subprocess
import numpy as np
import nedc_image_tools
import multiprocessing as mp
import time

# Processing constants
TILE_SIZE = 2048
WINDOW_SIZE = 256
STRIDE_SIZE = 128
HISTOGRAM_BINS = 16

"Detect number of available CUDA GPUs"
def detect_gpu_count():
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=count', '--format=csv,noheader'],
            capture_output=True, 
            text=True,
            timeout=5
        )
        return len(result.stdout.strip().split('\n'))
    except:
        return 1

"Process a single image on specified GPU using compiled C++ worker"
def process_image_on_gpu(image_file, gpu_id):
    try:
        # Launch worker subprocess for this GPU
        worker = subprocess.Popen(
            ['./myaverager', str(gpu_id)],
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        
        # Tell worker we're sending 1 image
        worker.stdin.write(struct.pack('i', 1))
        worker.stdin.flush()
        
        # Open image and get dimensions
        reader = nedc_image_tools.Nil()
        reader.open(image_file)
        width, height = reader.get_dimension()
        
        # Send image dimensions
        worker.stdin.write(struct.pack('ii', width, height))
        worker.stdin.flush()
        
        window_count = 0
        
        # Process image in overlapping tiles
        tile_stride = TILE_SIZE - WINDOW_SIZE + STRIDE_SIZE
        
        for tile_y in range(0, height - WINDOW_SIZE + 1, tile_stride):
            for tile_x in range(0, width - WINDOW_SIZE + 1, tile_stride):
                # Calculate tile dimensions
                tile_w = min(TILE_SIZE, width - tile_x)
                tile_h = min(TILE_SIZE, height - tile_y)
                
                # Skip tiles too small for even one window
                if tile_w < WINDOW_SIZE or tile_h < WINDOW_SIZE:
                    continue
                
                # Read tile from SVS file
                tile = reader.read_data(
                    coordinate=(tile_x, tile_y),
                    npixx=tile_w,
                    npixy=tile_h,
                    color_mode="RGB"
                )
                
                # Ensure contiguous memory layout for efficient transfer
                tile = np.ascontiguousarray(tile, dtype=np.uint8)
                
                # Send tile dimensions and data
                worker.stdin.write(struct.pack('ii', tile_w, tile_h))
                worker.stdin.write(tile.tobytes())
                worker.stdin.flush()
                
                # Count windows in this tile
                num_windows_x = (tile_w - WINDOW_SIZE) // STRIDE_SIZE + 1
                num_windows_y = (tile_h - WINDOW_SIZE) // STRIDE_SIZE + 1
                window_count += num_windows_x * num_windows_y
        
        # Send terminator (0, 0)
        worker.stdin.write(struct.pack('ii', 0, 0))
        worker.stdin.flush()
        
        # Close reader and input pipe
        reader.close()
        worker.stdin.close()
        
        # Read histogram result
        histogram_bytes = worker.stdout.read(HISTOGRAM_BINS * 8)
        if len(histogram_bytes) != HISTOGRAM_BINS * 8:
            raise RuntimeError(f"Expected {HISTOGRAM_BINS * 8} bytes, got {len(histogram_bytes)}")
        
        histogram = np.frombuffer(histogram_bytes, dtype=np.uint64)
        
        # Wait for worker to complete
        worker.wait()
        
        return histogram, window_count
        
    except Exception as e:
        sys.stderr.write(f"Error processing {image_file}: {e}\n")
        return np.zeros(HISTOGRAM_BINS, dtype=np.uint64), 0

"Wrapper function for multiprocessing pool"
def worker_wrapper(args):
    image_file, gpu_id = args
    return process_image_on_gpu(image_file, gpu_id)

"Calculate mean and standard deviation from histogram"
def calculate_statistics(histogram, num_bins=HISTOGRAM_BINS):
    total_count = np.sum(histogram)
    
    if total_count == 0:
        return 0.0, 0.0
    
    # Bin parameters (from FFT magnitude range)
    max_magnitude = 20.0  # 256 * 256 * 256
    bin_width = max_magnitude / num_bins
    bin_centers = np.arange(num_bins) * bin_width + bin_width / 2
    
    # Calculate mean
    mean = np.sum(histogram * bin_centers) / total_count
    
    # Calculate variance and standard deviation
    variance = np.sum(histogram * (bin_centers - mean) ** 2) / total_count
    stdev = np.sqrt(variance)
    
    return mean, stdev

def main():
    if len(sys.argv) < 2:
        sys.stderr.write("Usage: python3 myaverager.py images.list\n")
        sys.exit(1)
    
    files_list = sys.argv[1]
    
    # Detect GPUs
    num_gpus = detect_gpu_count()
    
    # Read image file list
    with open(files_list, 'r') as f:
        image_files = [line.strip() for line in f if line.strip()]
    
    if not image_files:
        sys.stderr.write("Error: No image files found in list\n")
        sys.exit(1)
    
    print(f"Processing {len(image_files)} images using {num_gpus} GPU(s)\n")
    
    start_time = time.time()
    
    # Distribute images across GPUs (round-robin)
    tasks = [(image_file, i % num_gpus) for i, image_file in enumerate(image_files)]
    
    # Process images in parallel (one process per GPU)
    with mp.Pool(processes=num_gpus) as pool:
        results = pool.map(worker_wrapper, tasks)
    
    # Aggregate results from all workers
    global_histogram = np.zeros(HISTOGRAM_BINS, dtype=np.uint64)
    total_windows = 0
    
    for histogram, window_count in results:
        global_histogram += histogram
        total_windows += window_count
    
    elapsed_time = time.time() - start_time
    
    # Calculate statistics
    mean, stdev = calculate_statistics(global_histogram)
    
    # Display results
    print("\n" + "=" * 60)
    print("RESULTS")
    print("=" * 60)
    print(f"Images processed:      {len(image_files)}")
    print(f"Total windows:         {total_windows:,}")
    print(f"Average magnitude:     {mean:.6f}")
    print(f"Standard deviation:    {stdev:.6f}")
    print(f"\nTime spent:            {elapsed_time:.2f} seconds")
    print(f"Throughput:            {len(image_files)/elapsed_time:.2f} images/second")
    print(f"                       {total_windows/elapsed_time:.1f} windows/second")
    print(f"Per-image average:     {elapsed_time/len(image_files)*1000:.2f} ms")
    print("=" * 60)

if __name__ == "__main__":
    main()