#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>

#define MAX_FILTER_SIZE 7
#define HISTOGRAM_BINS 16
#define LOG_MIN -5
#define LOG_MAX 10
#define EXTRA_MAG 1e-12f


__constant__ float gaussian_filter[MAX_FILTER_SIZE];
__device__ unsigned int g_d_hist[HISTOGRAM_BINS];
__device__ float g_d_mean = 0.0f;
__device__ float g_d_std  = 0.0f;

// #################################

extern "C" int get_gpu_count(){
    int count = 0;
    cudaGetDeviceCount(&count);
    return count;
}

extern "C" size_t get_gpu_memory(int device_id){
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, device_id);
    return prop.totalGlobalMem;
}

extern "C" void set_device(int device_id){
    cudaSetDevice(device_id);
}

// #################################

__global__ void gaussian_horizontal(float* img_out, const float* img_in, int N, int H, int W, int K){
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int n = blockIdx.z;
    if(x >= W || y >= H || n >= N) return;

    const float* in_n = img_in + n*H*W*3;
    float* out_n      = img_out + n*H*W*3;

    int K2 = K/2;
    int idx = (y*W + x)*3;

    float r=0.0f, g=0.0f, b=0.0f;
    for(int k=0; k<K; k++){
        int xx = x + k - K2;
        if(xx>=0 && xx<W){
            int idx2 = (y*W + xx)*3;
            float w = gaussian_filter[k];
            r += in_n[idx2+0]*w;
            g += in_n[idx2+1]*w;
            b += in_n[idx2+2]*w;
        }
    }
    out_n[idx+0] = r;
    out_n[idx+1] = g;
    out_n[idx+2] = b;
}

__global__ void gaussian_vertical(float* img_out, const float* img_in, int N, int H, int W, int K){
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int n = blockIdx.z;
    if(x >= W || y >= H || n >= N) return;

    const float* in_n = img_in + n*H*W*3;
    float* out_n      = img_out + n*H*W*3;

    int K2 = K/2;
    int idx = (y*W + x)*3;

    float r=0.0f, g=0.0f, b=0.0f;
    for(int k=0; k<K; k++){
        int yy = y + k - K2;
        if(yy>=0 && yy<H){
            int idx2 = (yy*W + x)*3;
            float w = gaussian_filter[k];
            r += in_n[idx2+0]*w;
            g += in_n[idx2+1]*w;
            b += in_n[idx2+2]*w;
        }
    }
    out_n[idx+0] = r;
    out_n[idx+1] = g;
    out_n[idx+2] = b;
}

__global__ void logmag_merge_hist(const float2* fft, int N, int H, int W){
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int n = blockIdx.z;
    if(x >= W || y >= H || n >= N) return;

    int idx = (n * H * W) + (y * W + x);
    float2 v = fft[idx];
    float real = v.x;
    float imag = v.y;
    float mag = sqrtf(real * real + imag * imag) + EXTRA_MAG;

    float logv = log10f(mag);

    float t = (logv - LOG_MIN) / (LOG_MAX - LOG_MIN);
    int bin = (int)floorf(t * HISTOGRAM_BINS);
    if (bin < 0) bin = 0;
    if (bin >= HISTOGRAM_BINS) bin = HISTOGRAM_BINS - 1;

    atomicAdd(&g_d_hist[bin], 1u);
}

// #################################

extern "C" void create_gaussian_filter(int size=7, float sigma=1.0){

    if (size > MAX_FILTER_SIZE) {
        printf("ERROR: Gaussian Filter Size is Too Large (Max %d, Got %d)\n", MAX_FILTER_SIZE, size);
        return;
    }

    if (size % 2 != 1){
        printf("ERROR: Gaussian Filter isn't Odd");
        return;
    }

    float filter[MAX_FILTER_SIZE];
    for (int i = 0; i < MAX_FILTER_SIZE; ++i) filter[i] = 0.0f;

    int center = size / 2;
    float total = 0.0;

    for (int i = 0; i < size; i++) {
        int dx = i - center;
        float val = expf(-(dx * dx) / (2.0f * sigma * sigma));
        filter[i] = val;
        total += val;
    }

    for (int i = 0; i < size; i++)
        filter[i] /= total;

    cudaMemcpyToSymbol(gaussian_filter, filter, sizeof(filter));
}

extern "C" void gaussian_smoother(float* d_images, float* d_temp, int N, int H, int W, int K){
    dim3 block(16,16,1);
    dim3 grid((W+15)/16, (H+15)/16, N);

    // Horizontal pass: img_in -> temp
    gaussian_horizontal<<<grid,block>>>(d_temp, d_images, N, H, W, K);
    cudaDeviceSynchronize();

    // Vertical pass: temp -> img_in (overwrite original)
    gaussian_vertical<<<grid,block>>>(d_images, d_temp, N, H, W, K);
    cudaDeviceSynchronize();
}

extern "C" void log_mag_merge_histogram(const void* input_fft, int N, int H, int W){

    const float2* fft = reinterpret_cast<const float2*>(input_fft);

    dim3 block(16, 16, 1);
    dim3 grid((W + block.x - 1) / block.x, (H + block.y - 1) / block.y, N);
    logmag_merge_hist<<<grid, block>>>(fft, N, H, W);
    cudaDeviceSynchronize();
}

extern "C" void reset_global_histogram() {
    cudaMemset(g_d_hist, 0, HISTOGRAM_BINS * sizeof(unsigned int));
}

extern "C" void get_global_histogram(unsigned int* h_hist, int zero_after_copy) {

    cudaMemcpyFromSymbol(h_hist, g_d_hist, HISTOGRAM_BINS * sizeof(unsigned int));

    if (zero_after_copy)
        cudaMemset(g_d_hist, 0, HISTOGRAM_BINS * sizeof(unsigned int));
}


// ##############################

__global__ void compute_hist_mean_std_kernel(int num_bins, float log_min, float log_max){
    // Use g_d_hist directly
    float bin_width = (LOG_MAX - LOG_MIN) / HISTOGRAM_BINS;
    unsigned int total_count = 0;
    float mean = 0.0f;

    for (int i = 0; i < HISTOGRAM_BINS; i++){
        float center = LOG_MIN + (i + 0.5f) * bin_width;
        mean += center * g_d_hist[i];
        total_count += g_d_hist[i];
    }
    mean /= total_count;

    float var = 0.0f;
    for (int i = 0; i < HISTOGRAM_BINS; i++){
        float center = LOG_MIN + (i + 0.5f) * bin_width;
        float diff = center - mean;
        var += g_d_hist[i] * diff * diff;
    }
    var /= total_count;

    g_d_mean = mean;
    g_d_std = sqrtf(var);
}

// Host-callable functions
extern "C" void compute_histogram_mean_std(){
    compute_hist_mean_std_kernel<<<1,1>>>(HISTOGRAM_BINS, LOG_MIN, LOG_MAX);
    cudaDeviceSynchronize();
}

extern "C" void get_histogram_mean_std(float* mean, float* std){
    cudaMemcpyFromSymbol(mean, g_d_mean, sizeof(float));
    cudaMemcpyFromSymbol(std,  g_d_std,  sizeof(float));
}