#include #include #include #include #define MAX_FILTER_SIZE 7 #define HISTOGRAM_BINS 16 #define LOG_MIN -5 #define LOG_MAX 10 #define EXTRA_MAG 1e-12f __constant__ float gaussian_filter[MAX_FILTER_SIZE]; __device__ unsigned int g_d_hist[HISTOGRAM_BINS]; __device__ float g_d_mean = 0.0f; __device__ float g_d_std = 0.0f; // ################################# extern "C" int get_gpu_count(){ int count = 0; cudaGetDeviceCount(&count); return count; } extern "C" size_t get_gpu_memory(int device_id){ cudaDeviceProp prop; cudaGetDeviceProperties(&prop, device_id); return prop.totalGlobalMem; } extern "C" void set_device(int device_id){ cudaSetDevice(device_id); } // ################################# __global__ void gaussian_horizontal(float* img_out, const float* img_in, int N, int H, int W, int K){ int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int n = blockIdx.z; if(x >= W || y >= H || n >= N) return; const float* in_n = img_in + n*H*W*3; float* out_n = img_out + n*H*W*3; int K2 = K/2; int idx = (y*W + x)*3; float r=0.0f, g=0.0f, b=0.0f; for(int k=0; k=0 && xx= W || y >= H || n >= N) return; const float* in_n = img_in + n*H*W*3; float* out_n = img_out + n*H*W*3; int K2 = K/2; int idx = (y*W + x)*3; float r=0.0f, g=0.0f, b=0.0f; for(int k=0; k=0 && yy= W || y >= H || n >= N) return; int idx = (n * H * W) + (y * W + x); float2 v = fft[idx]; float real = v.x; float imag = v.y; float mag = sqrtf(real * real + imag * imag) + EXTRA_MAG; float logv = log10f(mag); float t = (logv - LOG_MIN) / (LOG_MAX - LOG_MIN); int bin = (int)floorf(t * HISTOGRAM_BINS); if (bin < 0) bin = 0; if (bin >= HISTOGRAM_BINS) bin = HISTOGRAM_BINS - 1; atomicAdd(&g_d_hist[bin], 1u); } // ################################# extern "C" void create_gaussian_filter(int size=7, float sigma=1.0){ if (size > MAX_FILTER_SIZE) { printf("ERROR: Gaussian Filter Size is Too Large (Max %d, Got %d)\n", MAX_FILTER_SIZE, size); return; } if (size % 2 != 1){ printf("ERROR: Gaussian Filter isn't Odd"); return; } float filter[MAX_FILTER_SIZE]; for (int i = 0; i < MAX_FILTER_SIZE; ++i) filter[i] = 0.0f; int center = size / 2; float total = 0.0; for (int i = 0; i < size; i++) { int dx = i - center; float val = expf(-(dx * dx) / (2.0f * sigma * sigma)); filter[i] = val; total += val; } for (int i = 0; i < size; i++) filter[i] /= total; cudaMemcpyToSymbol(gaussian_filter, filter, sizeof(filter)); } extern "C" void gaussian_smoother(float* d_images, float* d_temp, int N, int H, int W, int K){ dim3 block(16,16,1); dim3 grid((W+15)/16, (H+15)/16, N); // Horizontal pass: img_in -> temp gaussian_horizontal<<>>(d_temp, d_images, N, H, W, K); cudaDeviceSynchronize(); // Vertical pass: temp -> img_in (overwrite original) gaussian_vertical<<>>(d_images, d_temp, N, H, W, K); cudaDeviceSynchronize(); } extern "C" void log_mag_merge_histogram(const void* input_fft, int N, int H, int W){ const float2* fft = reinterpret_cast(input_fft); dim3 block(16, 16, 1); dim3 grid((W + block.x - 1) / block.x, (H + block.y - 1) / block.y, N); logmag_merge_hist<<>>(fft, N, H, W); cudaDeviceSynchronize(); } extern "C" void reset_global_histogram() { cudaMemset(g_d_hist, 0, HISTOGRAM_BINS * sizeof(unsigned int)); } extern "C" void get_global_histogram(unsigned int* h_hist, int zero_after_copy) { cudaMemcpyFromSymbol(h_hist, g_d_hist, HISTOGRAM_BINS * sizeof(unsigned int)); if (zero_after_copy) cudaMemset(g_d_hist, 0, HISTOGRAM_BINS * sizeof(unsigned int)); } // ############################## __global__ void compute_hist_mean_std_kernel(int num_bins, float log_min, float log_max){ // Use g_d_hist directly float bin_width = (LOG_MAX - LOG_MIN) / HISTOGRAM_BINS; unsigned int total_count = 0; float mean = 0.0f; for (int i = 0; i < HISTOGRAM_BINS; i++){ float center = LOG_MIN + (i + 0.5f) * bin_width; mean += center * g_d_hist[i]; total_count += g_d_hist[i]; } mean /= total_count; float var = 0.0f; for (int i = 0; i < HISTOGRAM_BINS; i++){ float center = LOG_MIN + (i + 0.5f) * bin_width; float diff = center - mean; var += g_d_hist[i] * diff * diff; } var /= total_count; g_d_mean = mean; g_d_std = sqrtf(var); } // Host-callable functions extern "C" void compute_histogram_mean_std(){ compute_hist_mean_std_kernel<<<1,1>>>(HISTOGRAM_BINS, LOG_MIN, LOG_MAX); cudaDeviceSynchronize(); } extern "C" void get_histogram_mean_std(float* mean, float* std){ cudaMemcpyFromSymbol(mean, g_d_mean, sizeof(float)); cudaMemcpyFromSymbol(std, g_d_std, sizeof(float)); }