#include #include #include #include #include #include #include #include #include #include // Processing constants #define WINDOW_SIZE 256 #define STRIDE_SIZE 128 #define GAUSSIAN_SIZE 7 #define HISTOGRAM_BINS 16 #define MAX_MAGNITUDE 20.0f // Error checking macros #define CUDA_CHECK(call) \ do { \ cudaError_t error = call; \ if (error != cudaSuccess) { \ fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ cudaGetErrorString(error)); \ exit(EXIT_FAILURE); \ } \ } while(0) #define CUFFT_CHECK(call) \ do { \ cufftResult result = call; \ if (result != CUFFT_SUCCESS) { \ fprintf(stderr, "cuFFT error at %s:%d: %d\n", __FILE__, __LINE__, result); \ exit(EXIT_FAILURE); \ } \ } while(0) // GPU state structure - persistent across all tiles typedef struct { float* d_kernel; // Gaussian kernel unsigned long long* d_global_histogram; // Global histogram accumulator cufftHandle fft_plan; // FFT plan (created once) void* d_fft_work; // FFT workspace // Dynamically sized buffers uint8_t* d_tile; // Tile buffer uint8_t* d_windows; // Extracted windows float* d_gray; // Grayscale images float* d_filtered; // Gaussian filtered cufftComplex* d_fft; // FFT output float* d_magnitude; // FFT magnitude // Buffer size tracking int max_tile_size; int max_windows; // Kernel launch parameters int block_size_1d; dim3 block_size_2d; } GPUState; // Timing utility double get_time() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + tv.tv_usec * 1e-6; } // Calculate optimal block size for kernel template int get_optimal_block_size(KernelFunc kernel) { int min_grid_size; int block_size; CUDA_CHECK(cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, kernel, 0, 0)); return block_size; } // Generate 2D Gaussian kernel void gaussian_kernel(float* kernel, int size, float sigma) { int center = size / 2; float sum = 0.0f; for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { float dx = j - center; float dy = i - center; float value = expf(-(dx * dx + dy * dy) / (2.0f * sigma * sigma)); kernel[i * size + j] = value; sum += value; } } // Normalize for (int i = 0; i < size * size; i++) { kernel[i] /= sum; } } // Extract windows from tile with stride __global__ void extract_windows( const uint8_t* __restrict__ tile, uint8_t* __restrict__ windows, int tile_width, int tile_height, int num_windows_x, int num_windows_y ) { int window_idx = blockIdx.x; if (window_idx >= num_windows_x * num_windows_y) return; int win_x = window_idx % num_windows_x; int win_y = window_idx / num_windows_x; int start_x = win_x * STRIDE_SIZE; int start_y = win_y * STRIDE_SIZE; int pixel_idx = threadIdx.x; int pixels_per_window = WINDOW_SIZE * WINDOW_SIZE; // Each thread copies multiple pixels for (int p = pixel_idx; p < pixels_per_window; p += blockDim.x) { int local_y = p / WINDOW_SIZE; int local_x = p % WINDOW_SIZE; int src_y = start_y + local_y; int src_x = start_x + local_x; // Copy all 3 color channels for (int c = 0; c < 3; c++) { int src_idx = (src_y * tile_width + src_x) * 3 + c; int dst_idx = (window_idx * pixels_per_window + p) * 3 + c; windows[dst_idx] = tile[src_idx]; } } } // RGB to grayscale conversion __global__ void color_conversion( const uint8_t* __restrict__ rgb, float* __restrict__ gray, int width, int height ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int total_pixels = width * height; if (idx < total_pixels) { int rgb_idx = idx * 3; float r = (float)rgb[rgb_idx + 0]; float g = (float)rgb[rgb_idx + 1]; float b = (float)rgb[rgb_idx + 2]; // Standard luminance conversion gray[idx] = 0.299f * r + 0.587f * g + 0.114f * b; } } // 2D Gaussian filter with shared memory __global__ void gaussian_filter( const float* __restrict__ input, float* __restrict__ output, const float* __restrict__ kernel, int width, int height, int kernel_size ) { __shared__ float tile[16][16]; int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; // Load tile into shared memory if (x < width && y < height) { tile[ty][tx] = input[y * width + x]; } __syncthreads(); if (x >= width || y >= height) return; int half_kernel = kernel_size / 2; float sum = 0.0f; // Apply convolution with mirror boundary conditions for (int i = 0; i < kernel_size; i++) { for (int j = 0; j < kernel_size; j++) { int ix = x + j - half_kernel; int iy = y + i - half_kernel; // Mirror boundary if (ix < 0) ix = -ix; if (ix >= width) ix = 2 * width - ix - 2; if (iy < 0) iy = -iy; if (iy >= height) iy = 2 * height - iy - 2; float pixel_value = input[iy * width + ix]; float kernel_weight = kernel[i * kernel_size + j]; sum += pixel_value * kernel_weight; } } output[y * width + x] = sum; } // Compute magnitude from complex FFT output __global__ void compute_magnitude( const cufftComplex* __restrict__ fft_data, float* __restrict__ magnitude, int size ) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx < size) { float real = fft_data[idx].x; float imag = fft_data[idx].y; float mag = sqrtf(real * real + imag * imag); magnitude[idx] = logf(mag + 1.0f); } } // Compute histogram with shared memory reduction __global__ void compute_histogram( const float* __restrict__ data, unsigned long long* __restrict__ histogram, int data_size, int num_bins, float max_val ) { __shared__ unsigned int local_hist[HISTOGRAM_BINS]; // Initialize shared histogram if (threadIdx.x < num_bins) { local_hist[threadIdx.x] = 0; } __syncthreads(); int idx = blockIdx.x * blockDim.x + threadIdx.x; // Accumulate into shared histogram if (idx < data_size) { float value = data[idx]; int bin = (int)((value / max_val) * num_bins); // Clamp to valid range if (bin < 0) bin = 0; if (bin >= num_bins) bin = num_bins - 1; atomicAdd(&local_hist[bin], 1); } __syncthreads(); // Write to global histogram if (threadIdx.x < num_bins) { if (local_hist[threadIdx.x] > 0) { atomicAdd(&histogram[threadIdx.x], (unsigned long long)local_hist[threadIdx.x]); } } } // Initialize GPU state GPUState* gpu_initialize() { GPUState* state = (GPUState*)malloc(sizeof(GPUState)); state->d_tile = NULL; state->d_windows = NULL; state->d_gray = NULL; state->d_filtered = NULL; state->d_fft = NULL; state->d_magnitude = NULL; state->d_fft_work = NULL; state->max_tile_size = 0; state->max_windows = 0; // Calculate optimal block sizes state->block_size_1d = get_optimal_block_size(color_conversion); int block_dim_2d = 16; state->block_size_2d = dim3(block_dim_2d, block_dim_2d); // Generate and upload Gaussian kernel float h_kernel[GAUSSIAN_SIZE * GAUSSIAN_SIZE]; gaussian_kernel(h_kernel, GAUSSIAN_SIZE, 1.0f); CUDA_CHECK(cudaMalloc(&state->d_kernel, GAUSSIAN_SIZE * GAUSSIAN_SIZE * sizeof(float))); CUDA_CHECK(cudaMemcpy(state->d_kernel, h_kernel, GAUSSIAN_SIZE * GAUSSIAN_SIZE * sizeof(float), cudaMemcpyHostToDevice)); // Create FFT plan and workspace CUFFT_CHECK(cufftPlan2d(&state->fft_plan, WINDOW_SIZE, WINDOW_SIZE, CUFFT_R2C)); size_t worksize; CUFFT_CHECK(cufftEstimate2d(WINDOW_SIZE, WINDOW_SIZE, CUFFT_R2C, &worksize)); CUDA_CHECK(cudaMalloc(&state->d_fft_work, worksize)); CUFFT_CHECK(cufftSetWorkArea(state->fft_plan, state->d_fft_work)); // Allocate global histogram CUDA_CHECK(cudaMalloc(&state->d_global_histogram, HISTOGRAM_BINS * sizeof(unsigned long long))); CUDA_CHECK(cudaMemset(state->d_global_histogram, 0, HISTOGRAM_BINS * sizeof(unsigned long long))); return state; } // Process one tile void process_tile(GPUState* state, uint8_t* h_tile, int tile_w, int tile_h) { int tile_pixels = tile_w * tile_h; int tile_bytes = tile_pixels * 3; int num_windows_x = (tile_w - WINDOW_SIZE) / STRIDE_SIZE + 1; int num_windows_y = (tile_h - WINDOW_SIZE) / STRIDE_SIZE + 1; int total_windows = num_windows_x * num_windows_y; if (total_windows == 0) return; int window_pixels = WINDOW_SIZE * WINDOW_SIZE; // Allocate/reallocate tile buffer if needed if (tile_bytes > state->max_tile_size) { if (state->max_tile_size > 0) CUDA_CHECK(cudaFree(state->d_tile)); CUDA_CHECK(cudaMalloc(&state->d_tile, tile_bytes)); state->max_tile_size = tile_bytes; } // Allocate/reallocate window buffers if needed if (total_windows > state->max_windows) { if (state->max_windows > 0) { CUDA_CHECK(cudaFree(state->d_windows)); CUDA_CHECK(cudaFree(state->d_gray)); CUDA_CHECK(cudaFree(state->d_filtered)); CUDA_CHECK(cudaFree(state->d_fft)); CUDA_CHECK(cudaFree(state->d_magnitude)); } CUDA_CHECK(cudaMalloc(&state->d_windows, total_windows * window_pixels * 3)); CUDA_CHECK(cudaMalloc(&state->d_gray, total_windows * window_pixels * sizeof(float))); CUDA_CHECK(cudaMalloc(&state->d_filtered, total_windows * window_pixels * sizeof(float))); CUDA_CHECK(cudaMalloc(&state->d_fft, total_windows * window_pixels * sizeof(cufftComplex))); CUDA_CHECK(cudaMalloc(&state->d_magnitude, total_windows * window_pixels * sizeof(float))); state->max_windows = total_windows; } // Upload tile CUDA_CHECK(cudaMemcpy(state->d_tile, h_tile, tile_bytes, cudaMemcpyHostToDevice)); // Extract all windows from tile extract_windows<<block_size_1d>>>( state->d_tile, state->d_windows, tile_w, tile_h, num_windows_x, num_windows_y ); // Process ALL windows in parallel int total_pixels = total_windows * window_pixels; int blocks_1d = (total_pixels + state->block_size_1d - 1) / state->block_size_1d; // RGB to grayscale for ALL windows color_conversion<<block_size_1d>>>( state->d_windows, state->d_gray, total_windows * WINDOW_SIZE, WINDOW_SIZE ); // Gaussian filter for ALL windows dim3 filter_grid( (total_windows * WINDOW_SIZE + state->block_size_2d.x - 1) / state->block_size_2d.x, (WINDOW_SIZE + state->block_size_2d.y - 1) / state->block_size_2d.y ); gaussian_filter<<block_size_2d>>>( state->d_gray, state->d_filtered, state->d_kernel, total_windows * WINDOW_SIZE, WINDOW_SIZE, GAUSSIAN_SIZE ); // FFT for each window (still need loop due to cuFFT API) for (int i = 0; i < total_windows; i++) { float* d_filtered_win = state->d_filtered + i * window_pixels; cufftComplex* d_fft_win = state->d_fft + i * window_pixels; CUFFT_CHECK(cufftExecR2C(state->fft_plan, d_filtered_win, d_fft_win)); } // Magnitude for ALL windows compute_magnitude<<block_size_1d>>>( state->d_fft, state->d_magnitude, total_pixels ); // Histogram for ALL windows compute_histogram<<block_size_1d>>>( state->d_magnitude, state->d_global_histogram, total_pixels, HISTOGRAM_BINS, MAX_MAGNITUDE ); CUDA_CHECK(cudaDeviceSynchronize()); } // Read exact number of bytes from stdin int read_exact(void* buf, size_t n) { size_t total = 0; while (total < n) { ssize_t r = read(STDIN_FILENO, (char*)buf + total, n - total); if (r <= 0) return -1; total += r; } return 0; } int main(int argc, char** argv) { // Get GPU ID from command line int gpu_id = 0; if (argc > 1) { gpu_id = atoi(argv[1]); } CUDA_CHECK(cudaSetDevice(gpu_id)); double init_start = get_time(); GPUState* state = gpu_initialize(); fprintf(stderr, "GPU %d initialized in %.3f seconds\n\n", gpu_id, get_time() - init_start); // Read number of images int num_images; if (read_exact(&num_images, sizeof(int)) < 0) { fprintf(stderr, "Error reading number of images\n"); return 1; } double total_start = get_time(); // Process each image for (int img_idx = 0; img_idx < num_images; img_idx++) { int width, height; if (read_exact(&width, sizeof(int)) < 0 || read_exact(&height, sizeof(int)) < 0) { fprintf(stderr, "Error reading image dimensions\n"); return 1; } fprintf(stderr, " Image %d/%d: %dx%d\n", img_idx + 1, num_images, width, height); // Read and process tiles while (1) { int tile_w, tile_h; if (read_exact(&tile_w, sizeof(int)) < 0 || read_exact(&tile_h, sizeof(int)) < 0) { fprintf(stderr, "Error reading tile dimensions\n"); return 1; } // Terminator if (tile_w == 0 && tile_h == 0) break; int tile_bytes = tile_w * tile_h * 3; uint8_t* tile = (uint8_t*)malloc(tile_bytes); if (read_exact(tile, tile_bytes) < 0) { fprintf(stderr, "Error reading tile data\n"); free(tile); return 1; } process_tile(state, tile, tile_w, tile_h); free(tile); } } double total_time = get_time() - total_start; // Download histogram and write to stdout unsigned long long h_histogram[HISTOGRAM_BINS]; CUDA_CHECK(cudaMemcpy(h_histogram, state->d_global_histogram, HISTOGRAM_BINS * sizeof(unsigned long long), cudaMemcpyDeviceToHost)); fwrite(h_histogram, sizeof(unsigned long long), HISTOGRAM_BINS, stdout); fflush(stdout); fprintf(stderr, "GPU %d completed in %.2f seconds\n", gpu_id, total_time); return 0; }