#include <stdio.h>
#include <cuda_runtime.h>

// GPU kernel function - runs on the GPU
__global__ void helloFromGPU() {
    // Get thread and block indices
    int threadId = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Print hello world from each GPU thread
    printf("Hello World from GPU thread %d!\n", threadId);
}

int main() {
    printf("Hello World from CPU!\n");
    
    // Launch the kernel with 1 block of 10 threads
    helloFromGPU<<<1, 10>>>();
    
    // Wait for GPU to finish before accessing results
    cudaDeviceSynchronize();
    
    // Check for any errors
    cudaError_t error = cudaGetLastError();
    if (error != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        return -1;
    }
    
    printf("GPU kernel execution completed!\n");
    
    return 0;
}