// file: lecture_14/01_single_call/example.cu // // This file implements a vector add on a GPU using a single call. // // To compile this program: make. // // include files // #include "example.h" int main(int argc, char** argv) { // get the dimension from the command line // int N = atoi(argv[1]); // create pointers to use to reference the vectors // int *a, *b, *c_cpu, *c; // in CPU memory int *d_a, *d_b, *d_c; // in GPU memory int size = N * sizeof(int); // no. of bytes // create the data in CPU memory // a = (int*)new int[N]; b = (int*)new int[N]; c_cpu = (int*)new int[N]; c = (int*)new int[N]; // initialize the vectors // fprintf(stdout, "main (CPU): initializing vectors ...\n"); if (vinit(a, N, (char*)"a") == false) { fprintf(stdout, "%s: error initializing vectors\n", argv[0]); } if (vinit(b, N, (char*)"b") == false) { fprintf(stdout, "%s: error initializing matrices\n", argv[0]); } // add the vectors on the cpu // vadd(c_cpu, a, b, N); // display the result // fprintf(stdout, "main (CPU): displaying the results ...\n"); for (long i = 0; i < N; i++) { fprintf(stdout, "a[%d] (%d) + b[%d] (%d) = c_cpu[%d] (%d)\n", i, a[i], i, b[i], i, c[i]); c[i] = (int)0; } // allocate space for device copies of a, b, c // fprintf(stdout, "main (GPU): allocating space ...\n"); cudaMalloc((void **)&d_a, size); cudaMalloc((void **)&d_b, size); cudaMalloc((void **)&d_c, size); // copy inputs to device // fprintf(stdout, "main (GPU): copying data to the gpu ...\n"); cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice); cudaDeviceSynchronize(); // launch add() kernel on GPU // fprintf(stdout, "main (GPU): adding two vectors on the gpu ...\n"); add<<<1,1>>>(d_c, d_a, d_b, N); cudaDeviceSynchronize(); // copy result back to host // fprintf(stdout, "main (GPU): copying the data back to the cpu ...\n"); cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); // display the result // fprintf(stdout, "main (GPU): displaying the results of add ...\n"); for (int i = 0; i < N; i++) { fprintf(stdout, "a[%d] (%d) + b[%d] (%d) = c[%d] (%d) [(%d: %d)]\n", i, a[i], i, b[i], i, c[i], c_cpu[i], c[i] - c_cpu[i]); } // cleanup // fprintf(stdout, "main (GPU): freeing space on the gpu ...\n"); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); // exit gracefully // return 0; }