// file: /courses/temple/ece_4822/lectures/current/lecture_13/example.cu // // This file implements a simple parallel vector add program. // // To compile this program: // // nvcc -o example.exe example.cu // // // include files // #include "example.h" int main(int argc, char** argv) { // get the dimension // int N = atoi(argv[1]); // case no. 3: add two vectors // int *a, *b, *c; int *d_a, *d_b, *d_c, *d_N; int size = N * sizeof(int); // create the data // a = (int*)new int[N]; b = (int*)new int[N]; c = (int*)new int[N]; // initialize the vectors // fprintf(stdout, "... initializing vectors on the CPU ...\n"); if (vinit(a, N) == false) { fprintf(stdout, "%s: error initializing vectors\n", argv[0]); } if (vinit(b, N) == false) { fprintf(stdout, "%s: error initializing matrices\n", argv[0]); } // add the vectors on the cpu // vadd(c, a, b, N); // display the result // fprintf(stdout, "... displaying the results of vadd on the cpu ...\n"); for (long i = 0; i < N; i++) { fprintf(stdout, "a[%d] (%d) + b[%d] (%d) = c[%d] (%d)\n", i, a[i], i, b[i], i, c[i]); c[i] = (int)0; } // allocate space for device copies of a, b, c // fprintf(stdout, "... allocating space on the gpu ...\n"); cudaMalloc((void **)&d_a, size); cudaMalloc((void **)&d_b, size); cudaMalloc((void **)&d_c, size); cudaMalloc((void **)&d_N, sizeof(int)); // copy inputs to device // fprintf(stdout, "... copying data to the gpu ...\n"); cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice); cudaMemcpy(d_N, &N, sizeof(int), cudaMemcpyHostToDevice); // launch add() kernel on GPU // fprintf(stdout, "... adding two vectors on the gpu ...\n"); mykernel<<<1,1>>>(); myadd<<<1,1>>>(d_c, d_a, d_b); // vadd_gpu<<<1,1>>>(d_c, d_a, d_b, d_N); // copy result back to host // cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost); // display the result // cudaDeviceSynchronize(); fprintf(stdout, "... displaying the results of vadd_gpu ...\n"); fprintf(stdout, "myadd: %d + %d = %d\n", a, b, c); // for (int i = 0; i < N; i++) { // fprintf(stdout, "a[%d] (%d) + b[%d] (%d) = c[%d] (%d)\n", i, a[i], i, b[i], i, c[i]); // } // cleanup // fprintf(stdout, "... freeing gpu space ...\n"); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); // exit gracefully // return 0; }