// file: lecture_14/01_single_call/example.cu
//
// This file implements a vector add on a GPU using a single call.
//
// To compile this program: make.
//

// include files
//
#include "example.h"

int main(int argc, char** argv) {

  // get the dimension from the command line
  //
  int N = atoi(argv[1]);

  // create pointers to use to reference the vectors
  //
  int *a, *b, *c_cpu, *c;	// in CPU memory
  int *d_a, *d_b, *d_c;		// in GPU memory
  int size = N * sizeof(int);	// no. of bytes

  // create the data in CPU memory
  //
  a = (int*)new int[N];
  b = (int*)new int[N];
  c_cpu = (int*)new int[N];
  c = (int*)new int[N];

  // initialize the vectors
  //
  fprintf(stdout, "main (CPU): initializing vectors ...\n");
  if (vinit(a, N, (char*)"a") == false) {
    fprintf(stdout, "%s: error initializing vectors\n", argv[0]);
  }
  if (vinit(b, N, (char*)"b") == false) {
    fprintf(stdout, "%s: error initializing matrices\n", argv[0]);
  }

  // add the vectors on the cpu
  //
  vadd(c_cpu, a, b, N);

  // display the result
  //
  fprintf(stdout, "main (CPU): displaying the results ...\n");
  for (long i = 0; i < N; i++) {
    fprintf(stdout, "a[%d] (%d) + b[%d] (%d) = c_cpu[%d] (%d)\n",
	    i, a[i], i, b[i], i, c[i]);
    c[i] = (int)0;
  }
  
  // allocate space for device copies of a, b, c
  //
  fprintf(stdout, "main (GPU): allocating space ...\n");
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  // copy inputs to device
  //
  fprintf(stdout, "main (GPU): copying data to the gpu ...\n");
  cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
  cudaDeviceSynchronize();

  // launch add() kernel on GPU
  //
  fprintf(stdout,
	  "main (GPU): adding two vectors on the gpu ...\n");
  add<<<1,1>>>(d_c, d_a, d_b, N);
  cudaDeviceSynchronize();

  // copy result back to host
  //
  fprintf(stdout, "main (GPU): copying the data back to the cpu ...\n");
  cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();

  // display the result
  //
  fprintf(stdout, "main (GPU): displaying the results of add ...\n");
  for (int i = 0; i < N; i++) {
    fprintf(stdout, "a[%d] (%d) + b[%d] (%d) = c[%d] (%d) [(%d: %d)]\n",
	    i, a[i], i, b[i], i, c[i], c_cpu[i], c[i] - c_cpu[i]);
  }
  
  // cleanup
  //
  fprintf(stdout, "main (GPU): freeing space on the gpu ...\n");
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  // exit gracefully
  //
  return 0;
}