// file: /courses/temple/ece_4822/lectures/current/lecture_13/example.cu
//
// This file implements a simple parallel vector add program.
//
// To compile this program:
//
//  nvcc -o example.exe example.cu
//
//

// include files
//
#include "example.h"

int main(int argc, char** argv) {

  // get the dimension
  //
  int N = atoi(argv[1]);

  // case no. 3: add two vectors
  //
  int *a, *b, *c;
  int *d_a, *d_b, *d_c, *d_N;
  int size = N * sizeof(int);

  // create the data
  //
  a = (int*)new int[N];
  b = (int*)new int[N];
  c = (int*)new int[N];

  // initialize the vectors
  //
  fprintf(stdout, "... initializing vectors on the CPU ...\n");
  if (vinit(a, N) == false) {
    fprintf(stdout, "%s: error initializing vectors\n", argv[0]);
  }
  if (vinit(b, N) == false) {
    fprintf(stdout, "%s: error initializing matrices\n", argv[0]);
  }

  // add the vectors on the cpu
  //
  vadd(c, a, b, N);

  // display the result
  //
  fprintf(stdout, "... displaying the results of vadd on the cpu ...\n");
  for (long i = 0; i < N; i++) {
    fprintf(stdout, "a[%d] (%d) + b[%d] (%d) = c[%d] (%d)\n", i, a[i], i, b[i], i, c[i]);
    c[i] = (int)0;
  }
  
  // allocate space for device copies of a, b, c
  //
  fprintf(stdout, "... allocating space on the gpu ...\n");
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);
  cudaMalloc((void **)&d_N, sizeof(int));

  // copy inputs to device
  //
  fprintf(stdout, "... copying data to the gpu ...\n");
  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_N, &N, sizeof(int), cudaMemcpyHostToDevice);

  // launch add() kernel on GPU
  //
  fprintf(stdout, "... adding two vectors on the gpu ...\n");
  mykernel<<<1,1>>>();
  myadd<<<1,1>>>(d_c, d_a, d_b);
  //  vadd_gpu<<<1,1>>>(d_c, d_a, d_b, d_N);

  // copy result back to host
  //
  cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);

  // display the result
  //
  cudaDeviceSynchronize();
  fprintf(stdout, "... displaying the results of vadd_gpu ...\n");
  fprintf(stdout, "myadd: %d + %d = %d\n", a, b, c);
  //  for (int i = 0; i < N; i++) {
  //    fprintf(stdout, "a[%d] (%d) + b[%d] (%d) = c[%d] (%d)\n", i, a[i], i, b[i], i, c[i]);
  //  }
  
  // cleanup
  //
  fprintf(stdout, "... freeing gpu space ...\n");
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  // exit gracefully
  //
  return 0;
}