// file: /courses/temple/ece_4822/lectures/current/lecture_12/example.cu
//
// This file contains my first CUDA program. Note that the file extension
// is ".cu".
//
// To compile this program:
//
//  nvcc -o example.exe example.cu
//
// This program prints "hello world" to the terminal and runs on a GPU.
//

// include files
//
#include "example.h"

int main(void) {

  // case no. 1: hello world
  //
  mykernel<<<1,1>>>();
  cudaDeviceSynchronize();
  fprintf(stdout, "main program: hello world from the CPU / main program!\n");

  // case no. 2: add
  //
  int a, b, c;
  int *d_a, *d_b, *d_c;
  int size = sizeof(int);

  // allocate space for device copies of a, b, c
  //
  cudaMalloc((void **)&d_a, size);
  cudaMalloc((void **)&d_b, size);
  cudaMalloc((void **)&d_c, size);

  // set the values of a and b
  //
  a = 1;
  b = 27;

  // copy inputs to device
  //
  cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

  // launch add() kernel on GPU
  //
  myadd<<<1,1>>>(d_c, d_a, d_b);

  // copy result back to host
  //
  cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);

  // display the result
  //
  cudaDeviceSynchronize();
  fprintf(stdout, "myadd: %d + %d = %d\n", a, b, c);

  // cleanup
  //
  cudaFree(d_a);
  cudaFree(d_b);
  cudaFree(d_c);

  // exit gracefully
  //
  return 0;
}