#include <stdio.h>
#include <stdlib.h>
__global__ void add(int *a, int *b) {
a[0]+= b[0];
}

int main() {
int a, b;

// host copies of variables a, b
int *d_a, *d_b;

// device copies of variables a, b
int size = sizeof(int);

// Allocate space for device copies of a, b
cudaMalloc(&d_a, size);
cudaMalloc(&d_b, size);

// Setup input values
a = 5;
b = 100;

// Copy inputs to device
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

// Launch add() kernel on GPU
add<<<1,1>>>(d_a, d_b);

// Copy result back to host
cudaError err = cudaMemcpy(&a, d_a, size, cudaMemcpyDeviceToHost);
  if(err!=cudaSuccess) {
      printf("CUDA error copying to Host: %s\n", cudaGetErrorString(err));
  }
printf("result is %d\n",a);

// Cleanup
cudaFree(d_a);
cudaFree(d_b);

return 0;
}