#include "stdio.h" __global__ void add_arrays_gpu( float *in1, float *in2, float *out, int Ntot) { int idx=blockIdx.x*blockDim.x+threadIdx.x; if (idx < Ntot) out[idx]=in1[idx]+in2[idx]; } int main(void) { /* pointers to host memory */ float *a, *b, *c; /* pointers to device memory */ float *a_d, *b_d, *c_d; int N=100000000; int i; /* Allocate arrays a, b and c on host*/ a = (float*) malloc(N*sizeof(float)); b = (float*) malloc(N*sizeof(float)); c = (float*) malloc(N*sizeof(float)); /* Allocate arrays a_d, b_d and c_d on device*/ cudaMalloc ((void **) &a_d, sizeof(float)*N); cudaMalloc ((void **) &b_d, sizeof(float)*N); cudaMalloc ((void **) &c_d, sizeof(float)*N); /* Initialize arrays a and b */ for (i=0; i>>(a_d, b_d, c_d, N); /* Copy data from deveice memory to host memory */ //cudaMemcpy(c, c_d, sizeof(float)*N, cudaMemcpyDeviceToHost); /* Print c */ /*for(i=0; i