1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
#include "stdio.h"
__global__ void add_arrays_gpu( float *in1, float *in2, float *out, int Ntot)
{
int idx=blockIdx.x*blockDim.x+threadIdx.x;
if (idx < Ntot)
out[idx]=in1[idx]+in2[idx];
}
int main(void)
{
/* pointers to host memory */
float *a, *b, *c;
/* pointers to device memory */
float *a_d, *b_d, *c_d;
int N=100000000;
int i;
/* Allocate arrays a, b and c on host*/
a = (float*) malloc(N*sizeof(float));
b = (float*) malloc(N*sizeof(float));
c = (float*) malloc(N*sizeof(float));
/* Allocate arrays a_d, b_d and c_d on device*/
cudaMalloc ((void **) &a_d, sizeof(float)*N);
cudaMalloc ((void **) &b_d, sizeof(float)*N);
cudaMalloc ((void **) &c_d, sizeof(float)*N);
/* Initialize arrays a and b */
for (i=0; i<N; i++) {
a[i]= (float) i;
b[i]=(float) -i;
}
/* Copy data from host memory to device memory */
cudaMemcpy(a_d, a, sizeof(float)*N, cudaMemcpyHostToDevice);
cudaMemcpy(b_d, b, sizeof(float)*N, cudaMemcpyHostToDevice);
/* Compute the execution configuration */
int block_size=256;
dim3 dimBlock(block_size);
dim3 dimGrid ( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) );
/* Add arrays a and b, store result in c */
add_arrays_gpu<<<dimGrid,dimBlock>>>(a_d, b_d, c_d, N);
/* Copy data from deveice memory to host memory */
//cudaMemcpy(c, c_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
/* Print c */
/*for(i=0; i<N; i++)
printf(" c[%d]=%f\n",i,c[i]);*/
/* Free the memory */
free(a); free(b); free(c);
cudaFree(a_d); cudaFree(b_d);cudaFree(c_d);
}
|