#include "stdio.h" __global__ void add_arrays_gpu( float *in1, float *in2, float *out, int Ntot) { int idx=blockIdx.x*blockDim.x+threadIdx.x; if (idx < Ntot) out[idx]=in1[idx]+in2[idx]; } int main(void) { int N=10000, M=100000; float **x = (float**) malloc(N*sizeof(float)); float **x_d = (float**) malloc(N*sizeof(float)); int i,j; for (i = 0; i < N; i++) { x[i] = (float*) malloc(M*sizeof(float)); cudaMalloc ((void **) &x_d[i], sizeof(float)*M); for (j = 0; j < M; j++) { x[i][j] = (float) j; } cudaMemcpy(x_d[i], x[i], sizeof(float)*M, cudaMemcpyHostToDevice); } /* Compute the execution configuration */ // int block_size=256; // dim3 dimBlock(block_size); //dim3 dimGrid ( (N/dimBlock.x) + (!(N%dimBlock.x)?0:1) ); /** Add arrays a and b, store result in c */ // add_arrays_gpu<<>>(a_d, b_d, c_d, N); /* Copy data from deveice memory to host memory */ //cudaMemcpy(c, c_d, sizeof(float)*N, cudaMemcpyDeviceToHost); /* Print c */ /*for(i=0; i