// // nvcc demoCuda.cu // // time ./a.out // #include // // called from host -- run on device // __global__ void addArraysGpu( float* ptr1 , float* ptr2 , float* ptr3 ) { int idx = threadIdx.x ; // flat model // ptr3[idx] = ptr1[idx] + ptr2[idx] ; } // int main() { // pointers to host memory float *a , *b , *c ; // // pointers to device memory float *a_d , *b_d , *c_d ; // int i , N = 18 ; int len = N*sizeof(float) ; // // allocate arrays a, b and c on host a = (float*)malloc(len) ; b = (float*)malloc(len) ; c = (float*)malloc(len) ; // // allocate arrays a_d, b_d and c_d on device cudaMalloc( (void**)&a_d , len ) ; cudaMalloc( (void**)&b_d , len ) ; cudaMalloc( (void**)&c_d , len ) ; // // initialize arrays a and b for( i = 0 ; i < N ; i++ ) { a[i] = (float)i*i ; b[i] = (float)i/-2.0f ; } // // copy input from host memory to device memory cudaMemcpy( a_d , a , len , cudaMemcpyHostToDevice ) ; cudaMemcpy( b_d , b , len , cudaMemcpyHostToDevice ) ; // // execution configuration: How the threads are arranged, FLAT and LINEAR. addArraysGpu<<<1,N>>>( a_d , b_d , c_d ) ; // // copy result from device memory to host memory cudaMemcpy( c , c_d , len , cudaMemcpyDeviceToHost ) ; // for( i = 0 ; i < N ; i++ ) { printf("c[%d]=%f\n" , i , c[i] ) ; } // free(a) ; free(b) ; free(c) ; cudaFree(a_d) ; cudaFree(b_d) ; cudaFree(c_d) ; // return 0 ; } // // end of file //