-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcuda_example.cu
More file actions
55 lines (45 loc) · 1.71 KB
/
cuda_example.cu
File metadata and controls
55 lines (45 loc) · 1.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#include "cuda_example.h"
__device__ float device_function(const float *data, const int row, const int col, const int ncols){
return data[col + row * ncols] + 1;
}
__global__ void kernel(
float *result,
const float *data,
const float *params,
const int nrows,
const int ncols,
const int numparams)
{
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (col<0 || col>=ncols || row<0 || row>=nrows) return;
result[col + row * ncols] = device_function(data, row, col, ncols);
result[col + row * ncols] *= params[0] + params[1];
}
void run_cuda_kernel(
float *result,
const float *data,
const float *params,
const int nrows,
const int ncols,
const int numparams)
{
float *d_result, *d_data, *d_params;
// Allocate GPU memory
cudaMalloc((void**)&d_result, sizeof(float) * nrows * ncols);
cudaMalloc((void**)&d_data, sizeof(float) * nrows * ncols);
cudaMalloc((void**)&d_params, sizeof(float) * numparams);
// Transfer data from host to device
cudaMemcpy(d_data, data, sizeof(float) * nrows * ncols, cudaMemcpyHostToDevice);
cudaMemcpy(d_params, params, sizeof(float) * numparams, cudaMemcpyHostToDevice);
// Configure threads and run kernel
dim3 block_size(32,32);
dim3 grid_size((int)((ncols)/32+1), (int)((nrows)/32+1));
kernel<<<grid_size,block_size>>>(d_result, d_data, d_params, nrows, ncols, numparams);
// Transfer data back to host memory
cudaMemcpy(result, d_result, sizeof(float) * nrows * ncols, cudaMemcpyDeviceToHost);
// Deallocate device memory
cudaFree(d_result);
cudaFree(d_data);
cudaFree(d_params);
}