-
Notifications
You must be signed in to change notification settings - Fork 1
/
main_mix_8_floats.cpp
113 lines (95 loc) · 3.84 KB
/
main_mix_8_floats.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
constexpr auto kernel_file = "vector_mix_floats.cl";
/*
This is the cpu version of the gpu kernel, to test that our program works as intended.
*/
auto cpu_func(std::vector<float> const & input) {
std::vector<float> output;
output.resize(input.size());
for(int i=0;i<input.size(); i+=2) {
output[i] = input[i+1];
output[i+1] = input[i];
}
return output;
}
int main(void) {
// Create the input vector
std::vector<float> input{2.5f, 9.f, -3.f, 5.f, 10.f, 4.f, 1.f, 7.f};
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
CHECK_CL_ERROR(ret);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
CHECK_CL_ERROR(ret);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
CHECK_CL_ERROR(ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
CHECK_CL_ERROR(ret);
auto bufSz = input.size() * sizeof(decltype(input[0]));
// Create memory buffers on the device for each vector
cl_mem input_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
bufSz, NULL, &ret);
CHECK_CL_ERROR(ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
bufSz, NULL, &ret);
CHECK_CL_ERROR(ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, input_mem_obj, CL_TRUE, 0,
bufSz, input.data(), 0, NULL, NULL);
CHECK_CL_ERROR(ret);
// Create a program from the kernel source
auto kernel_src = read_kernel(kernel_file);
auto kernel_c_src = kernel_src.c_str();
auto source_size = kernel_src.size();
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&kernel_c_src, (const size_t *)&source_size, &ret);
CHECK_CL_ERROR(ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
CHECK_CL_ERROR(ret);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "kernel_func", &ret);
CHECK_CL_ERROR(ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&input_mem_obj);
CHECK_CL_ERROR(ret);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&output_mem_obj);
CHECK_CL_ERROR(ret);
// Execute the OpenCL kernel on the list
size_t global_item_size = input.size(); // Process the entire lists
size_t local_item_size = input.size();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
CHECK_CL_ERROR(ret);
// Read the memory buffer output_mem_obj on the device to the local variable output
decltype(input) output;
output.resize(input.size());
ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
bufSz, output.data(), 0, NULL, NULL);
CHECK_CL_ERROR(ret);
// The output produced by the gpu is the same as the output produced by the cpu:
verifyVectorsAreEqual(output, cpu_func(input));
// Clean up
ret = clFlush(command_queue);
CHECK_CL_ERROR(ret);
ret = clFinish(command_queue);
CHECK_CL_ERROR(ret);
ret = clReleaseKernel(kernel);
CHECK_CL_ERROR(ret);
ret = clReleaseProgram(program);
CHECK_CL_ERROR(ret);
ret = clReleaseMemObject(input_mem_obj);
CHECK_CL_ERROR(ret);
ret = clReleaseMemObject(output_mem_obj);
CHECK_CL_ERROR(ret);
ret = clReleaseCommandQueue(command_queue);
CHECK_CL_ERROR(ret);
ret = clReleaseContext(context);
CHECK_CL_ERROR(ret);
return 0;
}