xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/workgroups/test_wg_broadcast.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 
18 #include <stdio.h>
19 #include <string.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 
23 #include <algorithm>
24 
25 #include "procs.h"
26 
27 
28 const char *wg_broadcast_1D_kernel_code =
29 "__kernel void test_wg_broadcast_1D(global float *input, global float *output)\n"
30 "{\n"
31 "    int  tid = get_global_id(0);\n"
32 "\n"
33 "    float result = work_group_broadcast(input[tid], get_group_id(0) % get_local_size(0));\n"
34 "    output[tid] = result;\n"
35 "}\n";
36 
37 const char *wg_broadcast_2D_kernel_code =
38 "__kernel void test_wg_broadcast_2D(global float *input, global float *output)\n"
39 "{\n"
40 "    size_t tid_x = get_global_id(0);\n"
41 "    size_t tid_y = get_global_id(1);\n"
42 "    size_t x = get_group_id(0) % get_local_size(0);\n"
43 "    size_t y = get_group_id(1) % get_local_size(1);\n"
44 "\n"
45 "    size_t indx = (tid_y * get_global_size(0)) + tid_x;\n"
46 "    float result = work_group_broadcast(input[indx], x, y);\n"
47 "    output[indx] = result;\n"
48 "}\n";
49 
50 const char *wg_broadcast_3D_kernel_code =
51 "__kernel void test_wg_broadcast_3D(global float *input, global float *output)\n"
52 "{\n"
53 "    size_t tid_x = get_global_id(0);\n"
54 "    size_t tid_y = get_global_id(1);\n"
55 "    size_t tid_z = get_global_id(2);\n"
56 "    size_t x = get_group_id(0) % get_local_size(0);\n"
57 "    size_t y = get_group_id(1) % get_local_size(1);\n"
58 "    size_t z = get_group_id(2) % get_local_size(2);\n"
59 "\n"
60 "    size_t indx = (tid_z * get_global_size(1) * get_global_size(0)) + (tid_y * get_global_size(0)) + tid_x;\n"
61 "    float result = work_group_broadcast(input[indx], x, y, z);\n"
62 "    output[indx] = result;\n"
63 "}\n";
64 
65 static int
verify_wg_broadcast_1D(float * inptr,float * outptr,size_t n,size_t wg_size)66 verify_wg_broadcast_1D(float *inptr, float *outptr, size_t n, size_t wg_size)
67 {
68     size_t     i, j;
69     size_t     group_id;
70 
71     for (i=0,group_id=0; i<n; i+=wg_size,group_id++)
72     {
73         size_t local_size = (n - i) > wg_size ? wg_size : (n - i);
74         float broadcast_result = inptr[i + (group_id % local_size)];
75         for (j=0; j<local_size; j++)
76         {
77             if ( broadcast_result != outptr[i+j] )
78             {
79                 log_info("work_group_broadcast: Error at %u: expected = %f, got = %f\n", i+j, broadcast_result, outptr[i+j]);
80                 return -1;
81             }
82         }
83     }
84 
85     return 0;
86 }
87 
88 static int
verify_wg_broadcast_2D(float * inptr,float * outptr,size_t nx,size_t ny,size_t wg_size_x,size_t wg_size_y)89 verify_wg_broadcast_2D(float *inptr, float *outptr, size_t nx, size_t ny, size_t wg_size_x, size_t wg_size_y)
90 {
91     size_t i, j, _i, _j;
92     size_t group_id_x, group_id_y;
93 
94     for (i=0,group_id_y=0; i<ny; i+=wg_size_y,group_id_y++)
95     {
96         size_t y = group_id_y % wg_size_y;
97         size_t local_size_y = (ny-i) > wg_size_y ? wg_size_y : (ny-i);
98         for (_i=0; _i < local_size_y; _i++)
99         {
100             for (j=0,group_id_x=0; j<nx; j+=wg_size_x,group_id_x++)
101             {
102                 size_t x = group_id_x % wg_size_x;
103                 size_t local_size_x = (nx-j) > wg_size_x ? wg_size_x : (nx-j);
104                 float  broadcast_result = inptr[(i + y) * nx + (j + x)];
105                 for (_j=0; _j < local_size_x; _j++)
106                 {
107                     size_t indx = (i + _i) * nx + (j + _j);
108                     if ( broadcast_result != outptr[indx] )
109                     {
110                         log_info("work_group_broadcast: Error at (%u, %u): expected = %f, got = %f\n", j+_j, i+_i, broadcast_result, outptr[indx]);
111                         return -1;
112                     }
113                 }
114             }
115         }
116     }
117 
118     return 0;
119 }
120 
121 static int
verify_wg_broadcast_3D(float * inptr,float * outptr,size_t nx,size_t ny,size_t nz,size_t wg_size_x,size_t wg_size_y,size_t wg_size_z)122 verify_wg_broadcast_3D(float *inptr, float *outptr, size_t nx, size_t ny, size_t nz, size_t wg_size_x, size_t wg_size_y, size_t wg_size_z)
123 {
124     size_t i, j, k, _i, _j, _k;
125     size_t group_id_x, group_id_y, group_id_z;
126 
127     for (i=0,group_id_z=0; i<nz; i+=wg_size_z,group_id_z++)
128     {
129         size_t z = group_id_z % wg_size_z;
130         size_t local_size_z = (nz-i) > wg_size_z ? wg_size_z : (nz-i);
131         for (_i=0; _i < local_size_z; _i++)
132         {
133             for (j=0,group_id_y=0; j<ny; j+=wg_size_y,group_id_y++)
134             {
135                 size_t y = group_id_y % wg_size_y;
136                 size_t local_size_y = (ny-j) > wg_size_y ? wg_size_y : (ny-j);
137                 for (_j=0; _j < local_size_y; _j++)
138                 {
139                     for (k=0,group_id_x=0; k<nx; k+=wg_size_x,group_id_x++)
140                     {
141                         size_t x = group_id_x % wg_size_x;
142                         size_t local_size_x = (nx-k) > wg_size_x ? wg_size_x : (nx-k);
143                         float  broadcast_result = inptr[(i + z) * ny * nz + (j + y) * nx + (k + x)];
144                         for (_k=0; _k < local_size_x; _k++)
145                         {
146                             size_t indx = (i + _i) * ny * nx + (j + _j) * nx + (k + _k);
147                             if ( broadcast_result != outptr[indx] )
148                             {
149                                 log_info("work_group_broadcast: Error at (%u, %u, %u): expected = %f, got = %f\n", k+_k, j+_j, i+_i, broadcast_result, outptr[indx]);
150                                 return -1;
151                             }
152                         }
153                     }
154                 }
155             }
156         }
157     }
158 
159     return 0;
160 }
161 
162 
163 int
test_work_group_broadcast_1D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)164 test_work_group_broadcast_1D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
165 {
166     cl_mem       streams[2];
167     cl_float     *input_ptr[1], *p;
168     cl_float     *output_ptr;
169     cl_program   program;
170     cl_kernel    kernel;
171     size_t       globalsize[1];
172     size_t       wg_size[1];
173     size_t       num_elements;
174     int          err;
175     MTdata       d;
176 
177     err = create_single_kernel_helper(context, &program, &kernel, 1,
178                                       &wg_broadcast_1D_kernel_code,
179                                       "test_wg_broadcast_1D");
180     if (err)
181         return -1;
182 
183     // "wg_size" is limited to that of the first dimension as only a 1DRange is executed.
184     err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size);
185     test_error(err, "get_max_allowed_1d_work_group_size_on_device failed");
186 
187     num_elements = n_elems;
188 
189     input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
190     output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
191     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
192                                 sizeof(cl_float) * num_elements, NULL, NULL);
193     if (!streams[0])
194     {
195         log_error("clCreateBuffer failed\n");
196         return -1;
197     }
198 
199     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
200                                 sizeof(cl_float) * num_elements, NULL, NULL);
201     if (!streams[1])
202     {
203         log_error("clCreateBuffer failed\n");
204         return -1;
205     }
206 
207     p = input_ptr[0];
208     d = init_genrand( gRandomSeed );
209     for (size_t i = 0; i < num_elements; i++)
210     {
211         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
212     }
213     free_mtdata(d); d = NULL;
214 
215     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
216     if (err != CL_SUCCESS)
217     {
218         log_error("clWriteArray failed\n");
219         return -1;
220     }
221 
222     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
223     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
224     if (err != CL_SUCCESS)
225     {
226         log_error("clSetKernelArgs failed\n");
227         return -1;
228     }
229 
230     // Line below is troublesome...
231     globalsize[0] = (size_t)n_elems;
232     err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, globalsize, wg_size, 0, NULL, NULL );
233     if (err != CL_SUCCESS)
234     {
235         log_error("clEnqueueNDRangeKernel failed\n");
236         return -1;
237     }
238 
239     cl_uint dead = 0xdeaddead;
240     memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
241     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
242     if (err != CL_SUCCESS)
243     {
244         log_error("clEnqueueReadBuffer failed\n");
245         return -1;
246     }
247 
248     if (verify_wg_broadcast_1D(input_ptr[0], output_ptr, num_elements, wg_size[0]))
249     {
250         log_error("work_group_broadcast_1D test failed\n");
251         return -1;
252     }
253     log_info("work_group_broadcast_1D test passed\n");
254 
255     clReleaseMemObject(streams[0]);
256     clReleaseMemObject(streams[1]);
257     clReleaseKernel(kernel);
258     clReleaseProgram(program);
259     free(input_ptr[0]);
260     free(output_ptr);
261 
262     return err;
263 }
264 
265 
266 int
test_work_group_broadcast_2D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)267 test_work_group_broadcast_2D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
268 {
269     cl_mem       streams[2];
270     cl_float     *input_ptr[1], *p;
271     cl_float     *output_ptr;
272     cl_program   program;
273     cl_kernel    kernel;
274     size_t       globalsize[2];
275     size_t       localsize[2];
276     size_t       wg_size[1];
277     size_t       num_workgroups;
278     size_t       num_elements;
279     int          err;
280     MTdata       d;
281 
282     err = create_single_kernel_helper(context, &program, &kernel, 1,
283                                       &wg_broadcast_2D_kernel_code,
284                                       "test_wg_broadcast_2D");
285     if (err)
286         return -1;
287 
288     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
289     if (err)
290         return -1;
291 
292     if (wg_size[0] >= 256)
293     {
294         localsize[0] = localsize[1] = 16;
295     }
296     else if (wg_size[0] >=64)
297     {
298         localsize[0] = localsize[1] = 8;
299     }
300     else if (wg_size[0] >= 16)
301     {
302         localsize[0] = localsize[1] = 4;
303     }
304     else
305     {
306         localsize[0] = localsize[1] = 1;
307     }
308 
309     num_workgroups = std::max(n_elems / wg_size[0], (size_t)16);
310     globalsize[0] = num_workgroups * localsize[0];
311     globalsize[1] = num_workgroups * localsize[1];
312     num_elements = globalsize[0] * globalsize[1];
313 
314     input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
315     output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
316     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
317                                 sizeof(cl_float) * num_elements, NULL, NULL);
318     if (!streams[0])
319     {
320         log_error("clCreateBuffer failed\n");
321         return -1;
322     }
323 
324     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
325                                 sizeof(cl_float) * num_elements, NULL, NULL);
326     if (!streams[1])
327     {
328         log_error("clCreateBuffer failed\n");
329         return -1;
330     }
331 
332     p = input_ptr[0];
333     d = init_genrand( gRandomSeed );
334     for (size_t i = 0; i < num_elements; i++)
335     {
336         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
337     }
338     free_mtdata(d); d = NULL;
339 
340     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
341     if (err != CL_SUCCESS)
342     {
343         log_error("clWriteArray failed\n");
344         return -1;
345     }
346 
347     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
348     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
349     if (err != CL_SUCCESS)
350     {
351         log_error("clSetKernelArgs failed\n");
352         return -1;
353     }
354 
355     err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, globalsize, localsize, 0, NULL, NULL );
356     if (err != CL_SUCCESS)
357     {
358         log_error("clEnqueueNDRangeKernel failed\n");
359         return -1;
360     }
361 
362     cl_uint dead = 0xdeaddead;
363     memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
364     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
365     if (err != CL_SUCCESS)
366     {
367         log_error("clEnqueueReadBuffer failed\n");
368         return -1;
369     }
370 
371     if (verify_wg_broadcast_2D(input_ptr[0], output_ptr, globalsize[0], globalsize[1], localsize[0], localsize[1]))
372     {
373         log_error("work_group_broadcast_2D test failed\n");
374         return -1;
375     }
376     log_info("work_group_broadcast_2D test passed\n");
377 
378     clReleaseMemObject(streams[0]);
379     clReleaseMemObject(streams[1]);
380     clReleaseKernel(kernel);
381     clReleaseProgram(program);
382     free(input_ptr[0]);
383     free(output_ptr);
384 
385     return err;
386 }
387 
388 
389 int
test_work_group_broadcast_3D(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)390 test_work_group_broadcast_3D(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
391 {
392     cl_mem       streams[2];
393     cl_float     *input_ptr[1], *p;
394     cl_float     *output_ptr;
395     cl_program   program;
396     cl_kernel    kernel;
397     size_t       globalsize[3];
398     size_t       localsize[3];
399     size_t       wg_size[1];
400     size_t       num_workgroups;
401     size_t       num_elements;
402     int          err;
403     MTdata       d;
404 
405     err = create_single_kernel_helper(context, &program, &kernel, 1,
406                                       &wg_broadcast_3D_kernel_code,
407                                       "test_wg_broadcast_3D");
408     if (err)
409         return -1;
410 
411     err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL);
412     if (err)
413         return -1;
414 
415     if (wg_size[0] >=512)
416     {
417         localsize[0] = localsize[1] = localsize[2] = 8;
418     }
419     else if (wg_size[0] >= 64)
420     {
421         localsize[0] = localsize[1] = localsize[2] = 4;
422     }
423     else if (wg_size[0] >= 8)
424     {
425         localsize[0] = localsize[1] = localsize[2] = 2;
426     }
427     else
428     {
429         localsize[0] = localsize[1] = localsize[2] = 1;
430     }
431 
432     num_workgroups = std::max(n_elems / wg_size[0], (size_t)8);
433     globalsize[0] = num_workgroups * localsize[0];
434     globalsize[1] = num_workgroups * localsize[1];
435     globalsize[2] = num_workgroups * localsize[2];
436     num_elements = globalsize[0] * globalsize[1] * globalsize[2];
437 
438     input_ptr[0] = (cl_float*)malloc(sizeof(cl_float) * num_elements);
439     output_ptr = (cl_float*)malloc(sizeof(cl_float) * num_elements);
440     streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE,
441                                 sizeof(cl_float) * num_elements, NULL, NULL);
442     if (!streams[0])
443     {
444         log_error("clCreateBuffer failed\n");
445         return -1;
446     }
447 
448     streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE,
449                                 sizeof(cl_float) * num_elements, NULL, NULL);
450     if (!streams[1])
451     {
452         log_error("clCreateBuffer failed\n");
453         return -1;
454     }
455 
456     p = input_ptr[0];
457     d = init_genrand( gRandomSeed );
458     for (size_t i = 0; i < num_elements; i++)
459     {
460         p[i] = get_random_float((float)(-100000.f * M_PI), (float)(100000.f * M_PI) ,d);
461     }
462     free_mtdata(d); d = NULL;
463 
464     err = clEnqueueWriteBuffer( queue, streams[0], true, 0, sizeof(cl_float)*num_elements, (void *)input_ptr[0], 0, NULL, NULL );
465     if (err != CL_SUCCESS)
466     {
467         log_error("clWriteArray failed\n");
468         return -1;
469     }
470 
471     err = clSetKernelArg(kernel, 0, sizeof streams[0], &streams[0] );
472     err |= clSetKernelArg(kernel, 1, sizeof streams[1], &streams[1] );
473     if (err != CL_SUCCESS)
474     {
475         log_error("clSetKernelArgs failed\n");
476         return -1;
477     }
478 
479     err = clEnqueueNDRangeKernel( queue, kernel, 3, NULL, globalsize, localsize, 0, NULL, NULL );
480     if (err != CL_SUCCESS)
481     {
482         log_error("clEnqueueNDRangeKernel failed\n");
483         return -1;
484     }
485 
486     cl_uint dead = 0xdeaddead;
487     memset_pattern4(output_ptr, &dead, sizeof(cl_float)*num_elements);
488     err = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof(cl_float)*num_elements, (void *)output_ptr, 0, NULL, NULL );
489     if (err != CL_SUCCESS)
490     {
491         log_error("clEnqueueReadBuffer failed\n");
492         return -1;
493     }
494 
495     if (verify_wg_broadcast_3D(input_ptr[0], output_ptr, globalsize[0], globalsize[1], globalsize[2], localsize[0], localsize[1], localsize[2]))
496     {
497         log_error("work_group_broadcast_3D test failed\n");
498         return -1;
499     }
500     log_info("work_group_broadcast_3D test passed\n");
501 
502     clReleaseMemObject(streams[0]);
503     clReleaseMemObject(streams[1]);
504     clReleaseKernel(kernel);
505     clReleaseProgram(program);
506     free(input_ptr[0]);
507     free(output_ptr);
508 
509     return err;
510 }
511 
512 
513 int
test_work_group_broadcast(cl_device_id device,cl_context context,cl_command_queue queue,int n_elems)514 test_work_group_broadcast(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems)
515 {
516     int err;
517 
518     err = test_work_group_broadcast_1D(device, context, queue, n_elems);
519     if (err) return err;
520     err = test_work_group_broadcast_2D(device, context, queue, n_elems);
521     if (err) return err;
522     return err;
523 }
524 
525 
526