xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/math_brute_force/macro_unary_float.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cstring>
23 
24 namespace {
25 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)26 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
27 {
28     BuildKernelInfo &info = *(BuildKernelInfo *)p;
29     auto generator = [](const std::string &kernel_name, const char *builtin,
30                         cl_uint vector_size_index) {
31         return GetUnaryKernel(kernel_name, builtin, ParameterType::Int,
32                               ParameterType::Float, vector_size_index);
33     };
34     return BuildKernels(info, job_id, generator);
35 }
36 
37 // Thread specific data for a worker thread
38 struct ThreadInfo
39 {
40     // Input and output buffers for the thread
41     clMemWrapper inBuf;
42     Buffers outBuf;
43 
44     // Per thread command queue to improve performance
45     clCommandQueueWrapper tQueue;
46 };
47 
48 struct TestInfo
49 {
50     size_t subBufferSize; // Size of the sub-buffer in elements
51     const Func *f; // A pointer to the function info
52 
53     // Programs for various vector sizes.
54     Programs programs;
55 
56     // Thread-specific kernels for each vector size:
57     // k[vector_size][thread_id]
58     KernelMatrix k;
59 
60     // Array of thread specific information
61     std::vector<ThreadInfo> tinfo;
62 
63     cl_uint threadCount; // Number of worker threads
64     cl_uint jobCount; // Number of jobs
65     cl_uint step; // step between each chunk and the next.
66     cl_uint scale; // stride between individual test values
67     int ftz; // non-zero if running in flush to zero mode
68     bool relaxedMode; // True if test is running in relaxed mode, false
69                       // otherwise.
70 };
71 
Test(cl_uint job_id,cl_uint thread_id,void * data)72 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
73 {
74     TestInfo *job = (TestInfo *)data;
75     size_t buffer_elements = job->subBufferSize;
76     size_t buffer_size = buffer_elements * sizeof(cl_float);
77     cl_uint scale = job->scale;
78     cl_uint base = job_id * (cl_uint)job->step;
79     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
80     fptr func = job->f->func;
81     int ftz = job->ftz;
82     bool relaxedMode = job->relaxedMode;
83     cl_int error = CL_SUCCESS;
84     const char *name = job->f->name;
85 
86     int signbit_test = 0;
87     if (!strcmp(name, "signbit")) signbit_test = 1;
88 
89 #define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s))
90 
91     cl_event e[VECTOR_SIZE_COUNT];
92     cl_int *out[VECTOR_SIZE_COUNT];
93     if (gHostFill)
94     {
95         // start the map of the output arrays
96         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
97         {
98             out[j] = (cl_int *)clEnqueueMapBuffer(
99                 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
100                 buffer_size, 0, NULL, e + j, &error);
101             if (error || NULL == out[j])
102             {
103                 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
104                            error);
105                 return error;
106             }
107         }
108 
109         // Get that moving
110         if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
111     }
112 
113     // Init input array
114     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
115     for (size_t j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
116 
117     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
118                                       buffer_size, p, 0, NULL, NULL)))
119     {
120         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
121         return error;
122     }
123 
124     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
125     {
126         if (gHostFill)
127         {
128             // Wait for the map to finish
129             if ((error = clWaitForEvents(1, e + j)))
130             {
131                 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
132                 return error;
133             }
134             if ((error = clReleaseEvent(e[j])))
135             {
136                 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
137                 return error;
138             }
139         }
140 
141         // Fill the result buffer with garbage, so that old results don't carry
142         // over
143         uint32_t pattern = 0xffffdead;
144         if (gHostFill)
145         {
146             memset_pattern4(out[j], &pattern, buffer_size);
147             if ((error = clEnqueueUnmapMemObject(
148                      tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
149             {
150                 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
151                            error);
152                 return error;
153             }
154         }
155         else
156         {
157             if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
158                                              &pattern, sizeof(pattern), 0,
159                                              buffer_size, 0, NULL, NULL)))
160             {
161                 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
162                            error);
163                 return error;
164             }
165         }
166 
167         // Run the kernel
168         size_t vectorCount =
169             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
170         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
171                                                  // own copy of the cl_kernel
172         cl_program program = job->programs[j];
173 
174         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
175                                     &tinfo->outBuf[j])))
176         {
177             LogBuildError(program);
178             return error;
179         }
180         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
181                                     &tinfo->inBuf)))
182         {
183             LogBuildError(program);
184             return error;
185         }
186 
187         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
188                                             &vectorCount, NULL, 0, NULL, NULL)))
189         {
190             vlog_error("FAILED -- could not execute kernel\n");
191             return error;
192         }
193     }
194 
195     // Get that moving
196     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
197 
198     if (gSkipCorrectnessTesting) return CL_SUCCESS;
199 
200     // Calculate the correctly rounded reference result
201     cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
202     float *s = (float *)p;
203     for (size_t j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]);
204 
205     // Read the data back -- no need to wait for the first N-1 buffers but wait
206     // for the last buffer. This is an in order queue.
207     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
208     {
209         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
210         out[j] = (cl_int *)clEnqueueMapBuffer(
211             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
212             buffer_size, 0, NULL, NULL, &error);
213         if (error || NULL == out[j])
214         {
215             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
216                        error);
217             return error;
218         }
219     }
220 
221     // Verify data
222     cl_int *t = (cl_int *)r;
223     for (size_t j = 0; j < buffer_elements; j++)
224     {
225         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
226         {
227             cl_int *q = out[0];
228 
229             // If we aren't getting the correctly rounded result
230             if (gMinVectorSizeIndex == 0 && t[j] != q[j])
231             {
232                 // If we aren't getting the correctly rounded result
233                 if (ftz || relaxedMode)
234                 {
235                     if (IsFloatSubnormal(s[j]))
236                     {
237                         int correct = ref_func(+0.0f);
238                         int correct2 = ref_func(-0.0f);
239                         if (correct == q[j] || correct2 == q[j]) continue;
240                     }
241                 }
242 
243                 uint32_t err = t[j] - q[j];
244                 if (q[j] > t[j]) err = q[j] - t[j];
245                 vlog_error("\nERROR: %s: %d ulp error at %a: *%d vs. %d\n",
246                            name, err, ((float *)s)[j], t[j], q[j]);
247                 return -1;
248             }
249 
250 
251             for (auto k = std::max(1U, gMinVectorSizeIndex);
252                  k < gMaxVectorSizeIndex; k++)
253             {
254                 q = out[k];
255                 // If we aren't getting the correctly rounded result
256                 if (-t[j] != q[j])
257                 {
258                     if (ftz || relaxedMode)
259                     {
260                         if (IsFloatSubnormal(s[j]))
261                         {
262                             int correct = -ref_func(+0.0f);
263                             int correct2 = -ref_func(-0.0f);
264                             if (correct == q[j] || correct2 == q[j]) continue;
265                         }
266                     }
267 
268                     uint32_t err = -t[j] - q[j];
269                     if (q[j] > -t[j]) err = q[j] + t[j];
270                     vlog_error(
271                         "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name,
272                         sizeNames[k], err, ((float *)s)[j], -t[j], q[j]);
273                     return -1;
274                 }
275             }
276         }
277     }
278 
279     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
280     {
281         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
282                                              out[j], 0, NULL, NULL)))
283         {
284             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
285                        j, error);
286             return error;
287         }
288     }
289 
290     if ((error = clFlush(tinfo->tQueue)))
291     {
292         vlog("clFlush 3 failed\n");
293         return error;
294     }
295 
296 
297     if (0 == (base & 0x0fffffff))
298     {
299         if (gVerboseBruteForce)
300         {
301             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
302                  "ThreadCount:%2u\n",
303                  base, job->step, job->scale, buffer_elements,
304                  job->threadCount);
305         }
306         else
307         {
308             vlog(".");
309         }
310         fflush(stdout);
311     }
312 
313     return CL_SUCCESS;
314 }
315 
316 } // anonymous namespace
317 
TestMacro_Int_Float(const Func * f,MTdata d,bool relaxedMode)318 int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
319 {
320     TestInfo test_info{};
321     cl_int error;
322 
323     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
324 
325     // Init test_info
326     test_info.threadCount = GetThreadCount();
327     test_info.subBufferSize = BUFFER_SIZE
328         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
329     test_info.scale = getTestScale(sizeof(cl_float));
330 
331     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
332     if (test_info.step / test_info.subBufferSize != test_info.scale)
333     {
334         // there was overflow
335         test_info.jobCount = 1;
336     }
337     else
338     {
339         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
340     }
341 
342     test_info.f = f;
343     test_info.ftz =
344         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
345     test_info.relaxedMode = relaxedMode;
346 
347     test_info.tinfo.resize(test_info.threadCount);
348     for (cl_uint i = 0; i < test_info.threadCount; i++)
349     {
350         cl_buffer_region region = {
351             i * test_info.subBufferSize * sizeof(cl_float),
352             test_info.subBufferSize * sizeof(cl_float)
353         };
354         test_info.tinfo[i].inBuf =
355             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
356                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
357         if (error || NULL == test_info.tinfo[i].inBuf)
358         {
359             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
360                        "region {%zd, %zd}\n",
361                        region.origin, region.size);
362             return error;
363         }
364 
365         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
366         {
367             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
368                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
369                 &region, &error);
370             if (error || NULL == test_info.tinfo[i].outBuf[j])
371             {
372                 vlog_error("Error: Unable to create sub-buffer of "
373                            "gOutBuffer[%d] for region {%zd, %zd}\n",
374                            (int)j, region.origin, region.size);
375                 return error;
376             }
377         }
378         test_info.tinfo[i].tQueue =
379             clCreateCommandQueue(gContext, gDevice, 0, &error);
380         if (NULL == test_info.tinfo[i].tQueue || error)
381         {
382             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
383             return error;
384         }
385     }
386 
387     // Init the kernels
388     BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
389                                 test_info.programs, f->nameInCode,
390                                 relaxedMode };
391     if ((error = ThreadPool_Do(BuildKernelFn,
392                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
393                                &build_info)))
394         return error;
395 
396     // Run the kernels
397     if (!gSkipCorrectnessTesting)
398     {
399         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
400         if (error) return error;
401 
402         if (gWimpyMode)
403             vlog("Wimp pass");
404         else
405             vlog("passed");
406     }
407 
408     vlog("\n");
409 
410     return CL_SUCCESS;
411 }
412