xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/math_brute_force/unary_float.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cstring>
23 
24 namespace {
25 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)26 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
27 {
28     BuildKernelInfo &info = *(BuildKernelInfo *)p;
29     auto generator = [](const std::string &kernel_name, const char *builtin,
30                         cl_uint vector_size_index) {
31         return GetUnaryKernel(kernel_name, builtin, ParameterType::Float,
32                               ParameterType::Float, vector_size_index);
33     };
34     return BuildKernels(info, job_id, generator);
35 }
36 
37 // Thread specific data for a worker thread
38 struct ThreadInfo
39 {
40     // Input and output buffers for the thread
41     clMemWrapper inBuf;
42     Buffers outBuf;
43 
44     float maxError; // max error value. Init to 0.
45     double maxErrorValue; // position of the max error value.  Init to 0.
46 
47     // Per thread command queue to improve performance
48     clCommandQueueWrapper tQueue;
49 };
50 
51 struct TestInfo
52 {
53     size_t subBufferSize; // Size of the sub-buffer in elements
54     const Func *f; // A pointer to the function info
55 
56     // Programs for various vector sizes.
57     Programs programs;
58 
59     // Thread-specific kernels for each vector size:
60     // k[vector_size][thread_id]
61     KernelMatrix k;
62 
63     // Array of thread specific information
64     std::vector<ThreadInfo> tinfo;
65 
66     cl_uint threadCount; // Number of worker threads
67     cl_uint jobCount; // Number of jobs
68     cl_uint step; // step between each chunk and the next.
69     cl_uint scale; // stride between individual test values
70     float ulps; // max_allowed ulps
71     int ftz; // non-zero if running in flush to zero mode
72 
73     int isRangeLimited; // 1 if the function is only to be evaluated over a
74                         // range
75     float half_sin_cos_tan_limit;
76     bool relaxedMode; // True if test is running in relaxed mode, false
77                       // otherwise.
78 };
79 
Test(cl_uint job_id,cl_uint thread_id,void * data)80 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
81 {
82     TestInfo *job = (TestInfo *)data;
83     size_t buffer_elements = job->subBufferSize;
84     size_t buffer_size = buffer_elements * sizeof(cl_float);
85     cl_uint scale = job->scale;
86     cl_uint base = job_id * (cl_uint)job->step;
87     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
88     fptr func = job->f->func;
89     const char *fname = job->f->name;
90     bool relaxedMode = job->relaxedMode;
91     float ulps = getAllowedUlpError(job->f, relaxedMode);
92     if (relaxedMode)
93     {
94         func = job->f->rfunc;
95     }
96 
97     cl_int error;
98 
99     int isRangeLimited = job->isRangeLimited;
100     float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit;
101     int ftz = job->ftz;
102 
103     cl_event e[VECTOR_SIZE_COUNT];
104     cl_uint *out[VECTOR_SIZE_COUNT];
105     if (gHostFill)
106     {
107         // start the map of the output arrays
108         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
109         {
110             out[j] = (cl_uint *)clEnqueueMapBuffer(
111                 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
112                 buffer_size, 0, NULL, e + j, &error);
113             if (error || NULL == out[j])
114             {
115                 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
116                            error);
117                 return error;
118             }
119         }
120 
121         // Get that moving
122         if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
123     }
124 
125     // Write the new values to the input array
126     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
127     for (size_t j = 0; j < buffer_elements; j++)
128     {
129         p[j] = base + j * scale;
130         if (relaxedMode)
131         {
132             float p_j = *(float *)&p[j];
133             if (strcmp(fname, "sin") == 0
134                 || strcmp(fname, "cos")
135                     == 0) // the domain of the function is [-pi,pi]
136             {
137                 if (fabs(p_j) > M_PI) ((float *)p)[j] = NAN;
138             }
139 
140             if (strcmp(fname, "reciprocal") == 0)
141             {
142                 const float l_limit = HEX_FLT(+, 1, 0, -, 126);
143                 const float u_limit = HEX_FLT(+, 1, 0, +, 126);
144 
145                 if (fabs(p_j) < l_limit
146                     || fabs(p_j) > u_limit) // the domain of the function is
147                                             // [2^-126,2^126]
148                     ((float *)p)[j] = NAN;
149             }
150         }
151     }
152 
153     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
154                                       buffer_size, p, 0, NULL, NULL)))
155     {
156         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
157         return error;
158     }
159 
160     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
161     {
162         if (gHostFill)
163         {
164             // Wait for the map to finish
165             if ((error = clWaitForEvents(1, e + j)))
166             {
167                 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
168                 return error;
169             }
170             if ((error = clReleaseEvent(e[j])))
171             {
172                 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
173                 return error;
174             }
175         }
176 
177         // Fill the result buffer with garbage, so that old results don't carry
178         // over
179         uint32_t pattern = 0xffffdead;
180         if (gHostFill)
181         {
182             memset_pattern4(out[j], &pattern, buffer_size);
183             if ((error = clEnqueueUnmapMemObject(
184                      tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
185             {
186                 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
187                            error);
188                 return error;
189             }
190         }
191         else
192         {
193             if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
194                                              &pattern, sizeof(pattern), 0,
195                                              buffer_size, 0, NULL, NULL)))
196             {
197                 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
198                            error);
199                 return error;
200             }
201         }
202 
203         // Run the kernel
204         size_t vectorCount =
205             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
206         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
207                                                  // own copy of the cl_kernel
208         cl_program program = job->programs[j];
209 
210         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
211                                     &tinfo->outBuf[j])))
212         {
213             LogBuildError(program);
214             return error;
215         }
216         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
217                                     &tinfo->inBuf)))
218         {
219             LogBuildError(program);
220             return error;
221         }
222 
223         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
224                                             &vectorCount, NULL, 0, NULL, NULL)))
225         {
226             vlog_error("FAILED -- could not execute kernel\n");
227             return error;
228         }
229     }
230 
231     // Get that moving
232     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
233 
234     if (gSkipCorrectnessTesting) return CL_SUCCESS;
235 
236     // Calculate the correctly rounded reference result
237     float *r = (float *)gOut_Ref + thread_id * buffer_elements;
238     float *s = (float *)p;
239     for (size_t j = 0; j < buffer_elements; j++) r[j] = (float)func.f_f(s[j]);
240 
241     // Read the data back -- no need to wait for the first N-1 buffers but wait
242     // for the last buffer. This is an in order queue.
243     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
244     {
245         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
246         out[j] = (cl_uint *)clEnqueueMapBuffer(
247             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
248             buffer_size, 0, NULL, NULL, &error);
249         if (error || NULL == out[j])
250         {
251             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
252                        error);
253             return error;
254         }
255     }
256 
257     // Verify data
258     uint32_t *t = (uint32_t *)r;
259     for (size_t j = 0; j < buffer_elements; j++)
260     {
261         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
262         {
263             uint32_t *q = out[k];
264 
265             // If we aren't getting the correctly rounded result
266             if (t[j] != q[j])
267             {
268                 float test = ((float *)q)[j];
269                 double correct = func.f_f(s[j]);
270                 float err = Ulp_Error(test, correct);
271                 float abs_error = Abs_Error(test, correct);
272                 int fail = 0;
273                 int use_abs_error = 0;
274 
275                 // it is possible for the output to not match the reference
276                 // result but for Ulp_Error to be zero, for example -1.#QNAN
277                 // vs. 1.#QNAN. In such cases there is no failure
278                 if (err == 0.0f)
279                 {
280                     fail = 0;
281                 }
282                 else if (relaxedMode)
283                 {
284                     if (strcmp(fname, "sin") == 0 || strcmp(fname, "cos") == 0)
285                     {
286                         fail = !(fabsf(abs_error) <= ulps);
287                         use_abs_error = 1;
288                     }
289                     if (strcmp(fname, "sinpi") == 0
290                         || strcmp(fname, "cospi") == 0)
291                     {
292                         if (s[j] >= -1.0 && s[j] <= 1.0)
293                         {
294                             fail = !(fabsf(abs_error) <= ulps);
295                             use_abs_error = 1;
296                         }
297                     }
298 
299                     if (strcmp(fname, "reciprocal") == 0)
300                     {
301                         fail = !(fabsf(err) <= ulps);
302                     }
303 
304                     if (strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0)
305                     {
306                         float exp_error = ulps;
307 
308                         if (!gIsEmbedded)
309                         {
310                             exp_error += floor(fabs(2 * s[j]));
311                         }
312 
313                         fail = !(fabsf(err) <= exp_error);
314                         ulps = exp_error;
315                     }
316                     if (strcmp(fname, "tan") == 0)
317                     {
318 
319                         if (!gFastRelaxedDerived)
320                         {
321                             fail = !(fabsf(err) <= ulps);
322                         }
323                         // Else fast math derived implementation does not
324                         // require ULP verification
325                     }
326                     if (strcmp(fname, "exp10") == 0)
327                     {
328                         if (!gFastRelaxedDerived)
329                         {
330                             fail = !(fabsf(err) <= ulps);
331                         }
332                         // Else fast math derived implementation does not
333                         // require ULP verification
334                     }
335                     if (strcmp(fname, "log") == 0 || strcmp(fname, "log2") == 0
336                         || strcmp(fname, "log10") == 0)
337                     {
338                         if (s[j] >= 0.5 && s[j] <= 2)
339                         {
340                             fail = !(fabsf(abs_error) <= ulps);
341                         }
342                         else
343                         {
344                             ulps = gIsEmbedded ? job->f->float_embedded_ulps
345                                                : job->f->float_ulps;
346                             fail = !(fabsf(err) <= ulps);
347                         }
348                     }
349 
350 
351                     // fast-relaxed implies finite-only
352                     if (IsFloatInfinity(correct) || IsFloatNaN(correct)
353                         || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
354                     {
355                         fail = 0;
356                         err = 0;
357                     }
358                 }
359                 else
360                 {
361                     fail = !(fabsf(err) <= ulps);
362                 }
363 
364                 // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
365                 if (isRangeLimited
366                     && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
367                     && fabsf(s[j]) < INFINITY)
368                 {
369                     if (fabsf(test) <= half_sin_cos_tan_limit)
370                     {
371                         err = 0;
372                         fail = 0;
373                     }
374                 }
375 
376                 if (fail)
377                 {
378                     if (ftz || relaxedMode)
379                     {
380                         typedef int (*CheckForSubnormal)(
381                             double, float); // If we are in fast relaxed math,
382                                             // we have a different calculation
383                                             // for the subnormal threshold.
384                         CheckForSubnormal isFloatResultSubnormalPtr;
385 
386                         if (relaxedMode)
387                         {
388                             isFloatResultSubnormalPtr =
389                                 &IsFloatResultSubnormalAbsError;
390                         }
391                         else
392                         {
393                             isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
394                         }
395                         // retry per section 6.5.3.2
396                         if ((*isFloatResultSubnormalPtr)(correct, ulps))
397                         {
398                             fail = fail && (test != 0.0f);
399                             if (!fail) err = 0.0f;
400                         }
401 
402                         // retry per section 6.5.3.3
403                         if (IsFloatSubnormal(s[j]))
404                         {
405                             double correct2 = func.f_f(0.0);
406                             double correct3 = func.f_f(-0.0);
407                             float err2;
408                             float err3;
409                             if (use_abs_error)
410                             {
411                                 err2 = Abs_Error(test, correct2);
412                                 err3 = Abs_Error(test, correct3);
413                             }
414                             else
415                             {
416                                 err2 = Ulp_Error(test, correct2);
417                                 err3 = Ulp_Error(test, correct3);
418                             }
419                             fail = fail
420                                 && ((!(fabsf(err2) <= ulps))
421                                     && (!(fabsf(err3) <= ulps)));
422                             if (fabsf(err2) < fabsf(err)) err = err2;
423                             if (fabsf(err3) < fabsf(err)) err = err3;
424 
425                             // retry per section 6.5.3.4
426                             if ((*isFloatResultSubnormalPtr)(correct2, ulps)
427                                 || (*isFloatResultSubnormalPtr)(correct3, ulps))
428                             {
429                                 fail = fail && (test != 0.0f);
430                                 if (!fail) err = 0.0f;
431                             }
432                         }
433                     }
434                 }
435                 if (fabsf(err) > tinfo->maxError)
436                 {
437                     tinfo->maxError = fabsf(err);
438                     tinfo->maxErrorValue = s[j];
439                 }
440                 if (fail)
441                 {
442                     vlog_error("\nERROR: %s%s: %f ulp error at %a (0x%8.8x): "
443                                "*%a vs. %a\n",
444                                job->f->name, sizeNames[k], err, ((float *)s)[j],
445                                ((uint32_t *)s)[j], ((float *)t)[j], test);
446                     return -1;
447                 }
448             }
449         }
450     }
451 
452     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
453     {
454         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
455                                              out[j], 0, NULL, NULL)))
456         {
457             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
458                        j, error);
459             return error;
460         }
461     }
462 
463     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
464 
465 
466     if (0 == (base & 0x0fffffff))
467     {
468         if (gVerboseBruteForce)
469         {
470             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
471                  "ThreadCount:%2u\n",
472                  base, job->step, job->scale, buffer_elements, job->ulps,
473                  job->threadCount);
474         }
475         else
476         {
477             vlog(".");
478         }
479         fflush(stdout);
480     }
481 
482     return CL_SUCCESS;
483 }
484 
485 } // anonymous namespace
486 
TestFunc_Float_Float(const Func * f,MTdata d,bool relaxedMode)487 int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
488 {
489     TestInfo test_info{};
490     cl_int error;
491     float maxError = 0.0f;
492     double maxErrorVal = 0.0;
493     int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
494 
495     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
496 
497     // Init test_info
498     test_info.threadCount = GetThreadCount();
499     test_info.subBufferSize = BUFFER_SIZE
500         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
501     test_info.scale = getTestScale(sizeof(cl_float));
502 
503     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
504     if (test_info.step / test_info.subBufferSize != test_info.scale)
505     {
506         // there was overflow
507         test_info.jobCount = 1;
508     }
509     else
510     {
511         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
512     }
513 
514     test_info.f = f;
515     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
516     test_info.ftz =
517         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
518     test_info.relaxedMode = relaxedMode;
519     test_info.tinfo.resize(test_info.threadCount);
520     for (cl_uint i = 0; i < test_info.threadCount; i++)
521     {
522         cl_buffer_region region = {
523             i * test_info.subBufferSize * sizeof(cl_float),
524             test_info.subBufferSize * sizeof(cl_float)
525         };
526         test_info.tinfo[i].inBuf =
527             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
528                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
529         if (error || NULL == test_info.tinfo[i].inBuf)
530         {
531             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
532                        "region {%zd, %zd}\n",
533                        region.origin, region.size);
534             return error;
535         }
536 
537         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
538         {
539             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
540                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
541                 &region, &error);
542             if (error || NULL == test_info.tinfo[i].outBuf[j])
543             {
544                 vlog_error("Error: Unable to create sub-buffer of "
545                            "gOutBuffer[%d] for region {%zd, %zd}\n",
546                            (int)j, region.origin, region.size);
547                 return error;
548             }
549         }
550         test_info.tinfo[i].tQueue =
551             clCreateCommandQueue(gContext, gDevice, 0, &error);
552         if (NULL == test_info.tinfo[i].tQueue || error)
553         {
554             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
555             return error;
556         }
557     }
558 
559     // Check for special cases for unary float
560     test_info.isRangeLimited = 0;
561     test_info.half_sin_cos_tan_limit = 0;
562     if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
563     {
564         test_info.isRangeLimited = 1;
565         test_info.half_sin_cos_tan_limit = 1.0f
566             + test_info.ulps
567                 * (FLT_EPSILON / 2.0f); // out of range results from finite
568                                         // inputs must be in [-1,1]
569     }
570     else if (0 == strcmp(f->name, "half_tan"))
571     {
572         test_info.isRangeLimited = 1;
573         test_info.half_sin_cos_tan_limit =
574             INFINITY; // out of range resut from finite inputs must be numeric
575     }
576 
577     // Init the kernels
578     BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
579                                 test_info.programs, f->nameInCode,
580                                 relaxedMode };
581     if ((error = ThreadPool_Do(BuildKernelFn,
582                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
583                                &build_info)))
584         return error;
585 
586     // Run the kernels
587     if (!gSkipCorrectnessTesting || skipTestingRelaxed)
588     {
589         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
590         if (error) return error;
591 
592         // Accumulate the arithmetic errors
593         for (cl_uint i = 0; i < test_info.threadCount; i++)
594         {
595             if (test_info.tinfo[i].maxError > maxError)
596             {
597                 maxError = test_info.tinfo[i].maxError;
598                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
599             }
600         }
601 
602         if (gWimpyMode)
603             vlog("Wimp pass");
604         else
605             vlog("passed");
606 
607         if (skipTestingRelaxed)
608         {
609             vlog(" (rlx skip correctness testing)\n");
610             return error;
611         }
612 
613         vlog("\t%8.2f @ %a", maxError, maxErrorVal);
614     }
615 
616     vlog("\n");
617 
618     return CL_SUCCESS;
619 }
620