xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/math_brute_force/binary_float.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <cstring>
23 
24 namespace {
25 
26 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
27 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)28 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
29 {
30     BuildKernelInfo &info = *(BuildKernelInfo *)p;
31     auto generator = [](const std::string &kernel_name, const char *builtin,
32                         cl_uint vector_size_index) {
33         return GetBinaryKernel(kernel_name, builtin, ParameterType::Float,
34                                ParameterType::Float, ParameterType::Float,
35                                vector_size_index);
36     };
37     return BuildKernels(info, job_id, generator);
38 }
39 
40 // Thread specific data for a worker thread
41 struct ThreadInfo
42 {
43     // Input and output buffers for the thread
44     clMemWrapper inBuf;
45     clMemWrapper inBuf2;
46     Buffers outBuf;
47 
48     float maxError; // max error value. Init to 0.
49     double
50         maxErrorValue; // position of the max error value (param 1).  Init to 0.
51     double maxErrorValue2; // position of the max error value (param 2).  Init
52                            // to 0.
53     MTdataHolder d;
54 
55     // Per thread command queue to improve performance
56     clCommandQueueWrapper tQueue;
57 };
58 
59 struct TestInfo
60 {
61     size_t subBufferSize; // Size of the sub-buffer in elements
62     const Func *f; // A pointer to the function info
63 
64     // Programs for various vector sizes.
65     Programs programs;
66 
67     // Thread-specific kernels for each vector size:
68     // k[vector_size][thread_id]
69     KernelMatrix k;
70 
71     // Array of thread specific information
72     std::vector<ThreadInfo> tinfo;
73 
74     cl_uint threadCount; // Number of worker threads
75     cl_uint jobCount; // Number of jobs
76     cl_uint step; // step between each chunk and the next.
77     cl_uint scale; // stride between individual test values
78     float ulps; // max_allowed ulps
79     int ftz; // non-zero if running in flush to zero mode
80 
81     int isFDim;
82     int skipNanInf;
83     int isNextafter;
84     bool relaxedMode; // True if test is running in relaxed mode, false
85                       // otherwise.
86 };
87 
88 // A table of more difficult cases to get right
89 const float specialValues[] = {
90     -NAN,
91     -INFINITY,
92     -FLT_MAX,
93     MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
94     MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
95     MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
96     MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
97     MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
98     MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
99     MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
100     MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
101     MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
102     MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
103     MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
104     MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
105     -1000.f,
106     -100.f,
107     -4.0f,
108     -3.5f,
109     -3.0f,
110     MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
111     -2.5f,
112     MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
113     -2.0f,
114     MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
115     -1.5f,
116     MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
117     MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
118     -1.0f,
119     MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
120     MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
121     -0.5f,
122     MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
123     MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
124     -0.25f,
125     MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
126     MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
127     -FLT_MIN,
128     MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
129     MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
130     MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
131     MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
132     MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
133     MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
134     MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
135     MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
136     MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
137     MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
138     -0.0f,
139 
140     +NAN,
141     +INFINITY,
142     +FLT_MAX,
143     MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
144     MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
145     MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
146     MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
147     MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
148     MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
149     MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
150     MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
151     MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
152     MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
153     MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
154     MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
155     +1000.f,
156     +100.f,
157     +4.0f,
158     +3.5f,
159     +3.0f,
160     MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
161     2.5f,
162     MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
163     +2.0f,
164     MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
165     1.5f,
166     MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
167     MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
168     +1.0f,
169     MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
170     MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
171     +0.5f,
172     MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
173     MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
174     +0.25f,
175     MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
176     MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
177     +FLT_MIN,
178     MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
179     MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
180     MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
181     MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
182     MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
183     MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
184     MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
185     MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
186     MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
187     MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
188     +0.0f,
189 };
190 
191 constexpr size_t specialValuesCount =
192     sizeof(specialValues) / sizeof(specialValues[0]);
193 
Test(cl_uint job_id,cl_uint thread_id,void * data)194 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
195 {
196     TestInfo *job = (TestInfo *)data;
197     size_t buffer_elements = job->subBufferSize;
198     size_t buffer_size = buffer_elements * sizeof(cl_float);
199     cl_uint base = job_id * (cl_uint)job->step;
200     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
201     fptr func = job->f->func;
202     int ftz = job->ftz;
203     bool relaxedMode = job->relaxedMode;
204     float ulps = getAllowedUlpError(job->f, relaxedMode);
205     MTdata d = tinfo->d;
206     cl_int error;
207     std::vector<bool> overflow(buffer_elements, false);
208     const char *name = job->f->name;
209     int isFDim = job->isFDim;
210     int skipNanInf = job->skipNanInf;
211     int isNextafter = job->isNextafter;
212     cl_uint *t = 0;
213     cl_float *r = 0;
214     cl_float *s = 0;
215     cl_float *s2 = 0;
216     cl_int copysign_test = 0;
217     RoundingMode oldRoundMode;
218     int skipVerification = 0;
219 
220     if (relaxedMode)
221     {
222         func = job->f->rfunc;
223         if (strcmp(name, "pow") == 0 && gFastRelaxedDerived)
224         {
225             ulps = INFINITY;
226             skipVerification = 1;
227         }
228     }
229 
230     cl_event e[VECTOR_SIZE_COUNT];
231     cl_uint *out[VECTOR_SIZE_COUNT];
232     if (gHostFill)
233     {
234         // start the map of the output arrays
235         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
236         {
237             out[j] = (cl_uint *)clEnqueueMapBuffer(
238                 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
239                 buffer_size, 0, NULL, e + j, &error);
240             if (error || NULL == out[j])
241             {
242                 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
243                            error);
244                 return error;
245             }
246         }
247 
248         // Get that moving
249         if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
250     }
251 
252     // Init input array
253     cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
254     cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
255     cl_uint idx = 0;
256     int totalSpecialValueCount = specialValuesCount * specialValuesCount;
257     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
258 
259     // Test edge cases
260     if (job_id <= (cl_uint)lastSpecialJobIndex)
261     {
262         float *fp = (float *)p;
263         float *fp2 = (float *)p2;
264         uint32_t x, y;
265 
266         x = (job_id * buffer_elements) % specialValuesCount;
267         y = (job_id * buffer_elements) / specialValuesCount;
268 
269         for (; idx < buffer_elements; idx++)
270         {
271             fp[idx] = specialValues[x];
272             fp2[idx] = specialValues[y];
273             ++x;
274             if (x >= specialValuesCount)
275             {
276                 x = 0;
277                 y++;
278                 if (y >= specialValuesCount) break;
279             }
280         }
281     }
282 
283     // Init any remaining values
284     for (; idx < buffer_elements; idx++)
285     {
286         p[idx] = genrand_int32(d);
287         p2[idx] = genrand_int32(d);
288     }
289 
290     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
291                                       buffer_size, p, 0, NULL, NULL)))
292     {
293         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
294         return error;
295     }
296 
297     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
298                                       buffer_size, p2, 0, NULL, NULL)))
299     {
300         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
301         return error;
302     }
303 
304     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
305     {
306         if (gHostFill)
307         {
308             // Wait for the map to finish
309             if ((error = clWaitForEvents(1, e + j)))
310             {
311                 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
312                 return error;
313             }
314             if ((error = clReleaseEvent(e[j])))
315             {
316                 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
317                 return error;
318             }
319         }
320 
321         // Fill the result buffer with garbage, so that old results don't carry
322         // over
323         uint32_t pattern = 0xffffdead;
324         if (gHostFill)
325         {
326             memset_pattern4(out[j], &pattern, buffer_size);
327             if ((error = clEnqueueUnmapMemObject(
328                      tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
329             {
330                 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
331                            error);
332                 return error;
333             }
334         }
335         else
336         {
337             if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
338                                              &pattern, sizeof(pattern), 0,
339                                              buffer_size, 0, NULL, NULL)))
340             {
341                 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
342                            error);
343                 return error;
344             }
345         }
346 
347         // Run the kernel
348         size_t vectorCount =
349             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
350         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
351                                                  // own copy of the cl_kernel
352         cl_program program = job->programs[j];
353 
354         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
355                                     &tinfo->outBuf[j])))
356         {
357             LogBuildError(program);
358             return error;
359         }
360         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
361                                     &tinfo->inBuf)))
362         {
363             LogBuildError(program);
364             return error;
365         }
366         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
367                                     &tinfo->inBuf2)))
368         {
369             LogBuildError(program);
370             return error;
371         }
372 
373         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
374                                             &vectorCount, NULL, 0, NULL, NULL)))
375         {
376             vlog_error("FAILED -- could not execute kernel\n");
377             return error;
378         }
379     }
380 
381     // Get that moving
382     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
383 
384     if (gSkipCorrectnessTesting)
385     {
386         if ((error = clFinish(tinfo->tQueue)))
387         {
388             vlog_error("Error: clFinish failed! err: %d\n", error);
389             return error;
390         }
391         return CL_SUCCESS;
392     }
393 
394     FPU_mode_type oldMode;
395     oldRoundMode = kRoundToNearestEven;
396     if (isFDim)
397     {
398         // Calculate the correctly rounded reference result
399         memset(&oldMode, 0, sizeof(oldMode));
400         if (ftz || relaxedMode) ForceFTZ(&oldMode);
401 
402         // Set the rounding mode to match the device
403         if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
404     }
405 
406     if (!strcmp(name, "copysign")) copysign_test = 1;
407 
408 #define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2))
409 
410     // Calculate the correctly rounded reference result
411     r = (float *)gOut_Ref + thread_id * buffer_elements;
412     s = (float *)gIn + thread_id * buffer_elements;
413     s2 = (float *)gIn2 + thread_id * buffer_elements;
414     if (skipNanInf)
415     {
416         for (size_t j = 0; j < buffer_elements; j++)
417         {
418             feclearexcept(FE_OVERFLOW);
419             r[j] = (float)ref_func(s[j], s2[j]);
420             overflow[j] =
421                 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
422         }
423     }
424     else
425     {
426         for (size_t j = 0; j < buffer_elements; j++)
427             r[j] = (float)ref_func(s[j], s2[j]);
428     }
429 
430     if (isFDim && ftz) RestoreFPState(&oldMode);
431 
432     // Read the data back -- no need to wait for the first N-1 buffers but wait
433     // for the last buffer. This is an in order queue.
434     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
435     {
436         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
437         out[j] = (cl_uint *)clEnqueueMapBuffer(
438             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
439             buffer_size, 0, NULL, NULL, &error);
440         if (error || NULL == out[j])
441         {
442             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
443                        error);
444             return error;
445         }
446     }
447 
448     if (!skipVerification)
449     {
450         // Verify data
451         t = (cl_uint *)r;
452         for (size_t j = 0; j < buffer_elements; j++)
453         {
454             for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
455             {
456                 cl_uint *q = out[k];
457 
458                 // If we aren't getting the correctly rounded result
459                 if (t[j] != q[j])
460                 {
461                     float test = ((float *)q)[j];
462                     double correct = ref_func(s[j], s2[j]);
463 
464                     // Per section 10 paragraph 6, accept any result if an input
465                     // or output is a infinity or NaN or overflow As per
466                     // OpenCL 2.0 spec, section 5.8.4.3, enabling
467                     // fast-relaxed-math mode also enables -cl-finite-math-only
468                     // optimization. This optimization allows to assume that
469                     // arguments and results are not NaNs or +/-INFs. Hence,
470                     // accept any result if inputs or results are NaNs or INFs.
471                     if (relaxedMode || skipNanInf)
472                     {
473                         if (skipNanInf && overflow[j]) continue;
474                         // Note: no double rounding here.  Reference functions
475                         // calculate in single precision.
476                         if (IsFloatInfinity(correct) || IsFloatNaN(correct)
477                             || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
478                             || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
479                             continue;
480                     }
481 
482                     float err = Ulp_Error(test, correct);
483                     int fail = !(fabsf(err) <= ulps);
484 
485                     if (fail && (ftz || relaxedMode))
486                     {
487                         // retry per section 6.5.3.2
488                         if (IsFloatResultSubnormal(correct, ulps))
489                         {
490                             fail = fail && (test != 0.0f);
491                             if (!fail) err = 0.0f;
492                         }
493 
494                         // nextafter on FTZ platforms may return the smallest
495                         // normal float (2^-126) given a denormal or a zero
496                         // as the first argument. The rationale here is that
497                         // nextafter flushes the argument to zero and then
498                         // returns the next representable number in the
499                         // direction of the second argument, and since
500                         // denorms are considered as zero, the smallest
501                         // normal number is the next representable number.
502                         // In which case, it should have the same sign as the
503                         // second argument.
504                         if (isNextafter)
505                         {
506                             if (IsFloatSubnormal(s[j]) || s[j] == 0.0f)
507                             {
508                                 float value = copysignf(twoToMinus126, s2[j]);
509                                 fail = fail && (test != value);
510                                 if (!fail) err = 0.0f;
511                             }
512                         }
513                         else
514                         {
515                             // retry per section 6.5.3.3
516                             if (IsFloatSubnormal(s[j]))
517                             {
518                                 double correct2, correct3;
519                                 float err2, err3;
520 
521                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
522 
523                                 correct2 = ref_func(0.0, s2[j]);
524                                 correct3 = ref_func(-0.0, s2[j]);
525 
526                                 // Per section 10 paragraph 6, accept any result
527                                 // if an input or output is a infinity or NaN or
528                                 // overflow As per OpenCL 2.0 spec,
529                                 // section 5.8.4.3, enabling fast-relaxed-math
530                                 // mode also enables -cl-finite-math-only
531                                 // optimization. This optimization allows to
532                                 // assume that arguments and results are not
533                                 // NaNs or +/-INFs. Hence, accept any result if
534                                 // inputs or results are NaNs or INFs.
535                                 if (relaxedMode || skipNanInf)
536                                 {
537                                     if (fetestexcept(FE_OVERFLOW) && skipNanInf)
538                                         continue;
539 
540                                     // Note: no double rounding here.  Reference
541                                     // functions calculate in single precision.
542                                     if (IsFloatInfinity(correct2)
543                                         || IsFloatNaN(correct2)
544                                         || IsFloatInfinity(correct3)
545                                         || IsFloatNaN(correct3))
546                                         continue;
547                                 }
548 
549                                 err2 = Ulp_Error(test, correct2);
550                                 err3 = Ulp_Error(test, correct3);
551                                 fail = fail
552                                     && ((!(fabsf(err2) <= ulps))
553                                         && (!(fabsf(err3) <= ulps)));
554                                 if (fabsf(err2) < fabsf(err)) err = err2;
555                                 if (fabsf(err3) < fabsf(err)) err = err3;
556 
557                                 // retry per section 6.5.3.4
558                                 if (IsFloatResultSubnormal(correct2, ulps)
559                                     || IsFloatResultSubnormal(correct3, ulps))
560                                 {
561                                     fail = fail && (test != 0.0f);
562                                     if (!fail) err = 0.0f;
563                                 }
564 
565                                 // try with both args as zero
566                                 if (IsFloatSubnormal(s2[j]))
567                                 {
568                                     double correct4, correct5;
569                                     float err4, err5;
570 
571                                     if (skipNanInf) feclearexcept(FE_OVERFLOW);
572 
573                                     correct2 = ref_func(0.0, 0.0);
574                                     correct3 = ref_func(-0.0, 0.0);
575                                     correct4 = ref_func(0.0, -0.0);
576                                     correct5 = ref_func(-0.0, -0.0);
577 
578                                     // Per section 10 paragraph 6, accept any
579                                     // result if an input or output is a
580                                     // infinity or NaN or overflow As per
581                                     // OpenCL 2.0 spec, section 5.8.4.3,
582                                     // enabling fast-relaxed-math mode also
583                                     // enables -cl-finite-math-only
584                                     // optimization. This optimization allows to
585                                     // assume that arguments and results are not
586                                     // NaNs or +/-INFs. Hence, accept any result
587                                     // if inputs or results are NaNs or INFs.
588                                     if (relaxedMode || skipNanInf)
589                                     {
590                                         if (fetestexcept(FE_OVERFLOW)
591                                             && skipNanInf)
592                                             continue;
593 
594                                         // Note: no double rounding here.
595                                         // Reference functions calculate in
596                                         // single precision.
597                                         if (IsFloatInfinity(correct2)
598                                             || IsFloatNaN(correct2)
599                                             || IsFloatInfinity(correct3)
600                                             || IsFloatNaN(correct3)
601                                             || IsFloatInfinity(correct4)
602                                             || IsFloatNaN(correct4)
603                                             || IsFloatInfinity(correct5)
604                                             || IsFloatNaN(correct5))
605                                             continue;
606                                     }
607 
608                                     err2 = Ulp_Error(test, correct2);
609                                     err3 = Ulp_Error(test, correct3);
610                                     err4 = Ulp_Error(test, correct4);
611                                     err5 = Ulp_Error(test, correct5);
612                                     fail = fail
613                                         && ((!(fabsf(err2) <= ulps))
614                                             && (!(fabsf(err3) <= ulps))
615                                             && (!(fabsf(err4) <= ulps))
616                                             && (!(fabsf(err5) <= ulps)));
617                                     if (fabsf(err2) < fabsf(err)) err = err2;
618                                     if (fabsf(err3) < fabsf(err)) err = err3;
619                                     if (fabsf(err4) < fabsf(err)) err = err4;
620                                     if (fabsf(err5) < fabsf(err)) err = err5;
621 
622                                     // retry per section 6.5.3.4
623                                     if (IsFloatResultSubnormal(correct2, ulps)
624                                         || IsFloatResultSubnormal(correct3,
625                                                                   ulps)
626                                         || IsFloatResultSubnormal(correct4,
627                                                                   ulps)
628                                         || IsFloatResultSubnormal(correct5,
629                                                                   ulps))
630                                     {
631                                         fail = fail && (test != 0.0f);
632                                         if (!fail) err = 0.0f;
633                                     }
634                                 }
635                             }
636                             else if (IsFloatSubnormal(s2[j]))
637                             {
638                                 double correct2, correct3;
639                                 float err2, err3;
640 
641                                 if (skipNanInf) feclearexcept(FE_OVERFLOW);
642 
643                                 correct2 = ref_func(s[j], 0.0);
644                                 correct3 = ref_func(s[j], -0.0);
645 
646                                 // Per section 10 paragraph 6, accept any result
647                                 // if an input or output is a infinity or NaN or
648                                 // overflow As per OpenCL 2.0 spec,
649                                 // section 5.8.4.3, enabling fast-relaxed-math
650                                 // mode also enables -cl-finite-math-only
651                                 // optimization. This optimization allows to
652                                 // assume that arguments and results are not
653                                 // NaNs or +/-INFs. Hence, accept any result if
654                                 // inputs or results are NaNs or INFs.
655                                 if (relaxedMode || skipNanInf)
656                                 {
657                                     // Note: no double rounding here.  Reference
658                                     // functions calculate in single precision.
659                                     if (overflow[j] && skipNanInf) continue;
660 
661                                     if (IsFloatInfinity(correct2)
662                                         || IsFloatNaN(correct2)
663                                         || IsFloatInfinity(correct3)
664                                         || IsFloatNaN(correct3))
665                                         continue;
666                                 }
667 
668                                 err2 = Ulp_Error(test, correct2);
669                                 err3 = Ulp_Error(test, correct3);
670                                 fail = fail
671                                     && ((!(fabsf(err2) <= ulps))
672                                         && (!(fabsf(err3) <= ulps)));
673                                 if (fabsf(err2) < fabsf(err)) err = err2;
674                                 if (fabsf(err3) < fabsf(err)) err = err3;
675 
676                                 // retry per section 6.5.3.4
677                                 if (IsFloatResultSubnormal(correct2, ulps)
678                                     || IsFloatResultSubnormal(correct3, ulps))
679                                 {
680                                     fail = fail && (test != 0.0f);
681                                     if (!fail) err = 0.0f;
682                                 }
683                             }
684                         }
685                     }
686 
687                     if (fabsf(err) > tinfo->maxError)
688                     {
689                         tinfo->maxError = fabsf(err);
690                         tinfo->maxErrorValue = s[j];
691                         tinfo->maxErrorValue2 = s2[j];
692                     }
693                     if (fail)
694                     {
695                         vlog_error(
696                             "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a "
697                             "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %zu\n",
698                             name, sizeNames[k], err, s[j], ((cl_uint *)s)[j],
699                             s2[j], ((cl_uint *)s2)[j], r[j], test,
700                             ((cl_uint *)&test)[0], j);
701                         return -1;
702                     }
703                 }
704             }
705         }
706     }
707 
708     if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
709 
710     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
711     {
712         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
713                                              out[j], 0, NULL, NULL)))
714         {
715             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
716                        j, error);
717             return error;
718         }
719     }
720 
721     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
722 
723 
724     if (0 == (base & 0x0fffffff))
725     {
726         if (gVerboseBruteForce)
727         {
728             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
729                  "ThreadCount:%2u\n",
730                  base, job->step, job->scale, buffer_elements, job->ulps,
731                  job->threadCount);
732         }
733         else
734         {
735             vlog(".");
736         }
737         fflush(stdout);
738     }
739 
740     return CL_SUCCESS;
741 }
742 
743 } // anonymous namespace
744 
TestFunc_Float_Float_Float(const Func * f,MTdata d,bool relaxedMode)745 int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
746 {
747     TestInfo test_info{};
748     cl_int error;
749     float maxError = 0.0f;
750     double maxErrorVal = 0.0;
751     double maxErrorVal2 = 0.0;
752 
753     logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
754 
755     // Init test_info
756     test_info.threadCount = GetThreadCount();
757     test_info.subBufferSize = BUFFER_SIZE
758         / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
759     test_info.scale = getTestScale(sizeof(cl_float));
760 
761     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
762     if (test_info.step / test_info.subBufferSize != test_info.scale)
763     {
764         // there was overflow
765         test_info.jobCount = 1;
766     }
767     else
768     {
769         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
770     }
771 
772     test_info.f = f;
773     test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
774     test_info.ftz =
775         f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
776     test_info.relaxedMode = relaxedMode;
777     test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
778     test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
779     test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
780 
781     test_info.tinfo.resize(test_info.threadCount);
782     for (cl_uint i = 0; i < test_info.threadCount; i++)
783     {
784         cl_buffer_region region = {
785             i * test_info.subBufferSize * sizeof(cl_float),
786             test_info.subBufferSize * sizeof(cl_float)
787         };
788         test_info.tinfo[i].inBuf =
789             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
790                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
791         if (error || NULL == test_info.tinfo[i].inBuf)
792         {
793             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
794                        "region {%zd, %zd}\n",
795                        region.origin, region.size);
796             return error;
797         }
798         test_info.tinfo[i].inBuf2 =
799             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
800                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
801         if (error || NULL == test_info.tinfo[i].inBuf2)
802         {
803             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
804                        "region {%zd, %zd}\n",
805                        region.origin, region.size);
806             return error;
807         }
808 
809         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
810         {
811             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
812                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
813                 &region, &error);
814             if (error || NULL == test_info.tinfo[i].outBuf[j])
815             {
816                 vlog_error("Error: Unable to create sub-buffer of "
817                            "gOutBuffer[%d] for region {%zd, %zd}\n",
818                            (int)j, region.origin, region.size);
819                 return error;
820             }
821         }
822         test_info.tinfo[i].tQueue =
823             clCreateCommandQueue(gContext, gDevice, 0, &error);
824         if (NULL == test_info.tinfo[i].tQueue || error)
825         {
826             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
827             return error;
828         }
829 
830         test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
831     }
832 
833     // Init the kernels
834     BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
835                                 test_info.programs, f->nameInCode,
836                                 relaxedMode };
837     if ((error = ThreadPool_Do(BuildKernelFn,
838                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
839                                &build_info)))
840         return error;
841 
842     // Run the kernels
843     if (!gSkipCorrectnessTesting)
844     {
845         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
846         if (error) return error;
847 
848         // Accumulate the arithmetic errors
849         for (cl_uint i = 0; i < test_info.threadCount; i++)
850         {
851             if (test_info.tinfo[i].maxError > maxError)
852             {
853                 maxError = test_info.tinfo[i].maxError;
854                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
855                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
856             }
857         }
858 
859         if (gWimpyMode)
860             vlog("Wimp pass");
861         else
862             vlog("passed");
863 
864         vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
865     }
866 
867     vlog("\n");
868 
869     return CL_SUCCESS;
870 }
871