xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/math_brute_force/binary_i_double.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21 
22 #include <climits>
23 #include <cstring>
24 
25 namespace {
26 
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)27 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
28 {
29     BuildKernelInfo &info = *(BuildKernelInfo *)p;
30     auto generator = [](const std::string &kernel_name, const char *builtin,
31                         cl_uint vector_size_index) {
32         return GetBinaryKernel(kernel_name, builtin, ParameterType::Double,
33                                ParameterType::Double, ParameterType::Int,
34                                vector_size_index);
35     };
36     return BuildKernels(info, job_id, generator);
37 }
38 
39 // Thread specific data for a worker thread
40 struct ThreadInfo
41 {
42     // Input and output buffers for the thread
43     clMemWrapper inBuf;
44     clMemWrapper inBuf2;
45     Buffers outBuf;
46 
47     float maxError; // max error value. Init to 0.
48     double
49         maxErrorValue; // position of the max error value (param 1).  Init to 0.
50     cl_int maxErrorValue2; // position of the max error value (param 2).  Init
51                            // to 0.
52     MTdataHolder d;
53 
54     // Per thread command queue to improve performance
55     clCommandQueueWrapper tQueue;
56 };
57 
58 struct TestInfo
59 {
60     size_t subBufferSize; // Size of the sub-buffer in elements
61     const Func *f; // A pointer to the function info
62 
63     // Programs for various vector sizes.
64     Programs programs;
65 
66     // Thread-specific kernels for each vector size:
67     // k[vector_size][thread_id]
68     KernelMatrix k;
69 
70     // Array of thread specific information
71     std::vector<ThreadInfo> tinfo;
72 
73     cl_uint threadCount; // Number of worker threads
74     cl_uint jobCount; // Number of jobs
75     cl_uint step; // step between each chunk and the next.
76     cl_uint scale; // stride between individual test values
77     float ulps; // max_allowed ulps
78     int ftz; // non-zero if running in flush to zero mode
79     bool relaxedMode; // True if test is running in relaxed mode, false
80                       // otherwise.
81 
82     // no special values
83 };
84 
85 // A table of more difficult cases to get right
86 const double specialValues[] = {
87     -NAN,
88     -INFINITY,
89     -DBL_MAX,
90     MAKE_HEX_DOUBLE(-0x1.0000000000001p64, -0x10000000000001LL, 12),
91     MAKE_HEX_DOUBLE(-0x1.0p64, -0x1LL, 64),
92     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp63, -0x1fffffffffffffLL, 11),
93     MAKE_HEX_DOUBLE(-0x1.0000000000001p63, -0x10000000000001LL, 11),
94     MAKE_HEX_DOUBLE(-0x1.0p63, -0x1LL, 63),
95     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp62, -0x1fffffffffffffLL, 10),
96     MAKE_HEX_DOUBLE(-0x1.000002p32, -0x1000002LL, 8),
97     MAKE_HEX_DOUBLE(-0x1.0p32, -0x1LL, 32),
98     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp31, -0x1fffffffffffffLL, -21),
99     MAKE_HEX_DOUBLE(-0x1.0000000000001p31, -0x10000000000001LL, -21),
100     MAKE_HEX_DOUBLE(-0x1.0p31, -0x1LL, 31),
101     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp30, -0x1fffffffffffffLL, -22),
102     -1000.0,
103     -100.0,
104     -4.0,
105     -3.5,
106     -3.0,
107     MAKE_HEX_DOUBLE(-0x1.8000000000001p1, -0x18000000000001LL, -51),
108     -2.5,
109     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp1, -0x17ffffffffffffLL, -51),
110     -2.0,
111     MAKE_HEX_DOUBLE(-0x1.8000000000001p0, -0x18000000000001LL, -52),
112     -1.5,
113     MAKE_HEX_DOUBLE(-0x1.7ffffffffffffp0, -0x17ffffffffffffLL, -52),
114     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
115     -1.0,
116     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-1, -0x1fffffffffffffLL, -53),
117     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1, -0x10000000000001LL, -53),
118     -0.5,
119     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-2, -0x1fffffffffffffLL, -54),
120     MAKE_HEX_DOUBLE(-0x1.0000000000001p-2, -0x10000000000001LL, -54),
121     -0.25,
122     MAKE_HEX_DOUBLE(-0x1.fffffffffffffp-3, -0x1fffffffffffffLL, -55),
123     MAKE_HEX_DOUBLE(-0x1.0000000000001p-1022, -0x10000000000001LL, -1074),
124     -DBL_MIN,
125     MAKE_HEX_DOUBLE(-0x0.fffffffffffffp-1022, -0x0fffffffffffffLL, -1074),
126     MAKE_HEX_DOUBLE(-0x0.0000000000fffp-1022, -0x00000000000fffLL, -1074),
127     MAKE_HEX_DOUBLE(-0x0.00000000000fep-1022, -0x000000000000feLL, -1074),
128     MAKE_HEX_DOUBLE(-0x0.000000000000ep-1022, -0x0000000000000eLL, -1074),
129     MAKE_HEX_DOUBLE(-0x0.000000000000cp-1022, -0x0000000000000cLL, -1074),
130     MAKE_HEX_DOUBLE(-0x0.000000000000ap-1022, -0x0000000000000aLL, -1074),
131     MAKE_HEX_DOUBLE(-0x0.0000000000008p-1022, -0x00000000000008LL, -1074),
132     MAKE_HEX_DOUBLE(-0x0.0000000000007p-1022, -0x00000000000007LL, -1074),
133     MAKE_HEX_DOUBLE(-0x0.0000000000006p-1022, -0x00000000000006LL, -1074),
134     MAKE_HEX_DOUBLE(-0x0.0000000000005p-1022, -0x00000000000005LL, -1074),
135     MAKE_HEX_DOUBLE(-0x0.0000000000004p-1022, -0x00000000000004LL, -1074),
136     MAKE_HEX_DOUBLE(-0x0.0000000000003p-1022, -0x00000000000003LL, -1074),
137     MAKE_HEX_DOUBLE(-0x0.0000000000002p-1022, -0x00000000000002LL, -1074),
138     MAKE_HEX_DOUBLE(-0x0.0000000000001p-1022, -0x00000000000001LL, -1074),
139     -0.0,
140 
141     +NAN,
142     +INFINITY,
143     +DBL_MAX,
144     MAKE_HEX_DOUBLE(+0x1.0000000000001p64, +0x10000000000001LL, 12),
145     MAKE_HEX_DOUBLE(+0x1.0p64, +0x1LL, 64),
146     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp63, +0x1fffffffffffffLL, 11),
147     MAKE_HEX_DOUBLE(+0x1.0000000000001p63, +0x10000000000001LL, 11),
148     MAKE_HEX_DOUBLE(+0x1.0p63, +0x1LL, 63),
149     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp62, +0x1fffffffffffffLL, 10),
150     MAKE_HEX_DOUBLE(+0x1.000002p32, +0x1000002LL, 8),
151     MAKE_HEX_DOUBLE(+0x1.0p32, +0x1LL, 32),
152     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp31, +0x1fffffffffffffLL, -21),
153     MAKE_HEX_DOUBLE(+0x1.0000000000001p31, +0x10000000000001LL, -21),
154     MAKE_HEX_DOUBLE(+0x1.0p31, +0x1LL, 31),
155     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp30, +0x1fffffffffffffLL, -22),
156     +1000.0,
157     +100.0,
158     +4.0,
159     +3.5,
160     +3.0,
161     MAKE_HEX_DOUBLE(+0x1.8000000000001p1, +0x18000000000001LL, -51),
162     +2.5,
163     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp1, +0x17ffffffffffffLL, -51),
164     +2.0,
165     MAKE_HEX_DOUBLE(+0x1.8000000000001p0, +0x18000000000001LL, -52),
166     +1.5,
167     MAKE_HEX_DOUBLE(+0x1.7ffffffffffffp0, +0x17ffffffffffffLL, -52),
168     MAKE_HEX_DOUBLE(-0x1.0000000000001p0, -0x10000000000001LL, -52),
169     +1.0,
170     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-1, +0x1fffffffffffffLL, -53),
171     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1, +0x10000000000001LL, -53),
172     +0.5,
173     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-2, +0x1fffffffffffffLL, -54),
174     MAKE_HEX_DOUBLE(+0x1.0000000000001p-2, +0x10000000000001LL, -54),
175     +0.25,
176     MAKE_HEX_DOUBLE(+0x1.fffffffffffffp-3, +0x1fffffffffffffLL, -55),
177     MAKE_HEX_DOUBLE(+0x1.0000000000001p-1022, +0x10000000000001LL, -1074),
178     +DBL_MIN,
179     MAKE_HEX_DOUBLE(+0x0.fffffffffffffp-1022, +0x0fffffffffffffLL, -1074),
180     MAKE_HEX_DOUBLE(+0x0.0000000000fffp-1022, +0x00000000000fffLL, -1074),
181     MAKE_HEX_DOUBLE(+0x0.00000000000fep-1022, +0x000000000000feLL, -1074),
182     MAKE_HEX_DOUBLE(+0x0.000000000000ep-1022, +0x0000000000000eLL, -1074),
183     MAKE_HEX_DOUBLE(+0x0.000000000000cp-1022, +0x0000000000000cLL, -1074),
184     MAKE_HEX_DOUBLE(+0x0.000000000000ap-1022, +0x0000000000000aLL, -1074),
185     MAKE_HEX_DOUBLE(+0x0.0000000000008p-1022, +0x00000000000008LL, -1074),
186     MAKE_HEX_DOUBLE(+0x0.0000000000007p-1022, +0x00000000000007LL, -1074),
187     MAKE_HEX_DOUBLE(+0x0.0000000000006p-1022, +0x00000000000006LL, -1074),
188     MAKE_HEX_DOUBLE(+0x0.0000000000005p-1022, +0x00000000000005LL, -1074),
189     MAKE_HEX_DOUBLE(+0x0.0000000000004p-1022, +0x00000000000004LL, -1074),
190     MAKE_HEX_DOUBLE(+0x0.0000000000003p-1022, +0x00000000000003LL, -1074),
191     MAKE_HEX_DOUBLE(+0x0.0000000000002p-1022, +0x00000000000002LL, -1074),
192     MAKE_HEX_DOUBLE(+0x0.0000000000001p-1022, +0x00000000000001LL, -1074),
193     +0.0,
194 };
195 
196 constexpr size_t specialValuesCount =
197     sizeof(specialValues) / sizeof(specialValues[0]);
198 
199 const int specialValuesInt[] = {
200     0,       1,  2,  3,  1022,  1023,  1024,   INT_MIN,
201     INT_MAX, -1, -2, -3, -1022, -1023, -11024, -INT_MAX,
202 };
203 
204 constexpr size_t specialValuesIntCount =
205     sizeof(specialValuesInt) / sizeof(specialValuesInt[0]);
206 
Test(cl_uint job_id,cl_uint thread_id,void * data)207 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
208 {
209     TestInfo *job = (TestInfo *)data;
210     size_t buffer_elements = job->subBufferSize;
211     size_t buffer_size = buffer_elements * sizeof(cl_double);
212     cl_uint base = job_id * (cl_uint)job->step;
213     ThreadInfo *tinfo = &(job->tinfo[thread_id]);
214     float ulps = job->ulps;
215     dptr func = job->f->dfunc;
216     int ftz = job->ftz;
217     bool relaxedMode = job->relaxedMode;
218     MTdata d = tinfo->d;
219     cl_int error;
220     const char *name = job->f->name;
221     cl_ulong *t;
222     cl_double *r;
223     cl_double *s;
224     cl_int *s2;
225 
226     Force64BitFPUPrecision();
227 
228     cl_event e[VECTOR_SIZE_COUNT];
229     cl_ulong *out[VECTOR_SIZE_COUNT];
230     if (gHostFill)
231     {
232         // Start the map of the output arrays
233         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
234         {
235             out[j] = (cl_ulong *)clEnqueueMapBuffer(
236                 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
237                 buffer_size, 0, NULL, e + j, &error);
238             if (error || NULL == out[j])
239             {
240                 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
241                            error);
242                 return error;
243             }
244         }
245 
246         // Get that moving
247         if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
248     }
249 
250     // Init input array
251     cl_ulong *p = (cl_ulong *)gIn + thread_id * buffer_elements;
252     cl_int *p2 = (cl_int *)gIn2 + thread_id * buffer_elements;
253     size_t idx = 0;
254     int totalSpecialValueCount = specialValuesCount * specialValuesIntCount;
255     int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
256 
257     // Test edge cases
258     if (job_id <= (cl_uint)lastSpecialJobIndex)
259     {
260         cl_double *fp = (cl_double *)p;
261         cl_int *ip2 = (cl_int *)p2;
262         uint32_t x, y;
263 
264         x = (job_id * buffer_elements) % specialValuesCount;
265         y = (job_id * buffer_elements) / specialValuesCount;
266 
267         for (; idx < buffer_elements; idx++)
268         {
269             fp[idx] = specialValues[x];
270             ip2[idx] = specialValuesInt[y];
271             if (++x >= specialValuesCount)
272             {
273                 x = 0;
274                 y++;
275                 if (y >= specialValuesIntCount) break;
276             }
277         }
278     }
279 
280     // Init any remaining values.
281     for (; idx < buffer_elements; idx++)
282     {
283         p[idx] = DoubleFromUInt32(genrand_int32(d));
284         p2[idx] = genrand_int32(d);
285     }
286 
287     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
288                                       buffer_size, p, 0, NULL, NULL)))
289     {
290         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
291         return error;
292     }
293 
294     if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
295                                       buffer_size / 2, p2, 0, NULL, NULL)))
296     {
297         vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
298         return error;
299     }
300 
301     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
302     {
303         if (gHostFill)
304         {
305             // Wait for the map to finish
306             if ((error = clWaitForEvents(1, e + j)))
307             {
308                 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
309                 return error;
310             }
311             if ((error = clReleaseEvent(e[j])))
312             {
313                 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
314                 return error;
315             }
316         }
317 
318         // Fill the result buffer with garbage, so that old results don't carry
319         // over
320         uint32_t pattern = 0xffffdead;
321         if (gHostFill)
322         {
323             memset_pattern4(out[j], &pattern, buffer_size);
324             if ((error = clEnqueueUnmapMemObject(
325                      tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
326             {
327                 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
328                            error);
329                 return error;
330             }
331         }
332         else
333         {
334             if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
335                                              &pattern, sizeof(pattern), 0,
336                                              buffer_size, 0, NULL, NULL)))
337             {
338                 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
339                            error);
340                 return error;
341             }
342         }
343 
344         // Run the kernel
345         size_t vectorCount =
346             (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
347         cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
348                                                  // own copy of the cl_kernel
349         cl_program program = job->programs[j];
350 
351         if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
352                                     &tinfo->outBuf[j])))
353         {
354             LogBuildError(program);
355             return error;
356         }
357         if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
358                                     &tinfo->inBuf)))
359         {
360             LogBuildError(program);
361             return error;
362         }
363         if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
364                                     &tinfo->inBuf2)))
365         {
366             LogBuildError(program);
367             return error;
368         }
369 
370         if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
371                                             &vectorCount, NULL, 0, NULL, NULL)))
372         {
373             vlog_error("FAILED -- could not execute kernel\n");
374             return error;
375         }
376     }
377 
378     // Get that moving
379     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
380 
381     if (gSkipCorrectnessTesting) return CL_SUCCESS;
382 
383     // Calculate the correctly rounded reference result
384     r = (cl_double *)gOut_Ref + thread_id * buffer_elements;
385     s = (cl_double *)gIn + thread_id * buffer_elements;
386     s2 = (cl_int *)gIn2 + thread_id * buffer_elements;
387     for (size_t j = 0; j < buffer_elements; j++)
388         r[j] = (cl_double)func.f_fi(s[j], s2[j]);
389 
390     // Read the data back -- no need to wait for the first N-1 buffers but wait
391     // for the last buffer. This is an in order queue.
392     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
393     {
394         cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
395         out[j] = (cl_ulong *)clEnqueueMapBuffer(
396             tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
397             buffer_size, 0, NULL, NULL, &error);
398         if (error || NULL == out[j])
399         {
400             vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
401                        error);
402             return error;
403         }
404     }
405 
406     // Verify data
407     t = (cl_ulong *)r;
408     for (size_t j = 0; j < buffer_elements; j++)
409     {
410         for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
411         {
412             cl_ulong *q = out[k];
413 
414             // If we aren't getting the correctly rounded result
415             if (t[j] != q[j])
416             {
417                 cl_double test = ((cl_double *)q)[j];
418                 long double correct = func.f_fi(s[j], s2[j]);
419                 float err = Bruteforce_Ulp_Error_Double(test, correct);
420                 int fail = !(fabsf(err) <= ulps);
421 
422                 if (fail && (ftz || relaxedMode))
423                 {
424                     // retry per section 6.5.3.2
425                     if (IsDoubleResultSubnormal(correct, ulps))
426                     {
427                         fail = fail && (test != 0.0f);
428                         if (!fail) err = 0.0f;
429                     }
430 
431                     // retry per section 6.5.3.3
432                     if (IsDoubleSubnormal(s[j]))
433                     {
434                         long double correct2 = func.f_fi(0.0, s2[j]);
435                         long double correct3 = func.f_fi(-0.0, s2[j]);
436                         float err2 =
437                             Bruteforce_Ulp_Error_Double(test, correct2);
438                         float err3 =
439                             Bruteforce_Ulp_Error_Double(test, correct3);
440                         fail = fail
441                             && ((!(fabsf(err2) <= ulps))
442                                 && (!(fabsf(err3) <= ulps)));
443                         if (fabsf(err2) < fabsf(err)) err = err2;
444                         if (fabsf(err3) < fabsf(err)) err = err3;
445 
446                         // retry per section 6.5.3.4
447                         if (IsDoubleResultSubnormal(correct2, ulps)
448                             || IsDoubleResultSubnormal(correct3, ulps))
449                         {
450                             fail = fail && (test != 0.0f);
451                             if (!fail) err = 0.0f;
452                         }
453                     }
454                 }
455 
456                 if (fabsf(err) > tinfo->maxError)
457                 {
458                     tinfo->maxError = fabsf(err);
459                     tinfo->maxErrorValue = s[j];
460                     tinfo->maxErrorValue2 = s2[j];
461                 }
462                 if (fail)
463                 {
464                     vlog_error("\nERROR: %s%s: %f ulp error at {%.13la, %d}: "
465                                "*%.13la vs. %.13la\n",
466                                name, sizeNames[k], err, s[j], s2[j], r[j],
467                                test);
468                     return -1;
469                 }
470             }
471         }
472     }
473 
474     for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
475     {
476         if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
477                                              out[j], 0, NULL, NULL)))
478         {
479             vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
480                        j, error);
481             return error;
482         }
483     }
484 
485     if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
486 
487     if (0 == (base & 0x0fffffff))
488     {
489         if (gVerboseBruteForce)
490         {
491             vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
492                  "ThreadCount:%2u\n",
493                  base, job->step, job->scale, buffer_elements, job->ulps,
494                  job->threadCount);
495         }
496         else
497         {
498             vlog(".");
499         }
500         fflush(stdout);
501     }
502 
503     return CL_SUCCESS;
504 }
505 
506 } // anonymous namespace
507 
TestFunc_Double_Double_Int(const Func * f,MTdata d,bool relaxedMode)508 int TestFunc_Double_Double_Int(const Func *f, MTdata d, bool relaxedMode)
509 {
510     TestInfo test_info{};
511     cl_int error;
512     float maxError = 0.0f;
513     double maxErrorVal = 0.0;
514     cl_int maxErrorVal2 = 0;
515 
516     logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
517 
518     // Init test_info
519     test_info.threadCount = GetThreadCount();
520     test_info.subBufferSize = BUFFER_SIZE
521         / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
522     test_info.scale = getTestScale(sizeof(cl_double));
523 
524     test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
525     if (test_info.step / test_info.subBufferSize != test_info.scale)
526     {
527         // there was overflow
528         test_info.jobCount = 1;
529     }
530     else
531     {
532         test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
533     }
534 
535     test_info.f = f;
536     test_info.ulps = f->double_ulps;
537     test_info.ftz = f->ftz || gForceFTZ;
538     test_info.relaxedMode = relaxedMode;
539 
540     test_info.tinfo.resize(test_info.threadCount);
541     for (cl_uint i = 0; i < test_info.threadCount; i++)
542     {
543         cl_buffer_region region = {
544             i * test_info.subBufferSize * sizeof(cl_double),
545             test_info.subBufferSize * sizeof(cl_double)
546         };
547         test_info.tinfo[i].inBuf =
548             clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
549                               CL_BUFFER_CREATE_TYPE_REGION, &region, &error);
550         if (error || NULL == test_info.tinfo[i].inBuf)
551         {
552             vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
553                        "region {%zd, %zd}\n",
554                        region.origin, region.size);
555             return error;
556         }
557         cl_buffer_region region2 = { i * test_info.subBufferSize
558                                          * sizeof(cl_int),
559                                      test_info.subBufferSize * sizeof(cl_int) };
560         test_info.tinfo[i].inBuf2 =
561             clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
562                               CL_BUFFER_CREATE_TYPE_REGION, &region2, &error);
563         if (error || NULL == test_info.tinfo[i].inBuf2)
564         {
565             vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
566                        "region {%zd, %zd}\n",
567                        region.origin, region.size);
568             return error;
569         }
570 
571         for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
572         {
573             test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
574                 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
575                 &region, &error);
576             if (error || NULL == test_info.tinfo[i].outBuf[j])
577             {
578                 vlog_error("Error: Unable to create sub-buffer of "
579                            "gOutBuffer[%d] for region {%zd, %zd}\n",
580                            (int)j, region.origin, region.size);
581                 return error;
582             }
583         }
584         test_info.tinfo[i].tQueue =
585             clCreateCommandQueue(gContext, gDevice, 0, &error);
586         if (NULL == test_info.tinfo[i].tQueue || error)
587         {
588             vlog_error("clCreateCommandQueue failed. (%d)\n", error);
589             return error;
590         }
591 
592         test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
593     }
594 
595     // Init the kernels
596     BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
597                                 test_info.programs, f->nameInCode,
598                                 relaxedMode };
599     if ((error = ThreadPool_Do(BuildKernelFn,
600                                gMaxVectorSizeIndex - gMinVectorSizeIndex,
601                                &build_info)))
602         return error;
603 
604     // Run the kernels
605     if (!gSkipCorrectnessTesting)
606     {
607         error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
608         if (error) return error;
609 
610         // Accumulate the arithmetic errors
611         for (cl_uint i = 0; i < test_info.threadCount; i++)
612         {
613             if (test_info.tinfo[i].maxError > maxError)
614             {
615                 maxError = test_info.tinfo[i].maxError;
616                 maxErrorVal = test_info.tinfo[i].maxErrorValue;
617                 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
618             }
619         }
620 
621         if (gWimpyMode)
622             vlog("Wimp pass");
623         else
624             vlog("passed");
625 
626         vlog("\t%8.2f @ {%a, %d}", maxError, maxErrorVal, maxErrorVal2);
627     }
628 
629     vlog("\n");
630 
631     return CL_SUCCESS;
632 }
633