1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cstring>
23
24 namespace {
25
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)26 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
27 {
28 BuildKernelInfo &info = *(BuildKernelInfo *)p;
29 auto generator = [](const std::string &kernel_name, const char *builtin,
30 cl_uint vector_size_index) {
31 return GetBinaryKernel(kernel_name, builtin, ParameterType::Float,
32 ParameterType::Float, ParameterType::Float,
33 vector_size_index);
34 };
35 return BuildKernels(info, job_id, generator);
36 }
37
38 // Thread specific data for a worker thread
39 struct ThreadInfo
40 {
41 // Input and output buffers for the thread
42 clMemWrapper inBuf;
43 clMemWrapper inBuf2;
44 Buffers outBuf;
45
46 float maxError; // max error value. Init to 0.
47 double
48 maxErrorValue; // position of the max error value (param 1). Init to 0.
49 double maxErrorValue2; // position of the max error value (param 2). Init
50 // to 0.
51 MTdataHolder d;
52
53 // Per thread command queue to improve performance
54 clCommandQueueWrapper tQueue;
55 };
56
57 struct TestInfo
58 {
59 size_t subBufferSize; // Size of the sub-buffer in elements
60 const Func *f; // A pointer to the function info
61
62 // Programs for various vector sizes.
63 Programs programs;
64
65 // Thread-specific kernels for each vector size:
66 // k[vector_size][thread_id]
67 KernelMatrix k;
68
69 // Array of thread specific information
70 std::vector<ThreadInfo> tinfo;
71
72 cl_uint threadCount; // Number of worker threads
73 cl_uint jobCount; // Number of jobs
74 cl_uint step; // step between each chunk and the next.
75 cl_uint scale; // stride between individual test values
76 float ulps; // max_allowed ulps
77 int ftz; // non-zero if running in flush to zero mode
78 bool relaxedMode; // True if the test is being run in relaxed mode, false
79 // otherwise.
80
81 // no special fields
82 };
83
84 // A table of more difficult cases to get right
85 const float specialValues[] = {
86 -NAN,
87 -INFINITY,
88 -FLT_MAX,
89 MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
90 MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
91 MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
92 MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
93 MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
94 MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
95 MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
96 MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
97 MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
98 MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
99 MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
100 MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
101 -1000.f,
102 -100.f,
103 -4.0f,
104 -3.5f,
105 -3.0f,
106 MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
107 -2.5f,
108 MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
109 -2.0f,
110 MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
111 -1.5f,
112 MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
113 MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
114 -1.0f,
115 MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
116 MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
117 -0.5f,
118 MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
119 MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
120 -0.25f,
121 MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
122 MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
123 -FLT_MIN,
124 MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
125 MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
126 MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
127 MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
128 MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
129 MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
130 MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
131 MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
132 MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
133 MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
134 -0.0f,
135
136 +NAN,
137 +INFINITY,
138 +FLT_MAX,
139 MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
140 MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
141 MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
142 MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
143 MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
144 MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
145 MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
146 MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
147 MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
148 MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
149 MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
150 MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
151 +1000.f,
152 +100.f,
153 +4.0f,
154 +3.5f,
155 +3.0f,
156 MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
157 2.5f,
158 MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
159 +2.0f,
160 MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
161 1.5f,
162 MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
163 MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
164 +1.0f,
165 MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
166 MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
167 +0.5f,
168 MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
169 MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
170 +0.25f,
171 MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
172 MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
173 +FLT_MIN,
174 MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
175 MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
176 MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
177 MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
178 MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
179 MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
180 MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
181 MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
182 MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
183 MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
184 +0.0f,
185 };
186
187 constexpr size_t specialValuesCount =
188 sizeof(specialValues) / sizeof(specialValues[0]);
189
Test(cl_uint job_id,cl_uint thread_id,void * data)190 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
191 {
192 TestInfo *job = (TestInfo *)data;
193 size_t buffer_elements = job->subBufferSize;
194 size_t buffer_size = buffer_elements * sizeof(cl_float);
195 cl_uint base = job_id * (cl_uint)job->step;
196 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
197 fptr func = job->f->func;
198 int ftz = job->ftz;
199 bool relaxedMode = job->relaxedMode;
200 float ulps = getAllowedUlpError(job->f, relaxedMode);
201 MTdata d = tinfo->d;
202 cl_int error;
203 std::vector<bool> overflow(buffer_elements, false);
204 const char *name = job->f->name;
205 cl_uint *t = 0;
206 cl_float *r = 0;
207 cl_float *s = 0;
208 cl_float *s2 = 0;
209 RoundingMode oldRoundMode;
210
211 if (relaxedMode)
212 {
213 func = job->f->rfunc;
214 }
215
216 cl_event e[VECTOR_SIZE_COUNT];
217 cl_uint *out[VECTOR_SIZE_COUNT];
218 if (gHostFill)
219 {
220 // start the map of the output arrays
221 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
222 {
223 out[j] = (cl_uint *)clEnqueueMapBuffer(
224 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
225 buffer_size, 0, NULL, e + j, &error);
226 if (error || NULL == out[j])
227 {
228 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
229 error);
230 return error;
231 }
232 }
233
234 // Get that moving
235 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
236 }
237
238 // Init input array
239 cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
240 cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
241 cl_uint idx = 0;
242 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
243 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
244
245 if (job_id <= (cl_uint)lastSpecialJobIndex)
246 {
247 // Insert special values
248 uint32_t x, y;
249
250 x = (job_id * buffer_elements) % specialValuesCount;
251 y = (job_id * buffer_elements) / specialValuesCount;
252
253 for (; idx < buffer_elements; idx++)
254 {
255 p[idx] = ((cl_uint *)specialValues)[x];
256 p2[idx] = ((cl_uint *)specialValues)[y];
257 ++x;
258 if (x >= specialValuesCount)
259 {
260 x = 0;
261 y++;
262 if (y >= specialValuesCount) break;
263 }
264 if (relaxedMode && strcmp(name, "divide") == 0)
265 {
266 cl_uint pj = p[idx] & 0x7fffffff;
267 cl_uint p2j = p2[idx] & 0x7fffffff;
268 // Replace values outside [2^-62, 2^62] with QNaN
269 if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
270 if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
271 }
272 }
273 }
274
275 // Init any remaining values
276 for (; idx < buffer_elements; idx++)
277 {
278 p[idx] = genrand_int32(d);
279 p2[idx] = genrand_int32(d);
280
281 if (relaxedMode && strcmp(name, "divide") == 0)
282 {
283 cl_uint pj = p[idx] & 0x7fffffff;
284 cl_uint p2j = p2[idx] & 0x7fffffff;
285 // Replace values outside [2^-62, 2^62] with QNaN
286 if (pj < 0x20800000 || pj > 0x5e800000) p[idx] = 0x7fc00000;
287 if (p2j < 0x20800000 || p2j > 0x5e800000) p2[idx] = 0x7fc00000;
288 }
289 }
290
291 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
292 buffer_size, p, 0, NULL, NULL)))
293 {
294 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
295 return error;
296 }
297
298 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
299 buffer_size, p2, 0, NULL, NULL)))
300 {
301 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
302 return error;
303 }
304
305 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
306 {
307 if (gHostFill)
308 {
309 // Wait for the map to finish
310 if ((error = clWaitForEvents(1, e + j)))
311 {
312 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
313 return error;
314 }
315 if ((error = clReleaseEvent(e[j])))
316 {
317 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
318 return error;
319 }
320 }
321
322 // Fill the result buffer with garbage, so that old results don't carry
323 // over
324 uint32_t pattern = 0xffffdead;
325 if (gHostFill)
326 {
327 memset_pattern4(out[j], &pattern, buffer_size);
328 if ((error = clEnqueueUnmapMemObject(
329 tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
330 {
331 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
332 error);
333 return error;
334 }
335 }
336 else
337 {
338 if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
339 &pattern, sizeof(pattern), 0,
340 buffer_size, 0, NULL, NULL)))
341 {
342 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
343 error);
344 return error;
345 }
346 }
347
348 // Run the kernel
349 size_t vectorCount =
350 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
351 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
352 // own copy of the cl_kernel
353 cl_program program = job->programs[j];
354
355 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
356 &tinfo->outBuf[j])))
357 {
358 LogBuildError(program);
359 return error;
360 }
361 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
362 &tinfo->inBuf)))
363 {
364 LogBuildError(program);
365 return error;
366 }
367 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
368 &tinfo->inBuf2)))
369 {
370 LogBuildError(program);
371 return error;
372 }
373
374 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
375 &vectorCount, NULL, 0, NULL, NULL)))
376 {
377 vlog_error("FAILED -- could not execute kernel\n");
378 return error;
379 }
380 }
381
382 // Get that moving
383 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
384
385 if (gSkipCorrectnessTesting)
386 {
387 return CL_SUCCESS;
388 }
389
390 // Calculate the correctly rounded reference result
391 FPU_mode_type oldMode;
392 memset(&oldMode, 0, sizeof(oldMode));
393 if (ftz || relaxedMode) ForceFTZ(&oldMode);
394
395 // Set the rounding mode to match the device
396 oldRoundMode = kRoundToNearestEven;
397 if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
398
399 // Calculate the correctly rounded reference result
400 r = (float *)gOut_Ref + thread_id * buffer_elements;
401 s = (float *)gIn + thread_id * buffer_elements;
402 s2 = (float *)gIn2 + thread_id * buffer_elements;
403 if (gInfNanSupport)
404 {
405 for (size_t j = 0; j < buffer_elements; j++)
406 r[j] = (float)func.f_ff(s[j], s2[j]);
407 }
408 else
409 {
410 for (size_t j = 0; j < buffer_elements; j++)
411 {
412 feclearexcept(FE_OVERFLOW);
413 r[j] = (float)func.f_ff(s[j], s2[j]);
414 overflow[j] =
415 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
416 }
417 }
418
419 if (gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
420
421 if (ftz || relaxedMode) RestoreFPState(&oldMode);
422
423 // Read the data back -- no need to wait for the first N-1 buffers but wait
424 // for the last buffer. This is an in order queue.
425 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
426 {
427 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
428 out[j] = (cl_uint *)clEnqueueMapBuffer(
429 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
430 buffer_size, 0, NULL, NULL, &error);
431 if (error || NULL == out[j])
432 {
433 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
434 error);
435 return error;
436 }
437 }
438
439 // Verify data
440 t = (cl_uint *)r;
441 for (size_t j = 0; j < buffer_elements; j++)
442 {
443 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
444 {
445 cl_uint *q = out[k];
446
447 // If we aren't getting the correctly rounded result
448 if (t[j] != q[j])
449 {
450 float test = ((float *)q)[j];
451 double correct = func.f_ff(s[j], s2[j]);
452
453 // Per section 10 paragraph 6, accept any result if an input or
454 // output is a infinity or NaN or overflow
455 if (!gInfNanSupport)
456 {
457 // Note: no double rounding here. Reference functions
458 // calculate in single precision.
459 if (overflow[j] || IsFloatInfinity(correct)
460 || IsFloatNaN(correct) || IsFloatInfinity(s2[j])
461 || IsFloatNaN(s2[j]) || IsFloatInfinity(s[j])
462 || IsFloatNaN(s[j]))
463 continue;
464 }
465
466 // Per section 10 paragraph 6, accept embedded devices always
467 // returning positive 0.0.
468 if (gIsEmbedded && (t[j] == 0x80000000) && (q[j] == 0x00000000))
469 continue;
470
471 float err = Ulp_Error(test, correct);
472 float errB = Ulp_Error(test, (float)correct);
473
474 int fail =
475 ((!(fabsf(err) <= ulps)) && (!(fabsf(errB) <= ulps)));
476 if (fabsf(errB) < fabsf(err)) err = errB;
477
478 if (fail && (ftz || relaxedMode))
479 {
480 // retry per section 6.5.3.2
481 if (IsFloatResultSubnormal(correct, ulps))
482 {
483 fail = fail && (test != 0.0f);
484 if (!fail) err = 0.0f;
485 }
486
487 // retry per section 6.5.3.3
488 if (IsFloatSubnormal(s[j]))
489 {
490 double correct2, correct3;
491 float err2, err3;
492
493 if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
494
495 correct2 = func.f_ff(0.0, s2[j]);
496 correct3 = func.f_ff(-0.0, s2[j]);
497
498 // Per section 10 paragraph 6, accept any result if an
499 // input or output is a infinity or NaN or overflow
500 if (!gInfNanSupport)
501 {
502 if (fetestexcept(FE_OVERFLOW)) continue;
503
504 // Note: no double rounding here. Reference
505 // functions calculate in single precision.
506 if (IsFloatInfinity(correct2)
507 || IsFloatNaN(correct2)
508 || IsFloatInfinity(correct3)
509 || IsFloatNaN(correct3))
510 continue;
511 }
512
513 err2 = Ulp_Error(test, correct2);
514 err3 = Ulp_Error(test, correct3);
515 fail = fail
516 && ((!(fabsf(err2) <= ulps))
517 && (!(fabsf(err3) <= ulps)));
518 if (fabsf(err2) < fabsf(err)) err = err2;
519 if (fabsf(err3) < fabsf(err)) err = err3;
520
521 // retry per section 6.5.3.4
522 if (IsFloatResultSubnormal(correct2, ulps)
523 || IsFloatResultSubnormal(correct3, ulps))
524 {
525 fail = fail && (test != 0.0f);
526 if (!fail) err = 0.0f;
527 }
528
529 // try with both args as zero
530 if (IsFloatSubnormal(s2[j]))
531 {
532 double correct4, correct5;
533 float err4, err5;
534
535 if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
536
537 correct2 = func.f_ff(0.0, 0.0);
538 correct3 = func.f_ff(-0.0, 0.0);
539 correct4 = func.f_ff(0.0, -0.0);
540 correct5 = func.f_ff(-0.0, -0.0);
541
542 // Per section 10 paragraph 6, accept any result if
543 // an input or output is a infinity or NaN or
544 // overflow
545 if (!gInfNanSupport)
546 {
547 if (fetestexcept(FE_OVERFLOW)) continue;
548
549 // Note: no double rounding here. Reference
550 // functions calculate in single precision.
551 if (IsFloatInfinity(correct2)
552 || IsFloatNaN(correct2)
553 || IsFloatInfinity(correct3)
554 || IsFloatNaN(correct3)
555 || IsFloatInfinity(correct4)
556 || IsFloatNaN(correct4)
557 || IsFloatInfinity(correct5)
558 || IsFloatNaN(correct5))
559 continue;
560 }
561
562 err2 = Ulp_Error(test, correct2);
563 err3 = Ulp_Error(test, correct3);
564 err4 = Ulp_Error(test, correct4);
565 err5 = Ulp_Error(test, correct5);
566 fail = fail
567 && ((!(fabsf(err2) <= ulps))
568 && (!(fabsf(err3) <= ulps))
569 && (!(fabsf(err4) <= ulps))
570 && (!(fabsf(err5) <= ulps)));
571 if (fabsf(err2) < fabsf(err)) err = err2;
572 if (fabsf(err3) < fabsf(err)) err = err3;
573 if (fabsf(err4) < fabsf(err)) err = err4;
574 if (fabsf(err5) < fabsf(err)) err = err5;
575
576 // retry per section 6.5.3.4
577 if (IsFloatResultSubnormal(correct2, ulps)
578 || IsFloatResultSubnormal(correct3, ulps)
579 || IsFloatResultSubnormal(correct4, ulps)
580 || IsFloatResultSubnormal(correct5, ulps))
581 {
582 fail = fail && (test != 0.0f);
583 if (!fail) err = 0.0f;
584 }
585 }
586 }
587 else if (IsFloatSubnormal(s2[j]))
588 {
589 double correct2, correct3;
590 float err2, err3;
591
592 if (!gInfNanSupport) feclearexcept(FE_OVERFLOW);
593
594 correct2 = func.f_ff(s[j], 0.0);
595 correct3 = func.f_ff(s[j], -0.0);
596
597 // Per section 10 paragraph 6, accept any result if an
598 // input or output is a infinity or NaN or overflow
599 if (!gInfNanSupport)
600 {
601 // Note: no double rounding here. Reference
602 // functions calculate in single precision.
603 if (overflow[j] || IsFloatInfinity(correct)
604 || IsFloatNaN(correct)
605 || IsFloatInfinity(correct2)
606 || IsFloatNaN(correct2))
607 continue;
608 }
609
610 err2 = Ulp_Error(test, correct2);
611 err3 = Ulp_Error(test, correct3);
612 fail = fail
613 && ((!(fabsf(err2) <= ulps))
614 && (!(fabsf(err3) <= ulps)));
615 if (fabsf(err2) < fabsf(err)) err = err2;
616 if (fabsf(err3) < fabsf(err)) err = err3;
617
618 // retry per section 6.5.3.4
619 if (IsFloatResultSubnormal(correct2, ulps)
620 || IsFloatResultSubnormal(correct3, ulps))
621 {
622 fail = fail && (test != 0.0f);
623 if (!fail) err = 0.0f;
624 }
625 }
626 }
627
628
629 if (fabsf(err) > tinfo->maxError)
630 {
631 tinfo->maxError = fabsf(err);
632 tinfo->maxErrorValue = s[j];
633 tinfo->maxErrorValue2 = s2[j];
634 }
635 if (fail)
636 {
637 vlog_error("\nERROR: %s%s: %f ulp error at {%a, %a}: *%a "
638 "vs. %a (0x%8.8x) at index: %zu\n",
639 name, sizeNames[k], err, s[j], s2[j], r[j], test,
640 ((cl_uint *)&test)[0], j);
641 return -1;
642 }
643 }
644 }
645 }
646
647 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
648 {
649 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
650 out[j], 0, NULL, NULL)))
651 {
652 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
653 j, error);
654 return error;
655 }
656 }
657
658 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
659
660
661 if (0 == (base & 0x0fffffff))
662 {
663 if (gVerboseBruteForce)
664 {
665 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
666 "ThreadCount:%2u\n",
667 base, job->step, job->scale, buffer_elements, job->ulps,
668 job->threadCount);
669 }
670 else
671 {
672 vlog(".");
673 }
674 fflush(stdout);
675 }
676
677 return CL_SUCCESS;
678 }
679
680 } // anonymous namespace
681
TestFunc_Float_Float_Float_Operator(const Func * f,MTdata d,bool relaxedMode)682 int TestFunc_Float_Float_Float_Operator(const Func *f, MTdata d,
683 bool relaxedMode)
684 {
685 TestInfo test_info{};
686 cl_int error;
687 float maxError = 0.0f;
688 double maxErrorVal = 0.0;
689 double maxErrorVal2 = 0.0;
690
691 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
692
693 // Init test_info
694 test_info.threadCount = GetThreadCount();
695 test_info.subBufferSize = BUFFER_SIZE
696 / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
697 test_info.scale = getTestScale(sizeof(cl_float));
698
699 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
700 if (test_info.step / test_info.subBufferSize != test_info.scale)
701 {
702 // there was overflow
703 test_info.jobCount = 1;
704 }
705 else
706 {
707 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
708 }
709
710 test_info.f = f;
711 test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
712 test_info.ftz =
713 f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
714 test_info.relaxedMode = relaxedMode;
715
716 test_info.tinfo.resize(test_info.threadCount);
717 for (cl_uint i = 0; i < test_info.threadCount; i++)
718 {
719 cl_buffer_region region = {
720 i * test_info.subBufferSize * sizeof(cl_float),
721 test_info.subBufferSize * sizeof(cl_float)
722 };
723 test_info.tinfo[i].inBuf =
724 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
725 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
726 if (error || NULL == test_info.tinfo[i].inBuf)
727 {
728 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
729 "region {%zd, %zd}\n",
730 region.origin, region.size);
731 return error;
732 }
733 test_info.tinfo[i].inBuf2 =
734 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
735 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
736 if (error || NULL == test_info.tinfo[i].inBuf2)
737 {
738 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
739 "region {%zd, %zd}\n",
740 region.origin, region.size);
741 return error;
742 }
743
744 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
745 {
746 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
747 gOutBuffer[j], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION,
748 ®ion, &error);
749 if (error || NULL == test_info.tinfo[i].outBuf[j])
750 {
751 vlog_error("Error: Unable to create sub-buffer of "
752 "gOutBuffer[%d] for region {%zd, %zd}\n",
753 (int)j, region.origin, region.size);
754 return error;
755 }
756 }
757 test_info.tinfo[i].tQueue =
758 clCreateCommandQueue(gContext, gDevice, 0, &error);
759 if (NULL == test_info.tinfo[i].tQueue || error)
760 {
761 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
762 return error;
763 }
764
765 test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
766 }
767
768 // Init the kernels
769 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
770 test_info.programs, f->nameInCode,
771 relaxedMode };
772 if ((error = ThreadPool_Do(BuildKernelFn,
773 gMaxVectorSizeIndex - gMinVectorSizeIndex,
774 &build_info)))
775 return error;
776
777 // Run the kernels
778 if (!gSkipCorrectnessTesting)
779 {
780 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
781 if (error) return error;
782
783 // Accumulate the arithmetic errors
784 for (cl_uint i = 0; i < test_info.threadCount; i++)
785 {
786 if (test_info.tinfo[i].maxError > maxError)
787 {
788 maxError = test_info.tinfo[i].maxError;
789 maxErrorVal = test_info.tinfo[i].maxErrorValue;
790 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
791 }
792 }
793
794 if (gWimpyMode)
795 vlog("Wimp pass");
796 else
797 vlog("passed");
798
799 vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
800 }
801
802 vlog("\n");
803
804 return CL_SUCCESS;
805 }
806