1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cstring>
23
24 namespace {
25
26 const float twoToMinus126 = MAKE_HEX_FLOAT(0x1p-126f, 1, -126);
27
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)28 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
29 {
30 BuildKernelInfo &info = *(BuildKernelInfo *)p;
31 auto generator = [](const std::string &kernel_name, const char *builtin,
32 cl_uint vector_size_index) {
33 return GetBinaryKernel(kernel_name, builtin, ParameterType::Float,
34 ParameterType::Float, ParameterType::Float,
35 vector_size_index);
36 };
37 return BuildKernels(info, job_id, generator);
38 }
39
40 // Thread specific data for a worker thread
41 struct ThreadInfo
42 {
43 // Input and output buffers for the thread
44 clMemWrapper inBuf;
45 clMemWrapper inBuf2;
46 Buffers outBuf;
47
48 float maxError; // max error value. Init to 0.
49 double
50 maxErrorValue; // position of the max error value (param 1). Init to 0.
51 double maxErrorValue2; // position of the max error value (param 2). Init
52 // to 0.
53 MTdataHolder d;
54
55 // Per thread command queue to improve performance
56 clCommandQueueWrapper tQueue;
57 };
58
59 struct TestInfo
60 {
61 size_t subBufferSize; // Size of the sub-buffer in elements
62 const Func *f; // A pointer to the function info
63
64 // Programs for various vector sizes.
65 Programs programs;
66
67 // Thread-specific kernels for each vector size:
68 // k[vector_size][thread_id]
69 KernelMatrix k;
70
71 // Array of thread specific information
72 std::vector<ThreadInfo> tinfo;
73
74 cl_uint threadCount; // Number of worker threads
75 cl_uint jobCount; // Number of jobs
76 cl_uint step; // step between each chunk and the next.
77 cl_uint scale; // stride between individual test values
78 float ulps; // max_allowed ulps
79 int ftz; // non-zero if running in flush to zero mode
80
81 int isFDim;
82 int skipNanInf;
83 int isNextafter;
84 bool relaxedMode; // True if test is running in relaxed mode, false
85 // otherwise.
86 };
87
88 // A table of more difficult cases to get right
89 const float specialValues[] = {
90 -NAN,
91 -INFINITY,
92 -FLT_MAX,
93 MAKE_HEX_FLOAT(-0x1.000002p64f, -0x1000002L, 40),
94 MAKE_HEX_FLOAT(-0x1.0p64f, -0x1L, 64),
95 MAKE_HEX_FLOAT(-0x1.fffffep63f, -0x1fffffeL, 39),
96 MAKE_HEX_FLOAT(-0x1.000002p63f, -0x1000002L, 39),
97 MAKE_HEX_FLOAT(-0x1.0p63f, -0x1L, 63),
98 MAKE_HEX_FLOAT(-0x1.fffffep62f, -0x1fffffeL, 38),
99 MAKE_HEX_FLOAT(-0x1.000002p32f, -0x1000002L, 8),
100 MAKE_HEX_FLOAT(-0x1.0p32f, -0x1L, 32),
101 MAKE_HEX_FLOAT(-0x1.fffffep31f, -0x1fffffeL, 7),
102 MAKE_HEX_FLOAT(-0x1.000002p31f, -0x1000002L, 7),
103 MAKE_HEX_FLOAT(-0x1.0p31f, -0x1L, 31),
104 MAKE_HEX_FLOAT(-0x1.fffffep30f, -0x1fffffeL, 6),
105 -1000.f,
106 -100.f,
107 -4.0f,
108 -3.5f,
109 -3.0f,
110 MAKE_HEX_FLOAT(-0x1.800002p1f, -0x1800002L, -23),
111 -2.5f,
112 MAKE_HEX_FLOAT(-0x1.7ffffep1f, -0x17ffffeL, -23),
113 -2.0f,
114 MAKE_HEX_FLOAT(-0x1.800002p0f, -0x1800002L, -24),
115 -1.5f,
116 MAKE_HEX_FLOAT(-0x1.7ffffep0f, -0x17ffffeL, -24),
117 MAKE_HEX_FLOAT(-0x1.000002p0f, -0x1000002L, -24),
118 -1.0f,
119 MAKE_HEX_FLOAT(-0x1.fffffep-1f, -0x1fffffeL, -25),
120 MAKE_HEX_FLOAT(-0x1.000002p-1f, -0x1000002L, -25),
121 -0.5f,
122 MAKE_HEX_FLOAT(-0x1.fffffep-2f, -0x1fffffeL, -26),
123 MAKE_HEX_FLOAT(-0x1.000002p-2f, -0x1000002L, -26),
124 -0.25f,
125 MAKE_HEX_FLOAT(-0x1.fffffep-3f, -0x1fffffeL, -27),
126 MAKE_HEX_FLOAT(-0x1.000002p-126f, -0x1000002L, -150),
127 -FLT_MIN,
128 MAKE_HEX_FLOAT(-0x0.fffffep-126f, -0x0fffffeL, -150),
129 MAKE_HEX_FLOAT(-0x0.000ffep-126f, -0x0000ffeL, -150),
130 MAKE_HEX_FLOAT(-0x0.0000fep-126f, -0x00000feL, -150),
131 MAKE_HEX_FLOAT(-0x0.00000ep-126f, -0x000000eL, -150),
132 MAKE_HEX_FLOAT(-0x0.00000cp-126f, -0x000000cL, -150),
133 MAKE_HEX_FLOAT(-0x0.00000ap-126f, -0x000000aL, -150),
134 MAKE_HEX_FLOAT(-0x0.000008p-126f, -0x0000008L, -150),
135 MAKE_HEX_FLOAT(-0x0.000006p-126f, -0x0000006L, -150),
136 MAKE_HEX_FLOAT(-0x0.000004p-126f, -0x0000004L, -150),
137 MAKE_HEX_FLOAT(-0x0.000002p-126f, -0x0000002L, -150),
138 -0.0f,
139
140 +NAN,
141 +INFINITY,
142 +FLT_MAX,
143 MAKE_HEX_FLOAT(+0x1.000002p64f, +0x1000002L, 40),
144 MAKE_HEX_FLOAT(+0x1.0p64f, +0x1L, 64),
145 MAKE_HEX_FLOAT(+0x1.fffffep63f, +0x1fffffeL, 39),
146 MAKE_HEX_FLOAT(+0x1.000002p63f, +0x1000002L, 39),
147 MAKE_HEX_FLOAT(+0x1.0p63f, +0x1L, 63),
148 MAKE_HEX_FLOAT(+0x1.fffffep62f, +0x1fffffeL, 38),
149 MAKE_HEX_FLOAT(+0x1.000002p32f, +0x1000002L, 8),
150 MAKE_HEX_FLOAT(+0x1.0p32f, +0x1L, 32),
151 MAKE_HEX_FLOAT(+0x1.fffffep31f, +0x1fffffeL, 7),
152 MAKE_HEX_FLOAT(+0x1.000002p31f, +0x1000002L, 7),
153 MAKE_HEX_FLOAT(+0x1.0p31f, +0x1L, 31),
154 MAKE_HEX_FLOAT(+0x1.fffffep30f, +0x1fffffeL, 6),
155 +1000.f,
156 +100.f,
157 +4.0f,
158 +3.5f,
159 +3.0f,
160 MAKE_HEX_FLOAT(+0x1.800002p1f, +0x1800002L, -23),
161 2.5f,
162 MAKE_HEX_FLOAT(+0x1.7ffffep1f, +0x17ffffeL, -23),
163 +2.0f,
164 MAKE_HEX_FLOAT(+0x1.800002p0f, +0x1800002L, -24),
165 1.5f,
166 MAKE_HEX_FLOAT(+0x1.7ffffep0f, +0x17ffffeL, -24),
167 MAKE_HEX_FLOAT(+0x1.000002p0f, +0x1000002L, -24),
168 +1.0f,
169 MAKE_HEX_FLOAT(+0x1.fffffep-1f, +0x1fffffeL, -25),
170 MAKE_HEX_FLOAT(+0x1.000002p-1f, +0x1000002L, -25),
171 +0.5f,
172 MAKE_HEX_FLOAT(+0x1.fffffep-2f, +0x1fffffeL, -26),
173 MAKE_HEX_FLOAT(+0x1.000002p-2f, +0x1000002L, -26),
174 +0.25f,
175 MAKE_HEX_FLOAT(+0x1.fffffep-3f, +0x1fffffeL, -27),
176 MAKE_HEX_FLOAT(0x1.000002p-126f, 0x1000002L, -150),
177 +FLT_MIN,
178 MAKE_HEX_FLOAT(+0x0.fffffep-126f, +0x0fffffeL, -150),
179 MAKE_HEX_FLOAT(+0x0.000ffep-126f, +0x0000ffeL, -150),
180 MAKE_HEX_FLOAT(+0x0.0000fep-126f, +0x00000feL, -150),
181 MAKE_HEX_FLOAT(+0x0.00000ep-126f, +0x000000eL, -150),
182 MAKE_HEX_FLOAT(+0x0.00000cp-126f, +0x000000cL, -150),
183 MAKE_HEX_FLOAT(+0x0.00000ap-126f, +0x000000aL, -150),
184 MAKE_HEX_FLOAT(+0x0.000008p-126f, +0x0000008L, -150),
185 MAKE_HEX_FLOAT(+0x0.000006p-126f, +0x0000006L, -150),
186 MAKE_HEX_FLOAT(+0x0.000004p-126f, +0x0000004L, -150),
187 MAKE_HEX_FLOAT(+0x0.000002p-126f, +0x0000002L, -150),
188 +0.0f,
189 };
190
191 constexpr size_t specialValuesCount =
192 sizeof(specialValues) / sizeof(specialValues[0]);
193
Test(cl_uint job_id,cl_uint thread_id,void * data)194 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
195 {
196 TestInfo *job = (TestInfo *)data;
197 size_t buffer_elements = job->subBufferSize;
198 size_t buffer_size = buffer_elements * sizeof(cl_float);
199 cl_uint base = job_id * (cl_uint)job->step;
200 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
201 fptr func = job->f->func;
202 int ftz = job->ftz;
203 bool relaxedMode = job->relaxedMode;
204 float ulps = getAllowedUlpError(job->f, relaxedMode);
205 MTdata d = tinfo->d;
206 cl_int error;
207 std::vector<bool> overflow(buffer_elements, false);
208 const char *name = job->f->name;
209 int isFDim = job->isFDim;
210 int skipNanInf = job->skipNanInf;
211 int isNextafter = job->isNextafter;
212 cl_uint *t = 0;
213 cl_float *r = 0;
214 cl_float *s = 0;
215 cl_float *s2 = 0;
216 cl_int copysign_test = 0;
217 RoundingMode oldRoundMode;
218 int skipVerification = 0;
219
220 if (relaxedMode)
221 {
222 func = job->f->rfunc;
223 if (strcmp(name, "pow") == 0 && gFastRelaxedDerived)
224 {
225 ulps = INFINITY;
226 skipVerification = 1;
227 }
228 }
229
230 cl_event e[VECTOR_SIZE_COUNT];
231 cl_uint *out[VECTOR_SIZE_COUNT];
232 if (gHostFill)
233 {
234 // start the map of the output arrays
235 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
236 {
237 out[j] = (cl_uint *)clEnqueueMapBuffer(
238 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
239 buffer_size, 0, NULL, e + j, &error);
240 if (error || NULL == out[j])
241 {
242 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
243 error);
244 return error;
245 }
246 }
247
248 // Get that moving
249 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
250 }
251
252 // Init input array
253 cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
254 cl_uint *p2 = (cl_uint *)gIn2 + thread_id * buffer_elements;
255 cl_uint idx = 0;
256 int totalSpecialValueCount = specialValuesCount * specialValuesCount;
257 int lastSpecialJobIndex = (totalSpecialValueCount - 1) / buffer_elements;
258
259 // Test edge cases
260 if (job_id <= (cl_uint)lastSpecialJobIndex)
261 {
262 float *fp = (float *)p;
263 float *fp2 = (float *)p2;
264 uint32_t x, y;
265
266 x = (job_id * buffer_elements) % specialValuesCount;
267 y = (job_id * buffer_elements) / specialValuesCount;
268
269 for (; idx < buffer_elements; idx++)
270 {
271 fp[idx] = specialValues[x];
272 fp2[idx] = specialValues[y];
273 ++x;
274 if (x >= specialValuesCount)
275 {
276 x = 0;
277 y++;
278 if (y >= specialValuesCount) break;
279 }
280 }
281 }
282
283 // Init any remaining values
284 for (; idx < buffer_elements; idx++)
285 {
286 p[idx] = genrand_int32(d);
287 p2[idx] = genrand_int32(d);
288 }
289
290 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
291 buffer_size, p, 0, NULL, NULL)))
292 {
293 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
294 return error;
295 }
296
297 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf2, CL_FALSE, 0,
298 buffer_size, p2, 0, NULL, NULL)))
299 {
300 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
301 return error;
302 }
303
304 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
305 {
306 if (gHostFill)
307 {
308 // Wait for the map to finish
309 if ((error = clWaitForEvents(1, e + j)))
310 {
311 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
312 return error;
313 }
314 if ((error = clReleaseEvent(e[j])))
315 {
316 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
317 return error;
318 }
319 }
320
321 // Fill the result buffer with garbage, so that old results don't carry
322 // over
323 uint32_t pattern = 0xffffdead;
324 if (gHostFill)
325 {
326 memset_pattern4(out[j], &pattern, buffer_size);
327 if ((error = clEnqueueUnmapMemObject(
328 tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
329 {
330 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
331 error);
332 return error;
333 }
334 }
335 else
336 {
337 if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
338 &pattern, sizeof(pattern), 0,
339 buffer_size, 0, NULL, NULL)))
340 {
341 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
342 error);
343 return error;
344 }
345 }
346
347 // Run the kernel
348 size_t vectorCount =
349 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
350 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
351 // own copy of the cl_kernel
352 cl_program program = job->programs[j];
353
354 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
355 &tinfo->outBuf[j])))
356 {
357 LogBuildError(program);
358 return error;
359 }
360 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
361 &tinfo->inBuf)))
362 {
363 LogBuildError(program);
364 return error;
365 }
366 if ((error = clSetKernelArg(kernel, 2, sizeof(tinfo->inBuf2),
367 &tinfo->inBuf2)))
368 {
369 LogBuildError(program);
370 return error;
371 }
372
373 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
374 &vectorCount, NULL, 0, NULL, NULL)))
375 {
376 vlog_error("FAILED -- could not execute kernel\n");
377 return error;
378 }
379 }
380
381 // Get that moving
382 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
383
384 if (gSkipCorrectnessTesting)
385 {
386 if ((error = clFinish(tinfo->tQueue)))
387 {
388 vlog_error("Error: clFinish failed! err: %d\n", error);
389 return error;
390 }
391 return CL_SUCCESS;
392 }
393
394 FPU_mode_type oldMode;
395 oldRoundMode = kRoundToNearestEven;
396 if (isFDim)
397 {
398 // Calculate the correctly rounded reference result
399 memset(&oldMode, 0, sizeof(oldMode));
400 if (ftz || relaxedMode) ForceFTZ(&oldMode);
401
402 // Set the rounding mode to match the device
403 if (gIsInRTZMode) oldRoundMode = set_round(kRoundTowardZero, kfloat);
404 }
405
406 if (!strcmp(name, "copysign")) copysign_test = 1;
407
408 #define ref_func(s, s2) (copysign_test ? func.f_ff_f(s, s2) : func.f_ff(s, s2))
409
410 // Calculate the correctly rounded reference result
411 r = (float *)gOut_Ref + thread_id * buffer_elements;
412 s = (float *)gIn + thread_id * buffer_elements;
413 s2 = (float *)gIn2 + thread_id * buffer_elements;
414 if (skipNanInf)
415 {
416 for (size_t j = 0; j < buffer_elements; j++)
417 {
418 feclearexcept(FE_OVERFLOW);
419 r[j] = (float)ref_func(s[j], s2[j]);
420 overflow[j] =
421 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
422 }
423 }
424 else
425 {
426 for (size_t j = 0; j < buffer_elements; j++)
427 r[j] = (float)ref_func(s[j], s2[j]);
428 }
429
430 if (isFDim && ftz) RestoreFPState(&oldMode);
431
432 // Read the data back -- no need to wait for the first N-1 buffers but wait
433 // for the last buffer. This is an in order queue.
434 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
435 {
436 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
437 out[j] = (cl_uint *)clEnqueueMapBuffer(
438 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
439 buffer_size, 0, NULL, NULL, &error);
440 if (error || NULL == out[j])
441 {
442 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
443 error);
444 return error;
445 }
446 }
447
448 if (!skipVerification)
449 {
450 // Verify data
451 t = (cl_uint *)r;
452 for (size_t j = 0; j < buffer_elements; j++)
453 {
454 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
455 {
456 cl_uint *q = out[k];
457
458 // If we aren't getting the correctly rounded result
459 if (t[j] != q[j])
460 {
461 float test = ((float *)q)[j];
462 double correct = ref_func(s[j], s2[j]);
463
464 // Per section 10 paragraph 6, accept any result if an input
465 // or output is a infinity or NaN or overflow As per
466 // OpenCL 2.0 spec, section 5.8.4.3, enabling
467 // fast-relaxed-math mode also enables -cl-finite-math-only
468 // optimization. This optimization allows to assume that
469 // arguments and results are not NaNs or +/-INFs. Hence,
470 // accept any result if inputs or results are NaNs or INFs.
471 if (relaxedMode || skipNanInf)
472 {
473 if (skipNanInf && overflow[j]) continue;
474 // Note: no double rounding here. Reference functions
475 // calculate in single precision.
476 if (IsFloatInfinity(correct) || IsFloatNaN(correct)
477 || IsFloatInfinity(s2[j]) || IsFloatNaN(s2[j])
478 || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
479 continue;
480 }
481
482 float err = Ulp_Error(test, correct);
483 int fail = !(fabsf(err) <= ulps);
484
485 if (fail && (ftz || relaxedMode))
486 {
487 // retry per section 6.5.3.2
488 if (IsFloatResultSubnormal(correct, ulps))
489 {
490 fail = fail && (test != 0.0f);
491 if (!fail) err = 0.0f;
492 }
493
494 // nextafter on FTZ platforms may return the smallest
495 // normal float (2^-126) given a denormal or a zero
496 // as the first argument. The rationale here is that
497 // nextafter flushes the argument to zero and then
498 // returns the next representable number in the
499 // direction of the second argument, and since
500 // denorms are considered as zero, the smallest
501 // normal number is the next representable number.
502 // In which case, it should have the same sign as the
503 // second argument.
504 if (isNextafter)
505 {
506 if (IsFloatSubnormal(s[j]) || s[j] == 0.0f)
507 {
508 float value = copysignf(twoToMinus126, s2[j]);
509 fail = fail && (test != value);
510 if (!fail) err = 0.0f;
511 }
512 }
513 else
514 {
515 // retry per section 6.5.3.3
516 if (IsFloatSubnormal(s[j]))
517 {
518 double correct2, correct3;
519 float err2, err3;
520
521 if (skipNanInf) feclearexcept(FE_OVERFLOW);
522
523 correct2 = ref_func(0.0, s2[j]);
524 correct3 = ref_func(-0.0, s2[j]);
525
526 // Per section 10 paragraph 6, accept any result
527 // if an input or output is a infinity or NaN or
528 // overflow As per OpenCL 2.0 spec,
529 // section 5.8.4.3, enabling fast-relaxed-math
530 // mode also enables -cl-finite-math-only
531 // optimization. This optimization allows to
532 // assume that arguments and results are not
533 // NaNs or +/-INFs. Hence, accept any result if
534 // inputs or results are NaNs or INFs.
535 if (relaxedMode || skipNanInf)
536 {
537 if (fetestexcept(FE_OVERFLOW) && skipNanInf)
538 continue;
539
540 // Note: no double rounding here. Reference
541 // functions calculate in single precision.
542 if (IsFloatInfinity(correct2)
543 || IsFloatNaN(correct2)
544 || IsFloatInfinity(correct3)
545 || IsFloatNaN(correct3))
546 continue;
547 }
548
549 err2 = Ulp_Error(test, correct2);
550 err3 = Ulp_Error(test, correct3);
551 fail = fail
552 && ((!(fabsf(err2) <= ulps))
553 && (!(fabsf(err3) <= ulps)));
554 if (fabsf(err2) < fabsf(err)) err = err2;
555 if (fabsf(err3) < fabsf(err)) err = err3;
556
557 // retry per section 6.5.3.4
558 if (IsFloatResultSubnormal(correct2, ulps)
559 || IsFloatResultSubnormal(correct3, ulps))
560 {
561 fail = fail && (test != 0.0f);
562 if (!fail) err = 0.0f;
563 }
564
565 // try with both args as zero
566 if (IsFloatSubnormal(s2[j]))
567 {
568 double correct4, correct5;
569 float err4, err5;
570
571 if (skipNanInf) feclearexcept(FE_OVERFLOW);
572
573 correct2 = ref_func(0.0, 0.0);
574 correct3 = ref_func(-0.0, 0.0);
575 correct4 = ref_func(0.0, -0.0);
576 correct5 = ref_func(-0.0, -0.0);
577
578 // Per section 10 paragraph 6, accept any
579 // result if an input or output is a
580 // infinity or NaN or overflow As per
581 // OpenCL 2.0 spec, section 5.8.4.3,
582 // enabling fast-relaxed-math mode also
583 // enables -cl-finite-math-only
584 // optimization. This optimization allows to
585 // assume that arguments and results are not
586 // NaNs or +/-INFs. Hence, accept any result
587 // if inputs or results are NaNs or INFs.
588 if (relaxedMode || skipNanInf)
589 {
590 if (fetestexcept(FE_OVERFLOW)
591 && skipNanInf)
592 continue;
593
594 // Note: no double rounding here.
595 // Reference functions calculate in
596 // single precision.
597 if (IsFloatInfinity(correct2)
598 || IsFloatNaN(correct2)
599 || IsFloatInfinity(correct3)
600 || IsFloatNaN(correct3)
601 || IsFloatInfinity(correct4)
602 || IsFloatNaN(correct4)
603 || IsFloatInfinity(correct5)
604 || IsFloatNaN(correct5))
605 continue;
606 }
607
608 err2 = Ulp_Error(test, correct2);
609 err3 = Ulp_Error(test, correct3);
610 err4 = Ulp_Error(test, correct4);
611 err5 = Ulp_Error(test, correct5);
612 fail = fail
613 && ((!(fabsf(err2) <= ulps))
614 && (!(fabsf(err3) <= ulps))
615 && (!(fabsf(err4) <= ulps))
616 && (!(fabsf(err5) <= ulps)));
617 if (fabsf(err2) < fabsf(err)) err = err2;
618 if (fabsf(err3) < fabsf(err)) err = err3;
619 if (fabsf(err4) < fabsf(err)) err = err4;
620 if (fabsf(err5) < fabsf(err)) err = err5;
621
622 // retry per section 6.5.3.4
623 if (IsFloatResultSubnormal(correct2, ulps)
624 || IsFloatResultSubnormal(correct3,
625 ulps)
626 || IsFloatResultSubnormal(correct4,
627 ulps)
628 || IsFloatResultSubnormal(correct5,
629 ulps))
630 {
631 fail = fail && (test != 0.0f);
632 if (!fail) err = 0.0f;
633 }
634 }
635 }
636 else if (IsFloatSubnormal(s2[j]))
637 {
638 double correct2, correct3;
639 float err2, err3;
640
641 if (skipNanInf) feclearexcept(FE_OVERFLOW);
642
643 correct2 = ref_func(s[j], 0.0);
644 correct3 = ref_func(s[j], -0.0);
645
646 // Per section 10 paragraph 6, accept any result
647 // if an input or output is a infinity or NaN or
648 // overflow As per OpenCL 2.0 spec,
649 // section 5.8.4.3, enabling fast-relaxed-math
650 // mode also enables -cl-finite-math-only
651 // optimization. This optimization allows to
652 // assume that arguments and results are not
653 // NaNs or +/-INFs. Hence, accept any result if
654 // inputs or results are NaNs or INFs.
655 if (relaxedMode || skipNanInf)
656 {
657 // Note: no double rounding here. Reference
658 // functions calculate in single precision.
659 if (overflow[j] && skipNanInf) continue;
660
661 if (IsFloatInfinity(correct2)
662 || IsFloatNaN(correct2)
663 || IsFloatInfinity(correct3)
664 || IsFloatNaN(correct3))
665 continue;
666 }
667
668 err2 = Ulp_Error(test, correct2);
669 err3 = Ulp_Error(test, correct3);
670 fail = fail
671 && ((!(fabsf(err2) <= ulps))
672 && (!(fabsf(err3) <= ulps)));
673 if (fabsf(err2) < fabsf(err)) err = err2;
674 if (fabsf(err3) < fabsf(err)) err = err3;
675
676 // retry per section 6.5.3.4
677 if (IsFloatResultSubnormal(correct2, ulps)
678 || IsFloatResultSubnormal(correct3, ulps))
679 {
680 fail = fail && (test != 0.0f);
681 if (!fail) err = 0.0f;
682 }
683 }
684 }
685 }
686
687 if (fabsf(err) > tinfo->maxError)
688 {
689 tinfo->maxError = fabsf(err);
690 tinfo->maxErrorValue = s[j];
691 tinfo->maxErrorValue2 = s2[j];
692 }
693 if (fail)
694 {
695 vlog_error(
696 "\nERROR: %s%s: %f ulp error at {%a (0x%x), %a "
697 "(0x%x)}: *%a vs. %a (0x%8.8x) at index: %zu\n",
698 name, sizeNames[k], err, s[j], ((cl_uint *)s)[j],
699 s2[j], ((cl_uint *)s2)[j], r[j], test,
700 ((cl_uint *)&test)[0], j);
701 return -1;
702 }
703 }
704 }
705 }
706 }
707
708 if (isFDim && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
709
710 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
711 {
712 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
713 out[j], 0, NULL, NULL)))
714 {
715 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
716 j, error);
717 return error;
718 }
719 }
720
721 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
722
723
724 if (0 == (base & 0x0fffffff))
725 {
726 if (gVerboseBruteForce)
727 {
728 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zu ulps:%5.3f "
729 "ThreadCount:%2u\n",
730 base, job->step, job->scale, buffer_elements, job->ulps,
731 job->threadCount);
732 }
733 else
734 {
735 vlog(".");
736 }
737 fflush(stdout);
738 }
739
740 return CL_SUCCESS;
741 }
742
743 } // anonymous namespace
744
TestFunc_Float_Float_Float(const Func * f,MTdata d,bool relaxedMode)745 int TestFunc_Float_Float_Float(const Func *f, MTdata d, bool relaxedMode)
746 {
747 TestInfo test_info{};
748 cl_int error;
749 float maxError = 0.0f;
750 double maxErrorVal = 0.0;
751 double maxErrorVal2 = 0.0;
752
753 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
754
755 // Init test_info
756 test_info.threadCount = GetThreadCount();
757 test_info.subBufferSize = BUFFER_SIZE
758 / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
759 test_info.scale = getTestScale(sizeof(cl_float));
760
761 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
762 if (test_info.step / test_info.subBufferSize != test_info.scale)
763 {
764 // there was overflow
765 test_info.jobCount = 1;
766 }
767 else
768 {
769 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
770 }
771
772 test_info.f = f;
773 test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
774 test_info.ftz =
775 f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
776 test_info.relaxedMode = relaxedMode;
777 test_info.isFDim = 0 == strcmp("fdim", f->nameInCode);
778 test_info.skipNanInf = test_info.isFDim && !gInfNanSupport;
779 test_info.isNextafter = 0 == strcmp("nextafter", f->nameInCode);
780
781 test_info.tinfo.resize(test_info.threadCount);
782 for (cl_uint i = 0; i < test_info.threadCount; i++)
783 {
784 cl_buffer_region region = {
785 i * test_info.subBufferSize * sizeof(cl_float),
786 test_info.subBufferSize * sizeof(cl_float)
787 };
788 test_info.tinfo[i].inBuf =
789 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
790 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
791 if (error || NULL == test_info.tinfo[i].inBuf)
792 {
793 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
794 "region {%zd, %zd}\n",
795 region.origin, region.size);
796 return error;
797 }
798 test_info.tinfo[i].inBuf2 =
799 clCreateSubBuffer(gInBuffer2, CL_MEM_READ_ONLY,
800 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
801 if (error || NULL == test_info.tinfo[i].inBuf2)
802 {
803 vlog_error("Error: Unable to create sub-buffer of gInBuffer2 for "
804 "region {%zd, %zd}\n",
805 region.origin, region.size);
806 return error;
807 }
808
809 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
810 {
811 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
812 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
813 ®ion, &error);
814 if (error || NULL == test_info.tinfo[i].outBuf[j])
815 {
816 vlog_error("Error: Unable to create sub-buffer of "
817 "gOutBuffer[%d] for region {%zd, %zd}\n",
818 (int)j, region.origin, region.size);
819 return error;
820 }
821 }
822 test_info.tinfo[i].tQueue =
823 clCreateCommandQueue(gContext, gDevice, 0, &error);
824 if (NULL == test_info.tinfo[i].tQueue || error)
825 {
826 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
827 return error;
828 }
829
830 test_info.tinfo[i].d = MTdataHolder(genrand_int32(d));
831 }
832
833 // Init the kernels
834 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
835 test_info.programs, f->nameInCode,
836 relaxedMode };
837 if ((error = ThreadPool_Do(BuildKernelFn,
838 gMaxVectorSizeIndex - gMinVectorSizeIndex,
839 &build_info)))
840 return error;
841
842 // Run the kernels
843 if (!gSkipCorrectnessTesting)
844 {
845 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
846 if (error) return error;
847
848 // Accumulate the arithmetic errors
849 for (cl_uint i = 0; i < test_info.threadCount; i++)
850 {
851 if (test_info.tinfo[i].maxError > maxError)
852 {
853 maxError = test_info.tinfo[i].maxError;
854 maxErrorVal = test_info.tinfo[i].maxErrorValue;
855 maxErrorVal2 = test_info.tinfo[i].maxErrorValue2;
856 }
857 }
858
859 if (gWimpyMode)
860 vlog("Wimp pass");
861 else
862 vlog("passed");
863
864 vlog("\t%8.2f @ {%a, %a}", maxError, maxErrorVal, maxErrorVal2);
865 }
866
867 vlog("\n");
868
869 return CL_SUCCESS;
870 }
871