1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cstring>
23
24 namespace {
25
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)26 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
27 {
28 BuildKernelInfo &info = *(BuildKernelInfo *)p;
29 auto generator = [](const std::string &kernel_name, const char *builtin,
30 cl_uint vector_size_index) {
31 return GetUnaryKernel(kernel_name, builtin, ParameterType::Float,
32 ParameterType::Float, vector_size_index);
33 };
34 return BuildKernels(info, job_id, generator);
35 }
36
37 // Thread specific data for a worker thread
38 struct ThreadInfo
39 {
40 // Input and output buffers for the thread
41 clMemWrapper inBuf;
42 Buffers outBuf;
43
44 float maxError; // max error value. Init to 0.
45 double maxErrorValue; // position of the max error value. Init to 0.
46
47 // Per thread command queue to improve performance
48 clCommandQueueWrapper tQueue;
49 };
50
51 struct TestInfo
52 {
53 size_t subBufferSize; // Size of the sub-buffer in elements
54 const Func *f; // A pointer to the function info
55
56 // Programs for various vector sizes.
57 Programs programs;
58
59 // Thread-specific kernels for each vector size:
60 // k[vector_size][thread_id]
61 KernelMatrix k;
62
63 // Array of thread specific information
64 std::vector<ThreadInfo> tinfo;
65
66 cl_uint threadCount; // Number of worker threads
67 cl_uint jobCount; // Number of jobs
68 cl_uint step; // step between each chunk and the next.
69 cl_uint scale; // stride between individual test values
70 float ulps; // max_allowed ulps
71 int ftz; // non-zero if running in flush to zero mode
72
73 int isRangeLimited; // 1 if the function is only to be evaluated over a
74 // range
75 float half_sin_cos_tan_limit;
76 bool relaxedMode; // True if test is running in relaxed mode, false
77 // otherwise.
78 };
79
Test(cl_uint job_id,cl_uint thread_id,void * data)80 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
81 {
82 TestInfo *job = (TestInfo *)data;
83 size_t buffer_elements = job->subBufferSize;
84 size_t buffer_size = buffer_elements * sizeof(cl_float);
85 cl_uint scale = job->scale;
86 cl_uint base = job_id * (cl_uint)job->step;
87 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
88 fptr func = job->f->func;
89 const char *fname = job->f->name;
90 bool relaxedMode = job->relaxedMode;
91 float ulps = getAllowedUlpError(job->f, relaxedMode);
92 if (relaxedMode)
93 {
94 func = job->f->rfunc;
95 }
96
97 cl_int error;
98
99 int isRangeLimited = job->isRangeLimited;
100 float half_sin_cos_tan_limit = job->half_sin_cos_tan_limit;
101 int ftz = job->ftz;
102
103 cl_event e[VECTOR_SIZE_COUNT];
104 cl_uint *out[VECTOR_SIZE_COUNT];
105 if (gHostFill)
106 {
107 // start the map of the output arrays
108 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
109 {
110 out[j] = (cl_uint *)clEnqueueMapBuffer(
111 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
112 buffer_size, 0, NULL, e + j, &error);
113 if (error || NULL == out[j])
114 {
115 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
116 error);
117 return error;
118 }
119 }
120
121 // Get that moving
122 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
123 }
124
125 // Write the new values to the input array
126 cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
127 for (size_t j = 0; j < buffer_elements; j++)
128 {
129 p[j] = base + j * scale;
130 if (relaxedMode)
131 {
132 float p_j = *(float *)&p[j];
133 if (strcmp(fname, "sin") == 0
134 || strcmp(fname, "cos")
135 == 0) // the domain of the function is [-pi,pi]
136 {
137 if (fabs(p_j) > M_PI) ((float *)p)[j] = NAN;
138 }
139
140 if (strcmp(fname, "reciprocal") == 0)
141 {
142 const float l_limit = HEX_FLT(+, 1, 0, -, 126);
143 const float u_limit = HEX_FLT(+, 1, 0, +, 126);
144
145 if (fabs(p_j) < l_limit
146 || fabs(p_j) > u_limit) // the domain of the function is
147 // [2^-126,2^126]
148 ((float *)p)[j] = NAN;
149 }
150 }
151 }
152
153 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
154 buffer_size, p, 0, NULL, NULL)))
155 {
156 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
157 return error;
158 }
159
160 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
161 {
162 if (gHostFill)
163 {
164 // Wait for the map to finish
165 if ((error = clWaitForEvents(1, e + j)))
166 {
167 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
168 return error;
169 }
170 if ((error = clReleaseEvent(e[j])))
171 {
172 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
173 return error;
174 }
175 }
176
177 // Fill the result buffer with garbage, so that old results don't carry
178 // over
179 uint32_t pattern = 0xffffdead;
180 if (gHostFill)
181 {
182 memset_pattern4(out[j], &pattern, buffer_size);
183 if ((error = clEnqueueUnmapMemObject(
184 tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
185 {
186 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
187 error);
188 return error;
189 }
190 }
191 else
192 {
193 if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
194 &pattern, sizeof(pattern), 0,
195 buffer_size, 0, NULL, NULL)))
196 {
197 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
198 error);
199 return error;
200 }
201 }
202
203 // Run the kernel
204 size_t vectorCount =
205 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
206 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
207 // own copy of the cl_kernel
208 cl_program program = job->programs[j];
209
210 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
211 &tinfo->outBuf[j])))
212 {
213 LogBuildError(program);
214 return error;
215 }
216 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
217 &tinfo->inBuf)))
218 {
219 LogBuildError(program);
220 return error;
221 }
222
223 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
224 &vectorCount, NULL, 0, NULL, NULL)))
225 {
226 vlog_error("FAILED -- could not execute kernel\n");
227 return error;
228 }
229 }
230
231 // Get that moving
232 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
233
234 if (gSkipCorrectnessTesting) return CL_SUCCESS;
235
236 // Calculate the correctly rounded reference result
237 float *r = (float *)gOut_Ref + thread_id * buffer_elements;
238 float *s = (float *)p;
239 for (size_t j = 0; j < buffer_elements; j++) r[j] = (float)func.f_f(s[j]);
240
241 // Read the data back -- no need to wait for the first N-1 buffers but wait
242 // for the last buffer. This is an in order queue.
243 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
244 {
245 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
246 out[j] = (cl_uint *)clEnqueueMapBuffer(
247 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
248 buffer_size, 0, NULL, NULL, &error);
249 if (error || NULL == out[j])
250 {
251 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
252 error);
253 return error;
254 }
255 }
256
257 // Verify data
258 uint32_t *t = (uint32_t *)r;
259 for (size_t j = 0; j < buffer_elements; j++)
260 {
261 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
262 {
263 uint32_t *q = out[k];
264
265 // If we aren't getting the correctly rounded result
266 if (t[j] != q[j])
267 {
268 float test = ((float *)q)[j];
269 double correct = func.f_f(s[j]);
270 float err = Ulp_Error(test, correct);
271 float abs_error = Abs_Error(test, correct);
272 int fail = 0;
273 int use_abs_error = 0;
274
275 // it is possible for the output to not match the reference
276 // result but for Ulp_Error to be zero, for example -1.#QNAN
277 // vs. 1.#QNAN. In such cases there is no failure
278 if (err == 0.0f)
279 {
280 fail = 0;
281 }
282 else if (relaxedMode)
283 {
284 if (strcmp(fname, "sin") == 0 || strcmp(fname, "cos") == 0)
285 {
286 fail = !(fabsf(abs_error) <= ulps);
287 use_abs_error = 1;
288 }
289 if (strcmp(fname, "sinpi") == 0
290 || strcmp(fname, "cospi") == 0)
291 {
292 if (s[j] >= -1.0 && s[j] <= 1.0)
293 {
294 fail = !(fabsf(abs_error) <= ulps);
295 use_abs_error = 1;
296 }
297 }
298
299 if (strcmp(fname, "reciprocal") == 0)
300 {
301 fail = !(fabsf(err) <= ulps);
302 }
303
304 if (strcmp(fname, "exp") == 0 || strcmp(fname, "exp2") == 0)
305 {
306 float exp_error = ulps;
307
308 if (!gIsEmbedded)
309 {
310 exp_error += floor(fabs(2 * s[j]));
311 }
312
313 fail = !(fabsf(err) <= exp_error);
314 ulps = exp_error;
315 }
316 if (strcmp(fname, "tan") == 0)
317 {
318
319 if (!gFastRelaxedDerived)
320 {
321 fail = !(fabsf(err) <= ulps);
322 }
323 // Else fast math derived implementation does not
324 // require ULP verification
325 }
326 if (strcmp(fname, "exp10") == 0)
327 {
328 if (!gFastRelaxedDerived)
329 {
330 fail = !(fabsf(err) <= ulps);
331 }
332 // Else fast math derived implementation does not
333 // require ULP verification
334 }
335 if (strcmp(fname, "log") == 0 || strcmp(fname, "log2") == 0
336 || strcmp(fname, "log10") == 0)
337 {
338 if (s[j] >= 0.5 && s[j] <= 2)
339 {
340 fail = !(fabsf(abs_error) <= ulps);
341 }
342 else
343 {
344 ulps = gIsEmbedded ? job->f->float_embedded_ulps
345 : job->f->float_ulps;
346 fail = !(fabsf(err) <= ulps);
347 }
348 }
349
350
351 // fast-relaxed implies finite-only
352 if (IsFloatInfinity(correct) || IsFloatNaN(correct)
353 || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
354 {
355 fail = 0;
356 err = 0;
357 }
358 }
359 else
360 {
361 fail = !(fabsf(err) <= ulps);
362 }
363
364 // half_sin/cos/tan are only valid between +-2**16, Inf, NaN
365 if (isRangeLimited
366 && fabsf(s[j]) > MAKE_HEX_FLOAT(0x1.0p16f, 0x1L, 16)
367 && fabsf(s[j]) < INFINITY)
368 {
369 if (fabsf(test) <= half_sin_cos_tan_limit)
370 {
371 err = 0;
372 fail = 0;
373 }
374 }
375
376 if (fail)
377 {
378 if (ftz || relaxedMode)
379 {
380 typedef int (*CheckForSubnormal)(
381 double, float); // If we are in fast relaxed math,
382 // we have a different calculation
383 // for the subnormal threshold.
384 CheckForSubnormal isFloatResultSubnormalPtr;
385
386 if (relaxedMode)
387 {
388 isFloatResultSubnormalPtr =
389 &IsFloatResultSubnormalAbsError;
390 }
391 else
392 {
393 isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
394 }
395 // retry per section 6.5.3.2
396 if ((*isFloatResultSubnormalPtr)(correct, ulps))
397 {
398 fail = fail && (test != 0.0f);
399 if (!fail) err = 0.0f;
400 }
401
402 // retry per section 6.5.3.3
403 if (IsFloatSubnormal(s[j]))
404 {
405 double correct2 = func.f_f(0.0);
406 double correct3 = func.f_f(-0.0);
407 float err2;
408 float err3;
409 if (use_abs_error)
410 {
411 err2 = Abs_Error(test, correct2);
412 err3 = Abs_Error(test, correct3);
413 }
414 else
415 {
416 err2 = Ulp_Error(test, correct2);
417 err3 = Ulp_Error(test, correct3);
418 }
419 fail = fail
420 && ((!(fabsf(err2) <= ulps))
421 && (!(fabsf(err3) <= ulps)));
422 if (fabsf(err2) < fabsf(err)) err = err2;
423 if (fabsf(err3) < fabsf(err)) err = err3;
424
425 // retry per section 6.5.3.4
426 if ((*isFloatResultSubnormalPtr)(correct2, ulps)
427 || (*isFloatResultSubnormalPtr)(correct3, ulps))
428 {
429 fail = fail && (test != 0.0f);
430 if (!fail) err = 0.0f;
431 }
432 }
433 }
434 }
435 if (fabsf(err) > tinfo->maxError)
436 {
437 tinfo->maxError = fabsf(err);
438 tinfo->maxErrorValue = s[j];
439 }
440 if (fail)
441 {
442 vlog_error("\nERROR: %s%s: %f ulp error at %a (0x%8.8x): "
443 "*%a vs. %a\n",
444 job->f->name, sizeNames[k], err, ((float *)s)[j],
445 ((uint32_t *)s)[j], ((float *)t)[j], test);
446 return -1;
447 }
448 }
449 }
450 }
451
452 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
453 {
454 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
455 out[j], 0, NULL, NULL)))
456 {
457 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
458 j, error);
459 return error;
460 }
461 }
462
463 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
464
465
466 if (0 == (base & 0x0fffffff))
467 {
468 if (gVerboseBruteForce)
469 {
470 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd ulps:%5.3f "
471 "ThreadCount:%2u\n",
472 base, job->step, job->scale, buffer_elements, job->ulps,
473 job->threadCount);
474 }
475 else
476 {
477 vlog(".");
478 }
479 fflush(stdout);
480 }
481
482 return CL_SUCCESS;
483 }
484
485 } // anonymous namespace
486
TestFunc_Float_Float(const Func * f,MTdata d,bool relaxedMode)487 int TestFunc_Float_Float(const Func *f, MTdata d, bool relaxedMode)
488 {
489 TestInfo test_info{};
490 cl_int error;
491 float maxError = 0.0f;
492 double maxErrorVal = 0.0;
493 int skipTestingRelaxed = (relaxedMode && strcmp(f->name, "tan") == 0);
494
495 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
496
497 // Init test_info
498 test_info.threadCount = GetThreadCount();
499 test_info.subBufferSize = BUFFER_SIZE
500 / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
501 test_info.scale = getTestScale(sizeof(cl_float));
502
503 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
504 if (test_info.step / test_info.subBufferSize != test_info.scale)
505 {
506 // there was overflow
507 test_info.jobCount = 1;
508 }
509 else
510 {
511 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
512 }
513
514 test_info.f = f;
515 test_info.ulps = gIsEmbedded ? f->float_embedded_ulps : f->float_ulps;
516 test_info.ftz =
517 f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
518 test_info.relaxedMode = relaxedMode;
519 test_info.tinfo.resize(test_info.threadCount);
520 for (cl_uint i = 0; i < test_info.threadCount; i++)
521 {
522 cl_buffer_region region = {
523 i * test_info.subBufferSize * sizeof(cl_float),
524 test_info.subBufferSize * sizeof(cl_float)
525 };
526 test_info.tinfo[i].inBuf =
527 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
528 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
529 if (error || NULL == test_info.tinfo[i].inBuf)
530 {
531 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
532 "region {%zd, %zd}\n",
533 region.origin, region.size);
534 return error;
535 }
536
537 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
538 {
539 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
540 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
541 ®ion, &error);
542 if (error || NULL == test_info.tinfo[i].outBuf[j])
543 {
544 vlog_error("Error: Unable to create sub-buffer of "
545 "gOutBuffer[%d] for region {%zd, %zd}\n",
546 (int)j, region.origin, region.size);
547 return error;
548 }
549 }
550 test_info.tinfo[i].tQueue =
551 clCreateCommandQueue(gContext, gDevice, 0, &error);
552 if (NULL == test_info.tinfo[i].tQueue || error)
553 {
554 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
555 return error;
556 }
557 }
558
559 // Check for special cases for unary float
560 test_info.isRangeLimited = 0;
561 test_info.half_sin_cos_tan_limit = 0;
562 if (0 == strcmp(f->name, "half_sin") || 0 == strcmp(f->name, "half_cos"))
563 {
564 test_info.isRangeLimited = 1;
565 test_info.half_sin_cos_tan_limit = 1.0f
566 + test_info.ulps
567 * (FLT_EPSILON / 2.0f); // out of range results from finite
568 // inputs must be in [-1,1]
569 }
570 else if (0 == strcmp(f->name, "half_tan"))
571 {
572 test_info.isRangeLimited = 1;
573 test_info.half_sin_cos_tan_limit =
574 INFINITY; // out of range resut from finite inputs must be numeric
575 }
576
577 // Init the kernels
578 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
579 test_info.programs, f->nameInCode,
580 relaxedMode };
581 if ((error = ThreadPool_Do(BuildKernelFn,
582 gMaxVectorSizeIndex - gMinVectorSizeIndex,
583 &build_info)))
584 return error;
585
586 // Run the kernels
587 if (!gSkipCorrectnessTesting || skipTestingRelaxed)
588 {
589 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
590 if (error) return error;
591
592 // Accumulate the arithmetic errors
593 for (cl_uint i = 0; i < test_info.threadCount; i++)
594 {
595 if (test_info.tinfo[i].maxError > maxError)
596 {
597 maxError = test_info.tinfo[i].maxError;
598 maxErrorVal = test_info.tinfo[i].maxErrorValue;
599 }
600 }
601
602 if (gWimpyMode)
603 vlog("Wimp pass");
604 else
605 vlog("passed");
606
607 if (skipTestingRelaxed)
608 {
609 vlog(" (rlx skip correctness testing)\n");
610 return error;
611 }
612
613 vlog("\t%8.2f @ %a", maxError, maxErrorVal);
614 }
615
616 vlog("\n");
617
618 return CL_SUCCESS;
619 }
620