1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cinttypes>
23 #include <cstring>
24
25 namespace {
26
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)27 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
28 {
29 BuildKernelInfo &info = *(BuildKernelInfo *)p;
30 auto generator = [](const std::string &kernel_name, const char *builtin,
31 cl_uint vector_size_index) {
32 return GetUnaryKernel(kernel_name, builtin, ParameterType::Float,
33 ParameterType::Float, ParameterType::Float,
34 vector_size_index);
35 };
36 return BuildKernels(info, job_id, generator);
37 }
38
39 } // anonymous namespace
40
TestFunc_Float2_Float(const Func * f,MTdata d,bool relaxedMode)41 int TestFunc_Float2_Float(const Func *f, MTdata d, bool relaxedMode)
42 {
43 int error;
44 Programs programs;
45 const unsigned thread_id = 0; // Test is currently not multithreaded.
46 KernelMatrix kernels;
47 float maxError0 = 0.0f;
48 float maxError1 = 0.0f;
49 int ftz = f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
50 float maxErrorVal0 = 0.0f;
51 float maxErrorVal1 = 0.0f;
52 uint64_t step = getTestStep(sizeof(float), BUFFER_SIZE);
53 int scale = (int)((1ULL << 32) / (16 * BUFFER_SIZE / sizeof(float)) + 1);
54 cl_uchar overflow[BUFFER_SIZE / sizeof(float)];
55 int isFract = 0 == strcmp("fract", f->nameInCode);
56 int skipNanInf = isFract && !gInfNanSupport;
57
58 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
59
60 float float_ulps = getAllowedUlpError(f, relaxedMode);
61 // Init the kernels
62 BuildKernelInfo build_info{ 1, kernels, programs, f->nameInCode,
63 relaxedMode };
64 if ((error = ThreadPool_Do(BuildKernelFn,
65 gMaxVectorSizeIndex - gMinVectorSizeIndex,
66 &build_info)))
67 return error;
68
69 for (uint64_t i = 0; i < (1ULL << 32); i += step)
70 {
71 // Init input array
72 uint32_t *p = (uint32_t *)gIn;
73 if (gWimpyMode)
74 {
75 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
76 {
77 p[j] = (uint32_t)i + j * scale;
78 if (relaxedMode && strcmp(f->name, "sincos") == 0)
79 {
80 float pj = *(float *)&p[j];
81 if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
82 }
83 }
84 }
85 else
86 {
87 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
88 {
89 p[j] = (uint32_t)i + j;
90 if (relaxedMode && strcmp(f->name, "sincos") == 0)
91 {
92 float pj = *(float *)&p[j];
93 if (fabs(pj) > M_PI) ((float *)p)[j] = NAN;
94 }
95 }
96 }
97
98 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer, CL_FALSE, 0,
99 BUFFER_SIZE, gIn, 0, NULL, NULL)))
100 {
101 vlog_error("\n*** Error %d in clEnqueueWriteBuffer ***\n", error);
102 return error;
103 }
104
105 // Write garbage into output arrays
106 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
107 {
108 uint32_t pattern = 0xffffdead;
109 if (gHostFill)
110 {
111 memset_pattern4(gOut[j], &pattern, BUFFER_SIZE);
112 if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer[j],
113 CL_FALSE, 0, BUFFER_SIZE,
114 gOut[j], 0, NULL, NULL)))
115 {
116 vlog_error(
117 "\n*** Error %d in clEnqueueWriteBuffer2(%d) ***\n",
118 error, j);
119 return error;
120 }
121
122 memset_pattern4(gOut2[j], &pattern, BUFFER_SIZE);
123 if ((error = clEnqueueWriteBuffer(gQueue, gOutBuffer2[j],
124 CL_FALSE, 0, BUFFER_SIZE,
125 gOut2[j], 0, NULL, NULL)))
126 {
127 vlog_error(
128 "\n*** Error %d in clEnqueueWriteBuffer2b(%d) ***\n",
129 error, j);
130 return error;
131 }
132 }
133 else
134 {
135 if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
136 &pattern, sizeof(pattern), 0,
137 BUFFER_SIZE, 0, NULL, NULL)))
138 {
139 vlog_error("Error: clEnqueueFillBuffer 1 failed! err: %d\n",
140 error);
141 return error;
142 }
143
144 if ((error = clEnqueueFillBuffer(gQueue, gOutBuffer[j],
145 &pattern, sizeof(pattern), 0,
146 BUFFER_SIZE, 0, NULL, NULL)))
147 {
148 vlog_error("Error: clEnqueueFillBuffer 2 failed! err: %d\n",
149 error);
150 return error;
151 }
152 }
153 }
154
155 // Run the kernels
156 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
157 {
158 size_t vectorSize = sizeValues[j] * sizeof(cl_float);
159 size_t localCount = (BUFFER_SIZE + vectorSize - 1) / vectorSize;
160 if ((error = clSetKernelArg(kernels[j][thread_id], 0,
161 sizeof(gOutBuffer[j]), &gOutBuffer[j])))
162 {
163 LogBuildError(programs[j]);
164 return error;
165 }
166 if ((error =
167 clSetKernelArg(kernels[j][thread_id], 1,
168 sizeof(gOutBuffer2[j]), &gOutBuffer2[j])))
169 {
170 LogBuildError(programs[j]);
171 return error;
172 }
173 if ((error = clSetKernelArg(kernels[j][thread_id], 2,
174 sizeof(gInBuffer), &gInBuffer)))
175 {
176 LogBuildError(programs[j]);
177 return error;
178 }
179
180 if ((error = clEnqueueNDRangeKernel(gQueue, kernels[j][thread_id],
181 1, NULL, &localCount, NULL, 0,
182 NULL, NULL)))
183 {
184 vlog_error("FAILED -- could not execute kernel\n");
185 return error;
186 }
187 }
188
189 // Get that moving
190 if ((error = clFlush(gQueue))) vlog("clFlush failed\n");
191
192 FPU_mode_type oldMode = 0;
193 RoundingMode oldRoundMode = kRoundToNearestEven;
194 if (isFract)
195 {
196 // Calculate the correctly rounded reference result
197 if (ftz || relaxedMode) ForceFTZ(&oldMode);
198
199 // Set the rounding mode to match the device
200 if (gIsInRTZMode)
201 oldRoundMode = set_round(kRoundTowardZero, kfloat);
202 }
203
204 // Calculate the correctly rounded reference result
205 float *r = (float *)gOut_Ref;
206 float *r2 = (float *)gOut_Ref2;
207 float *s = (float *)gIn;
208
209 if (skipNanInf)
210 {
211 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
212 {
213 double dd;
214 feclearexcept(FE_OVERFLOW);
215
216 if (relaxedMode)
217 r[j] = (float)f->rfunc.f_fpf(s[j], &dd);
218 else
219 r[j] = (float)f->func.f_fpf(s[j], &dd);
220
221 r2[j] = (float)dd;
222 overflow[j] =
223 FE_OVERFLOW == (FE_OVERFLOW & fetestexcept(FE_OVERFLOW));
224 }
225 }
226 else
227 {
228 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
229 {
230 double dd;
231 if (relaxedMode)
232 r[j] = (float)f->rfunc.f_fpf(s[j], &dd);
233 else
234 r[j] = (float)f->func.f_fpf(s[j], &dd);
235
236 r2[j] = (float)dd;
237 }
238 }
239
240 if (isFract && ftz) RestoreFPState(&oldMode);
241
242 // Read the data back
243 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
244 {
245 if ((error =
246 clEnqueueReadBuffer(gQueue, gOutBuffer[j], CL_TRUE, 0,
247 BUFFER_SIZE, gOut[j], 0, NULL, NULL)))
248 {
249 vlog_error("ReadArray failed %d\n", error);
250 return error;
251 }
252 if ((error =
253 clEnqueueReadBuffer(gQueue, gOutBuffer2[j], CL_TRUE, 0,
254 BUFFER_SIZE, gOut2[j], 0, NULL, NULL)))
255 {
256 vlog_error("ReadArray2 failed %d\n", error);
257 return error;
258 }
259 }
260
261 if (gSkipCorrectnessTesting)
262 {
263 if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
264 break;
265 }
266
267 // Verify data
268 uint32_t *t = (uint32_t *)gOut_Ref;
269 uint32_t *t2 = (uint32_t *)gOut_Ref2;
270 for (size_t j = 0; j < BUFFER_SIZE / sizeof(float); j++)
271 {
272 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
273 {
274 uint32_t *q = (uint32_t *)gOut[k];
275 uint32_t *q2 = (uint32_t *)gOut2[k];
276
277 // If we aren't getting the correctly rounded result
278 if (t[j] != q[j] || t2[j] != q2[j])
279 {
280 double correct, correct2;
281 float err, err2;
282 float test = ((float *)q)[j];
283 float test2 = ((float *)q2)[j];
284
285 if (relaxedMode)
286 correct = f->rfunc.f_fpf(s[j], &correct2);
287 else
288 correct = f->func.f_fpf(s[j], &correct2);
289
290 // Per section 10 paragraph 6, accept any result if an input
291 // or output is a infinity or NaN or overflow
292 if (relaxedMode || skipNanInf)
293 {
294 if (skipNanInf && overflow[j]) continue;
295 // Note: no double rounding here. Reference functions
296 // calculate in single precision.
297 if (IsFloatInfinity(correct) || IsFloatNaN(correct)
298 || IsFloatInfinity(correct2) || IsFloatNaN(correct2)
299 || IsFloatInfinity(s[j]) || IsFloatNaN(s[j]))
300 continue;
301 }
302
303 typedef int (*CheckForSubnormal)(
304 double, float); // If we are in fast relaxed math, we
305 // have a different calculation for the
306 // subnormal threshold.
307 CheckForSubnormal isFloatResultSubnormalPtr;
308 if (relaxedMode)
309 {
310 err = Abs_Error(test, correct);
311 err2 = Abs_Error(test2, correct2);
312 isFloatResultSubnormalPtr =
313 &IsFloatResultSubnormalAbsError;
314 }
315 else
316 {
317 err = Ulp_Error(test, correct);
318 err2 = Ulp_Error(test2, correct2);
319 isFloatResultSubnormalPtr = &IsFloatResultSubnormal;
320 }
321 int fail = !(fabsf(err) <= float_ulps
322 && fabsf(err2) <= float_ulps);
323
324 if (ftz || relaxedMode)
325 {
326 // retry per section 6.5.3.2
327 if ((*isFloatResultSubnormalPtr)(correct, float_ulps))
328 {
329 if ((*isFloatResultSubnormalPtr)(correct2,
330 float_ulps))
331 {
332 fail = fail && !(test == 0.0f && test2 == 0.0f);
333 if (!fail)
334 {
335 err = 0.0f;
336 err2 = 0.0f;
337 }
338 }
339 else
340 {
341 fail = fail
342 && !(test == 0.0f
343 && fabsf(err2) <= float_ulps);
344 if (!fail) err = 0.0f;
345 }
346 }
347 else if ((*isFloatResultSubnormalPtr)(correct2,
348 float_ulps))
349 {
350 fail = fail
351 && !(test2 == 0.0f && fabsf(err) <= float_ulps);
352 if (!fail) err2 = 0.0f;
353 }
354
355
356 // retry per section 6.5.3.3
357 if (IsFloatSubnormal(s[j]))
358 {
359 double correctp, correctn;
360 double correct2p, correct2n;
361 float errp, err2p, errn, err2n;
362
363 if (skipNanInf) feclearexcept(FE_OVERFLOW);
364 if (relaxedMode)
365 {
366 correctp = f->rfunc.f_fpf(0.0, &correct2p);
367 correctn = f->rfunc.f_fpf(-0.0, &correct2n);
368 }
369 else
370 {
371 correctp = f->func.f_fpf(0.0, &correct2p);
372 correctn = f->func.f_fpf(-0.0, &correct2n);
373 }
374
375 // Per section 10 paragraph 6, accept any result if
376 // an input or output is a infinity or NaN or
377 // overflow
378 if (skipNanInf)
379 {
380 if (fetestexcept(FE_OVERFLOW)) continue;
381
382 // Note: no double rounding here. Reference
383 // functions calculate in single precision.
384 if (IsFloatInfinity(correctp)
385 || IsFloatNaN(correctp)
386 || IsFloatInfinity(correctn)
387 || IsFloatNaN(correctn)
388 || IsFloatInfinity(correct2p)
389 || IsFloatNaN(correct2p)
390 || IsFloatInfinity(correct2n)
391 || IsFloatNaN(correct2n))
392 continue;
393 }
394
395 if (relaxedMode)
396 {
397 errp = Abs_Error(test, correctp);
398 err2p = Abs_Error(test, correct2p);
399 errn = Abs_Error(test, correctn);
400 err2n = Abs_Error(test, correct2n);
401 }
402 else
403 {
404 errp = Ulp_Error(test, correctp);
405 err2p = Ulp_Error(test, correct2p);
406 errn = Ulp_Error(test, correctn);
407 err2n = Ulp_Error(test, correct2n);
408 }
409
410 fail = fail
411 && ((!(fabsf(errp) <= float_ulps))
412 && (!(fabsf(err2p) <= float_ulps))
413 && ((!(fabsf(errn) <= float_ulps))
414 && (!(fabsf(err2n) <= float_ulps))));
415 if (fabsf(errp) < fabsf(err)) err = errp;
416 if (fabsf(errn) < fabsf(err)) err = errn;
417 if (fabsf(err2p) < fabsf(err2)) err2 = err2p;
418 if (fabsf(err2n) < fabsf(err2)) err2 = err2n;
419
420 // retry per section 6.5.3.4
421 if ((*isFloatResultSubnormalPtr)(correctp,
422 float_ulps)
423 || (*isFloatResultSubnormalPtr)(correctn,
424 float_ulps))
425 {
426 if ((*isFloatResultSubnormalPtr)(correct2p,
427 float_ulps)
428 || (*isFloatResultSubnormalPtr)(correct2n,
429 float_ulps))
430 {
431 fail = fail
432 && !(test == 0.0f && test2 == 0.0f);
433 if (!fail) err = err2 = 0.0f;
434 }
435 else
436 {
437 fail = fail
438 && !(test == 0.0f
439 && fabsf(err2) <= float_ulps);
440 if (!fail) err = 0.0f;
441 }
442 }
443 else if ((*isFloatResultSubnormalPtr)(correct2p,
444 float_ulps)
445 || (*isFloatResultSubnormalPtr)(
446 correct2n, float_ulps))
447 {
448 fail = fail
449 && !(test2 == 0.0f
450 && (fabsf(err) <= float_ulps));
451 if (!fail) err2 = 0.0f;
452 }
453 }
454 }
455 if (fabsf(err) > maxError0)
456 {
457 maxError0 = fabsf(err);
458 maxErrorVal0 = s[j];
459 }
460 if (fabsf(err2) > maxError1)
461 {
462 maxError1 = fabsf(err2);
463 maxErrorVal1 = s[j];
464 }
465 if (fail)
466 {
467 vlog_error("\nERROR: %s%s: {%f, %f} ulp error at %a: "
468 "*{%a, %a} vs. {%a, %a}\n",
469 f->name, sizeNames[k], err, err2,
470 ((float *)gIn)[j], ((float *)gOut_Ref)[j],
471 ((float *)gOut_Ref2)[j], test, test2);
472 return -1;
473 }
474 }
475 }
476 }
477
478 if (isFract && gIsInRTZMode) (void)set_round(oldRoundMode, kfloat);
479
480 if (0 == (i & 0x0fffffff))
481 {
482 if (gVerboseBruteForce)
483 {
484 vlog("base:%14" PRIu64 " step:%10" PRIu64
485 " bufferSize:%10d \n",
486 i, step, BUFFER_SIZE);
487 }
488 else
489 {
490 vlog(".");
491 }
492 fflush(stdout);
493 }
494 }
495
496 if (!gSkipCorrectnessTesting)
497 {
498 if (gWimpyMode)
499 vlog("Wimp pass");
500 else
501 vlog("passed");
502
503 vlog("\t{%8.2f, %8.2f} @ {%a, %a}", maxError0, maxError1, maxErrorVal0,
504 maxErrorVal1);
505 }
506
507 vlog("\n");
508
509 return CL_SUCCESS;
510 }
511