1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cinttypes>
23 #include <cstring>
24
25 namespace {
26
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)27 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
28 {
29 BuildKernelInfo &info = *(BuildKernelInfo *)p;
30 auto generator = [](const std::string &kernel_name, const char *builtin,
31 cl_uint vector_size_index) {
32 return GetUnaryKernel(kernel_name, builtin, ParameterType::Long,
33 ParameterType::Double, vector_size_index);
34 };
35 return BuildKernels(info, job_id, generator);
36 }
37
38 // Thread specific data for a worker thread
39 struct ThreadInfo
40 {
41 // Input and output buffers for the thread
42 clMemWrapper inBuf;
43 Buffers outBuf;
44
45 // Per thread command queue to improve performance
46 clCommandQueueWrapper tQueue;
47 };
48
49 struct TestInfo
50 {
51 size_t subBufferSize; // Size of the sub-buffer in elements
52 const Func *f; // A pointer to the function info
53
54 // Programs for various vector sizes.
55 Programs programs;
56
57 // Thread-specific kernels for each vector size:
58 // k[vector_size][thread_id]
59 KernelMatrix k;
60
61 // Array of thread specific information
62 std::vector<ThreadInfo> tinfo;
63
64 cl_uint threadCount; // Number of worker threads
65 cl_uint jobCount; // Number of jobs
66 cl_uint step; // step between each chunk and the next.
67 cl_uint scale; // stride between individual test values
68 int ftz; // non-zero if running in flush to zero mode
69 bool relaxedMode; // True if test is running in relaxed mode, false
70 // otherwise.
71 };
72
Test(cl_uint job_id,cl_uint thread_id,void * data)73 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
74 {
75 TestInfo *job = (TestInfo *)data;
76 size_t buffer_elements = job->subBufferSize;
77 size_t buffer_size = buffer_elements * sizeof(cl_double);
78 cl_uint scale = job->scale;
79 cl_uint base = job_id * (cl_uint)job->step;
80 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
81 dptr dfunc = job->f->dfunc;
82 int ftz = job->ftz;
83 bool relaxedMode = job->relaxedMode;
84 cl_int error;
85 const char *name = job->f->name;
86
87 Force64BitFPUPrecision();
88
89 cl_event e[VECTOR_SIZE_COUNT];
90 cl_long *out[VECTOR_SIZE_COUNT];
91 if (gHostFill)
92 {
93 // start the map of the output arrays
94 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
95 {
96 out[j] = (cl_long *)clEnqueueMapBuffer(
97 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
98 buffer_size, 0, NULL, e + j, &error);
99 if (error || NULL == out[j])
100 {
101 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
102 error);
103 return error;
104 }
105 }
106
107 // Get that moving
108 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
109 }
110
111 // Write the new values to the input array
112 cl_double *p = (cl_double *)gIn + thread_id * buffer_elements;
113 for (size_t j = 0; j < buffer_elements; j++)
114 p[j] = DoubleFromUInt32(base + j * scale);
115
116 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
117 buffer_size, p, 0, NULL, NULL)))
118 {
119 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
120 return error;
121 }
122
123 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
124 {
125 if (gHostFill)
126 {
127 // Wait for the map to finish
128 if ((error = clWaitForEvents(1, e + j)))
129 {
130 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
131 return error;
132 }
133 if ((error = clReleaseEvent(e[j])))
134 {
135 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
136 return error;
137 }
138 }
139
140 // Fill the result buffer with garbage, so that old results don't carry
141 // over
142 uint32_t pattern = 0xffffdead;
143 if (gHostFill)
144 {
145 memset_pattern4(out[j], &pattern, buffer_size);
146 if ((error = clEnqueueUnmapMemObject(
147 tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
148 {
149 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
150 error);
151 return error;
152 }
153 }
154 else
155 {
156 if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
157 &pattern, sizeof(pattern), 0,
158 buffer_size, 0, NULL, NULL)))
159 {
160 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
161 error);
162 return error;
163 }
164 }
165
166 // Run the kernel
167 size_t vectorCount =
168 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
169 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
170 // own copy of the cl_kernel
171 cl_program program = job->programs[j];
172
173 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
174 &tinfo->outBuf[j])))
175 {
176 LogBuildError(program);
177 return error;
178 }
179 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
180 &tinfo->inBuf)))
181 {
182 LogBuildError(program);
183 return error;
184 }
185
186 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
187 &vectorCount, NULL, 0, NULL, NULL)))
188 {
189 vlog_error("FAILED -- could not execute kernel\n");
190 return error;
191 }
192 }
193
194 // Get that moving
195 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
196
197 if (gSkipCorrectnessTesting) return CL_SUCCESS;
198
199 // Calculate the correctly rounded reference result
200 cl_long *r = (cl_long *)gOut_Ref + thread_id * buffer_elements;
201 cl_double *s = (cl_double *)p;
202 for (size_t j = 0; j < buffer_elements; j++) r[j] = dfunc.i_f(s[j]);
203
204 // Read the data back -- no need to wait for the first N-1 buffers but wait
205 // for the last buffer. This is an in order queue.
206 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
207 {
208 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
209 out[j] = (cl_long *)clEnqueueMapBuffer(
210 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
211 buffer_size, 0, NULL, NULL, &error);
212 if (error || NULL == out[j])
213 {
214 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
215 error);
216 return error;
217 }
218 }
219
220 // Verify data
221 cl_long *t = (cl_long *)r;
222 for (size_t j = 0; j < buffer_elements; j++)
223 {
224 cl_long *q = out[0];
225
226 // If we aren't getting the correctly rounded result
227 if (gMinVectorSizeIndex == 0 && t[j] != q[j])
228 {
229 // If we aren't getting the correctly rounded result
230 if (ftz || relaxedMode)
231 {
232 if (IsDoubleSubnormal(s[j]))
233 {
234 cl_long correct = dfunc.i_f(+0.0f);
235 cl_long correct2 = dfunc.i_f(-0.0f);
236 if (correct == q[j] || correct2 == q[j]) continue;
237 }
238 }
239
240 cl_ulong err = t[j] - q[j];
241 if (q[j] > t[j]) err = q[j] - t[j];
242 vlog_error("\nERROR: %sD: %" PRId64
243 " ulp error at %.13la: *%" PRId64 " vs. %" PRId64 "\n",
244 name, err, ((double *)gIn)[j], t[j], q[j]);
245 return -1;
246 }
247
248
249 for (auto k = std::max(1U, gMinVectorSizeIndex);
250 k < gMaxVectorSizeIndex; k++)
251 {
252 q = out[k];
253 // If we aren't getting the correctly rounded result
254 if (-t[j] != q[j])
255 {
256 if (ftz || relaxedMode)
257 {
258 if (IsDoubleSubnormal(s[j]))
259 {
260 int64_t correct = -dfunc.i_f(+0.0f);
261 int64_t correct2 = -dfunc.i_f(-0.0f);
262 if (correct == q[j] || correct2 == q[j]) continue;
263 }
264 }
265
266 cl_ulong err = -t[j] - q[j];
267 if (q[j] > -t[j]) err = q[j] + t[j];
268 vlog_error(
269 "\nERROR: %sD%s: %" PRId64 " ulp error at %.13la: *%" PRId64
270 " vs. %" PRId64 "\n",
271 name, sizeNames[k], err, ((double *)gIn)[j], -t[j], q[j]);
272 return -1;
273 }
274 }
275 }
276
277 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
278 {
279 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
280 out[j], 0, NULL, NULL)))
281 {
282 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
283 j, error);
284 return error;
285 }
286 }
287
288 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 3 failed\n");
289
290
291 if (0 == (base & 0x0fffffff))
292 {
293 if (gVerboseBruteForce)
294 {
295 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
296 "ThreadCount:%2u\n",
297 base, job->step, job->scale, buffer_elements,
298 job->threadCount);
299 }
300 else
301 {
302 vlog(".");
303 }
304 fflush(stdout);
305 }
306
307 return CL_SUCCESS;
308 }
309
310 } // anonymous namespace
311
TestMacro_Int_Double(const Func * f,MTdata d,bool relaxedMode)312 int TestMacro_Int_Double(const Func *f, MTdata d, bool relaxedMode)
313 {
314 TestInfo test_info{};
315 cl_int error;
316
317 logFunctionInfo(f->name, sizeof(cl_double), relaxedMode);
318
319 // Init test_info
320 test_info.threadCount = GetThreadCount();
321 test_info.subBufferSize = BUFFER_SIZE
322 / (sizeof(cl_double) * RoundUpToNextPowerOfTwo(test_info.threadCount));
323 test_info.scale = getTestScale(sizeof(cl_double));
324
325 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
326 if (test_info.step / test_info.subBufferSize != test_info.scale)
327 {
328 // there was overflow
329 test_info.jobCount = 1;
330 }
331 else
332 {
333 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
334 }
335
336 test_info.f = f;
337 test_info.ftz = f->ftz || gForceFTZ;
338 test_info.relaxedMode = relaxedMode;
339
340 test_info.tinfo.resize(test_info.threadCount);
341 for (cl_uint i = 0; i < test_info.threadCount; i++)
342 {
343 cl_buffer_region region = {
344 i * test_info.subBufferSize * sizeof(cl_double),
345 test_info.subBufferSize * sizeof(cl_double)
346 };
347 test_info.tinfo[i].inBuf =
348 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
349 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
350 if (error || NULL == test_info.tinfo[i].inBuf)
351 {
352 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
353 "region {%zd, %zd}\n",
354 region.origin, region.size);
355 return error;
356 }
357
358 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
359 {
360 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
361 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
362 ®ion, &error);
363 if (error || NULL == test_info.tinfo[i].outBuf[j])
364 {
365 vlog_error("Error: Unable to create sub-buffer of "
366 "gOutBuffer[%d] for region {%zd, %zd}\n",
367 (int)j, region.origin, region.size);
368 return error;
369 }
370 }
371 test_info.tinfo[i].tQueue =
372 clCreateCommandQueue(gContext, gDevice, 0, &error);
373 if (NULL == test_info.tinfo[i].tQueue || error)
374 {
375 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
376 return error;
377 }
378 }
379
380 // Init the kernels
381 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
382 test_info.programs, f->nameInCode,
383 relaxedMode };
384 if ((error = ThreadPool_Do(BuildKernelFn,
385 gMaxVectorSizeIndex - gMinVectorSizeIndex,
386 &build_info)))
387 return error;
388
389 // Run the kernels
390 if (!gSkipCorrectnessTesting)
391 {
392 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
393 if (error) return error;
394
395 if (gWimpyMode)
396 vlog("Wimp pass");
397 else
398 vlog("passed");
399 }
400
401 vlog("\n");
402
403 return CL_SUCCESS;
404 }
405