1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18 #include "function_list.h"
19 #include "test_functions.h"
20 #include "utility.h"
21
22 #include <cstring>
23
24 namespace {
25
BuildKernelFn(cl_uint job_id,cl_uint thread_id UNUSED,void * p)26 cl_int BuildKernelFn(cl_uint job_id, cl_uint thread_id UNUSED, void *p)
27 {
28 BuildKernelInfo &info = *(BuildKernelInfo *)p;
29 auto generator = [](const std::string &kernel_name, const char *builtin,
30 cl_uint vector_size_index) {
31 return GetUnaryKernel(kernel_name, builtin, ParameterType::Int,
32 ParameterType::Float, vector_size_index);
33 };
34 return BuildKernels(info, job_id, generator);
35 }
36
37 // Thread specific data for a worker thread
38 struct ThreadInfo
39 {
40 // Input and output buffers for the thread
41 clMemWrapper inBuf;
42 Buffers outBuf;
43
44 // Per thread command queue to improve performance
45 clCommandQueueWrapper tQueue;
46 };
47
48 struct TestInfo
49 {
50 size_t subBufferSize; // Size of the sub-buffer in elements
51 const Func *f; // A pointer to the function info
52
53 // Programs for various vector sizes.
54 Programs programs;
55
56 // Thread-specific kernels for each vector size:
57 // k[vector_size][thread_id]
58 KernelMatrix k;
59
60 // Array of thread specific information
61 std::vector<ThreadInfo> tinfo;
62
63 cl_uint threadCount; // Number of worker threads
64 cl_uint jobCount; // Number of jobs
65 cl_uint step; // step between each chunk and the next.
66 cl_uint scale; // stride between individual test values
67 int ftz; // non-zero if running in flush to zero mode
68 bool relaxedMode; // True if test is running in relaxed mode, false
69 // otherwise.
70 };
71
Test(cl_uint job_id,cl_uint thread_id,void * data)72 cl_int Test(cl_uint job_id, cl_uint thread_id, void *data)
73 {
74 TestInfo *job = (TestInfo *)data;
75 size_t buffer_elements = job->subBufferSize;
76 size_t buffer_size = buffer_elements * sizeof(cl_float);
77 cl_uint scale = job->scale;
78 cl_uint base = job_id * (cl_uint)job->step;
79 ThreadInfo *tinfo = &(job->tinfo[thread_id]);
80 fptr func = job->f->func;
81 int ftz = job->ftz;
82 bool relaxedMode = job->relaxedMode;
83 cl_int error = CL_SUCCESS;
84 const char *name = job->f->name;
85
86 int signbit_test = 0;
87 if (!strcmp(name, "signbit")) signbit_test = 1;
88
89 #define ref_func(s) (signbit_test ? func.i_f_f(s) : func.i_f(s))
90
91 cl_event e[VECTOR_SIZE_COUNT];
92 cl_int *out[VECTOR_SIZE_COUNT];
93 if (gHostFill)
94 {
95 // start the map of the output arrays
96 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
97 {
98 out[j] = (cl_int *)clEnqueueMapBuffer(
99 tinfo->tQueue, tinfo->outBuf[j], CL_FALSE, CL_MAP_WRITE, 0,
100 buffer_size, 0, NULL, e + j, &error);
101 if (error || NULL == out[j])
102 {
103 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
104 error);
105 return error;
106 }
107 }
108
109 // Get that moving
110 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush failed\n");
111 }
112
113 // Init input array
114 cl_uint *p = (cl_uint *)gIn + thread_id * buffer_elements;
115 for (size_t j = 0; j < buffer_elements; j++) p[j] = base + j * scale;
116
117 if ((error = clEnqueueWriteBuffer(tinfo->tQueue, tinfo->inBuf, CL_FALSE, 0,
118 buffer_size, p, 0, NULL, NULL)))
119 {
120 vlog_error("Error: clEnqueueWriteBuffer failed! err: %d\n", error);
121 return error;
122 }
123
124 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
125 {
126 if (gHostFill)
127 {
128 // Wait for the map to finish
129 if ((error = clWaitForEvents(1, e + j)))
130 {
131 vlog_error("Error: clWaitForEvents failed! err: %d\n", error);
132 return error;
133 }
134 if ((error = clReleaseEvent(e[j])))
135 {
136 vlog_error("Error: clReleaseEvent failed! err: %d\n", error);
137 return error;
138 }
139 }
140
141 // Fill the result buffer with garbage, so that old results don't carry
142 // over
143 uint32_t pattern = 0xffffdead;
144 if (gHostFill)
145 {
146 memset_pattern4(out[j], &pattern, buffer_size);
147 if ((error = clEnqueueUnmapMemObject(
148 tinfo->tQueue, tinfo->outBuf[j], out[j], 0, NULL, NULL)))
149 {
150 vlog_error("Error: clEnqueueUnmapMemObject failed! err: %d\n",
151 error);
152 return error;
153 }
154 }
155 else
156 {
157 if ((error = clEnqueueFillBuffer(tinfo->tQueue, tinfo->outBuf[j],
158 &pattern, sizeof(pattern), 0,
159 buffer_size, 0, NULL, NULL)))
160 {
161 vlog_error("Error: clEnqueueFillBuffer failed! err: %d\n",
162 error);
163 return error;
164 }
165 }
166
167 // Run the kernel
168 size_t vectorCount =
169 (buffer_elements + sizeValues[j] - 1) / sizeValues[j];
170 cl_kernel kernel = job->k[j][thread_id]; // each worker thread has its
171 // own copy of the cl_kernel
172 cl_program program = job->programs[j];
173
174 if ((error = clSetKernelArg(kernel, 0, sizeof(tinfo->outBuf[j]),
175 &tinfo->outBuf[j])))
176 {
177 LogBuildError(program);
178 return error;
179 }
180 if ((error = clSetKernelArg(kernel, 1, sizeof(tinfo->inBuf),
181 &tinfo->inBuf)))
182 {
183 LogBuildError(program);
184 return error;
185 }
186
187 if ((error = clEnqueueNDRangeKernel(tinfo->tQueue, kernel, 1, NULL,
188 &vectorCount, NULL, 0, NULL, NULL)))
189 {
190 vlog_error("FAILED -- could not execute kernel\n");
191 return error;
192 }
193 }
194
195 // Get that moving
196 if ((error = clFlush(tinfo->tQueue))) vlog("clFlush 2 failed\n");
197
198 if (gSkipCorrectnessTesting) return CL_SUCCESS;
199
200 // Calculate the correctly rounded reference result
201 cl_int *r = (cl_int *)gOut_Ref + thread_id * buffer_elements;
202 float *s = (float *)p;
203 for (size_t j = 0; j < buffer_elements; j++) r[j] = ref_func(s[j]);
204
205 // Read the data back -- no need to wait for the first N-1 buffers but wait
206 // for the last buffer. This is an in order queue.
207 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
208 {
209 cl_bool blocking = (j + 1 < gMaxVectorSizeIndex) ? CL_FALSE : CL_TRUE;
210 out[j] = (cl_int *)clEnqueueMapBuffer(
211 tinfo->tQueue, tinfo->outBuf[j], blocking, CL_MAP_READ, 0,
212 buffer_size, 0, NULL, NULL, &error);
213 if (error || NULL == out[j])
214 {
215 vlog_error("Error: clEnqueueMapBuffer %d failed! err: %d\n", j,
216 error);
217 return error;
218 }
219 }
220
221 // Verify data
222 cl_int *t = (cl_int *)r;
223 for (size_t j = 0; j < buffer_elements; j++)
224 {
225 for (auto k = gMinVectorSizeIndex; k < gMaxVectorSizeIndex; k++)
226 {
227 cl_int *q = out[0];
228
229 // If we aren't getting the correctly rounded result
230 if (gMinVectorSizeIndex == 0 && t[j] != q[j])
231 {
232 // If we aren't getting the correctly rounded result
233 if (ftz || relaxedMode)
234 {
235 if (IsFloatSubnormal(s[j]))
236 {
237 int correct = ref_func(+0.0f);
238 int correct2 = ref_func(-0.0f);
239 if (correct == q[j] || correct2 == q[j]) continue;
240 }
241 }
242
243 uint32_t err = t[j] - q[j];
244 if (q[j] > t[j]) err = q[j] - t[j];
245 vlog_error("\nERROR: %s: %d ulp error at %a: *%d vs. %d\n",
246 name, err, ((float *)s)[j], t[j], q[j]);
247 return -1;
248 }
249
250
251 for (auto k = std::max(1U, gMinVectorSizeIndex);
252 k < gMaxVectorSizeIndex; k++)
253 {
254 q = out[k];
255 // If we aren't getting the correctly rounded result
256 if (-t[j] != q[j])
257 {
258 if (ftz || relaxedMode)
259 {
260 if (IsFloatSubnormal(s[j]))
261 {
262 int correct = -ref_func(+0.0f);
263 int correct2 = -ref_func(-0.0f);
264 if (correct == q[j] || correct2 == q[j]) continue;
265 }
266 }
267
268 uint32_t err = -t[j] - q[j];
269 if (q[j] > -t[j]) err = q[j] + t[j];
270 vlog_error(
271 "\nERROR: %s%s: %d ulp error at %a: *%d vs. %d\n", name,
272 sizeNames[k], err, ((float *)s)[j], -t[j], q[j]);
273 return -1;
274 }
275 }
276 }
277 }
278
279 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
280 {
281 if ((error = clEnqueueUnmapMemObject(tinfo->tQueue, tinfo->outBuf[j],
282 out[j], 0, NULL, NULL)))
283 {
284 vlog_error("Error: clEnqueueUnmapMemObject %d failed 2! err: %d\n",
285 j, error);
286 return error;
287 }
288 }
289
290 if ((error = clFlush(tinfo->tQueue)))
291 {
292 vlog("clFlush 3 failed\n");
293 return error;
294 }
295
296
297 if (0 == (base & 0x0fffffff))
298 {
299 if (gVerboseBruteForce)
300 {
301 vlog("base:%14u step:%10u scale:%10u buf_elements:%10zd "
302 "ThreadCount:%2u\n",
303 base, job->step, job->scale, buffer_elements,
304 job->threadCount);
305 }
306 else
307 {
308 vlog(".");
309 }
310 fflush(stdout);
311 }
312
313 return CL_SUCCESS;
314 }
315
316 } // anonymous namespace
317
TestMacro_Int_Float(const Func * f,MTdata d,bool relaxedMode)318 int TestMacro_Int_Float(const Func *f, MTdata d, bool relaxedMode)
319 {
320 TestInfo test_info{};
321 cl_int error;
322
323 logFunctionInfo(f->name, sizeof(cl_float), relaxedMode);
324
325 // Init test_info
326 test_info.threadCount = GetThreadCount();
327 test_info.subBufferSize = BUFFER_SIZE
328 / (sizeof(cl_float) * RoundUpToNextPowerOfTwo(test_info.threadCount));
329 test_info.scale = getTestScale(sizeof(cl_float));
330
331 test_info.step = (cl_uint)test_info.subBufferSize * test_info.scale;
332 if (test_info.step / test_info.subBufferSize != test_info.scale)
333 {
334 // there was overflow
335 test_info.jobCount = 1;
336 }
337 else
338 {
339 test_info.jobCount = (cl_uint)((1ULL << 32) / test_info.step);
340 }
341
342 test_info.f = f;
343 test_info.ftz =
344 f->ftz || gForceFTZ || 0 == (CL_FP_DENORM & gFloatCapabilities);
345 test_info.relaxedMode = relaxedMode;
346
347 test_info.tinfo.resize(test_info.threadCount);
348 for (cl_uint i = 0; i < test_info.threadCount; i++)
349 {
350 cl_buffer_region region = {
351 i * test_info.subBufferSize * sizeof(cl_float),
352 test_info.subBufferSize * sizeof(cl_float)
353 };
354 test_info.tinfo[i].inBuf =
355 clCreateSubBuffer(gInBuffer, CL_MEM_READ_ONLY,
356 CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error);
357 if (error || NULL == test_info.tinfo[i].inBuf)
358 {
359 vlog_error("Error: Unable to create sub-buffer of gInBuffer for "
360 "region {%zd, %zd}\n",
361 region.origin, region.size);
362 return error;
363 }
364
365 for (auto j = gMinVectorSizeIndex; j < gMaxVectorSizeIndex; j++)
366 {
367 test_info.tinfo[i].outBuf[j] = clCreateSubBuffer(
368 gOutBuffer[j], CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION,
369 ®ion, &error);
370 if (error || NULL == test_info.tinfo[i].outBuf[j])
371 {
372 vlog_error("Error: Unable to create sub-buffer of "
373 "gOutBuffer[%d] for region {%zd, %zd}\n",
374 (int)j, region.origin, region.size);
375 return error;
376 }
377 }
378 test_info.tinfo[i].tQueue =
379 clCreateCommandQueue(gContext, gDevice, 0, &error);
380 if (NULL == test_info.tinfo[i].tQueue || error)
381 {
382 vlog_error("clCreateCommandQueue failed. (%d)\n", error);
383 return error;
384 }
385 }
386
387 // Init the kernels
388 BuildKernelInfo build_info{ test_info.threadCount, test_info.k,
389 test_info.programs, f->nameInCode,
390 relaxedMode };
391 if ((error = ThreadPool_Do(BuildKernelFn,
392 gMaxVectorSizeIndex - gMinVectorSizeIndex,
393 &build_info)))
394 return error;
395
396 // Run the kernels
397 if (!gSkipCorrectnessTesting)
398 {
399 error = ThreadPool_Do(Test, test_info.jobCount, &test_info);
400 if (error) return error;
401
402 if (gWimpyMode)
403 vlog("Wimp pass");
404 else
405 vlog("passed");
406 }
407
408 vlog("\n");
409
410 return CL_SUCCESS;
411 }
412