1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/kernelHelpers.h"
18 #include "harness/testHarness.h"
19
20 #include <string.h>
21
22 #include <algorithm>
23
24 #include "cl_utils.h"
25 #include "tests.h"
26
27 #include <CL/cl_half.h>
28
29 typedef struct ComputeReferenceInfoF_
30 {
31 float *x;
32 cl_ushort *r;
33 f2h f;
34 cl_ulong i;
35 cl_uint lim;
36 cl_uint count;
37 } ComputeReferenceInfoF;
38
39 typedef struct ComputeReferenceInfoD_
40 {
41 double *x;
42 cl_ushort *r;
43 d2h f;
44 cl_ulong i;
45 cl_uint lim;
46 cl_uint count;
47 } ComputeReferenceInfoD;
48
49 typedef struct CheckResultInfoF_
50 {
51 const float *x;
52 const cl_ushort *r;
53 const cl_ushort *s;
54 f2h f;
55 const char *aspace;
56 cl_uint lim;
57 cl_uint count;
58 int vsz;
59 } CheckResultInfoF;
60
61 typedef struct CheckResultInfoD_
62 {
63 const double *x;
64 const cl_ushort *r;
65 const cl_ushort *s;
66 d2h f;
67 const char *aspace;
68 cl_uint lim;
69 cl_uint count;
70 int vsz;
71 } CheckResultInfoD;
72
ReferenceF(cl_uint jid,cl_uint tid,void * userInfo)73 static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
74 {
75 ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
76 cl_uint lim = cri->lim;
77 cl_uint count = cri->count;
78 cl_uint off = jid * count;
79 float *x = cri->x + off;
80 cl_ushort *r = cri->r + off;
81 f2h f = cri->f;
82 cl_ulong i = cri->i + off;
83 cl_uint j;
84
85 if (off + count > lim) count = lim - off;
86
87 for (j = 0; j < count; ++j)
88 {
89 x[j] = as_float((cl_uint)(i + j));
90 r[j] = f(x[j]);
91 }
92
93 return 0;
94 }
95
CheckF(cl_uint jid,cl_uint tid,void * userInfo)96 static cl_int CheckF(cl_uint jid, cl_uint tid, void *userInfo)
97 {
98 CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
99 cl_uint lim = cri->lim;
100 cl_uint count = cri->count;
101 cl_uint off = jid * count;
102 const float *x = cri->x + off;
103 const cl_ushort *r = cri->r + off;
104 const cl_ushort *s = cri->s + off;
105 f2h f = cri->f;
106 cl_uint j;
107 cl_ushort correct2 = f(0.0f);
108 cl_ushort correct3 = f(-0.0f);
109 cl_int ret = 0;
110
111 if (off + count > lim) count = lim - off;
112
113 if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0;
114
115 for (j = 0; j < count; j++)
116 {
117 if (s[j] == r[j]) continue;
118
119 // Pass any NaNs
120 if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue;
121
122 // retry per section 6.5.3.3
123 if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
124 continue;
125
126 // if reference result is subnormal, pass any zero
127 if (gIsEmbedded && IsHalfSubnormal(r[j])
128 && (s[j] == 0x0000 || s[j] == 0x8000))
129 continue;
130
131 vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x, "
132 "vector_size = %d, address_space = %s\n",
133 j + off, x[j], r[j], s[j], cri->vsz, cri->aspace);
134
135 ret = 1;
136 break;
137 }
138
139 return ret;
140 }
141
ReferenceD(cl_uint jid,cl_uint tid,void * userInfo)142 static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
143 {
144 ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
145 cl_uint lim = cri->lim;
146 cl_uint count = cri->count;
147 cl_uint off = jid * count;
148 double *x = cri->x + off;
149 cl_ushort *r = cri->r + off;
150 d2h f = cri->f;
151 cl_uint j;
152 cl_ulong i = cri->i + off;
153
154 if (off + count > lim) count = lim - off;
155
156 for (j = 0; j < count; ++j)
157 {
158 x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
159 r[j] = f(x[j]);
160 }
161
162 return 0;
163 }
164
CheckD(cl_uint jid,cl_uint tid,void * userInfo)165 static cl_int CheckD(cl_uint jid, cl_uint tid, void *userInfo)
166 {
167 CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
168 cl_uint lim = cri->lim;
169 cl_uint count = cri->count;
170 cl_uint off = jid * count;
171 const double *x = cri->x + off;
172 const cl_ushort *r = cri->r + off;
173 const cl_ushort *s = cri->s + off;
174 d2h f = cri->f;
175 cl_uint j;
176 cl_ushort correct2 = f(0.0);
177 cl_ushort correct3 = f(-0.0);
178 cl_int ret = 0;
179
180 if (off + count > lim) count = lim - off;
181
182 if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0;
183
184 for (j = 0; j < count; j++)
185 {
186 if (s[j] == r[j]) continue;
187
188 // Pass any NaNs
189 if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue;
190
191 if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
192 continue;
193
194 // if reference result is subnormal, pass any zero result
195 if (gIsEmbedded && IsHalfSubnormal(r[j])
196 && (s[j] == 0x0000 || s[j] == 0x8000))
197 continue;
198
199 vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, "
200 "vector_size = %d, address space = %s (double precision)\n",
201 j + off, x[j], r[j], s[j], cri->vsz, cri->aspace);
202
203 ret = 1;
204 break;
205 }
206
207 return ret;
208 }
209
float2half_rte(float f)210 static cl_half float2half_rte(float f)
211 {
212 return cl_half_from_float(f, CL_HALF_RTE);
213 }
214
float2half_rtz(float f)215 static cl_half float2half_rtz(float f)
216 {
217 return cl_half_from_float(f, CL_HALF_RTZ);
218 }
219
float2half_rtp(float f)220 static cl_half float2half_rtp(float f)
221 {
222 return cl_half_from_float(f, CL_HALF_RTP);
223 }
224
float2half_rtn(float f)225 static cl_half float2half_rtn(float f)
226 {
227 return cl_half_from_float(f, CL_HALF_RTN);
228 }
229
double2half_rte(double f)230 static cl_half double2half_rte(double f)
231 {
232 return cl_half_from_double(f, CL_HALF_RTE);
233 }
234
double2half_rtz(double f)235 static cl_half double2half_rtz(double f)
236 {
237 return cl_half_from_double(f, CL_HALF_RTZ);
238 }
239
double2half_rtp(double f)240 static cl_half double2half_rtp(double f)
241 {
242 return cl_half_from_double(f, CL_HALF_RTP);
243 }
244
double2half_rtn(double f)245 static cl_half double2half_rtn(double f)
246 {
247 return cl_half_from_double(f, CL_HALF_RTN);
248 }
249
test_vstore_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)250 int test_vstore_half(cl_device_id deviceID, cl_context context,
251 cl_command_queue queue, int num_elements)
252 {
253 switch (get_default_rounding_mode(deviceID))
254 {
255 case CL_FP_ROUND_TO_ZERO:
256 return Test_vStoreHalf_private(deviceID, float2half_rtz,
257 double2half_rte, "");
258 case 0: return -1;
259 default:
260 return Test_vStoreHalf_private(deviceID, float2half_rte,
261 double2half_rte, "");
262 }
263 }
264
test_vstore_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)265 int test_vstore_half_rte(cl_device_id deviceID, cl_context context,
266 cl_command_queue queue, int num_elements)
267 {
268 return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte,
269 "_rte");
270 }
271
test_vstore_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)272 int test_vstore_half_rtz(cl_device_id deviceID, cl_context context,
273 cl_command_queue queue, int num_elements)
274 {
275 return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz,
276 "_rtz");
277 }
278
test_vstore_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)279 int test_vstore_half_rtp(cl_device_id deviceID, cl_context context,
280 cl_command_queue queue, int num_elements)
281 {
282 return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp,
283 "_rtp");
284 }
285
test_vstore_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)286 int test_vstore_half_rtn(cl_device_id deviceID, cl_context context,
287 cl_command_queue queue, int num_elements)
288 {
289 return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn,
290 "_rtn");
291 }
292
test_vstorea_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)293 int test_vstorea_half(cl_device_id deviceID, cl_context context,
294 cl_command_queue queue, int num_elements)
295 {
296 switch (get_default_rounding_mode(deviceID))
297 {
298 case CL_FP_ROUND_TO_ZERO:
299 return Test_vStoreaHalf_private(deviceID, float2half_rtz,
300 double2half_rte, "");
301 case 0: return -1;
302 default:
303 return Test_vStoreaHalf_private(deviceID, float2half_rte,
304 double2half_rte, "");
305 }
306 }
307
test_vstorea_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)308 int test_vstorea_half_rte(cl_device_id deviceID, cl_context context,
309 cl_command_queue queue, int num_elements)
310 {
311 return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte,
312 "_rte");
313 }
314
test_vstorea_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)315 int test_vstorea_half_rtz(cl_device_id deviceID, cl_context context,
316 cl_command_queue queue, int num_elements)
317 {
318 return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz,
319 "_rtz");
320 }
321
test_vstorea_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)322 int test_vstorea_half_rtp(cl_device_id deviceID, cl_context context,
323 cl_command_queue queue, int num_elements)
324 {
325 return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp,
326 "_rtp");
327 }
328
test_vstorea_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)329 int test_vstorea_half_rtn(cl_device_id deviceID, cl_context context,
330 cl_command_queue queue, int num_elements)
331 {
332 return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn,
333 "_rtn");
334 }
335
336 #pragma mark -
337
Test_vStoreHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)338 int Test_vStoreHalf_private(cl_device_id device, f2h referenceFunc,
339 d2h doubleReferenceFunc, const char *roundName)
340 {
341 int vectorSize, error;
342 cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
343 cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
344 cl_program resetProgram;
345 cl_kernel resetKernel;
346
347 uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
348 uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
349 memset(min_time, -1, sizeof(min_time));
350 cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3];
351 cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
352 uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
353 uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = {
354 0
355 };
356 memset(min_double_time, -1, sizeof(min_double_time));
357
358 bool aligned = false;
359
360 for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
361 vectorSize++)
362 {
363 const char *source[] = { "__kernel void test( __global float",
364 vector_size_name_extensions[vectorSize],
365 " *p, __global half *f )\n"
366 "{\n"
367 " size_t i = get_global_id(0);\n"
368 " vstore_half",
369 vector_size_name_extensions[vectorSize],
370 roundName,
371 "( p[i], i, f );\n"
372 "}\n" };
373
374 const char *source_v3[] = {
375 "__kernel void test( __global float *p, __global half *f,\n"
376 " uint extra_last_thread)\n"
377 "{\n"
378 " size_t i = get_global_id(0);\n"
379 " size_t last_i = get_global_size(0)-1;\n"
380 " size_t adjust = 0;\n"
381 " if(last_i == i && extra_last_thread != 0) {\n"
382 " adjust = 3-extra_last_thread;\n"
383 " } "
384 " vstore_half3",
385 roundName,
386 "( vload3(i, p-adjust), i, f-adjust );\n"
387 "}\n"
388 };
389
390 const char *source_private_store[] = {
391 "__kernel void test( __global float",
392 vector_size_name_extensions[vectorSize],
393 " *p, __global half *f )\n"
394 "{\n"
395 " __private ushort data[16];\n"
396 " size_t i = get_global_id(0);\n"
397 " size_t offset = 0;\n"
398 " size_t vecsize = vec_step(p[i]);\n"
399 " vstore_half",
400 vector_size_name_extensions[vectorSize],
401 roundName,
402 "( p[i], 0, (__private half *)(&data[0]) );\n"
403 " for(offset = 0; offset < vecsize; offset++)\n"
404 " {\n"
405 " vstore_half(vload_half(offset, (__private half *)data), 0, "
406 "&f[vecsize*i+offset]);\n"
407 " }\n"
408 "}\n"
409 };
410
411
412 const char *source_private_store_v3[] = {
413 "__kernel void test( __global float *p, __global half *f,\n"
414 " uint extra_last_thread )\n"
415 "{\n"
416 " __private ushort data[4];\n"
417 " size_t i = get_global_id(0);\n"
418 " size_t last_i = get_global_size(0)-1;\n"
419 " size_t adjust = 0;\n"
420 " size_t offset = 0;\n"
421 " if(last_i == i && extra_last_thread != 0) {\n"
422 " adjust = 3-extra_last_thread;\n"
423 " } "
424 " vstore_half3",
425 roundName,
426 "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
427 " for(offset = 0; offset < 3; offset++)\n"
428 " {\n"
429 " vstore_half(vload_half(offset, (__private half *) data), "
430 "0, &f[3*i+offset-adjust]);\n"
431 " }\n"
432 "}\n"
433 };
434
435 char local_buf_size[10];
436 sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
437
438
439 const char *source_local_store[] = {
440 "__kernel void test( __global float",
441 vector_size_name_extensions[vectorSize],
442 " *p, __global half *f )\n"
443 "{\n"
444 " __local ushort data[16*",
445 local_buf_size,
446 "];\n"
447 " size_t i = get_global_id(0);\n"
448 " size_t lid = get_local_id(0);\n"
449 " size_t lsize = get_local_size(0);\n"
450 " size_t vecsize = vec_step(p[0]);\n"
451 " event_t async_event;\n"
452 " vstore_half",
453 vector_size_name_extensions[vectorSize],
454 roundName,
455 "( p[i], lid, (__local half *)(&data[0]) );\n"
456 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
457 " async_event = async_work_group_copy((__global ushort "
458 "*)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, "
459 "0);\n" // investigate later
460 " wait_group_events(1, &async_event);\n"
461 "}\n"
462 };
463
464 const char *source_local_store_v3[] = {
465 "__kernel void test( __global float *p, __global half *f,\n"
466 " uint extra_last_thread )\n"
467 "{\n"
468 " __local ushort data[3*(",
469 local_buf_size,
470 "+1)];\n"
471 " size_t i = get_global_id(0);\n"
472 " size_t lid = get_local_id(0);\n"
473 " size_t last_i = get_global_size(0)-1;\n"
474 " size_t adjust = 0;\n"
475 " size_t lsize = get_local_size(0);\n"
476 " event_t async_event;\n"
477 " if(last_i == i && extra_last_thread != 0) {\n"
478 " adjust = 3-extra_last_thread;\n"
479 " } "
480 " vstore_half3",
481 roundName,
482 "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
483 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
484 " if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
485 " extra_last_thread != 0) {\n"
486 " adjust = 3-extra_last_thread;\n"
487 " }\n"
488 " async_event = async_work_group_copy(\n"
489 " (__global ushort*)(f+3*(i-lid)),\n"
490 " (__local ushort *)(&data[adjust]),\n"
491 " lsize*3-adjust, 0);\n" // investigate later
492 " wait_group_events(1, &async_event);\n"
493 "}\n"
494 };
495
496 const char *double_source[] = {
497 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
498 "__kernel void test( __global double",
499 vector_size_name_extensions[vectorSize],
500 " *p, __global half *f )\n"
501 "{\n"
502 " size_t i = get_global_id(0);\n"
503 " vstore_half",
504 vector_size_name_extensions[vectorSize],
505 roundName,
506 "( p[i], i, f );\n"
507 "}\n"
508 };
509
510 const char *double_source_private_store[] = {
511 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
512 "__kernel void test( __global double",
513 vector_size_name_extensions[vectorSize],
514 " *p, __global half *f )\n"
515 "{\n"
516 " __private ushort data[16];\n"
517 " size_t i = get_global_id(0);\n"
518 " size_t offset = 0;\n"
519 " size_t vecsize = vec_step(p[i]);\n"
520 " vstore_half",
521 vector_size_name_extensions[vectorSize],
522 roundName,
523 "( p[i], 0, (__private half *)(&data[0]) );\n"
524 " for(offset = 0; offset < vecsize; offset++)\n"
525 " {\n"
526 " vstore_half(vload_half(offset, (__private half *)data), 0, "
527 "&f[vecsize*i+offset]);\n"
528 " }\n"
529 "}\n"
530 };
531
532
533 const char *double_source_local_store[] = {
534 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
535 "__kernel void test( __global double",
536 vector_size_name_extensions[vectorSize],
537 " *p, __global half *f )\n"
538 "{\n"
539 " __local ushort data[16*",
540 local_buf_size,
541 "];\n"
542 " size_t i = get_global_id(0);\n"
543 " size_t lid = get_local_id(0);\n"
544 " size_t vecsize = vec_step(p[0]);\n"
545 " size_t lsize = get_local_size(0);\n"
546 " event_t async_event;\n"
547 " vstore_half",
548 vector_size_name_extensions[vectorSize],
549 roundName,
550 "( p[i], lid, (__local half *)(&data[0]) );\n"
551 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
552 " async_event = async_work_group_copy((__global ushort "
553 "*)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), "
554 "vecsize*lsize, 0);\n" // investigate later
555 " wait_group_events(1, &async_event);\n"
556 "}\n"
557 };
558
559
560 const char *double_source_v3[] = {
561 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
562 "__kernel void test( __global double *p, __global half *f ,\n"
563 " uint extra_last_thread)\n"
564 "{\n"
565 " size_t i = get_global_id(0);\n"
566 " size_t last_i = get_global_size(0)-1;\n"
567 " size_t adjust = 0;\n"
568 " if(last_i == i && extra_last_thread != 0) {\n"
569 " adjust = 3-extra_last_thread;\n"
570 " } "
571 " vstore_half3",
572 roundName,
573 "( vload3(i,p-adjust), i, f -adjust);\n"
574 "}\n"
575 };
576
577 const char *double_source_private_store_v3[] = {
578 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
579 "__kernel void test( __global double *p, __global half *f,\n"
580 " uint extra_last_thread )\n"
581 "{\n"
582 " __private ushort data[4];\n"
583 " size_t i = get_global_id(0);\n"
584 " size_t last_i = get_global_size(0)-1;\n"
585 " size_t adjust = 0;\n"
586 " size_t offset = 0;\n"
587 " if(last_i == i && extra_last_thread != 0) {\n"
588 " adjust = 3-extra_last_thread;\n"
589 " } "
590 " vstore_half3",
591 roundName,
592 "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
593 " for(offset = 0; offset < 3; offset++)\n"
594 " {\n"
595 " vstore_half(vload_half(offset, (__private half *)data), 0, "
596 "&f[3*i+offset-adjust]);\n"
597 " }\n"
598 "}\n"
599 };
600
601 const char *double_source_local_store_v3[] = {
602 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
603 "__kernel void test( __global double *p, __global half *f,\n"
604 " uint extra_last_thread )\n"
605 "{\n"
606 " __local ushort data[3*(",
607 local_buf_size,
608 "+1)];\n"
609 " size_t i = get_global_id(0);\n"
610 " size_t lid = get_local_id(0);\n"
611 " size_t last_i = get_global_size(0)-1;\n"
612 " size_t adjust = 0;\n"
613 " size_t lsize = get_local_size(0);\n"
614 " event_t async_event;\n"
615 " if(last_i == i && extra_last_thread != 0) {\n"
616 " adjust = 3-extra_last_thread;\n"
617 " }\n "
618 " vstore_half3",
619 roundName,
620 "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
621 " barrier( CLK_LOCAL_MEM_FENCE ); \n"
622 " if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
623 " extra_last_thread != 0) {\n"
624 " adjust = 3-extra_last_thread;\n"
625 " }\n"
626 " async_event = async_work_group_copy(\n"
627 " (__global ushort *)(f+3*(i-lid)),\n"
628 " (__local ushort *)(&data[adjust]),\n"
629 " lsize*3-adjust, 0);\n" // investigate later
630 " wait_group_events(1, &async_event);\n"
631 "}\n"
632 };
633
634
635 if (g_arrVecSizes[vectorSize] == 3)
636 {
637 programs[vectorSize][0] = MakeProgram(
638 device, source_v3, sizeof(source_v3) / sizeof(source_v3[0]));
639 }
640 else
641 {
642 programs[vectorSize][0] =
643 MakeProgram(device, source, sizeof(source) / sizeof(source[0]));
644 }
645 if (NULL == programs[vectorSize][0])
646 {
647 gFailCount++;
648 return -1;
649 }
650
651 kernels[vectorSize][0] =
652 clCreateKernel(programs[vectorSize][0], "test", &error);
653 if (NULL == kernels[vectorSize][0])
654 {
655 gFailCount++;
656 vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error);
657 return error;
658 }
659
660 if (g_arrVecSizes[vectorSize] == 3)
661 {
662 programs[vectorSize][1] =
663 MakeProgram(device, source_private_store_v3,
664 sizeof(source_private_store_v3)
665 / sizeof(source_private_store_v3[0]));
666 }
667 else
668 {
669 programs[vectorSize][1] = MakeProgram(
670 device, source_private_store,
671 sizeof(source_private_store) / sizeof(source_private_store[0]));
672 }
673 if (NULL == programs[vectorSize][1])
674 {
675 gFailCount++;
676 return -1;
677 }
678
679 kernels[vectorSize][1] =
680 clCreateKernel(programs[vectorSize][1], "test", &error);
681 if (NULL == kernels[vectorSize][1])
682 {
683 gFailCount++;
684 vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n",
685 error);
686 return error;
687 }
688
689 if (g_arrVecSizes[vectorSize] == 3)
690 {
691 programs[vectorSize][2] =
692 MakeProgram(device, source_local_store_v3,
693 sizeof(source_local_store_v3)
694 / sizeof(source_local_store_v3[0]));
695 if (NULL == programs[vectorSize][2])
696 {
697 unsigned q;
698 for (q = 0; q < sizeof(source_local_store_v3)
699 / sizeof(source_local_store_v3[0]);
700 q++)
701 vlog_error("%s", source_local_store_v3[q]);
702
703 gFailCount++;
704 return -1;
705 }
706 }
707 else
708 {
709 programs[vectorSize][2] = MakeProgram(
710 device, source_local_store,
711 sizeof(source_local_store) / sizeof(source_local_store[0]));
712 if (NULL == programs[vectorSize][2])
713 {
714 unsigned q;
715 for (q = 0; q < sizeof(source_local_store)
716 / sizeof(source_local_store[0]);
717 q++)
718 vlog_error("%s", source_local_store[q]);
719
720 gFailCount++;
721 return -1;
722 }
723 }
724
725 kernels[vectorSize][2] =
726 clCreateKernel(programs[vectorSize][2], "test", &error);
727 if (NULL == kernels[vectorSize][2])
728 {
729 gFailCount++;
730 vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n",
731 error);
732 return error;
733 }
734
735 if (gTestDouble)
736 {
737 if (g_arrVecSizes[vectorSize] == 3)
738 {
739 doublePrograms[vectorSize][0] = MakeProgram(
740 device, double_source_v3,
741 sizeof(double_source_v3) / sizeof(double_source_v3[0]));
742 }
743 else
744 {
745 doublePrograms[vectorSize][0] = MakeProgram(
746 device, double_source,
747 sizeof(double_source) / sizeof(double_source[0]));
748 }
749 if (NULL == doublePrograms[vectorSize][0])
750 {
751 gFailCount++;
752 return -1;
753 }
754
755 doubleKernels[vectorSize][0] =
756 clCreateKernel(doublePrograms[vectorSize][0], "test", &error);
757 if (NULL == kernels[vectorSize][0])
758 {
759 gFailCount++;
760 vlog_error(
761 "\t\tFAILED -- Failed to create double kernel. (%d)\n",
762 error);
763 return error;
764 }
765
766 if (g_arrVecSizes[vectorSize] == 3)
767 doublePrograms[vectorSize][1] = MakeProgram(
768 device, double_source_private_store_v3,
769 sizeof(double_source_private_store_v3)
770 / sizeof(double_source_private_store_v3[0]));
771 else
772 doublePrograms[vectorSize][1] =
773 MakeProgram(device, double_source_private_store,
774 sizeof(double_source_private_store)
775 / sizeof(double_source_private_store[0]));
776
777 if (NULL == doublePrograms[vectorSize][1])
778 {
779 gFailCount++;
780 return -1;
781 }
782
783 doubleKernels[vectorSize][1] =
784 clCreateKernel(doublePrograms[vectorSize][1], "test", &error);
785 if (NULL == kernels[vectorSize][1])
786 {
787 gFailCount++;
788 vlog_error("\t\tFAILED -- Failed to create double private "
789 "kernel. (%d)\n",
790 error);
791 return error;
792 }
793
794 if (g_arrVecSizes[vectorSize] == 3)
795 {
796 doublePrograms[vectorSize][2] =
797 MakeProgram(device, double_source_local_store_v3,
798 sizeof(double_source_local_store_v3)
799 / sizeof(double_source_local_store_v3[0]));
800 }
801 else
802 {
803 doublePrograms[vectorSize][2] =
804 MakeProgram(device, double_source_local_store,
805 sizeof(double_source_local_store)
806 / sizeof(double_source_local_store[0]));
807 }
808 if (NULL == doublePrograms[vectorSize][2])
809 {
810 gFailCount++;
811 return -1;
812 }
813
814 doubleKernels[vectorSize][2] =
815 clCreateKernel(doublePrograms[vectorSize][2], "test", &error);
816 if (NULL == kernels[vectorSize][2])
817 {
818 gFailCount++;
819 vlog_error("\t\tFAILED -- Failed to create double local "
820 "kernel. (%d)\n",
821 error);
822 return error;
823 }
824 }
825 } // end for vector size
826
827 const char *reset[] = {
828 "__kernel void reset( __global float *p, __global ushort *f,\n"
829 " uint extra_last_thread)\n"
830 "{\n"
831 " size_t i = get_global_id(0);\n"
832 " *(f + i) = 0xdead;"
833 "}\n"
834 };
835
836 if (!gHostReset)
837 {
838 resetProgram =
839 MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0]));
840 if (NULL == resetProgram)
841 {
842 gFailCount++;
843 return -1;
844 }
845 resetKernel = clCreateKernel(resetProgram, "reset", &error);
846 if (NULL == resetKernel)
847 {
848 gFailCount++;
849 return -1;
850 }
851 }
852
853 // Figure out how many elements are in a work block
854 size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
855 size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
856 uint64_t lastCase = 1ULL << (8 * sizeof(float)); // number of floats.
857 size_t stride = blockCount;
858
859 if (gWimpyMode)
860 stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
861
862 // we handle 64-bit types a bit differently.
863 if (lastCase == 0) lastCase = 0x100000000ULL;
864
865 uint64_t i, j;
866 error = 0;
867 uint64_t printMask = (lastCase >> 4) - 1;
868 cl_uint count = 0;
869 int addressSpace;
870 size_t loopCount;
871 cl_uint threadCount = GetThreadCount();
872
873 ComputeReferenceInfoF fref;
874 fref.x = (float *)gIn_single;
875 fref.r = (cl_half *)gOut_half_reference;
876 fref.f = referenceFunc;
877 fref.lim = blockCount;
878 fref.count = (blockCount + threadCount - 1) / threadCount;
879
880 CheckResultInfoF fchk;
881 fchk.x = (const float *)gIn_single;
882 fchk.r = (const cl_half *)gOut_half_reference;
883 fchk.s = (const cl_half *)gOut_half;
884 fchk.f = referenceFunc;
885 fchk.lim = blockCount;
886 fchk.count = (blockCount + threadCount - 1) / threadCount;
887
888 ComputeReferenceInfoD dref;
889 dref.x = (double *)gIn_double;
890 dref.r = (cl_half *)gOut_half_reference_double;
891 dref.f = doubleReferenceFunc;
892 dref.lim = blockCount;
893 dref.count = (blockCount + threadCount - 1) / threadCount;
894
895 CheckResultInfoD dchk;
896 dchk.x = (const double *)gIn_double;
897 dchk.r = (const cl_half *)gOut_half_reference_double;
898 dchk.s = (const cl_half *)gOut_half;
899 dchk.f = doubleReferenceFunc;
900 dchk.lim = blockCount;
901 dchk.count = (blockCount + threadCount - 1) / threadCount;
902
903 for (i = 0; i < lastCase; i += stride)
904 {
905 count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
906 fref.i = i;
907 dref.i = i;
908
909 // Compute the input and reference
910 ThreadPool_Do(ReferenceF, threadCount, &fref);
911
912 error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0,
913 count * sizeof(float), gIn_single, 0, NULL,
914 NULL);
915 if (error)
916 {
917 vlog_error("Failure in clWriteBuffer\n");
918 gFailCount++;
919 goto exit;
920 }
921
922 if (gTestDouble)
923 {
924 ThreadPool_Do(ReferenceD, threadCount, &dref);
925
926 error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0,
927 count * sizeof(double), gIn_double, 0,
928 NULL, NULL);
929 if (error)
930 {
931 vlog_error("Failure in clWriteBuffer\n");
932 gFailCount++;
933 goto exit;
934 }
935 }
936
937 for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
938 vectorSize++)
939 {
940 // Loop through vector sizes
941 fchk.vsz = g_arrVecSizes[vectorSize];
942 dchk.vsz = g_arrVecSizes[vectorSize];
943
944 for (addressSpace = 0; addressSpace < 3; addressSpace++)
945 {
946 // Loop over address spaces
947 fchk.aspace = addressSpaceNames[addressSpace];
948 dchk.aspace = addressSpaceNames[addressSpace];
949
950 if (!gHostReset)
951 {
952 error = RunKernel(device, resetKernel, gInBuffer_single,
953 gOutBuffer_half, count, 0);
954 }
955 else
956 {
957 cl_uint pattern = 0xdeaddead;
958 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
959
960 error = clEnqueueWriteBuffer(
961 gQueue, gOutBuffer_half, CL_FALSE, 0,
962 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
963 }
964 if (error)
965 {
966 vlog_error("Failure in clWriteArray\n");
967 gFailCount++;
968 goto exit;
969 }
970
971 error = RunKernel(device, kernels[vectorSize][addressSpace],
972 gInBuffer_single, gOutBuffer_half,
973 numVecs(count, vectorSize, aligned),
974 runsOverBy(count, vectorSize, aligned));
975 if (error)
976 {
977 gFailCount++;
978 goto exit;
979 }
980
981 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
982 count * sizeof(cl_half), gOut_half,
983 0, NULL, NULL);
984 if (error)
985 {
986 vlog_error("Failure in clReadArray\n");
987 gFailCount++;
988 goto exit;
989 }
990
991 error = ThreadPool_Do(CheckF, threadCount, &fchk);
992 if (error)
993 {
994 gFailCount++;
995 goto exit;
996 }
997
998 if (gTestDouble)
999 {
1000
1001 if (!gHostReset)
1002 {
1003 error = RunKernel(device, resetKernel, gInBuffer_double,
1004 gOutBuffer_half, count, 0);
1005 }
1006 else
1007 {
1008 cl_uint pattern = 0xdeaddead;
1009 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
1010
1011 error = clEnqueueWriteBuffer(
1012 gQueue, gOutBuffer_half, CL_FALSE, 0,
1013 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1014 }
1015 if (error)
1016 {
1017 vlog_error("Failure in clWriteArray\n");
1018 gFailCount++;
1019 goto exit;
1020 }
1021
1022 error = RunKernel(device,
1023 doubleKernels[vectorSize][addressSpace],
1024 gInBuffer_double, gOutBuffer_half,
1025 numVecs(count, vectorSize, aligned),
1026 runsOverBy(count, vectorSize, aligned));
1027 if (error)
1028 {
1029 gFailCount++;
1030 goto exit;
1031 }
1032
1033 error = clEnqueueReadBuffer(
1034 gQueue, gOutBuffer_half, CL_TRUE, 0,
1035 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1036 if (error)
1037 {
1038 vlog_error("Failure in clReadArray\n");
1039 gFailCount++;
1040 goto exit;
1041 }
1042
1043
1044 error = ThreadPool_Do(CheckD, threadCount, &dchk);
1045 if (error)
1046 {
1047 gFailCount++;
1048 goto exit;
1049 }
1050 }
1051 }
1052 }
1053
1054 if (((i + blockCount) & ~printMask) == (i + blockCount))
1055 {
1056 vlog(".");
1057 fflush(stdout);
1058 }
1059 } // end last case
1060
1061 loopCount = count == blockCount ? 1 : 100;
1062 if (gReportTimes)
1063 {
1064 // Init the input stream
1065 cl_float *p = (cl_float *)gIn_single;
1066 for (j = 0; j < count; j++)
1067 p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
1068
1069 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0,
1070 count * sizeof(float), gIn_single, 0,
1071 NULL, NULL)))
1072 {
1073 vlog_error("Failure in clWriteArray\n");
1074 gFailCount++;
1075 goto exit;
1076 }
1077
1078 if (gTestDouble)
1079 {
1080 // Init the input stream
1081 cl_double *q = (cl_double *)gIn_double;
1082 for (j = 0; j < count; j++)
1083 q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
1084
1085 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE,
1086 0, count * sizeof(double),
1087 gIn_double, 0, NULL, NULL)))
1088 {
1089 vlog_error("Failure in clWriteArray\n");
1090 gFailCount++;
1091 goto exit;
1092 }
1093 }
1094
1095 // Run again for timing
1096 for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
1097 vectorSize++)
1098 {
1099 uint64_t bestTime = -1ULL;
1100 for (j = 0; j < loopCount; j++)
1101 {
1102 uint64_t startTime = ReadTime();
1103
1104
1105 if ((error = RunKernel(device, kernels[vectorSize][0],
1106 gInBuffer_single, gOutBuffer_half,
1107 numVecs(count, vectorSize, aligned),
1108 runsOverBy(count, vectorSize, aligned))))
1109 {
1110 gFailCount++;
1111 goto exit;
1112 }
1113
1114 if ((error = clFinish(gQueue)))
1115 {
1116 vlog_error("Failure in clFinish\n");
1117 gFailCount++;
1118 goto exit;
1119 }
1120 uint64_t currentTime = ReadTime() - startTime;
1121 if (currentTime < bestTime) bestTime = currentTime;
1122 time[vectorSize] += currentTime;
1123 }
1124 if (bestTime < min_time[vectorSize])
1125 min_time[vectorSize] = bestTime;
1126
1127 if (gTestDouble)
1128 {
1129 bestTime = -1ULL;
1130 for (j = 0; j < loopCount; j++)
1131 {
1132 uint64_t startTime = ReadTime();
1133 if ((error =
1134 RunKernel(device, doubleKernels[vectorSize][0],
1135 gInBuffer_double, gOutBuffer_half,
1136 numVecs(count, vectorSize, aligned),
1137 runsOverBy(count, vectorSize, aligned))))
1138 {
1139 gFailCount++;
1140 goto exit;
1141 }
1142
1143 if ((error = clFinish(gQueue)))
1144 {
1145 vlog_error("Failure in clFinish\n");
1146 gFailCount++;
1147 goto exit;
1148 }
1149 uint64_t currentTime = ReadTime() - startTime;
1150 if (currentTime < bestTime) bestTime = currentTime;
1151 doubleTime[vectorSize] += currentTime;
1152 }
1153 if (bestTime < min_double_time[vectorSize])
1154 min_double_time[vectorSize] = bestTime;
1155 }
1156 }
1157 }
1158
1159 if (gReportTimes)
1160 {
1161 for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
1162 vectorSize++)
1163 vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency
1164 * gComputeDevices / (double)(count * loopCount),
1165 0, "average us/elem",
1166 "vStoreHalf%s avg. (%s vector size: %d)", roundName,
1167 addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
1168 for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
1169 vectorSize++)
1170 vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6
1171 * gDeviceFrequency * gComputeDevices / (double)count,
1172 0, "best us/elem",
1173 "vStoreHalf%s best (%s vector size: %d)", roundName,
1174 addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
1175 if (gTestDouble)
1176 {
1177 for (vectorSize = kMinVectorSize;
1178 vectorSize < kLastVectorSizeToTest; vectorSize++)
1179 vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6
1180 * gDeviceFrequency * gComputeDevices
1181 / (double)(count * loopCount),
1182 0, "average us/elem (double)",
1183 "vStoreHalf%s avg. d (%s vector size: %d)", roundName,
1184 addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
1185 for (vectorSize = kMinVectorSize;
1186 vectorSize < kLastVectorSizeToTest; vectorSize++)
1187 vlog_perf(SubtractTime(min_double_time[vectorSize], 0) * 1e6
1188 * gDeviceFrequency * gComputeDevices
1189 / (double)count,
1190 0, "best us/elem (double)",
1191 "vStoreHalf%s best d (%s vector size: %d)", roundName,
1192 addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
1193 }
1194 }
1195
1196 exit:
1197 // clean up
1198 if (!gHostReset)
1199 {
1200 clReleaseKernel(resetKernel);
1201 clReleaseProgram(resetProgram);
1202 }
1203
1204 for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
1205 vectorSize++)
1206 {
1207 for (addressSpace = 0; addressSpace < 3; addressSpace++)
1208 {
1209 clReleaseKernel(kernels[vectorSize][addressSpace]);
1210 clReleaseProgram(programs[vectorSize][addressSpace]);
1211 if (gTestDouble)
1212 {
1213 clReleaseKernel(doubleKernels[vectorSize][addressSpace]);
1214 clReleaseProgram(doublePrograms[vectorSize][addressSpace]);
1215 }
1216 }
1217 }
1218
1219 return error;
1220 }
1221
Test_vStoreaHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)1222 int Test_vStoreaHalf_private(cl_device_id device, f2h referenceFunc,
1223 d2h doubleReferenceFunc, const char *roundName)
1224 {
1225 int vectorSize, error;
1226 cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
1227 cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
1228 cl_program resetProgram;
1229 cl_kernel resetKernel;
1230
1231 uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
1232 uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
1233 memset(min_time, -1, sizeof(min_time));
1234 cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3];
1235 cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
1236 uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
1237 uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = {
1238 0
1239 };
1240 memset(min_double_time, -1, sizeof(min_double_time));
1241
1242 bool aligned = true;
1243
1244 int minVectorSize = kMinVectorSize;
1245 // There is no aligned scalar vstorea_half
1246 if (0 == minVectorSize) minVectorSize = 1;
1247
1248 // Loop over vector sizes
1249 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
1250 vectorSize++)
1251 {
1252 const char *source[] = { "__kernel void test( __global float",
1253 vector_size_name_extensions[vectorSize],
1254 " *p, __global half *f )\n"
1255 "{\n"
1256 " size_t i = get_global_id(0);\n"
1257 " vstorea_half",
1258 vector_size_name_extensions[vectorSize],
1259 roundName,
1260 "( p[i], i, f );\n"
1261 "}\n" };
1262
1263 const char *source_v3[] = {
1264 "__kernel void test( __global float3 *p, __global half *f )\n"
1265 "{\n"
1266 " size_t i = get_global_id(0);\n"
1267 " vstorea_half3",
1268 roundName,
1269 "( p[i], i, f );\n"
1270 " vstore_half",
1271 roundName,
1272 "( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1273 "}\n"
1274 };
1275
1276 const char *source_private[] = {
1277 "__kernel void test( __global float",
1278 vector_size_name_extensions[vectorSize],
1279 " *p, __global half *f )\n"
1280 "{\n"
1281 " __private float",
1282 vector_size_name_extensions[vectorSize],
1283 " data;\n"
1284 " size_t i = get_global_id(0);\n"
1285 " data = p[i];\n"
1286 " vstorea_half",
1287 vector_size_name_extensions[vectorSize],
1288 roundName,
1289 "( data, i, f );\n"
1290 "}\n"
1291 };
1292
1293 const char *source_private_v3[] = {
1294 "__kernel void test( __global float3 *p, __global half *f )\n"
1295 "{\n"
1296 " __private float",
1297 vector_size_name_extensions[vectorSize],
1298 " data;\n"
1299 " size_t i = get_global_id(0);\n"
1300 " data = p[i];\n"
1301 " vstorea_half3",
1302 roundName,
1303 "( data, i, f );\n"
1304 " vstore_half",
1305 roundName,
1306 "( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1307 "}\n"
1308 };
1309
1310 char local_buf_size[10];
1311 sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
1312 const char *source_local[] = { "__kernel void test( __global float",
1313 vector_size_name_extensions[vectorSize],
1314 " *p, __global half *f )\n"
1315 "{\n"
1316 " __local float",
1317 vector_size_name_extensions[vectorSize],
1318 " data[",
1319 local_buf_size,
1320 "];\n"
1321 " size_t i = get_global_id(0);\n"
1322 " size_t lid = get_local_id(0);\n"
1323 " data[lid] = p[i];\n"
1324 " vstorea_half",
1325 vector_size_name_extensions[vectorSize],
1326 roundName,
1327 "( data[lid], i, f );\n"
1328 "}\n" };
1329
1330 const char *source_local_v3[] = {
1331 "__kernel void test( __global float",
1332 vector_size_name_extensions[vectorSize],
1333 " *p, __global half *f )\n"
1334 "{\n"
1335 " __local float",
1336 vector_size_name_extensions[vectorSize],
1337 " data[",
1338 local_buf_size,
1339 "];\n"
1340 " size_t i = get_global_id(0);\n"
1341 " size_t lid = get_local_id(0);\n"
1342 " data[lid] = p[i];\n"
1343 " vstorea_half",
1344 vector_size_name_extensions[vectorSize],
1345 roundName,
1346 "( data[lid], i, f );\n"
1347 " vstore_half",
1348 roundName,
1349 "( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1350 "}\n"
1351 };
1352
1353 const char *double_source[] = {
1354 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1355 "__kernel void test( __global double",
1356 vector_size_name_extensions[vectorSize],
1357 " *p, __global half *f )\n"
1358 "{\n"
1359 " size_t i = get_global_id(0);\n"
1360 " vstorea_half",
1361 vector_size_name_extensions[vectorSize],
1362 roundName,
1363 "( p[i], i, f );\n"
1364 "}\n"
1365 };
1366
1367 const char *double_source_v3[] = {
1368 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1369 "__kernel void test( __global double",
1370 vector_size_name_extensions[vectorSize],
1371 " *p, __global half *f )\n"
1372 "{\n"
1373 " size_t i = get_global_id(0);\n"
1374 " vstorea_half",
1375 vector_size_name_extensions[vectorSize],
1376 roundName,
1377 "( p[i], i, f );\n"
1378 " vstore_half",
1379 roundName,
1380 "( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1381 "}\n"
1382 };
1383
1384 const char *double_source_private[] = {
1385 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1386 "__kernel void test( __global double",
1387 vector_size_name_extensions[vectorSize],
1388 " *p, __global half *f )\n"
1389 "{\n"
1390 " __private double",
1391 vector_size_name_extensions[vectorSize],
1392 " data;\n"
1393 " size_t i = get_global_id(0);\n"
1394 " data = p[i];\n"
1395 " vstorea_half",
1396 vector_size_name_extensions[vectorSize],
1397 roundName,
1398 "( data, i, f );\n"
1399 "}\n"
1400 };
1401
1402 const char *double_source_private_v3[] = {
1403 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1404 "__kernel void test( __global double",
1405 vector_size_name_extensions[vectorSize],
1406 " *p, __global half *f )\n"
1407 "{\n"
1408 " __private double",
1409 vector_size_name_extensions[vectorSize],
1410 " data;\n"
1411 " size_t i = get_global_id(0);\n"
1412 " data = p[i];\n"
1413 " vstorea_half",
1414 vector_size_name_extensions[vectorSize],
1415 roundName,
1416 "( data, i, f );\n"
1417 " vstore_half",
1418 roundName,
1419 "( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1420 "}\n"
1421 };
1422
1423 const char *double_source_local[] = {
1424 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1425 "__kernel void test( __global double",
1426 vector_size_name_extensions[vectorSize],
1427 " *p, __global half *f )\n"
1428 "{\n"
1429 " __local double",
1430 vector_size_name_extensions[vectorSize],
1431 " data[",
1432 local_buf_size,
1433 "];\n"
1434 " size_t i = get_global_id(0);\n"
1435 " size_t lid = get_local_id(0);\n"
1436 " data[lid] = p[i];\n"
1437 " vstorea_half",
1438 vector_size_name_extensions[vectorSize],
1439 roundName,
1440 "( data[lid], i, f );\n"
1441 "}\n"
1442 };
1443
1444 const char *double_source_local_v3[] = {
1445 "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1446 "__kernel void test( __global double",
1447 vector_size_name_extensions[vectorSize],
1448 " *p, __global half *f )\n"
1449 "{\n"
1450 " __local double",
1451 vector_size_name_extensions[vectorSize],
1452 " data[",
1453 local_buf_size,
1454 "];\n"
1455 " size_t i = get_global_id(0);\n"
1456 " size_t lid = get_local_id(0);\n"
1457 " data[lid] = p[i];\n"
1458 " vstorea_half",
1459 vector_size_name_extensions[vectorSize],
1460 roundName,
1461 "( data[lid], i, f );\n"
1462 " vstore_half",
1463 roundName,
1464 "( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1465 "}\n"
1466 };
1467
1468 if (g_arrVecSizes[vectorSize] == 3)
1469 {
1470 programs[vectorSize][0] = MakeProgram(
1471 device, source_v3, sizeof(source_v3) / sizeof(source_v3[0]));
1472 if (NULL == programs[vectorSize][0])
1473 {
1474 gFailCount++;
1475 return -1;
1476 }
1477 }
1478 else
1479 {
1480 programs[vectorSize][0] =
1481 MakeProgram(device, source, sizeof(source) / sizeof(source[0]));
1482 if (NULL == programs[vectorSize][0])
1483 {
1484 gFailCount++;
1485 return -1;
1486 }
1487 }
1488
1489 kernels[vectorSize][0] =
1490 clCreateKernel(programs[vectorSize][0], "test", &error);
1491 if (NULL == kernels[vectorSize][0])
1492 {
1493 gFailCount++;
1494 vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error);
1495 return error;
1496 }
1497
1498 if (g_arrVecSizes[vectorSize] == 3)
1499 {
1500 programs[vectorSize][1] = MakeProgram(
1501 device, source_private_v3,
1502 sizeof(source_private_v3) / sizeof(source_private_v3[0]));
1503 if (NULL == programs[vectorSize][1])
1504 {
1505 gFailCount++;
1506 return -1;
1507 }
1508 }
1509 else
1510 {
1511 programs[vectorSize][1] =
1512 MakeProgram(device, source_private,
1513 sizeof(source_private) / sizeof(source_private[0]));
1514 if (NULL == programs[vectorSize][1])
1515 {
1516 gFailCount++;
1517 return -1;
1518 }
1519 }
1520
1521 kernels[vectorSize][1] =
1522 clCreateKernel(programs[vectorSize][1], "test", &error);
1523 if (NULL == kernels[vectorSize][1])
1524 {
1525 gFailCount++;
1526 vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n",
1527 error);
1528 return error;
1529 }
1530
1531 if (g_arrVecSizes[vectorSize] == 3)
1532 {
1533 programs[vectorSize][2] = MakeProgram(
1534 device, source_local_v3,
1535 sizeof(source_local_v3) / sizeof(source_local_v3[0]));
1536 if (NULL == programs[vectorSize][2])
1537 {
1538 gFailCount++;
1539 return -1;
1540 }
1541 }
1542 else
1543 {
1544 programs[vectorSize][2] =
1545 MakeProgram(device, source_local,
1546 sizeof(source_local) / sizeof(source_local[0]));
1547 if (NULL == programs[vectorSize][2])
1548 {
1549 gFailCount++;
1550 return -1;
1551 }
1552 }
1553
1554 kernels[vectorSize][2] =
1555 clCreateKernel(programs[vectorSize][2], "test", &error);
1556 if (NULL == kernels[vectorSize][2])
1557 {
1558 gFailCount++;
1559 vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n",
1560 error);
1561 return error;
1562 }
1563
1564 if (gTestDouble)
1565 {
1566 if (g_arrVecSizes[vectorSize] == 3)
1567 {
1568 doublePrograms[vectorSize][0] = MakeProgram(
1569 device, double_source_v3,
1570 sizeof(double_source_v3) / sizeof(double_source_v3[0]));
1571 if (NULL == doublePrograms[vectorSize][0])
1572 {
1573 gFailCount++;
1574 return -1;
1575 }
1576 }
1577 else
1578 {
1579 doublePrograms[vectorSize][0] = MakeProgram(
1580 device, double_source,
1581 sizeof(double_source) / sizeof(double_source[0]));
1582 if (NULL == doublePrograms[vectorSize][0])
1583 {
1584 gFailCount++;
1585 return -1;
1586 }
1587 }
1588
1589 doubleKernels[vectorSize][0] =
1590 clCreateKernel(doublePrograms[vectorSize][0], "test", &error);
1591 if (NULL == kernels[vectorSize][0])
1592 {
1593 gFailCount++;
1594 vlog_error(
1595 "\t\tFAILED -- Failed to create double kernel. (%d)\n",
1596 error);
1597 return error;
1598 }
1599
1600 if (g_arrVecSizes[vectorSize] == 3)
1601 {
1602 doublePrograms[vectorSize][1] =
1603 MakeProgram(device, double_source_private_v3,
1604 sizeof(double_source_private_v3)
1605 / sizeof(double_source_private_v3[0]));
1606 if (NULL == doublePrograms[vectorSize][1])
1607 {
1608 gFailCount++;
1609 return -1;
1610 }
1611 }
1612 else
1613 {
1614 doublePrograms[vectorSize][1] =
1615 MakeProgram(device, double_source_private,
1616 sizeof(double_source_private)
1617 / sizeof(double_source_private[0]));
1618 if (NULL == doublePrograms[vectorSize][1])
1619 {
1620 gFailCount++;
1621 return -1;
1622 }
1623 }
1624
1625 doubleKernels[vectorSize][1] =
1626 clCreateKernel(doublePrograms[vectorSize][1], "test", &error);
1627 if (NULL == kernels[vectorSize][1])
1628 {
1629 gFailCount++;
1630 vlog_error("\t\tFAILED -- Failed to create double private "
1631 "kernel. (%d)\n",
1632 error);
1633 return error;
1634 }
1635
1636 if (g_arrVecSizes[vectorSize] == 3)
1637 {
1638 doublePrograms[vectorSize][2] =
1639 MakeProgram(device, double_source_local_v3,
1640 sizeof(double_source_local_v3)
1641 / sizeof(double_source_local_v3[0]));
1642 if (NULL == doublePrograms[vectorSize][2])
1643 {
1644 gFailCount++;
1645 return -1;
1646 }
1647 }
1648 else
1649 {
1650 doublePrograms[vectorSize][2] =
1651 MakeProgram(device, double_source_local,
1652 sizeof(double_source_local)
1653 / sizeof(double_source_local[0]));
1654 if (NULL == doublePrograms[vectorSize][2])
1655 {
1656 gFailCount++;
1657 return -1;
1658 }
1659 }
1660
1661 doubleKernels[vectorSize][2] =
1662 clCreateKernel(doublePrograms[vectorSize][2], "test", &error);
1663 if (NULL == kernels[vectorSize][2])
1664 {
1665 gFailCount++;
1666 vlog_error("\t\tFAILED -- Failed to create double local "
1667 "kernel. (%d)\n",
1668 error);
1669 return error;
1670 }
1671 }
1672 }
1673
1674 const char *reset[] = {
1675 "__kernel void reset( __global float *p, __global ushort *f,\n"
1676 " uint extra_last_thread)\n"
1677 "{\n"
1678 " size_t i = get_global_id(0);\n"
1679 " *(f + i) = 0xdead;"
1680 "}\n"
1681 };
1682
1683 if (!gHostReset)
1684 {
1685 resetProgram =
1686 MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0]));
1687 if (NULL == resetProgram)
1688 {
1689 gFailCount++;
1690 return -1;
1691 }
1692 resetKernel = clCreateKernel(resetProgram, "reset", &error);
1693 if (NULL == resetKernel)
1694 {
1695 gFailCount++;
1696 return -1;
1697 }
1698 }
1699
1700 // Figure out how many elements are in a work block
1701 size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
1702 size_t blockCount = BUFFER_SIZE / elementSize;
1703 uint64_t lastCase = 1ULL << (8 * sizeof(float));
1704 size_t stride = blockCount;
1705
1706 if (gWimpyMode)
1707 stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
1708
1709 // we handle 64-bit types a bit differently.
1710 if (lastCase == 0) lastCase = 0x100000000ULL;
1711 uint64_t i, j;
1712 error = 0;
1713 uint64_t printMask = (lastCase >> 4) - 1;
1714 cl_uint count = 0;
1715 int addressSpace;
1716 size_t loopCount;
1717 cl_uint threadCount = GetThreadCount();
1718
1719 ComputeReferenceInfoF fref;
1720 fref.x = (float *)gIn_single;
1721 fref.r = (cl_half *)gOut_half_reference;
1722 fref.f = referenceFunc;
1723 fref.lim = blockCount;
1724 fref.count = (blockCount + threadCount - 1) / threadCount;
1725
1726 CheckResultInfoF fchk;
1727 fchk.x = (const float *)gIn_single;
1728 fchk.r = (const cl_half *)gOut_half_reference;
1729 fchk.s = (const cl_half *)gOut_half;
1730 fchk.f = referenceFunc;
1731 fchk.lim = blockCount;
1732 fchk.count = (blockCount + threadCount - 1) / threadCount;
1733
1734 ComputeReferenceInfoD dref;
1735 dref.x = (double *)gIn_double;
1736 dref.r = (cl_half *)gOut_half_reference_double;
1737 dref.f = doubleReferenceFunc;
1738 dref.lim = blockCount;
1739 dref.count = (blockCount + threadCount - 1) / threadCount;
1740
1741 CheckResultInfoD dchk;
1742 dchk.x = (const double *)gIn_double;
1743 dchk.r = (const cl_half *)gOut_half_reference_double;
1744 dchk.s = (const cl_half *)gOut_half;
1745 dchk.f = doubleReferenceFunc;
1746 dchk.lim = blockCount;
1747 dchk.count = (blockCount + threadCount - 1) / threadCount;
1748
1749 for (i = 0; i < (uint64_t)lastCase; i += stride)
1750 {
1751 count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
1752 fref.i = i;
1753 dref.i = i;
1754
1755 // Create the input and reference
1756 ThreadPool_Do(ReferenceF, threadCount, &fref);
1757
1758 error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0,
1759 count * sizeof(float), gIn_single, 0, NULL,
1760 NULL);
1761 if (error)
1762 {
1763 vlog_error("Failure in clWriteArray\n");
1764 gFailCount++;
1765 goto exit;
1766 }
1767
1768 if (gTestDouble)
1769 {
1770 ThreadPool_Do(ReferenceD, threadCount, &dref);
1771
1772 error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0,
1773 count * sizeof(double), gIn_double, 0,
1774 NULL, NULL);
1775 if (error)
1776 {
1777 vlog_error("Failure in clWriteArray\n");
1778 gFailCount++;
1779 goto exit;
1780 }
1781 }
1782
1783 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
1784 vectorSize++)
1785 {
1786 // Loop over vector legths
1787 fchk.vsz = g_arrVecSizes[vectorSize];
1788 dchk.vsz = g_arrVecSizes[vectorSize];
1789
1790 for (addressSpace = 0; addressSpace < 3; addressSpace++)
1791 {
1792 // Loop over address spaces
1793 fchk.aspace = addressSpaceNames[addressSpace];
1794 dchk.aspace = addressSpaceNames[addressSpace];
1795
1796 if (!gHostReset)
1797 {
1798 error = RunKernel(device, resetKernel, gInBuffer_single,
1799 gOutBuffer_half, count, 0);
1800 }
1801 else
1802 {
1803 cl_uint pattern = 0xdeaddead;
1804 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
1805
1806 error = clEnqueueWriteBuffer(
1807 gQueue, gOutBuffer_half, CL_FALSE, 0,
1808 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1809 }
1810 if (error)
1811 {
1812 vlog_error("Failure in clWriteArray\n");
1813 gFailCount++;
1814 goto exit;
1815 }
1816
1817 error = RunKernel(device, kernels[vectorSize][addressSpace],
1818 gInBuffer_single, gOutBuffer_half,
1819 numVecs(count, vectorSize, aligned),
1820 runsOverBy(count, vectorSize, aligned));
1821 if (error)
1822 {
1823 gFailCount++;
1824 goto exit;
1825 }
1826
1827 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
1828 count * sizeof(cl_half), gOut_half,
1829 0, NULL, NULL);
1830 if (error)
1831 {
1832 vlog_error("Failure in clReadArray\n");
1833 gFailCount++;
1834 goto exit;
1835 }
1836
1837 error = ThreadPool_Do(CheckF, threadCount, &fchk);
1838 if (error)
1839 {
1840 gFailCount++;
1841 goto exit;
1842 }
1843
1844 if (gTestDouble)
1845 {
1846
1847 if (!gHostReset)
1848 {
1849 error = RunKernel(device, resetKernel, gInBuffer_single,
1850 gOutBuffer_half, count, 0);
1851 }
1852 else
1853 {
1854 cl_uint pattern = 0xdeaddead;
1855 memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
1856
1857 error = clEnqueueWriteBuffer(
1858 gQueue, gOutBuffer_half, CL_FALSE, 0,
1859 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1860 }
1861 if (error)
1862 {
1863 vlog_error("Failure in clWriteArray\n");
1864 gFailCount++;
1865 goto exit;
1866 }
1867
1868 error = RunKernel(device,
1869 doubleKernels[vectorSize][addressSpace],
1870 gInBuffer_double, gOutBuffer_half,
1871 numVecs(count, vectorSize, aligned),
1872 runsOverBy(count, vectorSize, aligned));
1873 if (error)
1874 {
1875 gFailCount++;
1876 goto exit;
1877 }
1878
1879 error = clEnqueueReadBuffer(
1880 gQueue, gOutBuffer_half, CL_TRUE, 0,
1881 count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1882 if (error)
1883 {
1884 vlog_error("Failure in clReadArray\n");
1885 gFailCount++;
1886 goto exit;
1887 }
1888
1889 error = ThreadPool_Do(CheckD, threadCount, &dchk);
1890 if (error)
1891 {
1892 gFailCount++;
1893 goto exit;
1894 }
1895 }
1896 }
1897 } // end for vector size
1898
1899 if (((i + blockCount) & ~printMask) == (i + blockCount))
1900 {
1901 vlog(".");
1902 fflush(stdout);
1903 }
1904 } // for end lastcase
1905
1906 loopCount = count == blockCount ? 1 : 100;
1907 if (gReportTimes)
1908 {
1909 // Init the input stream
1910 cl_float *p = (cl_float *)gIn_single;
1911 for (j = 0; j < count; j++)
1912 p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
1913
1914 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0,
1915 count * sizeof(float), gIn_single, 0,
1916 NULL, NULL)))
1917 {
1918 vlog_error("Failure in clWriteArray\n");
1919 gFailCount++;
1920 goto exit;
1921 }
1922
1923 if (gTestDouble)
1924 {
1925 // Init the input stream
1926 cl_double *q = (cl_double *)gIn_double;
1927 for (j = 0; j < count; j++)
1928 q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
1929
1930 if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE,
1931 0, count * sizeof(double),
1932 gIn_double, 0, NULL, NULL)))
1933 {
1934 vlog_error("Failure in clWriteArray\n");
1935 gFailCount++;
1936 goto exit;
1937 }
1938 }
1939
1940 // Run again for timing
1941 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
1942 vectorSize++)
1943 {
1944 uint64_t bestTime = -1ULL;
1945 for (j = 0; j < loopCount; j++)
1946 {
1947 uint64_t startTime = ReadTime();
1948 if ((error = RunKernel(device, kernels[vectorSize][0],
1949 gInBuffer_single, gOutBuffer_half,
1950 numVecs(count, vectorSize, aligned),
1951 runsOverBy(count, vectorSize, aligned))))
1952 {
1953 gFailCount++;
1954 goto exit;
1955 }
1956
1957 if ((error = clFinish(gQueue)))
1958 {
1959 vlog_error("Failure in clFinish\n");
1960 gFailCount++;
1961 goto exit;
1962 }
1963 uint64_t currentTime = ReadTime() - startTime;
1964 if (currentTime < bestTime) bestTime = currentTime;
1965 time[vectorSize] += currentTime;
1966 }
1967 if (bestTime < min_time[vectorSize])
1968 min_time[vectorSize] = bestTime;
1969
1970 if (gTestDouble)
1971 {
1972 bestTime = -1ULL;
1973 for (j = 0; j < loopCount; j++)
1974 {
1975 uint64_t startTime = ReadTime();
1976 if ((error =
1977 RunKernel(device, doubleKernels[vectorSize][0],
1978 gInBuffer_double, gOutBuffer_half,
1979 numVecs(count, vectorSize, aligned),
1980 runsOverBy(count, vectorSize, aligned))))
1981 {
1982 gFailCount++;
1983 goto exit;
1984 }
1985
1986 if ((error = clFinish(gQueue)))
1987 {
1988 vlog_error("Failure in clFinish\n");
1989 gFailCount++;
1990 goto exit;
1991 }
1992 uint64_t currentTime = ReadTime() - startTime;
1993 if (currentTime < bestTime) bestTime = currentTime;
1994 doubleTime[vectorSize] += currentTime;
1995 }
1996 if (bestTime < min_double_time[vectorSize])
1997 min_double_time[vectorSize] = bestTime;
1998 }
1999 }
2000 }
2001
2002 if (gReportTimes)
2003 {
2004 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2005 vectorSize++)
2006 vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency
2007 * gComputeDevices / (double)(count * loopCount),
2008 0, "average us/elem",
2009 "vStoreaHalf%s avg. (%s vector size: %d)", roundName,
2010 addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
2011 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2012 vectorSize++)
2013 vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6
2014 * gDeviceFrequency * gComputeDevices / (double)count,
2015 0, "best us/elem",
2016 "vStoreaHalf%s best (%s vector size: %d)", roundName,
2017 addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
2018 if (gTestDouble)
2019 {
2020 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2021 vectorSize++)
2022 vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6
2023 * gDeviceFrequency * gComputeDevices
2024 / (double)(count * loopCount),
2025 0, "average us/elem (double)",
2026 "vStoreaHalf%s avg. d (%s vector size: %d)",
2027 roundName, addressSpaceNames[0],
2028 (g_arrVecSizes[vectorSize]));
2029 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2030 vectorSize++)
2031 vlog_perf(
2032 SubtractTime(min_double_time[vectorSize], 0) * 1e6
2033 * gDeviceFrequency * gComputeDevices / (double)count,
2034 0, "best us/elem (double)",
2035 "vStoreaHalf%s best d (%s vector size: %d)", roundName,
2036 addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
2037 }
2038 }
2039
2040 exit:
2041 // clean up
2042 if (!gHostReset)
2043 {
2044 clReleaseKernel(resetKernel);
2045 clReleaseProgram(resetProgram);
2046 }
2047
2048 for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2049 vectorSize++)
2050 {
2051 for (addressSpace = 0; addressSpace < 3; addressSpace++)
2052 {
2053 clReleaseKernel(kernels[vectorSize][addressSpace]);
2054 clReleaseProgram(programs[vectorSize][addressSpace]);
2055 if (gTestDouble)
2056 {
2057 clReleaseKernel(doubleKernels[vectorSize][addressSpace]);
2058 clReleaseProgram(doublePrograms[vectorSize][addressSpace]);
2059 }
2060 }
2061 }
2062
2063 return error;
2064 }
2065