xref: /aosp_15_r20/external/OpenCL-CTS/test_conformance/half/Test_vStoreHalf.cpp (revision 6467f958c7de8070b317fc65bcb0f6472e388d82)
1 //
2 // Copyright (c) 2017 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //    http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 #include "harness/compat.h"
17 #include "harness/kernelHelpers.h"
18 #include "harness/testHarness.h"
19 
20 #include <string.h>
21 
22 #include <algorithm>
23 
24 #include "cl_utils.h"
25 #include "tests.h"
26 
27 #include <CL/cl_half.h>
28 
29 typedef struct ComputeReferenceInfoF_
30 {
31     float *x;
32     cl_ushort *r;
33     f2h f;
34     cl_ulong i;
35     cl_uint lim;
36     cl_uint count;
37 } ComputeReferenceInfoF;
38 
39 typedef struct ComputeReferenceInfoD_
40 {
41     double *x;
42     cl_ushort *r;
43     d2h f;
44     cl_ulong i;
45     cl_uint lim;
46     cl_uint count;
47 } ComputeReferenceInfoD;
48 
49 typedef struct CheckResultInfoF_
50 {
51     const float *x;
52     const cl_ushort *r;
53     const cl_ushort *s;
54     f2h f;
55     const char *aspace;
56     cl_uint lim;
57     cl_uint count;
58     int vsz;
59 } CheckResultInfoF;
60 
61 typedef struct CheckResultInfoD_
62 {
63     const double *x;
64     const cl_ushort *r;
65     const cl_ushort *s;
66     d2h f;
67     const char *aspace;
68     cl_uint lim;
69     cl_uint count;
70     int vsz;
71 } CheckResultInfoD;
72 
ReferenceF(cl_uint jid,cl_uint tid,void * userInfo)73 static cl_int ReferenceF(cl_uint jid, cl_uint tid, void *userInfo)
74 {
75     ComputeReferenceInfoF *cri = (ComputeReferenceInfoF *)userInfo;
76     cl_uint lim = cri->lim;
77     cl_uint count = cri->count;
78     cl_uint off = jid * count;
79     float *x = cri->x + off;
80     cl_ushort *r = cri->r + off;
81     f2h f = cri->f;
82     cl_ulong i = cri->i + off;
83     cl_uint j;
84 
85     if (off + count > lim) count = lim - off;
86 
87     for (j = 0; j < count; ++j)
88     {
89         x[j] = as_float((cl_uint)(i + j));
90         r[j] = f(x[j]);
91     }
92 
93     return 0;
94 }
95 
CheckF(cl_uint jid,cl_uint tid,void * userInfo)96 static cl_int CheckF(cl_uint jid, cl_uint tid, void *userInfo)
97 {
98     CheckResultInfoF *cri = (CheckResultInfoF *)userInfo;
99     cl_uint lim = cri->lim;
100     cl_uint count = cri->count;
101     cl_uint off = jid * count;
102     const float *x = cri->x + off;
103     const cl_ushort *r = cri->r + off;
104     const cl_ushort *s = cri->s + off;
105     f2h f = cri->f;
106     cl_uint j;
107     cl_ushort correct2 = f(0.0f);
108     cl_ushort correct3 = f(-0.0f);
109     cl_int ret = 0;
110 
111     if (off + count > lim) count = lim - off;
112 
113     if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0;
114 
115     for (j = 0; j < count; j++)
116     {
117         if (s[j] == r[j]) continue;
118 
119         // Pass any NaNs
120         if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue;
121 
122         // retry per section 6.5.3.3
123         if (IsFloatSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
124             continue;
125 
126         // if reference result is subnormal, pass any zero
127         if (gIsEmbedded && IsHalfSubnormal(r[j])
128             && (s[j] == 0x0000 || s[j] == 0x8000))
129             continue;
130 
131         vlog_error("\nFailure at [%u] with %.6a: *0x%04x vs 0x%04x,  "
132                    "vector_size = %d, address_space = %s\n",
133                    j + off, x[j], r[j], s[j], cri->vsz, cri->aspace);
134 
135         ret = 1;
136         break;
137     }
138 
139     return ret;
140 }
141 
ReferenceD(cl_uint jid,cl_uint tid,void * userInfo)142 static cl_int ReferenceD(cl_uint jid, cl_uint tid, void *userInfo)
143 {
144     ComputeReferenceInfoD *cri = (ComputeReferenceInfoD *)userInfo;
145     cl_uint lim = cri->lim;
146     cl_uint count = cri->count;
147     cl_uint off = jid * count;
148     double *x = cri->x + off;
149     cl_ushort *r = cri->r + off;
150     d2h f = cri->f;
151     cl_uint j;
152     cl_ulong i = cri->i + off;
153 
154     if (off + count > lim) count = lim - off;
155 
156     for (j = 0; j < count; ++j)
157     {
158         x[j] = as_double(DoubleFromUInt((cl_uint)(i + j)));
159         r[j] = f(x[j]);
160     }
161 
162     return 0;
163 }
164 
CheckD(cl_uint jid,cl_uint tid,void * userInfo)165 static cl_int CheckD(cl_uint jid, cl_uint tid, void *userInfo)
166 {
167     CheckResultInfoD *cri = (CheckResultInfoD *)userInfo;
168     cl_uint lim = cri->lim;
169     cl_uint count = cri->count;
170     cl_uint off = jid * count;
171     const double *x = cri->x + off;
172     const cl_ushort *r = cri->r + off;
173     const cl_ushort *s = cri->s + off;
174     d2h f = cri->f;
175     cl_uint j;
176     cl_ushort correct2 = f(0.0);
177     cl_ushort correct3 = f(-0.0);
178     cl_int ret = 0;
179 
180     if (off + count > lim) count = lim - off;
181 
182     if (!memcmp(r, s, count * sizeof(cl_ushort))) return 0;
183 
184     for (j = 0; j < count; j++)
185     {
186         if (s[j] == r[j]) continue;
187 
188         // Pass any NaNs
189         if ((s[j] & 0x7fff) > 0x7c00 && (r[j] & 0x7fff) > 0x7c00) continue;
190 
191         if (IsDoubleSubnormal(x[j]) && (s[j] == correct2 || s[j] == correct3))
192             continue;
193 
194         // if reference result is subnormal, pass any zero result
195         if (gIsEmbedded && IsHalfSubnormal(r[j])
196             && (s[j] == 0x0000 || s[j] == 0x8000))
197             continue;
198 
199         vlog_error("\nFailure at [%u] with %.13la: *0x%04x vs 0x%04x, "
200                    "vector_size = %d, address space = %s (double precision)\n",
201                    j + off, x[j], r[j], s[j], cri->vsz, cri->aspace);
202 
203         ret = 1;
204         break;
205     }
206 
207     return ret;
208 }
209 
float2half_rte(float f)210 static cl_half float2half_rte(float f)
211 {
212     return cl_half_from_float(f, CL_HALF_RTE);
213 }
214 
float2half_rtz(float f)215 static cl_half float2half_rtz(float f)
216 {
217     return cl_half_from_float(f, CL_HALF_RTZ);
218 }
219 
float2half_rtp(float f)220 static cl_half float2half_rtp(float f)
221 {
222     return cl_half_from_float(f, CL_HALF_RTP);
223 }
224 
float2half_rtn(float f)225 static cl_half float2half_rtn(float f)
226 {
227     return cl_half_from_float(f, CL_HALF_RTN);
228 }
229 
double2half_rte(double f)230 static cl_half double2half_rte(double f)
231 {
232     return cl_half_from_double(f, CL_HALF_RTE);
233 }
234 
double2half_rtz(double f)235 static cl_half double2half_rtz(double f)
236 {
237     return cl_half_from_double(f, CL_HALF_RTZ);
238 }
239 
double2half_rtp(double f)240 static cl_half double2half_rtp(double f)
241 {
242     return cl_half_from_double(f, CL_HALF_RTP);
243 }
244 
double2half_rtn(double f)245 static cl_half double2half_rtn(double f)
246 {
247     return cl_half_from_double(f, CL_HALF_RTN);
248 }
249 
test_vstore_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)250 int test_vstore_half(cl_device_id deviceID, cl_context context,
251                      cl_command_queue queue, int num_elements)
252 {
253     switch (get_default_rounding_mode(deviceID))
254     {
255         case CL_FP_ROUND_TO_ZERO:
256             return Test_vStoreHalf_private(deviceID, float2half_rtz,
257                                            double2half_rte, "");
258         case 0: return -1;
259         default:
260             return Test_vStoreHalf_private(deviceID, float2half_rte,
261                                            double2half_rte, "");
262     }
263 }
264 
test_vstore_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)265 int test_vstore_half_rte(cl_device_id deviceID, cl_context context,
266                          cl_command_queue queue, int num_elements)
267 {
268     return Test_vStoreHalf_private(deviceID, float2half_rte, double2half_rte,
269                                    "_rte");
270 }
271 
test_vstore_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)272 int test_vstore_half_rtz(cl_device_id deviceID, cl_context context,
273                          cl_command_queue queue, int num_elements)
274 {
275     return Test_vStoreHalf_private(deviceID, float2half_rtz, double2half_rtz,
276                                    "_rtz");
277 }
278 
test_vstore_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)279 int test_vstore_half_rtp(cl_device_id deviceID, cl_context context,
280                          cl_command_queue queue, int num_elements)
281 {
282     return Test_vStoreHalf_private(deviceID, float2half_rtp, double2half_rtp,
283                                    "_rtp");
284 }
285 
test_vstore_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)286 int test_vstore_half_rtn(cl_device_id deviceID, cl_context context,
287                          cl_command_queue queue, int num_elements)
288 {
289     return Test_vStoreHalf_private(deviceID, float2half_rtn, double2half_rtn,
290                                    "_rtn");
291 }
292 
test_vstorea_half(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)293 int test_vstorea_half(cl_device_id deviceID, cl_context context,
294                       cl_command_queue queue, int num_elements)
295 {
296     switch (get_default_rounding_mode(deviceID))
297     {
298         case CL_FP_ROUND_TO_ZERO:
299             return Test_vStoreaHalf_private(deviceID, float2half_rtz,
300                                             double2half_rte, "");
301         case 0: return -1;
302         default:
303             return Test_vStoreaHalf_private(deviceID, float2half_rte,
304                                             double2half_rte, "");
305     }
306 }
307 
test_vstorea_half_rte(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)308 int test_vstorea_half_rte(cl_device_id deviceID, cl_context context,
309                           cl_command_queue queue, int num_elements)
310 {
311     return Test_vStoreaHalf_private(deviceID, float2half_rte, double2half_rte,
312                                     "_rte");
313 }
314 
test_vstorea_half_rtz(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)315 int test_vstorea_half_rtz(cl_device_id deviceID, cl_context context,
316                           cl_command_queue queue, int num_elements)
317 {
318     return Test_vStoreaHalf_private(deviceID, float2half_rtz, double2half_rtz,
319                                     "_rtz");
320 }
321 
test_vstorea_half_rtp(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)322 int test_vstorea_half_rtp(cl_device_id deviceID, cl_context context,
323                           cl_command_queue queue, int num_elements)
324 {
325     return Test_vStoreaHalf_private(deviceID, float2half_rtp, double2half_rtp,
326                                     "_rtp");
327 }
328 
test_vstorea_half_rtn(cl_device_id deviceID,cl_context context,cl_command_queue queue,int num_elements)329 int test_vstorea_half_rtn(cl_device_id deviceID, cl_context context,
330                           cl_command_queue queue, int num_elements)
331 {
332     return Test_vStoreaHalf_private(deviceID, float2half_rtn, double2half_rtn,
333                                     "_rtn");
334 }
335 
336 #pragma mark -
337 
Test_vStoreHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)338 int Test_vStoreHalf_private(cl_device_id device, f2h referenceFunc,
339                             d2h doubleReferenceFunc, const char *roundName)
340 {
341     int vectorSize, error;
342     cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
343     cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
344     cl_program resetProgram;
345     cl_kernel resetKernel;
346 
347     uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
348     uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
349     memset(min_time, -1, sizeof(min_time));
350     cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3];
351     cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
352     uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
353     uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = {
354         0
355     };
356     memset(min_double_time, -1, sizeof(min_double_time));
357 
358     bool aligned = false;
359 
360     for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
361          vectorSize++)
362     {
363         const char *source[] = { "__kernel void test( __global float",
364                                  vector_size_name_extensions[vectorSize],
365                                  " *p, __global half *f )\n"
366                                  "{\n"
367                                  "   size_t i = get_global_id(0);\n"
368                                  "   vstore_half",
369                                  vector_size_name_extensions[vectorSize],
370                                  roundName,
371                                  "( p[i], i, f );\n"
372                                  "}\n" };
373 
374         const char *source_v3[] = {
375             "__kernel void test( __global float *p, __global half *f,\n"
376             "                   uint extra_last_thread)\n"
377             "{\n"
378             "   size_t i = get_global_id(0);\n"
379             "   size_t last_i = get_global_size(0)-1;\n"
380             "   size_t adjust = 0;\n"
381             "   if(last_i == i && extra_last_thread != 0) {\n"
382             "     adjust = 3-extra_last_thread;\n"
383             "   } "
384             "   vstore_half3",
385             roundName,
386             "( vload3(i, p-adjust), i, f-adjust );\n"
387             "}\n"
388         };
389 
390         const char *source_private_store[] = {
391             "__kernel void test( __global float",
392             vector_size_name_extensions[vectorSize],
393             " *p, __global half *f )\n"
394             "{\n"
395             "   __private ushort data[16];\n"
396             "   size_t i = get_global_id(0);\n"
397             "   size_t offset = 0;\n"
398             "   size_t vecsize = vec_step(p[i]);\n"
399             "   vstore_half",
400             vector_size_name_extensions[vectorSize],
401             roundName,
402             "( p[i], 0, (__private half *)(&data[0]) );\n"
403             "   for(offset = 0; offset < vecsize; offset++)\n"
404             "   {\n"
405             "       vstore_half(vload_half(offset, (__private half *)data), 0, "
406             "&f[vecsize*i+offset]);\n"
407             "   }\n"
408             "}\n"
409         };
410 
411 
412         const char *source_private_store_v3[] = {
413             "__kernel void test( __global float *p, __global half *f,\n"
414             "                   uint extra_last_thread )\n"
415             "{\n"
416             "   __private ushort data[4];\n"
417             "   size_t i = get_global_id(0);\n"
418             "   size_t last_i = get_global_size(0)-1;\n"
419             "   size_t adjust = 0;\n"
420             "   size_t offset = 0;\n"
421             "   if(last_i == i && extra_last_thread != 0) {\n"
422             "     adjust = 3-extra_last_thread;\n"
423             "   } "
424             "   vstore_half3",
425             roundName,
426             "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
427             "   for(offset = 0; offset < 3; offset++)\n"
428             "   {\n"
429             "       vstore_half(vload_half(offset, (__private half *) data), "
430             "0, &f[3*i+offset-adjust]);\n"
431             "   }\n"
432             "}\n"
433         };
434 
435         char local_buf_size[10];
436         sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
437 
438 
439         const char *source_local_store[] = {
440             "__kernel void test( __global float",
441             vector_size_name_extensions[vectorSize],
442             " *p, __global half *f )\n"
443             "{\n"
444             "   __local ushort data[16*",
445             local_buf_size,
446             "];\n"
447             "   size_t i = get_global_id(0);\n"
448             "   size_t lid = get_local_id(0);\n"
449             "   size_t lsize = get_local_size(0);\n"
450             "   size_t vecsize = vec_step(p[0]);\n"
451             "   event_t async_event;\n"
452             "   vstore_half",
453             vector_size_name_extensions[vectorSize],
454             roundName,
455             "( p[i], lid, (__local half *)(&data[0]) );\n"
456             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
457             "   async_event = async_work_group_copy((__global ushort "
458             "*)f+vecsize*(i-lid), (__local ushort *)(&data[0]), vecsize*lsize, "
459             "0);\n" // investigate later
460             "   wait_group_events(1, &async_event);\n"
461             "}\n"
462         };
463 
464         const char *source_local_store_v3[] = {
465             "__kernel void test( __global float *p, __global half *f,\n"
466             "                   uint extra_last_thread )\n"
467             "{\n"
468             "   __local ushort data[3*(",
469             local_buf_size,
470             "+1)];\n"
471             "   size_t i = get_global_id(0);\n"
472             "   size_t lid = get_local_id(0);\n"
473             "   size_t last_i = get_global_size(0)-1;\n"
474             "   size_t adjust = 0;\n"
475             "   size_t lsize = get_local_size(0);\n"
476             "   event_t async_event;\n"
477             "   if(last_i == i && extra_last_thread != 0) {\n"
478             "     adjust = 3-extra_last_thread;\n"
479             "   } "
480             "   vstore_half3",
481             roundName,
482             "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
483             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
484             "   if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
485             "       extra_last_thread != 0) {\n"
486             "     adjust = 3-extra_last_thread;\n"
487             "   }\n"
488             "   async_event = async_work_group_copy(\n"
489             "       (__global ushort*)(f+3*(i-lid)),\n"
490             "       (__local ushort *)(&data[adjust]),\n"
491             "       lsize*3-adjust, 0);\n" // investigate later
492             "   wait_group_events(1, &async_event);\n"
493             "}\n"
494         };
495 
496         const char *double_source[] = {
497             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
498             "__kernel void test( __global double",
499             vector_size_name_extensions[vectorSize],
500             " *p, __global half *f )\n"
501             "{\n"
502             "   size_t i = get_global_id(0);\n"
503             "   vstore_half",
504             vector_size_name_extensions[vectorSize],
505             roundName,
506             "( p[i], i, f );\n"
507             "}\n"
508         };
509 
510         const char *double_source_private_store[] = {
511             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
512             "__kernel void test( __global double",
513             vector_size_name_extensions[vectorSize],
514             " *p, __global half *f )\n"
515             "{\n"
516             "   __private ushort data[16];\n"
517             "   size_t i = get_global_id(0);\n"
518             "   size_t offset = 0;\n"
519             "   size_t vecsize = vec_step(p[i]);\n"
520             "   vstore_half",
521             vector_size_name_extensions[vectorSize],
522             roundName,
523             "( p[i], 0, (__private half *)(&data[0]) );\n"
524             "   for(offset = 0; offset < vecsize; offset++)\n"
525             "   {\n"
526             "       vstore_half(vload_half(offset, (__private half *)data), 0, "
527             "&f[vecsize*i+offset]);\n"
528             "   }\n"
529             "}\n"
530         };
531 
532 
533         const char *double_source_local_store[] = {
534             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
535             "__kernel void test( __global double",
536             vector_size_name_extensions[vectorSize],
537             " *p, __global half *f )\n"
538             "{\n"
539             "   __local ushort data[16*",
540             local_buf_size,
541             "];\n"
542             "   size_t i = get_global_id(0);\n"
543             "   size_t lid = get_local_id(0);\n"
544             "   size_t vecsize = vec_step(p[0]);\n"
545             "   size_t lsize = get_local_size(0);\n"
546             "   event_t async_event;\n"
547             "   vstore_half",
548             vector_size_name_extensions[vectorSize],
549             roundName,
550             "( p[i], lid, (__local half *)(&data[0]) );\n"
551             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
552             "   async_event = async_work_group_copy((__global ushort "
553             "*)(f+vecsize*(i-lid)), (__local ushort *)(&data[0]), "
554             "vecsize*lsize, 0);\n" // investigate later
555             "   wait_group_events(1, &async_event);\n"
556             "}\n"
557         };
558 
559 
560         const char *double_source_v3[] = {
561             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
562             "__kernel void test( __global double *p, __global half *f ,\n"
563             "                   uint extra_last_thread)\n"
564             "{\n"
565             "   size_t i = get_global_id(0);\n"
566             "   size_t last_i = get_global_size(0)-1;\n"
567             "   size_t adjust = 0;\n"
568             "   if(last_i == i && extra_last_thread != 0) {\n"
569             "     adjust = 3-extra_last_thread;\n"
570             "   } "
571             "   vstore_half3",
572             roundName,
573             "( vload3(i,p-adjust), i, f -adjust);\n"
574             "}\n"
575         };
576 
577         const char *double_source_private_store_v3[] = {
578             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
579             "__kernel void test( __global double *p, __global half *f,\n"
580             "                   uint extra_last_thread )\n"
581             "{\n"
582             "   __private ushort data[4];\n"
583             "   size_t i = get_global_id(0);\n"
584             "   size_t last_i = get_global_size(0)-1;\n"
585             "   size_t adjust = 0;\n"
586             "   size_t offset = 0;\n"
587             "   if(last_i == i && extra_last_thread != 0) {\n"
588             "     adjust = 3-extra_last_thread;\n"
589             "   } "
590             "   vstore_half3",
591             roundName,
592             "( vload3(i, p-adjust), 0, (__private half *)(&data[0]) );\n"
593             "   for(offset = 0; offset < 3; offset++)\n"
594             "   {\n"
595             "       vstore_half(vload_half(offset, (__private half *)data), 0, "
596             "&f[3*i+offset-adjust]);\n"
597             "   }\n"
598             "}\n"
599         };
600 
601         const char *double_source_local_store_v3[] = {
602             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
603             "__kernel void test( __global double *p, __global half *f,\n"
604             "                   uint extra_last_thread )\n"
605             "{\n"
606             "   __local ushort data[3*(",
607             local_buf_size,
608             "+1)];\n"
609             "   size_t i = get_global_id(0);\n"
610             "   size_t lid = get_local_id(0);\n"
611             "   size_t last_i = get_global_size(0)-1;\n"
612             "   size_t adjust = 0;\n"
613             "   size_t lsize = get_local_size(0);\n"
614             "   event_t async_event;\n"
615             "   if(last_i == i && extra_last_thread != 0) {\n"
616             "     adjust = 3-extra_last_thread;\n"
617             "   }\n "
618             "   vstore_half3",
619             roundName,
620             "( vload3(i,p-adjust), lid, (__local half *)(&data[0]) );\n"
621             "   barrier( CLK_LOCAL_MEM_FENCE ); \n"
622             "   if (get_group_id(0) == (get_num_groups(0) - 1) &&\n"
623             "       extra_last_thread != 0) {\n"
624             "     adjust = 3-extra_last_thread;\n"
625             "   }\n"
626             "   async_event = async_work_group_copy(\n"
627             "       (__global ushort *)(f+3*(i-lid)),\n"
628             "       (__local ushort *)(&data[adjust]),\n"
629             "       lsize*3-adjust, 0);\n" // investigate later
630             "   wait_group_events(1, &async_event);\n"
631             "}\n"
632         };
633 
634 
635         if (g_arrVecSizes[vectorSize] == 3)
636         {
637             programs[vectorSize][0] = MakeProgram(
638                 device, source_v3, sizeof(source_v3) / sizeof(source_v3[0]));
639         }
640         else
641         {
642             programs[vectorSize][0] =
643                 MakeProgram(device, source, sizeof(source) / sizeof(source[0]));
644         }
645         if (NULL == programs[vectorSize][0])
646         {
647             gFailCount++;
648             return -1;
649         }
650 
651         kernels[vectorSize][0] =
652             clCreateKernel(programs[vectorSize][0], "test", &error);
653         if (NULL == kernels[vectorSize][0])
654         {
655             gFailCount++;
656             vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error);
657             return error;
658         }
659 
660         if (g_arrVecSizes[vectorSize] == 3)
661         {
662             programs[vectorSize][1] =
663                 MakeProgram(device, source_private_store_v3,
664                             sizeof(source_private_store_v3)
665                                 / sizeof(source_private_store_v3[0]));
666         }
667         else
668         {
669             programs[vectorSize][1] = MakeProgram(
670                 device, source_private_store,
671                 sizeof(source_private_store) / sizeof(source_private_store[0]));
672         }
673         if (NULL == programs[vectorSize][1])
674         {
675             gFailCount++;
676             return -1;
677         }
678 
679         kernels[vectorSize][1] =
680             clCreateKernel(programs[vectorSize][1], "test", &error);
681         if (NULL == kernels[vectorSize][1])
682         {
683             gFailCount++;
684             vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n",
685                        error);
686             return error;
687         }
688 
689         if (g_arrVecSizes[vectorSize] == 3)
690         {
691             programs[vectorSize][2] =
692                 MakeProgram(device, source_local_store_v3,
693                             sizeof(source_local_store_v3)
694                                 / sizeof(source_local_store_v3[0]));
695             if (NULL == programs[vectorSize][2])
696             {
697                 unsigned q;
698                 for (q = 0; q < sizeof(source_local_store_v3)
699                          / sizeof(source_local_store_v3[0]);
700                      q++)
701                     vlog_error("%s", source_local_store_v3[q]);
702 
703                 gFailCount++;
704                 return -1;
705             }
706         }
707         else
708         {
709             programs[vectorSize][2] = MakeProgram(
710                 device, source_local_store,
711                 sizeof(source_local_store) / sizeof(source_local_store[0]));
712             if (NULL == programs[vectorSize][2])
713             {
714                 unsigned q;
715                 for (q = 0; q < sizeof(source_local_store)
716                          / sizeof(source_local_store[0]);
717                      q++)
718                     vlog_error("%s", source_local_store[q]);
719 
720                 gFailCount++;
721                 return -1;
722             }
723         }
724 
725         kernels[vectorSize][2] =
726             clCreateKernel(programs[vectorSize][2], "test", &error);
727         if (NULL == kernels[vectorSize][2])
728         {
729             gFailCount++;
730             vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n",
731                        error);
732             return error;
733         }
734 
735         if (gTestDouble)
736         {
737             if (g_arrVecSizes[vectorSize] == 3)
738             {
739                 doublePrograms[vectorSize][0] = MakeProgram(
740                     device, double_source_v3,
741                     sizeof(double_source_v3) / sizeof(double_source_v3[0]));
742             }
743             else
744             {
745                 doublePrograms[vectorSize][0] = MakeProgram(
746                     device, double_source,
747                     sizeof(double_source) / sizeof(double_source[0]));
748             }
749             if (NULL == doublePrograms[vectorSize][0])
750             {
751                 gFailCount++;
752                 return -1;
753             }
754 
755             doubleKernels[vectorSize][0] =
756                 clCreateKernel(doublePrograms[vectorSize][0], "test", &error);
757             if (NULL == kernels[vectorSize][0])
758             {
759                 gFailCount++;
760                 vlog_error(
761                     "\t\tFAILED -- Failed to create double kernel. (%d)\n",
762                     error);
763                 return error;
764             }
765 
766             if (g_arrVecSizes[vectorSize] == 3)
767                 doublePrograms[vectorSize][1] = MakeProgram(
768                     device, double_source_private_store_v3,
769                     sizeof(double_source_private_store_v3)
770                         / sizeof(double_source_private_store_v3[0]));
771             else
772                 doublePrograms[vectorSize][1] =
773                     MakeProgram(device, double_source_private_store,
774                                 sizeof(double_source_private_store)
775                                     / sizeof(double_source_private_store[0]));
776 
777             if (NULL == doublePrograms[vectorSize][1])
778             {
779                 gFailCount++;
780                 return -1;
781             }
782 
783             doubleKernels[vectorSize][1] =
784                 clCreateKernel(doublePrograms[vectorSize][1], "test", &error);
785             if (NULL == kernels[vectorSize][1])
786             {
787                 gFailCount++;
788                 vlog_error("\t\tFAILED -- Failed to create double private "
789                            "kernel. (%d)\n",
790                            error);
791                 return error;
792             }
793 
794             if (g_arrVecSizes[vectorSize] == 3)
795             {
796                 doublePrograms[vectorSize][2] =
797                     MakeProgram(device, double_source_local_store_v3,
798                                 sizeof(double_source_local_store_v3)
799                                     / sizeof(double_source_local_store_v3[0]));
800             }
801             else
802             {
803                 doublePrograms[vectorSize][2] =
804                     MakeProgram(device, double_source_local_store,
805                                 sizeof(double_source_local_store)
806                                     / sizeof(double_source_local_store[0]));
807             }
808             if (NULL == doublePrograms[vectorSize][2])
809             {
810                 gFailCount++;
811                 return -1;
812             }
813 
814             doubleKernels[vectorSize][2] =
815                 clCreateKernel(doublePrograms[vectorSize][2], "test", &error);
816             if (NULL == kernels[vectorSize][2])
817             {
818                 gFailCount++;
819                 vlog_error("\t\tFAILED -- Failed to create double local "
820                            "kernel. (%d)\n",
821                            error);
822                 return error;
823             }
824         }
825     } // end for vector size
826 
827     const char *reset[] = {
828         "__kernel void reset( __global float *p, __global ushort *f,\n"
829         "                   uint extra_last_thread)\n"
830         "{\n"
831         "   size_t i = get_global_id(0);\n"
832         "   *(f + i) = 0xdead;"
833         "}\n"
834     };
835 
836     if (!gHostReset)
837     {
838         resetProgram =
839             MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0]));
840         if (NULL == resetProgram)
841         {
842             gFailCount++;
843             return -1;
844         }
845         resetKernel = clCreateKernel(resetProgram, "reset", &error);
846         if (NULL == resetKernel)
847         {
848             gFailCount++;
849             return -1;
850         }
851     }
852 
853     // Figure out how many elements are in a work block
854     size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
855     size_t blockCount = BUFFER_SIZE / elementSize; // elementSize is power of 2
856     uint64_t lastCase = 1ULL << (8 * sizeof(float)); // number of floats.
857     size_t stride = blockCount;
858 
859     if (gWimpyMode)
860         stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
861 
862     // we handle 64-bit types a bit differently.
863     if (lastCase == 0) lastCase = 0x100000000ULL;
864 
865     uint64_t i, j;
866     error = 0;
867     uint64_t printMask = (lastCase >> 4) - 1;
868     cl_uint count = 0;
869     int addressSpace;
870     size_t loopCount;
871     cl_uint threadCount = GetThreadCount();
872 
873     ComputeReferenceInfoF fref;
874     fref.x = (float *)gIn_single;
875     fref.r = (cl_half *)gOut_half_reference;
876     fref.f = referenceFunc;
877     fref.lim = blockCount;
878     fref.count = (blockCount + threadCount - 1) / threadCount;
879 
880     CheckResultInfoF fchk;
881     fchk.x = (const float *)gIn_single;
882     fchk.r = (const cl_half *)gOut_half_reference;
883     fchk.s = (const cl_half *)gOut_half;
884     fchk.f = referenceFunc;
885     fchk.lim = blockCount;
886     fchk.count = (blockCount + threadCount - 1) / threadCount;
887 
888     ComputeReferenceInfoD dref;
889     dref.x = (double *)gIn_double;
890     dref.r = (cl_half *)gOut_half_reference_double;
891     dref.f = doubleReferenceFunc;
892     dref.lim = blockCount;
893     dref.count = (blockCount + threadCount - 1) / threadCount;
894 
895     CheckResultInfoD dchk;
896     dchk.x = (const double *)gIn_double;
897     dchk.r = (const cl_half *)gOut_half_reference_double;
898     dchk.s = (const cl_half *)gOut_half;
899     dchk.f = doubleReferenceFunc;
900     dchk.lim = blockCount;
901     dchk.count = (blockCount + threadCount - 1) / threadCount;
902 
903     for (i = 0; i < lastCase; i += stride)
904     {
905         count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
906         fref.i = i;
907         dref.i = i;
908 
909         // Compute the input and reference
910         ThreadPool_Do(ReferenceF, threadCount, &fref);
911 
912         error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0,
913                                      count * sizeof(float), gIn_single, 0, NULL,
914                                      NULL);
915         if (error)
916         {
917             vlog_error("Failure in clWriteBuffer\n");
918             gFailCount++;
919             goto exit;
920         }
921 
922         if (gTestDouble)
923         {
924             ThreadPool_Do(ReferenceD, threadCount, &dref);
925 
926             error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0,
927                                          count * sizeof(double), gIn_double, 0,
928                                          NULL, NULL);
929             if (error)
930             {
931                 vlog_error("Failure in clWriteBuffer\n");
932                 gFailCount++;
933                 goto exit;
934             }
935         }
936 
937         for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
938              vectorSize++)
939         {
940             // Loop through vector sizes
941             fchk.vsz = g_arrVecSizes[vectorSize];
942             dchk.vsz = g_arrVecSizes[vectorSize];
943 
944             for (addressSpace = 0; addressSpace < 3; addressSpace++)
945             {
946                 // Loop over address spaces
947                 fchk.aspace = addressSpaceNames[addressSpace];
948                 dchk.aspace = addressSpaceNames[addressSpace];
949 
950                 if (!gHostReset)
951                 {
952                     error = RunKernel(device, resetKernel, gInBuffer_single,
953                                       gOutBuffer_half, count, 0);
954                 }
955                 else
956                 {
957                     cl_uint pattern = 0xdeaddead;
958                     memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
959 
960                     error = clEnqueueWriteBuffer(
961                         gQueue, gOutBuffer_half, CL_FALSE, 0,
962                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
963                 }
964                 if (error)
965                 {
966                     vlog_error("Failure in clWriteArray\n");
967                     gFailCount++;
968                     goto exit;
969                 }
970 
971                 error = RunKernel(device, kernels[vectorSize][addressSpace],
972                                   gInBuffer_single, gOutBuffer_half,
973                                   numVecs(count, vectorSize, aligned),
974                                   runsOverBy(count, vectorSize, aligned));
975                 if (error)
976                 {
977                     gFailCount++;
978                     goto exit;
979                 }
980 
981                 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
982                                             count * sizeof(cl_half), gOut_half,
983                                             0, NULL, NULL);
984                 if (error)
985                 {
986                     vlog_error("Failure in clReadArray\n");
987                     gFailCount++;
988                     goto exit;
989                 }
990 
991                 error = ThreadPool_Do(CheckF, threadCount, &fchk);
992                 if (error)
993                 {
994                     gFailCount++;
995                     goto exit;
996                 }
997 
998                 if (gTestDouble)
999                 {
1000 
1001                     if (!gHostReset)
1002                     {
1003                         error = RunKernel(device, resetKernel, gInBuffer_double,
1004                                           gOutBuffer_half, count, 0);
1005                     }
1006                     else
1007                     {
1008                         cl_uint pattern = 0xdeaddead;
1009                         memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
1010 
1011                         error = clEnqueueWriteBuffer(
1012                             gQueue, gOutBuffer_half, CL_FALSE, 0,
1013                             count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1014                     }
1015                     if (error)
1016                     {
1017                         vlog_error("Failure in clWriteArray\n");
1018                         gFailCount++;
1019                         goto exit;
1020                     }
1021 
1022                     error = RunKernel(device,
1023                                       doubleKernels[vectorSize][addressSpace],
1024                                       gInBuffer_double, gOutBuffer_half,
1025                                       numVecs(count, vectorSize, aligned),
1026                                       runsOverBy(count, vectorSize, aligned));
1027                     if (error)
1028                     {
1029                         gFailCount++;
1030                         goto exit;
1031                     }
1032 
1033                     error = clEnqueueReadBuffer(
1034                         gQueue, gOutBuffer_half, CL_TRUE, 0,
1035                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1036                     if (error)
1037                     {
1038                         vlog_error("Failure in clReadArray\n");
1039                         gFailCount++;
1040                         goto exit;
1041                     }
1042 
1043 
1044                     error = ThreadPool_Do(CheckD, threadCount, &dchk);
1045                     if (error)
1046                     {
1047                         gFailCount++;
1048                         goto exit;
1049                     }
1050                 }
1051             }
1052         }
1053 
1054         if (((i + blockCount) & ~printMask) == (i + blockCount))
1055         {
1056             vlog(".");
1057             fflush(stdout);
1058         }
1059     } // end last case
1060 
1061     loopCount = count == blockCount ? 1 : 100;
1062     if (gReportTimes)
1063     {
1064         // Init the input stream
1065         cl_float *p = (cl_float *)gIn_single;
1066         for (j = 0; j < count; j++)
1067             p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
1068 
1069         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0,
1070                                           count * sizeof(float), gIn_single, 0,
1071                                           NULL, NULL)))
1072         {
1073             vlog_error("Failure in clWriteArray\n");
1074             gFailCount++;
1075             goto exit;
1076         }
1077 
1078         if (gTestDouble)
1079         {
1080             // Init the input stream
1081             cl_double *q = (cl_double *)gIn_double;
1082             for (j = 0; j < count; j++)
1083                 q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
1084 
1085             if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE,
1086                                               0, count * sizeof(double),
1087                                               gIn_double, 0, NULL, NULL)))
1088             {
1089                 vlog_error("Failure in clWriteArray\n");
1090                 gFailCount++;
1091                 goto exit;
1092             }
1093         }
1094 
1095         // Run again for timing
1096         for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
1097              vectorSize++)
1098         {
1099             uint64_t bestTime = -1ULL;
1100             for (j = 0; j < loopCount; j++)
1101             {
1102                 uint64_t startTime = ReadTime();
1103 
1104 
1105                 if ((error = RunKernel(device, kernels[vectorSize][0],
1106                                        gInBuffer_single, gOutBuffer_half,
1107                                        numVecs(count, vectorSize, aligned),
1108                                        runsOverBy(count, vectorSize, aligned))))
1109                 {
1110                     gFailCount++;
1111                     goto exit;
1112                 }
1113 
1114                 if ((error = clFinish(gQueue)))
1115                 {
1116                     vlog_error("Failure in clFinish\n");
1117                     gFailCount++;
1118                     goto exit;
1119                 }
1120                 uint64_t currentTime = ReadTime() - startTime;
1121                 if (currentTime < bestTime) bestTime = currentTime;
1122                 time[vectorSize] += currentTime;
1123             }
1124             if (bestTime < min_time[vectorSize])
1125                 min_time[vectorSize] = bestTime;
1126 
1127             if (gTestDouble)
1128             {
1129                 bestTime = -1ULL;
1130                 for (j = 0; j < loopCount; j++)
1131                 {
1132                     uint64_t startTime = ReadTime();
1133                     if ((error =
1134                              RunKernel(device, doubleKernels[vectorSize][0],
1135                                        gInBuffer_double, gOutBuffer_half,
1136                                        numVecs(count, vectorSize, aligned),
1137                                        runsOverBy(count, vectorSize, aligned))))
1138                     {
1139                         gFailCount++;
1140                         goto exit;
1141                     }
1142 
1143                     if ((error = clFinish(gQueue)))
1144                     {
1145                         vlog_error("Failure in clFinish\n");
1146                         gFailCount++;
1147                         goto exit;
1148                     }
1149                     uint64_t currentTime = ReadTime() - startTime;
1150                     if (currentTime < bestTime) bestTime = currentTime;
1151                     doubleTime[vectorSize] += currentTime;
1152                 }
1153                 if (bestTime < min_double_time[vectorSize])
1154                     min_double_time[vectorSize] = bestTime;
1155             }
1156         }
1157     }
1158 
1159     if (gReportTimes)
1160     {
1161         for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
1162              vectorSize++)
1163             vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency
1164                           * gComputeDevices / (double)(count * loopCount),
1165                       0, "average us/elem",
1166                       "vStoreHalf%s avg. (%s vector size: %d)", roundName,
1167                       addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
1168         for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
1169              vectorSize++)
1170             vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6
1171                           * gDeviceFrequency * gComputeDevices / (double)count,
1172                       0, "best us/elem",
1173                       "vStoreHalf%s best (%s vector size: %d)", roundName,
1174                       addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
1175         if (gTestDouble)
1176         {
1177             for (vectorSize = kMinVectorSize;
1178                  vectorSize < kLastVectorSizeToTest; vectorSize++)
1179                 vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6
1180                               * gDeviceFrequency * gComputeDevices
1181                               / (double)(count * loopCount),
1182                           0, "average us/elem (double)",
1183                           "vStoreHalf%s avg. d (%s vector size: %d)", roundName,
1184                           addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
1185             for (vectorSize = kMinVectorSize;
1186                  vectorSize < kLastVectorSizeToTest; vectorSize++)
1187                 vlog_perf(SubtractTime(min_double_time[vectorSize], 0) * 1e6
1188                               * gDeviceFrequency * gComputeDevices
1189                               / (double)count,
1190                           0, "best us/elem (double)",
1191                           "vStoreHalf%s best d (%s vector size: %d)", roundName,
1192                           addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
1193         }
1194     }
1195 
1196 exit:
1197     // clean up
1198     if (!gHostReset)
1199     {
1200         clReleaseKernel(resetKernel);
1201         clReleaseProgram(resetProgram);
1202     }
1203 
1204     for (vectorSize = kMinVectorSize; vectorSize < kLastVectorSizeToTest;
1205          vectorSize++)
1206     {
1207         for (addressSpace = 0; addressSpace < 3; addressSpace++)
1208         {
1209             clReleaseKernel(kernels[vectorSize][addressSpace]);
1210             clReleaseProgram(programs[vectorSize][addressSpace]);
1211             if (gTestDouble)
1212             {
1213                 clReleaseKernel(doubleKernels[vectorSize][addressSpace]);
1214                 clReleaseProgram(doublePrograms[vectorSize][addressSpace]);
1215             }
1216         }
1217     }
1218 
1219     return error;
1220 }
1221 
Test_vStoreaHalf_private(cl_device_id device,f2h referenceFunc,d2h doubleReferenceFunc,const char * roundName)1222 int Test_vStoreaHalf_private(cl_device_id device, f2h referenceFunc,
1223                              d2h doubleReferenceFunc, const char *roundName)
1224 {
1225     int vectorSize, error;
1226     cl_program programs[kVectorSizeCount + kStrangeVectorSizeCount][3];
1227     cl_kernel kernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
1228     cl_program resetProgram;
1229     cl_kernel resetKernel;
1230 
1231     uint64_t time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
1232     uint64_t min_time[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
1233     memset(min_time, -1, sizeof(min_time));
1234     cl_program doublePrograms[kVectorSizeCount + kStrangeVectorSizeCount][3];
1235     cl_kernel doubleKernels[kVectorSizeCount + kStrangeVectorSizeCount][3];
1236     uint64_t doubleTime[kVectorSizeCount + kStrangeVectorSizeCount] = { 0 };
1237     uint64_t min_double_time[kVectorSizeCount + kStrangeVectorSizeCount] = {
1238         0
1239     };
1240     memset(min_double_time, -1, sizeof(min_double_time));
1241 
1242     bool aligned = true;
1243 
1244     int minVectorSize = kMinVectorSize;
1245     // There is no aligned scalar vstorea_half
1246     if (0 == minVectorSize) minVectorSize = 1;
1247 
1248     // Loop over vector sizes
1249     for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
1250          vectorSize++)
1251     {
1252         const char *source[] = { "__kernel void test( __global float",
1253                                  vector_size_name_extensions[vectorSize],
1254                                  " *p, __global half *f )\n"
1255                                  "{\n"
1256                                  "   size_t i = get_global_id(0);\n"
1257                                  "   vstorea_half",
1258                                  vector_size_name_extensions[vectorSize],
1259                                  roundName,
1260                                  "( p[i], i, f );\n"
1261                                  "}\n" };
1262 
1263         const char *source_v3[] = {
1264             "__kernel void test( __global float3 *p, __global half *f )\n"
1265             "{\n"
1266             "   size_t i = get_global_id(0);\n"
1267             "   vstorea_half3",
1268             roundName,
1269             "( p[i], i, f );\n"
1270             "   vstore_half",
1271             roundName,
1272             "( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
1273             "}\n"
1274         };
1275 
1276         const char *source_private[] = {
1277             "__kernel void test( __global float",
1278             vector_size_name_extensions[vectorSize],
1279             " *p, __global half *f )\n"
1280             "{\n"
1281             "   __private float",
1282             vector_size_name_extensions[vectorSize],
1283             " data;\n"
1284             "   size_t i = get_global_id(0);\n"
1285             "   data = p[i];\n"
1286             "   vstorea_half",
1287             vector_size_name_extensions[vectorSize],
1288             roundName,
1289             "( data, i, f );\n"
1290             "}\n"
1291         };
1292 
1293         const char *source_private_v3[] = {
1294             "__kernel void test( __global float3 *p, __global half *f )\n"
1295             "{\n"
1296             "   __private float",
1297             vector_size_name_extensions[vectorSize],
1298             " data;\n"
1299             "   size_t i = get_global_id(0);\n"
1300             "   data = p[i];\n"
1301             "   vstorea_half3",
1302             roundName,
1303             "( data, i, f );\n"
1304             "   vstore_half",
1305             roundName,
1306             "( ((__global  float *)p)[4*i+3], 4*i+3, f);\n"
1307             "}\n"
1308         };
1309 
1310         char local_buf_size[10];
1311         sprintf(local_buf_size, "%lld", (uint64_t)gWorkGroupSize);
1312         const char *source_local[] = { "__kernel void test( __global float",
1313                                        vector_size_name_extensions[vectorSize],
1314                                        " *p, __global half *f )\n"
1315                                        "{\n"
1316                                        "   __local float",
1317                                        vector_size_name_extensions[vectorSize],
1318                                        " data[",
1319                                        local_buf_size,
1320                                        "];\n"
1321                                        "   size_t i = get_global_id(0);\n"
1322                                        "   size_t lid = get_local_id(0);\n"
1323                                        "   data[lid] = p[i];\n"
1324                                        "   vstorea_half",
1325                                        vector_size_name_extensions[vectorSize],
1326                                        roundName,
1327                                        "( data[lid], i, f );\n"
1328                                        "}\n" };
1329 
1330         const char *source_local_v3[] = {
1331             "__kernel void test( __global float",
1332             vector_size_name_extensions[vectorSize],
1333             " *p, __global half *f )\n"
1334             "{\n"
1335             "   __local float",
1336             vector_size_name_extensions[vectorSize],
1337             " data[",
1338             local_buf_size,
1339             "];\n"
1340             "   size_t i = get_global_id(0);\n"
1341             "   size_t lid = get_local_id(0);\n"
1342             "   data[lid] = p[i];\n"
1343             "   vstorea_half",
1344             vector_size_name_extensions[vectorSize],
1345             roundName,
1346             "( data[lid], i, f );\n"
1347             "   vstore_half",
1348             roundName,
1349             "( ((__global float *)p)[4*i+3], 4*i+3, f);\n"
1350             "}\n"
1351         };
1352 
1353         const char *double_source[] = {
1354             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1355             "__kernel void test( __global double",
1356             vector_size_name_extensions[vectorSize],
1357             " *p, __global half *f )\n"
1358             "{\n"
1359             "   size_t i = get_global_id(0);\n"
1360             "   vstorea_half",
1361             vector_size_name_extensions[vectorSize],
1362             roundName,
1363             "( p[i], i, f );\n"
1364             "}\n"
1365         };
1366 
1367         const char *double_source_v3[] = {
1368             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1369             "__kernel void test( __global double",
1370             vector_size_name_extensions[vectorSize],
1371             " *p, __global half *f )\n"
1372             "{\n"
1373             "   size_t i = get_global_id(0);\n"
1374             "   vstorea_half",
1375             vector_size_name_extensions[vectorSize],
1376             roundName,
1377             "( p[i], i, f );\n"
1378             "   vstore_half",
1379             roundName,
1380             "( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1381             "}\n"
1382         };
1383 
1384         const char *double_source_private[] = {
1385             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1386             "__kernel void test( __global double",
1387             vector_size_name_extensions[vectorSize],
1388             " *p, __global half *f )\n"
1389             "{\n"
1390             "   __private double",
1391             vector_size_name_extensions[vectorSize],
1392             " data;\n"
1393             "   size_t i = get_global_id(0);\n"
1394             "   data = p[i];\n"
1395             "   vstorea_half",
1396             vector_size_name_extensions[vectorSize],
1397             roundName,
1398             "( data, i, f );\n"
1399             "}\n"
1400         };
1401 
1402         const char *double_source_private_v3[] = {
1403             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1404             "__kernel void test( __global double",
1405             vector_size_name_extensions[vectorSize],
1406             " *p, __global half *f )\n"
1407             "{\n"
1408             "   __private double",
1409             vector_size_name_extensions[vectorSize],
1410             " data;\n"
1411             "   size_t i = get_global_id(0);\n"
1412             "   data = p[i];\n"
1413             "   vstorea_half",
1414             vector_size_name_extensions[vectorSize],
1415             roundName,
1416             "( data, i, f );\n"
1417             "   vstore_half",
1418             roundName,
1419             "( ((__global  double *)p)[4*i+3], 4*i+3, f);\n"
1420             "}\n"
1421         };
1422 
1423         const char *double_source_local[] = {
1424             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1425             "__kernel void test( __global double",
1426             vector_size_name_extensions[vectorSize],
1427             " *p, __global half *f )\n"
1428             "{\n"
1429             "   __local double",
1430             vector_size_name_extensions[vectorSize],
1431             " data[",
1432             local_buf_size,
1433             "];\n"
1434             "   size_t i = get_global_id(0);\n"
1435             "   size_t lid = get_local_id(0);\n"
1436             "   data[lid] = p[i];\n"
1437             "   vstorea_half",
1438             vector_size_name_extensions[vectorSize],
1439             roundName,
1440             "( data[lid], i, f );\n"
1441             "}\n"
1442         };
1443 
1444         const char *double_source_local_v3[] = {
1445             "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
1446             "__kernel void test( __global double",
1447             vector_size_name_extensions[vectorSize],
1448             " *p, __global half *f )\n"
1449             "{\n"
1450             "   __local double",
1451             vector_size_name_extensions[vectorSize],
1452             " data[",
1453             local_buf_size,
1454             "];\n"
1455             "   size_t i = get_global_id(0);\n"
1456             "   size_t lid = get_local_id(0);\n"
1457             "   data[lid] = p[i];\n"
1458             "   vstorea_half",
1459             vector_size_name_extensions[vectorSize],
1460             roundName,
1461             "( data[lid], i, f );\n"
1462             "   vstore_half",
1463             roundName,
1464             "( ((__global double *)p)[4*i+3], 4*i+3, f);\n"
1465             "}\n"
1466         };
1467 
1468         if (g_arrVecSizes[vectorSize] == 3)
1469         {
1470             programs[vectorSize][0] = MakeProgram(
1471                 device, source_v3, sizeof(source_v3) / sizeof(source_v3[0]));
1472             if (NULL == programs[vectorSize][0])
1473             {
1474                 gFailCount++;
1475                 return -1;
1476             }
1477         }
1478         else
1479         {
1480             programs[vectorSize][0] =
1481                 MakeProgram(device, source, sizeof(source) / sizeof(source[0]));
1482             if (NULL == programs[vectorSize][0])
1483             {
1484                 gFailCount++;
1485                 return -1;
1486             }
1487         }
1488 
1489         kernels[vectorSize][0] =
1490             clCreateKernel(programs[vectorSize][0], "test", &error);
1491         if (NULL == kernels[vectorSize][0])
1492         {
1493             gFailCount++;
1494             vlog_error("\t\tFAILED -- Failed to create kernel. (%d)\n", error);
1495             return error;
1496         }
1497 
1498         if (g_arrVecSizes[vectorSize] == 3)
1499         {
1500             programs[vectorSize][1] = MakeProgram(
1501                 device, source_private_v3,
1502                 sizeof(source_private_v3) / sizeof(source_private_v3[0]));
1503             if (NULL == programs[vectorSize][1])
1504             {
1505                 gFailCount++;
1506                 return -1;
1507             }
1508         }
1509         else
1510         {
1511             programs[vectorSize][1] =
1512                 MakeProgram(device, source_private,
1513                             sizeof(source_private) / sizeof(source_private[0]));
1514             if (NULL == programs[vectorSize][1])
1515             {
1516                 gFailCount++;
1517                 return -1;
1518             }
1519         }
1520 
1521         kernels[vectorSize][1] =
1522             clCreateKernel(programs[vectorSize][1], "test", &error);
1523         if (NULL == kernels[vectorSize][1])
1524         {
1525             gFailCount++;
1526             vlog_error("\t\tFAILED -- Failed to create private kernel. (%d)\n",
1527                        error);
1528             return error;
1529         }
1530 
1531         if (g_arrVecSizes[vectorSize] == 3)
1532         {
1533             programs[vectorSize][2] = MakeProgram(
1534                 device, source_local_v3,
1535                 sizeof(source_local_v3) / sizeof(source_local_v3[0]));
1536             if (NULL == programs[vectorSize][2])
1537             {
1538                 gFailCount++;
1539                 return -1;
1540             }
1541         }
1542         else
1543         {
1544             programs[vectorSize][2] =
1545                 MakeProgram(device, source_local,
1546                             sizeof(source_local) / sizeof(source_local[0]));
1547             if (NULL == programs[vectorSize][2])
1548             {
1549                 gFailCount++;
1550                 return -1;
1551             }
1552         }
1553 
1554         kernels[vectorSize][2] =
1555             clCreateKernel(programs[vectorSize][2], "test", &error);
1556         if (NULL == kernels[vectorSize][2])
1557         {
1558             gFailCount++;
1559             vlog_error("\t\tFAILED -- Failed to create local kernel. (%d)\n",
1560                        error);
1561             return error;
1562         }
1563 
1564         if (gTestDouble)
1565         {
1566             if (g_arrVecSizes[vectorSize] == 3)
1567             {
1568                 doublePrograms[vectorSize][0] = MakeProgram(
1569                     device, double_source_v3,
1570                     sizeof(double_source_v3) / sizeof(double_source_v3[0]));
1571                 if (NULL == doublePrograms[vectorSize][0])
1572                 {
1573                     gFailCount++;
1574                     return -1;
1575                 }
1576             }
1577             else
1578             {
1579                 doublePrograms[vectorSize][0] = MakeProgram(
1580                     device, double_source,
1581                     sizeof(double_source) / sizeof(double_source[0]));
1582                 if (NULL == doublePrograms[vectorSize][0])
1583                 {
1584                     gFailCount++;
1585                     return -1;
1586                 }
1587             }
1588 
1589             doubleKernels[vectorSize][0] =
1590                 clCreateKernel(doublePrograms[vectorSize][0], "test", &error);
1591             if (NULL == kernels[vectorSize][0])
1592             {
1593                 gFailCount++;
1594                 vlog_error(
1595                     "\t\tFAILED -- Failed to create double kernel. (%d)\n",
1596                     error);
1597                 return error;
1598             }
1599 
1600             if (g_arrVecSizes[vectorSize] == 3)
1601             {
1602                 doublePrograms[vectorSize][1] =
1603                     MakeProgram(device, double_source_private_v3,
1604                                 sizeof(double_source_private_v3)
1605                                     / sizeof(double_source_private_v3[0]));
1606                 if (NULL == doublePrograms[vectorSize][1])
1607                 {
1608                     gFailCount++;
1609                     return -1;
1610                 }
1611             }
1612             else
1613             {
1614                 doublePrograms[vectorSize][1] =
1615                     MakeProgram(device, double_source_private,
1616                                 sizeof(double_source_private)
1617                                     / sizeof(double_source_private[0]));
1618                 if (NULL == doublePrograms[vectorSize][1])
1619                 {
1620                     gFailCount++;
1621                     return -1;
1622                 }
1623             }
1624 
1625             doubleKernels[vectorSize][1] =
1626                 clCreateKernel(doublePrograms[vectorSize][1], "test", &error);
1627             if (NULL == kernels[vectorSize][1])
1628             {
1629                 gFailCount++;
1630                 vlog_error("\t\tFAILED -- Failed to create double private "
1631                            "kernel. (%d)\n",
1632                            error);
1633                 return error;
1634             }
1635 
1636             if (g_arrVecSizes[vectorSize] == 3)
1637             {
1638                 doublePrograms[vectorSize][2] =
1639                     MakeProgram(device, double_source_local_v3,
1640                                 sizeof(double_source_local_v3)
1641                                     / sizeof(double_source_local_v3[0]));
1642                 if (NULL == doublePrograms[vectorSize][2])
1643                 {
1644                     gFailCount++;
1645                     return -1;
1646                 }
1647             }
1648             else
1649             {
1650                 doublePrograms[vectorSize][2] =
1651                     MakeProgram(device, double_source_local,
1652                                 sizeof(double_source_local)
1653                                     / sizeof(double_source_local[0]));
1654                 if (NULL == doublePrograms[vectorSize][2])
1655                 {
1656                     gFailCount++;
1657                     return -1;
1658                 }
1659             }
1660 
1661             doubleKernels[vectorSize][2] =
1662                 clCreateKernel(doublePrograms[vectorSize][2], "test", &error);
1663             if (NULL == kernels[vectorSize][2])
1664             {
1665                 gFailCount++;
1666                 vlog_error("\t\tFAILED -- Failed to create double local "
1667                            "kernel. (%d)\n",
1668                            error);
1669                 return error;
1670             }
1671         }
1672     }
1673 
1674     const char *reset[] = {
1675         "__kernel void reset( __global float *p, __global ushort *f,\n"
1676         "                   uint extra_last_thread)\n"
1677         "{\n"
1678         "   size_t i = get_global_id(0);\n"
1679         "   *(f + i) = 0xdead;"
1680         "}\n"
1681     };
1682 
1683     if (!gHostReset)
1684     {
1685         resetProgram =
1686             MakeProgram(device, reset, sizeof(reset) / sizeof(reset[0]));
1687         if (NULL == resetProgram)
1688         {
1689             gFailCount++;
1690             return -1;
1691         }
1692         resetKernel = clCreateKernel(resetProgram, "reset", &error);
1693         if (NULL == resetKernel)
1694         {
1695             gFailCount++;
1696             return -1;
1697         }
1698     }
1699 
1700     // Figure out how many elements are in a work block
1701     size_t elementSize = std::max(sizeof(cl_ushort), sizeof(float));
1702     size_t blockCount = BUFFER_SIZE / elementSize;
1703     uint64_t lastCase = 1ULL << (8 * sizeof(float));
1704     size_t stride = blockCount;
1705 
1706     if (gWimpyMode)
1707         stride = (uint64_t)blockCount * (uint64_t)gWimpyReductionFactor;
1708 
1709     // we handle 64-bit types a bit differently.
1710     if (lastCase == 0) lastCase = 0x100000000ULL;
1711     uint64_t i, j;
1712     error = 0;
1713     uint64_t printMask = (lastCase >> 4) - 1;
1714     cl_uint count = 0;
1715     int addressSpace;
1716     size_t loopCount;
1717     cl_uint threadCount = GetThreadCount();
1718 
1719     ComputeReferenceInfoF fref;
1720     fref.x = (float *)gIn_single;
1721     fref.r = (cl_half *)gOut_half_reference;
1722     fref.f = referenceFunc;
1723     fref.lim = blockCount;
1724     fref.count = (blockCount + threadCount - 1) / threadCount;
1725 
1726     CheckResultInfoF fchk;
1727     fchk.x = (const float *)gIn_single;
1728     fchk.r = (const cl_half *)gOut_half_reference;
1729     fchk.s = (const cl_half *)gOut_half;
1730     fchk.f = referenceFunc;
1731     fchk.lim = blockCount;
1732     fchk.count = (blockCount + threadCount - 1) / threadCount;
1733 
1734     ComputeReferenceInfoD dref;
1735     dref.x = (double *)gIn_double;
1736     dref.r = (cl_half *)gOut_half_reference_double;
1737     dref.f = doubleReferenceFunc;
1738     dref.lim = blockCount;
1739     dref.count = (blockCount + threadCount - 1) / threadCount;
1740 
1741     CheckResultInfoD dchk;
1742     dchk.x = (const double *)gIn_double;
1743     dchk.r = (const cl_half *)gOut_half_reference_double;
1744     dchk.s = (const cl_half *)gOut_half;
1745     dchk.f = doubleReferenceFunc;
1746     dchk.lim = blockCount;
1747     dchk.count = (blockCount + threadCount - 1) / threadCount;
1748 
1749     for (i = 0; i < (uint64_t)lastCase; i += stride)
1750     {
1751         count = (cl_uint)std::min((uint64_t)blockCount, lastCase - i);
1752         fref.i = i;
1753         dref.i = i;
1754 
1755         // Create the input and reference
1756         ThreadPool_Do(ReferenceF, threadCount, &fref);
1757 
1758         error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_FALSE, 0,
1759                                      count * sizeof(float), gIn_single, 0, NULL,
1760                                      NULL);
1761         if (error)
1762         {
1763             vlog_error("Failure in clWriteArray\n");
1764             gFailCount++;
1765             goto exit;
1766         }
1767 
1768         if (gTestDouble)
1769         {
1770             ThreadPool_Do(ReferenceD, threadCount, &dref);
1771 
1772             error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_FALSE, 0,
1773                                          count * sizeof(double), gIn_double, 0,
1774                                          NULL, NULL);
1775             if (error)
1776             {
1777                 vlog_error("Failure in clWriteArray\n");
1778                 gFailCount++;
1779                 goto exit;
1780             }
1781         }
1782 
1783         for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
1784              vectorSize++)
1785         {
1786             // Loop over vector legths
1787             fchk.vsz = g_arrVecSizes[vectorSize];
1788             dchk.vsz = g_arrVecSizes[vectorSize];
1789 
1790             for (addressSpace = 0; addressSpace < 3; addressSpace++)
1791             {
1792                 // Loop over address spaces
1793                 fchk.aspace = addressSpaceNames[addressSpace];
1794                 dchk.aspace = addressSpaceNames[addressSpace];
1795 
1796                 if (!gHostReset)
1797                 {
1798                     error = RunKernel(device, resetKernel, gInBuffer_single,
1799                                       gOutBuffer_half, count, 0);
1800                 }
1801                 else
1802                 {
1803                     cl_uint pattern = 0xdeaddead;
1804                     memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
1805 
1806                     error = clEnqueueWriteBuffer(
1807                         gQueue, gOutBuffer_half, CL_FALSE, 0,
1808                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1809                 }
1810                 if (error)
1811                 {
1812                     vlog_error("Failure in clWriteArray\n");
1813                     gFailCount++;
1814                     goto exit;
1815                 }
1816 
1817                 error = RunKernel(device, kernels[vectorSize][addressSpace],
1818                                   gInBuffer_single, gOutBuffer_half,
1819                                   numVecs(count, vectorSize, aligned),
1820                                   runsOverBy(count, vectorSize, aligned));
1821                 if (error)
1822                 {
1823                     gFailCount++;
1824                     goto exit;
1825                 }
1826 
1827                 error = clEnqueueReadBuffer(gQueue, gOutBuffer_half, CL_TRUE, 0,
1828                                             count * sizeof(cl_half), gOut_half,
1829                                             0, NULL, NULL);
1830                 if (error)
1831                 {
1832                     vlog_error("Failure in clReadArray\n");
1833                     gFailCount++;
1834                     goto exit;
1835                 }
1836 
1837                 error = ThreadPool_Do(CheckF, threadCount, &fchk);
1838                 if (error)
1839                 {
1840                     gFailCount++;
1841                     goto exit;
1842                 }
1843 
1844                 if (gTestDouble)
1845                 {
1846 
1847                     if (!gHostReset)
1848                     {
1849                         error = RunKernel(device, resetKernel, gInBuffer_single,
1850                                           gOutBuffer_half, count, 0);
1851                     }
1852                     else
1853                     {
1854                         cl_uint pattern = 0xdeaddead;
1855                         memset_pattern4(gOut_half, &pattern, BUFFER_SIZE / 2);
1856 
1857                         error = clEnqueueWriteBuffer(
1858                             gQueue, gOutBuffer_half, CL_FALSE, 0,
1859                             count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1860                     }
1861                     if (error)
1862                     {
1863                         vlog_error("Failure in clWriteArray\n");
1864                         gFailCount++;
1865                         goto exit;
1866                     }
1867 
1868                     error = RunKernel(device,
1869                                       doubleKernels[vectorSize][addressSpace],
1870                                       gInBuffer_double, gOutBuffer_half,
1871                                       numVecs(count, vectorSize, aligned),
1872                                       runsOverBy(count, vectorSize, aligned));
1873                     if (error)
1874                     {
1875                         gFailCount++;
1876                         goto exit;
1877                     }
1878 
1879                     error = clEnqueueReadBuffer(
1880                         gQueue, gOutBuffer_half, CL_TRUE, 0,
1881                         count * sizeof(cl_half), gOut_half, 0, NULL, NULL);
1882                     if (error)
1883                     {
1884                         vlog_error("Failure in clReadArray\n");
1885                         gFailCount++;
1886                         goto exit;
1887                     }
1888 
1889                     error = ThreadPool_Do(CheckD, threadCount, &dchk);
1890                     if (error)
1891                     {
1892                         gFailCount++;
1893                         goto exit;
1894                     }
1895                 }
1896             }
1897         } // end for vector size
1898 
1899         if (((i + blockCount) & ~printMask) == (i + blockCount))
1900         {
1901             vlog(".");
1902             fflush(stdout);
1903         }
1904     } // for end lastcase
1905 
1906     loopCount = count == blockCount ? 1 : 100;
1907     if (gReportTimes)
1908     {
1909         // Init the input stream
1910         cl_float *p = (cl_float *)gIn_single;
1911         for (j = 0; j < count; j++)
1912             p[j] = (float)((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
1913 
1914         if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_single, CL_TRUE, 0,
1915                                           count * sizeof(float), gIn_single, 0,
1916                                           NULL, NULL)))
1917         {
1918             vlog_error("Failure in clWriteArray\n");
1919             gFailCount++;
1920             goto exit;
1921         }
1922 
1923         if (gTestDouble)
1924         {
1925             // Init the input stream
1926             cl_double *q = (cl_double *)gIn_double;
1927             for (j = 0; j < count; j++)
1928                 q[j] = ((double)(rand() - RAND_MAX / 2) / (RAND_MAX / 2));
1929 
1930             if ((error = clEnqueueWriteBuffer(gQueue, gInBuffer_double, CL_TRUE,
1931                                               0, count * sizeof(double),
1932                                               gIn_double, 0, NULL, NULL)))
1933             {
1934                 vlog_error("Failure in clWriteArray\n");
1935                 gFailCount++;
1936                 goto exit;
1937             }
1938         }
1939 
1940         // Run again for timing
1941         for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
1942              vectorSize++)
1943         {
1944             uint64_t bestTime = -1ULL;
1945             for (j = 0; j < loopCount; j++)
1946             {
1947                 uint64_t startTime = ReadTime();
1948                 if ((error = RunKernel(device, kernels[vectorSize][0],
1949                                        gInBuffer_single, gOutBuffer_half,
1950                                        numVecs(count, vectorSize, aligned),
1951                                        runsOverBy(count, vectorSize, aligned))))
1952                 {
1953                     gFailCount++;
1954                     goto exit;
1955                 }
1956 
1957                 if ((error = clFinish(gQueue)))
1958                 {
1959                     vlog_error("Failure in clFinish\n");
1960                     gFailCount++;
1961                     goto exit;
1962                 }
1963                 uint64_t currentTime = ReadTime() - startTime;
1964                 if (currentTime < bestTime) bestTime = currentTime;
1965                 time[vectorSize] += currentTime;
1966             }
1967             if (bestTime < min_time[vectorSize])
1968                 min_time[vectorSize] = bestTime;
1969 
1970             if (gTestDouble)
1971             {
1972                 bestTime = -1ULL;
1973                 for (j = 0; j < loopCount; j++)
1974                 {
1975                     uint64_t startTime = ReadTime();
1976                     if ((error =
1977                              RunKernel(device, doubleKernels[vectorSize][0],
1978                                        gInBuffer_double, gOutBuffer_half,
1979                                        numVecs(count, vectorSize, aligned),
1980                                        runsOverBy(count, vectorSize, aligned))))
1981                     {
1982                         gFailCount++;
1983                         goto exit;
1984                     }
1985 
1986                     if ((error = clFinish(gQueue)))
1987                     {
1988                         vlog_error("Failure in clFinish\n");
1989                         gFailCount++;
1990                         goto exit;
1991                     }
1992                     uint64_t currentTime = ReadTime() - startTime;
1993                     if (currentTime < bestTime) bestTime = currentTime;
1994                     doubleTime[vectorSize] += currentTime;
1995                 }
1996                 if (bestTime < min_double_time[vectorSize])
1997                     min_double_time[vectorSize] = bestTime;
1998             }
1999         }
2000     }
2001 
2002     if (gReportTimes)
2003     {
2004         for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2005              vectorSize++)
2006             vlog_perf(SubtractTime(time[vectorSize], 0) * 1e6 * gDeviceFrequency
2007                           * gComputeDevices / (double)(count * loopCount),
2008                       0, "average us/elem",
2009                       "vStoreaHalf%s avg. (%s vector size: %d)", roundName,
2010                       addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
2011         for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2012              vectorSize++)
2013             vlog_perf(SubtractTime(min_time[vectorSize], 0) * 1e6
2014                           * gDeviceFrequency * gComputeDevices / (double)count,
2015                       0, "best us/elem",
2016                       "vStoreaHalf%s best (%s vector size: %d)", roundName,
2017                       addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
2018         if (gTestDouble)
2019         {
2020             for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2021                  vectorSize++)
2022                 vlog_perf(SubtractTime(doubleTime[vectorSize], 0) * 1e6
2023                               * gDeviceFrequency * gComputeDevices
2024                               / (double)(count * loopCount),
2025                           0, "average us/elem (double)",
2026                           "vStoreaHalf%s avg. d (%s vector size: %d)",
2027                           roundName, addressSpaceNames[0],
2028                           (g_arrVecSizes[vectorSize]));
2029             for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2030                  vectorSize++)
2031                 vlog_perf(
2032                     SubtractTime(min_double_time[vectorSize], 0) * 1e6
2033                         * gDeviceFrequency * gComputeDevices / (double)count,
2034                     0, "best us/elem (double)",
2035                     "vStoreaHalf%s best d (%s vector size: %d)", roundName,
2036                     addressSpaceNames[0], (g_arrVecSizes[vectorSize]));
2037         }
2038     }
2039 
2040 exit:
2041     // clean up
2042     if (!gHostReset)
2043     {
2044         clReleaseKernel(resetKernel);
2045         clReleaseProgram(resetProgram);
2046     }
2047 
2048     for (vectorSize = minVectorSize; vectorSize < kLastVectorSizeToTest;
2049          vectorSize++)
2050     {
2051         for (addressSpace = 0; addressSpace < 3; addressSpace++)
2052         {
2053             clReleaseKernel(kernels[vectorSize][addressSpace]);
2054             clReleaseProgram(programs[vectorSize][addressSpace]);
2055             if (gTestDouble)
2056             {
2057                 clReleaseKernel(doubleKernels[vectorSize][addressSpace]);
2058                 clReleaseProgram(doublePrograms[vectorSize][addressSpace]);
2059             }
2060         }
2061     }
2062 
2063     return error;
2064 }
2065