1 //
2 // Copyright (c) 2022 The Khronos Group Inc.
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16
17 #include "common.h"
18
19 #include "utility.h" // for sizeNames and sizeValues.
20
21 #include <sstream>
22 #include <string>
23
24 namespace {
25
GetTypeName(ParameterType type)26 const char *GetTypeName(ParameterType type)
27 {
28 switch (type)
29 {
30 case ParameterType::Float: return "float";
31 case ParameterType::Double: return "double";
32 case ParameterType::Int: return "int";
33 case ParameterType::UInt: return "uint";
34 case ParameterType::Long: return "long";
35 case ParameterType::ULong: return "ulong";
36 }
37 return nullptr;
38 }
39
GetUndefValue(ParameterType type)40 const char *GetUndefValue(ParameterType type)
41 {
42 switch (type)
43 {
44 case ParameterType::Float:
45 case ParameterType::Double: return "NAN";
46
47 case ParameterType::Int:
48 case ParameterType::UInt: return "0x12345678";
49
50 case ParameterType::Long:
51 case ParameterType::ULong: return "0x0ddf00dbadc0ffee";
52 }
53 return nullptr;
54 }
55
EmitDefineType(std::ostringstream & kernel,const char * name,ParameterType type,int vector_size_index)56 void EmitDefineType(std::ostringstream &kernel, const char *name,
57 ParameterType type, int vector_size_index)
58 {
59 kernel << "#define " << name << " " << GetTypeName(type)
60 << sizeNames[vector_size_index] << '\n';
61 kernel << "#define " << name << "_SCALAR " << GetTypeName(type) << '\n';
62 }
63
EmitDefineUndef(std::ostringstream & kernel,const char * name,ParameterType type)64 void EmitDefineUndef(std::ostringstream &kernel, const char *name,
65 ParameterType type)
66 {
67 kernel << "#define " << name << " " << GetUndefValue(type) << '\n';
68 }
69
EmitEnableExtension(std::ostringstream & kernel,const std::initializer_list<ParameterType> & types)70 void EmitEnableExtension(std::ostringstream &kernel,
71 const std::initializer_list<ParameterType> &types)
72 {
73 bool needsFp64 = false;
74
75 for (const auto &type : types)
76 {
77 switch (type)
78 {
79 case ParameterType::Double: needsFp64 = true; break;
80
81 case ParameterType::Float:
82 case ParameterType::Int:
83 case ParameterType::UInt:
84 case ParameterType::Long:
85 case ParameterType::ULong:
86 // No extension required.
87 break;
88 }
89 }
90
91 if (needsFp64) kernel << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n";
92 }
93
GetBuildOptions(bool relaxed_mode)94 std::string GetBuildOptions(bool relaxed_mode)
95 {
96 std::ostringstream options;
97
98 if (gForceFTZ)
99 {
100 options << " -cl-denorms-are-zero";
101 }
102
103 if (gFloatCapabilities & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)
104 {
105 options << " -cl-fp32-correctly-rounded-divide-sqrt";
106 }
107
108 if (relaxed_mode)
109 {
110 options << " -cl-fast-relaxed-math";
111 }
112
113 return options.str();
114 }
115
116 } // anonymous namespace
117
GetKernelName(int vector_size_index)118 std::string GetKernelName(int vector_size_index)
119 {
120 return std::string("math_kernel") + sizeNames[vector_size_index];
121 }
122
GetUnaryKernel(const std::string & kernel_name,const char * builtin,ParameterType retType,ParameterType type1,int vector_size_index)123 std::string GetUnaryKernel(const std::string &kernel_name, const char *builtin,
124 ParameterType retType, ParameterType type1,
125 int vector_size_index)
126 {
127 // To keep the kernel code readable, use macros for types and undef values.
128 std::ostringstream kernel;
129 EmitDefineType(kernel, "RETTYPE", retType, vector_size_index);
130 EmitDefineType(kernel, "TYPE1", type1, vector_size_index);
131 EmitDefineUndef(kernel, "UNDEF1", type1);
132 EmitEnableExtension(kernel, { retType, type1 });
133
134 // clang-format off
135 const char *kernel_nonvec3[] = { R"(
136 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE* out,
137 __global TYPE1* in1)
138 {
139 size_t i = get_global_id(0);
140 out[i] = )", builtin, R"((in1[i]);
141 }
142 )" };
143
144 const char *kernel_vec3[] = { R"(
145 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE_SCALAR* out,
146 __global TYPE1_SCALAR* in1)
147 {
148 size_t i = get_global_id(0);
149
150 if (i + 1 < get_global_size(0))
151 {
152 TYPE1 a = vload3(0, in1 + 3 * i);
153 RETTYPE res = )", builtin, R"((a);
154 vstore3(res, 0, out + 3 * i);
155 }
156 else
157 {
158 // Figure out how many elements are left over after
159 // BUFFER_SIZE % (3 * sizeof(type)).
160 // Assume power of two buffer size.
161 size_t parity = i & 1;
162 TYPE1 a = (TYPE1)(UNDEF1, UNDEF1, UNDEF1);
163 switch (parity)
164 {
165 case 0:
166 a.y = in1[3 * i + 1];
167 // fall through
168 case 1:
169 a.x = in1[3 * i];
170 break;
171 }
172
173 RETTYPE res = )", builtin, R"((a);
174
175 switch (parity)
176 {
177 case 0:
178 out[3 * i + 1] = res.y;
179 // fall through
180 case 1:
181 out[3 * i] = res.x;
182 break;
183 }
184 }
185 }
186 )" };
187 // clang-format on
188
189 if (sizeValues[vector_size_index] != 3)
190 for (const auto &chunk : kernel_nonvec3) kernel << chunk;
191 else
192 for (const auto &chunk : kernel_vec3) kernel << chunk;
193
194 return kernel.str();
195 }
196
GetUnaryKernel(const std::string & kernel_name,const char * builtin,ParameterType retType1,ParameterType retType2,ParameterType type1,int vector_size_index)197 std::string GetUnaryKernel(const std::string &kernel_name, const char *builtin,
198 ParameterType retType1, ParameterType retType2,
199 ParameterType type1, int vector_size_index)
200 {
201 // To keep the kernel code readable, use macros for types and undef values.
202 std::ostringstream kernel;
203 EmitDefineType(kernel, "RETTYPE1", retType1, vector_size_index);
204 EmitDefineType(kernel, "RETTYPE2", retType2, vector_size_index);
205 EmitDefineType(kernel, "TYPE1", type1, vector_size_index);
206 EmitDefineUndef(kernel, "UNDEF1", type1);
207 EmitDefineUndef(kernel, "UNDEFR2", retType2);
208 EmitEnableExtension(kernel, { retType1, retType2, type1 });
209
210 // clang-format off
211 const char *kernel_nonvec3[] = { R"(
212 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE1* out1,
213 __global RETTYPE2* out2,
214 __global TYPE1* in1)
215 {
216 size_t i = get_global_id(0);
217 out1[i] = )", builtin, R"((in1[i], out2 + i);
218 }
219 )" };
220
221 const char *kernel_vec3[] = { R"(
222 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE1_SCALAR* out1,
223 __global RETTYPE2_SCALAR* out2,
224 __global TYPE1_SCALAR* in1)
225 {
226 size_t i = get_global_id(0);
227
228 if (i + 1 < get_global_size(0))
229 {
230 TYPE1 a = vload3(0, in1 + 3 * i);
231 RETTYPE2 res2 = UNDEFR2;
232 RETTYPE1 res1 = )", builtin, R"((a, &res2);
233 vstore3(res1, 0, out1 + 3 * i);
234 vstore3(res2, 0, out2 + 3 * i);
235 }
236 else
237 {
238 // Figure out how many elements are left over after
239 // BUFFER_SIZE % (3 * sizeof(type)).
240 // Assume power of two buffer size.
241 size_t parity = i & 1;
242 TYPE1 a = (TYPE1)(UNDEF1, UNDEF1, UNDEF1);
243 switch (parity)
244 {
245 case 0:
246 a.y = in1[3 * i + 1];
247 // fall through
248 case 1:
249 a.x = in1[3 * i];
250 break;
251 }
252
253 RETTYPE2 res2 = UNDEFR2;
254 RETTYPE1 res1 = )", builtin, R"((a, &res2);
255
256 switch (parity)
257 {
258 case 0:
259 out1[3 * i + 1] = res1.y;
260 out2[3 * i + 1] = res2.y;
261 // fall through
262 case 1:
263 out1[3 * i] = res1.x;
264 out2[3 * i] = res2.x;
265 break;
266 }
267 }
268 }
269 )" };
270 // clang-format on
271
272 if (sizeValues[vector_size_index] != 3)
273 for (const auto &chunk : kernel_nonvec3) kernel << chunk;
274 else
275 for (const auto &chunk : kernel_vec3) kernel << chunk;
276
277 return kernel.str();
278 }
279
GetBinaryKernel(const std::string & kernel_name,const char * builtin,ParameterType retType,ParameterType type1,ParameterType type2,int vector_size_index)280 std::string GetBinaryKernel(const std::string &kernel_name, const char *builtin,
281 ParameterType retType, ParameterType type1,
282 ParameterType type2, int vector_size_index)
283 {
284 // To keep the kernel code readable, use macros for types and undef values.
285 std::ostringstream kernel;
286 EmitDefineType(kernel, "RETTYPE", retType, vector_size_index);
287 EmitDefineType(kernel, "TYPE1", type1, vector_size_index);
288 EmitDefineType(kernel, "TYPE2", type2, vector_size_index);
289 EmitDefineUndef(kernel, "UNDEF1", type1);
290 EmitDefineUndef(kernel, "UNDEF2", type2);
291 EmitEnableExtension(kernel, { retType, type1, type2 });
292
293 const bool is_vec3 = sizeValues[vector_size_index] == 3;
294
295 std::string invocation;
296 if (strlen(builtin) == 1)
297 {
298 // Assume a single-character builtin is an operator (e.g., +, *, ...).
299 invocation = is_vec3 ? "a" : "in1[i] ";
300 invocation += builtin;
301 invocation += is_vec3 ? "b" : " in2[i]";
302 }
303 else
304 {
305 // Otherwise call the builtin as a function with two arguments.
306 invocation = builtin;
307 invocation += is_vec3 ? "(a, b)" : "(in1[i], in2[i])";
308 }
309
310 // clang-format off
311 const char *kernel_nonvec3[] = { R"(
312 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE* out,
313 __global TYPE1* in1,
314 __global TYPE2* in2)
315 {
316 size_t i = get_global_id(0);
317 out[i] = )", invocation.c_str(), R"(;
318 }
319 )" };
320
321 const char *kernel_vec3[] = { R"(
322 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE_SCALAR* out,
323 __global TYPE1_SCALAR* in1,
324 __global TYPE2_SCALAR* in2)
325 {
326 size_t i = get_global_id(0);
327
328 if (i + 1 < get_global_size(0))
329 {
330 TYPE1 a = vload3(0, in1 + 3 * i);
331 TYPE2 b = vload3(0, in2 + 3 * i);
332 RETTYPE res = )", invocation.c_str(), R"(;
333 vstore3(res, 0, out + 3 * i);
334 }
335 else
336 {
337 // Figure out how many elements are left over after
338 // BUFFER_SIZE % (3 * sizeof(type)).
339 // Assume power of two buffer size.
340 size_t parity = i & 1;
341 TYPE1 a = (TYPE1)(UNDEF1, UNDEF1, UNDEF1);
342 TYPE2 b = (TYPE2)(UNDEF2, UNDEF2, UNDEF2);
343 switch (parity)
344 {
345 case 0:
346 a.y = in1[3 * i + 1];
347 b.y = in2[3 * i + 1];
348 // fall through
349 case 1:
350 a.x = in1[3 * i];
351 b.x = in2[3 * i];
352 break;
353 }
354
355 RETTYPE res = )", invocation.c_str(), R"(;
356
357 switch (parity)
358 {
359 case 0:
360 out[3 * i + 1] = res.y;
361 // fall through
362 case 1:
363 out[3 * i] = res.x;
364 break;
365 }
366 }
367 }
368 )" };
369 // clang-format on
370
371 if (!is_vec3)
372 for (const auto &chunk : kernel_nonvec3) kernel << chunk;
373 else
374 for (const auto &chunk : kernel_vec3) kernel << chunk;
375
376 return kernel.str();
377 }
378
GetBinaryKernel(const std::string & kernel_name,const char * builtin,ParameterType retType1,ParameterType retType2,ParameterType type1,ParameterType type2,int vector_size_index)379 std::string GetBinaryKernel(const std::string &kernel_name, const char *builtin,
380 ParameterType retType1, ParameterType retType2,
381 ParameterType type1, ParameterType type2,
382 int vector_size_index)
383 {
384 // To keep the kernel code readable, use macros for types and undef values.
385 std::ostringstream kernel;
386 EmitDefineType(kernel, "RETTYPE1", retType1, vector_size_index);
387 EmitDefineType(kernel, "RETTYPE2", retType2, vector_size_index);
388 EmitDefineType(kernel, "TYPE1", type1, vector_size_index);
389 EmitDefineType(kernel, "TYPE2", type2, vector_size_index);
390 EmitDefineUndef(kernel, "UNDEF1", type1);
391 EmitDefineUndef(kernel, "UNDEF2", type2);
392 EmitDefineUndef(kernel, "UNDEFR2", retType2);
393 EmitEnableExtension(kernel, { retType1, retType2, type1, type2 });
394
395 // clang-format off
396 const char *kernel_nonvec3[] = { R"(
397 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE1* out1,
398 __global RETTYPE2* out2,
399 __global TYPE1* in1,
400 __global TYPE2* in2)
401 {
402 size_t i = get_global_id(0);
403 out1[i] = )", builtin, R"((in1[i], in2[i], out2 + i);
404 }
405 )" };
406
407 const char *kernel_vec3[] = { R"(
408 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE1_SCALAR* out1,
409 __global RETTYPE2_SCALAR* out2,
410 __global TYPE1_SCALAR* in1,
411 __global TYPE2_SCALAR* in2)
412 {
413 size_t i = get_global_id(0);
414
415 if (i + 1 < get_global_size(0))
416 {
417 TYPE1 a = vload3(0, in1 + 3 * i);
418 TYPE2 b = vload3(0, in2 + 3 * i);
419 RETTYPE2 res2 = UNDEFR2;
420 RETTYPE1 res1 = )", builtin, R"((a, b, &res2);
421 vstore3(res1, 0, out1 + 3 * i);
422 vstore3(res2, 0, out2 + 3 * i);
423 }
424 else
425 {
426 // Figure out how many elements are left over after
427 // BUFFER_SIZE % (3 * sizeof(type)).
428 // Assume power of two buffer size.
429 size_t parity = i & 1;
430 TYPE1 a = (TYPE1)(UNDEF1, UNDEF1, UNDEF1);
431 TYPE2 b = (TYPE2)(UNDEF2, UNDEF2, UNDEF2);
432 switch (parity)
433 {
434 case 0:
435 a.y = in1[3 * i + 1];
436 b.y = in2[3 * i + 1];
437 // fall through
438 case 1:
439 a.x = in1[3 * i];
440 b.x = in2[3 * i];
441 break;
442 }
443
444 RETTYPE2 res2 = UNDEFR2;
445 RETTYPE1 res1 = )", builtin, R"((a, b, &res2);
446
447 switch (parity)
448 {
449 case 0:
450 out1[3 * i + 1] = res1.y;
451 out2[3 * i + 1] = res2.y;
452 // fall through
453 case 1:
454 out1[3 * i] = res1.x;
455 out2[3 * i] = res2.x;
456 break;
457 }
458 }
459 }
460 )" };
461 // clang-format on
462
463 if (sizeValues[vector_size_index] != 3)
464 for (const auto &chunk : kernel_nonvec3) kernel << chunk;
465 else
466 for (const auto &chunk : kernel_vec3) kernel << chunk;
467
468 return kernel.str();
469 }
470
GetTernaryKernel(const std::string & kernel_name,const char * builtin,ParameterType retType,ParameterType type1,ParameterType type2,ParameterType type3,int vector_size_index)471 std::string GetTernaryKernel(const std::string &kernel_name,
472 const char *builtin, ParameterType retType,
473 ParameterType type1, ParameterType type2,
474 ParameterType type3, int vector_size_index)
475 {
476 // To keep the kernel code readable, use macros for types and undef values.
477 std::ostringstream kernel;
478 EmitDefineType(kernel, "RETTYPE", retType, vector_size_index);
479 EmitDefineType(kernel, "TYPE1", type1, vector_size_index);
480 EmitDefineType(kernel, "TYPE2", type2, vector_size_index);
481 EmitDefineType(kernel, "TYPE3", type3, vector_size_index);
482 EmitDefineUndef(kernel, "UNDEF1", type1);
483 EmitDefineUndef(kernel, "UNDEF2", type2);
484 EmitDefineUndef(kernel, "UNDEF3", type3);
485 EmitEnableExtension(kernel, { retType, type1, type2, type3 });
486
487 // clang-format off
488 const char *kernel_nonvec3[] = { R"(
489 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE* out,
490 __global TYPE1* in1,
491 __global TYPE2* in2,
492 __global TYPE3* in3)
493 {
494 size_t i = get_global_id(0);
495 out[i] = )", builtin, R"((in1[i], in2[i], in3[i]);
496 }
497 )" };
498
499 const char *kernel_vec3[] = { R"(
500 __kernel void )", kernel_name.c_str(), R"((__global RETTYPE_SCALAR* out,
501 __global TYPE1_SCALAR* in1,
502 __global TYPE2_SCALAR* in2,
503 __global TYPE3_SCALAR* in3)
504 {
505 size_t i = get_global_id(0);
506
507 if (i + 1 < get_global_size(0))
508 {
509 TYPE1 a = vload3(0, in1 + 3 * i);
510 TYPE2 b = vload3(0, in2 + 3 * i);
511 TYPE3 c = vload3(0, in3 + 3 * i);
512 RETTYPE res = )", builtin, R"((a, b, c);
513 vstore3(res, 0, out + 3 * i);
514 }
515 else
516 {
517 // Figure out how many elements are left over after
518 // BUFFER_SIZE % (3 * sizeof(type)).
519 // Assume power of two buffer size.
520 size_t parity = i & 1;
521 TYPE1 a = (TYPE1)(UNDEF1, UNDEF1, UNDEF1);
522 TYPE2 b = (TYPE2)(UNDEF2, UNDEF2, UNDEF2);
523 TYPE3 c = (TYPE3)(UNDEF3, UNDEF3, UNDEF3);
524 switch (parity)
525 {
526 case 0:
527 a.y = in1[3 * i + 1];
528 b.y = in2[3 * i + 1];
529 c.y = in3[3 * i + 1];
530 // fall through
531 case 1:
532 a.x = in1[3 * i];
533 b.x = in2[3 * i];
534 c.x = in3[3 * i];
535 break;
536 }
537
538 RETTYPE res = )", builtin, R"((a, b, c);
539
540 switch (parity)
541 {
542 case 0:
543 out[3 * i + 1] = res.y;
544 // fall through
545 case 1:
546 out[3 * i] = res.x;
547 break;
548 }
549 }
550 }
551 )" };
552 // clang-format on
553
554 if (sizeValues[vector_size_index] != 3)
555 for (const auto &chunk : kernel_nonvec3) kernel << chunk;
556 else
557 for (const auto &chunk : kernel_vec3) kernel << chunk;
558
559 return kernel.str();
560 }
561
BuildKernels(BuildKernelInfo & info,cl_uint job_id,SourceGenerator generator)562 cl_int BuildKernels(BuildKernelInfo &info, cl_uint job_id,
563 SourceGenerator generator)
564 {
565 // Generate the kernel code.
566 cl_uint vector_size_index = gMinVectorSizeIndex + job_id;
567 auto kernel_name = GetKernelName(vector_size_index);
568 auto source = generator(kernel_name, info.nameInCode, vector_size_index);
569 std::array<const char *, 1> sources{ source.c_str() };
570
571 // Create the program.
572 clProgramWrapper &program = info.programs[vector_size_index];
573 auto options = GetBuildOptions(info.relaxedMode);
574 int error =
575 create_single_kernel_helper(gContext, &program, nullptr, sources.size(),
576 sources.data(), nullptr, options.c_str());
577 if (error != CL_SUCCESS)
578 {
579 vlog_error("\t\tFAILED -- Failed to create program. (%d)\n", error);
580 return error;
581 }
582
583 // Create a kernel for each thread. cl_kernels aren't thread safe, so make
584 // one for every thread
585 auto &kernels = info.kernels[vector_size_index];
586 assert(kernels.empty() && "Dirty BuildKernelInfo");
587 kernels.resize(info.threadCount);
588 for (auto &kernel : kernels)
589 {
590 kernel = clCreateKernel(program, kernel_name.c_str(), &error);
591 if (!kernel || error != CL_SUCCESS)
592 {
593 vlog_error("\t\tFAILED -- clCreateKernel() failed: (%d)\n", error);
594 return error;
595 }
596 }
597
598 return CL_SUCCESS;
599 }
600