// // Copyright (C) 2009-2021 Intel Corporation // // SPDX-License-Identifier: MIT // // #pragma once // TODO: AABB_work_group_reduce is super slow, remove !!! #pragma cl_intel_subgroups : enable #pragma cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable uint intel_sub_group_ballot(bool valid); // atom_min float __attribute__((overloadable)) atom_min(volatile __global float *p, float val); float __attribute__((overloadable)) atom_min(volatile __local float *p, float val); float __attribute__((overloadable)) atomic_min(volatile __global float *p, float val); float __attribute__((overloadable)) atomic_min(volatile __local float *p, float val); // atom_max float __attribute__((overloadable)) atom_max(volatile __global float *p, float val); float __attribute__((overloadable)) atom_max(volatile __local float *p, float val); float __attribute__((overloadable)) atomic_max(volatile __global float *p, float val); float __attribute__((overloadable)) atomic_max(volatile __local float *p, float val); // atom_cmpxchg float __attribute__((overloadable)) atom_cmpxchg(volatile __global float *p, float cmp, float val); float __attribute__((overloadable)) atom_cmpxchg(volatile __local float *p, float cmp, float val); float __attribute__((overloadable)) atomic_cmpxchg(volatile __global float *p, float cmp, float val); float __attribute__((overloadable)) atomic_cmpxchg(volatile __local float *p, float cmp, float val); inline uint subgroup_single_atomic_add(global uint *p, uint val) { const uint subgroupLocalID = get_sub_group_local_id(); const int v = subgroupLocalID == 0 ? atomic_add(p, val) : 0; return sub_group_broadcast(v, 0); } inline float halfarea(const float3 d) { return fma(d.x, (d.y + d.z), d.y * d.z); } inline float area(const float3 d) { return halfarea(d) * 2.0f; } inline uint maxDim(const float3 a) { const float3 b = fabs(a); const bool b_x_y = b.x > b.y; const float cur_max = b_x_y ? b.x : b.y; const uint cur_idx = b_x_y ? 0 : 1; const bool b_x_y_z = b.z > cur_max; return b_x_y_z ? 2 : cur_idx; } inline uint3 sortByMaxDim(const float3 a) { const uint kz = maxDim(a); const uint _kx = (kz + 1) % 3; const uint _ky = (_kx + 1) % 3; const bool kz_pos = a[kz] >= 0.0f; const uint kx = kz_pos ? _ky : _kx; const uint ky = kz_pos ? _kx : _ky; return (uint3)(kx, ky, kz); } inline uint4 sort4_ascending(const uint4 dist) { const uint a0 = dist.s0; const uint a1 = dist.s1; const uint a2 = dist.s2; const uint a3 = dist.s3; const uint b0 = min(a0, a2); const uint b1 = min(a1, a3); const uint b2 = max(a0, a2); const uint b3 = max(a1, a3); const uint c0 = min(b0, b1); const uint c1 = max(b0, b1); const uint c2 = min(b2, b3); const uint c3 = max(b2, b3); const uint d0 = c0; const uint d1 = min(c1, c2); const uint d2 = max(c1, c2); const uint d3 = c3; return (uint4)(d0, d1, d2, d3); } __constant const uint shuffleA[8] = {1, 0, 3, 2, 5, 4, 7, 6}; __constant const uint shuffleB[8] = {2, 3, 0, 1, 7, 6, 5, 4}; __constant const uint shuffleC[8] = {1, 0, 3, 2, 5, 4, 7, 6}; __constant const uint shuffleD[8] = {7, 6, 5, 4, 3, 2, 1, 0}; __constant const uint shuffleE[8] = {2, 3, 0, 1, 6, 7, 4, 5}; __constant const uint shuffleF[8] = {1, 0, 3, 2, 5, 4, 7, 6}; __constant const uint shuffleG[8] = {0, 2, 1, 3, 5, 4, 7, 6}; __constant const uint selAA[8] = {0, 1, 0, 1, 0, 1, 0, 1}; __constant const uint selCC[8] = {0, 0, 1, 1, 0, 0, 1, 1}; __constant const uint selF0[8] = {0, 0, 0, 0, 1, 1, 1, 1}; __constant const uint selGG[8] = {0, 0, 1, 0, 1, 1, 1, 1}; inline uint compare_exchange_descending(const uint a0, const uint shuffleMask, const uint selectMask) { const uint a1 = intel_sub_group_shuffle(a0, shuffleMask); const uint a_min = min(a0, a1); const uint a_max = max(a0, a1); return select(a_max, a_min, selectMask); } inline uint compare_exchange_ascending(const uint a0, const uint shuffleMask, const uint selectMask) { const uint a1 = intel_sub_group_shuffle(a0, shuffleMask); const uint a_min = min(a0, a1); const uint a_max = max(a0, a1); return select(a_min, a_max, selectMask); } inline uint sort8_descending(const uint aa) { const unsigned int slotID = get_sub_group_local_id() % 8; const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]); const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]); const uint dd = compare_exchange_descending(cc, shuffleC[slotID], selAA[slotID]); const uint ee = compare_exchange_descending(dd, shuffleD[slotID], selF0[slotID]); const uint ff = compare_exchange_descending(ee, shuffleE[slotID], selCC[slotID]); const uint gg = compare_exchange_descending(ff, shuffleF[slotID], selAA[slotID]); return gg; } inline uint sort8_ascending(const uint aa) { const unsigned int slotID = get_sub_group_local_id() % 8; const uint bb = compare_exchange_ascending(aa, shuffleA[slotID], selAA[slotID]); const uint cc = compare_exchange_ascending(bb, shuffleB[slotID], selCC[slotID]); const uint dd = compare_exchange_ascending(cc, shuffleC[slotID], selAA[slotID]); const uint ee = compare_exchange_ascending(dd, shuffleD[slotID], selF0[slotID]); const uint ff = compare_exchange_ascending(ee, shuffleE[slotID], selCC[slotID]); const uint gg = compare_exchange_ascending(ff, shuffleF[slotID], selAA[slotID]); return gg; } inline uint sort4_descending(const uint aa) { const unsigned int slotID = get_sub_group_local_id() % 8; const uint bb = compare_exchange_descending(aa, shuffleA[slotID], selAA[slotID]); const uint cc = compare_exchange_descending(bb, shuffleB[slotID], selCC[slotID]); const uint dd = compare_exchange_descending(cc, shuffleG[slotID], selGG[slotID]); return dd; } inline ulong compare_exchange_descending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask) { const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask); const ulong a_min = min(a0, a1); const ulong a_max = max(a0, a1); return select(a_max, a_min, (ulong)selectMask); } inline ulong compare_exchange_ascending_ulong(const ulong a0, const uint shuffleMask, const uint selectMask) { const ulong a1 = intel_sub_group_shuffle(a0, shuffleMask); const ulong a_min = min(a0, a1); const ulong a_max = max(a0, a1); return select(a_min, a_max, (ulong)selectMask); } inline ulong sort8_ascending_ulong(const ulong aa) { const unsigned int slotID = get_sub_group_local_id() % 8; const ulong bb = compare_exchange_ascending_ulong(aa, shuffleA[slotID], selAA[slotID]); const ulong cc = compare_exchange_ascending_ulong(bb, shuffleB[slotID], selCC[slotID]); const ulong dd = compare_exchange_ascending_ulong(cc, shuffleC[slotID], selAA[slotID]); const ulong ee = compare_exchange_ascending_ulong(dd, shuffleD[slotID], selF0[slotID]); const ulong ff = compare_exchange_ascending_ulong(ee, shuffleE[slotID], selCC[slotID]); const ulong gg = compare_exchange_ascending_ulong(ff, shuffleF[slotID], selAA[slotID]); return gg; } inline uint bitInterleave3D(const uint4 in) { uint x = in.x, y = in.y, z = in.z; x = (x | (x << 16)) & 0x030000FF; x = (x | (x << 8)) & 0x0300F00F; x = (x | (x << 4)) & 0x030C30C3; x = (x | (x << 2)) & 0x09249249; y = (y | (y << 16)) & 0x030000FF; y = (y | (y << 8)) & 0x0300F00F; y = (y | (y << 4)) & 0x030C30C3; y = (y | (y << 2)) & 0x09249249; z = (z | (z << 16)) & 0x030000FF; z = (z | (z << 8)) & 0x0300F00F; z = (z | (z << 4)) & 0x030C30C3; z = (z | (z << 2)) & 0x09249249; return x | (y << 1) | (z << 2); } inline uint bitInterleave4D(const uint4 in) { uint x = in.x, y = in.y, z = in.z, w = in.w; x = x & 0x000000ff; x = (x ^ (x << 16)) & 0x00c0003f; x = (x ^ (x << 8)) & 0x00c03807; x = (x ^ (x << 4)) & 0x08530853; x = (x ^ (x << 2)) & 0x09090909; x = (x ^ (x << 1)) & 0x11111111; y = y & 0x000000ff; y = (y ^ (y << 16)) & 0x00c0003f; y = (y ^ (y << 8)) & 0x00c03807; y = (y ^ (y << 4)) & 0x08530853; y = (y ^ (y << 2)) & 0x09090909; y = (y ^ (y << 1)) & 0x11111111; z = z & 0x000000ff; z = (z ^ (z << 16)) & 0x00c0003f; z = (z ^ (z << 8)) & 0x00c03807; z = (z ^ (z << 4)) & 0x08530853; z = (z ^ (z << 2)) & 0x09090909; z = (z ^ (z << 1)) & 0x11111111; w = w & 0x000000ff; w = (w ^ (w << 16)) & 0x00c0003f; w = (w ^ (w << 8)) & 0x00c03807; w = (w ^ (w << 4)) & 0x08530853; w = (w ^ (w << 2)) & 0x09090909; w = (w ^ (w << 1)) & 0x11111111; return (x | (y << 1) | (z << 2) | (w << 3)); } inline ulong ulong_bitInterleave4D(const uint4 in) { ulong x = in.x, y = in.y, z = in.z, w = in.w; x = x & 0x0000ffff; x = (x ^ (x << 32)) & 0x0000f800000007ff; x = (x ^ (x << 16)) & 0x0000f80007c0003f; x = (x ^ (x << 8)) & 0x00c0380700c03807; x = (x ^ (x << 4)) & 0x0843084308430843; x = (x ^ (x << 2)) & 0x0909090909090909; x = (x ^ (x << 1)) & 0x1111111111111111; y = y & 0x0000ffff; y = (y ^ (y << 32)) & 0x0000f800000007ff; y = (y ^ (y << 16)) & 0x0000f80007c0003f; y = (y ^ (y << 8)) & 0x00c0380700c03807; y = (y ^ (y << 4)) & 0x0843084308430843; y = (y ^ (y << 2)) & 0x0909090909090909; y = (y ^ (y << 1)) & 0x1111111111111111; z = z & 0x0000ffff; z = (z ^ (z << 32)) & 0x0000f800000007ff; z = (z ^ (z << 16)) & 0x0000f80007c0003f; z = (z ^ (z << 8)) & 0x00c0380700c03807; z = (z ^ (z << 4)) & 0x0843084308430843; z = (z ^ (z << 2)) & 0x0909090909090909; z = (z ^ (z << 1)) & 0x1111111111111111; w = w & 0x0000ffff; w = (w ^ (w << 32)) & 0x0000f800000007ff; w = (w ^ (w << 16)) & 0x0000f80007c0003f; w = (w ^ (w << 8)) & 0x00c0380700c03807; w = (w ^ (w << 4)) & 0x0843084308430843; w = (w ^ (w << 2)) & 0x0909090909090909; w = (w ^ (w << 1)) & 0x1111111111111111; return (x | (y << 1) | (z << 2) | (w << 3)); } inline uint bitCompact(uint x) { x &= 0x09249249; x = (x ^ (x >> 2)) & 0x030c30c3; x = (x ^ (x >> 4)) & 0x0300f00f; x = (x ^ (x >> 8)) & 0xff0000ff; x = (x ^ (x >> 16)) & 0x000003ff; return x; } inline uint3 bitCompact3D(const uint in) { const uint x = bitCompact(x >> 0); const uint y = bitCompact(y >> 1); const uint z = bitCompact(z >> 2); return (uint3)(x, y, z); } inline uint convertToPushIndices8(uint ID) { const unsigned int slotID = get_sub_group_local_id(); uint index = 0; for (uint i = 0; i < 8; i++) { const uint mask = intel_sub_group_ballot(ID == i); const uint new_index = ctz(mask); index = i == slotID ? new_index : index; } return index; } inline uint convertToPushIndices16(uint ID) { const unsigned int slotID = get_sub_group_local_id(); uint index = 0; for (uint i = 0; i < 16; i++) { const uint mask = intel_sub_group_ballot(ID == i); const uint new_index = ctz(mask); index = i == slotID ? new_index : index; } return index; } #define FLOAT_EXPONENT_MASK (0x7F800000) // used to be EXPONENT_MASK #define FLOAT_MANTISSA_MASK (0x007FFFFF) // used to be MANTISSA_MASK #define FLOAT_NEG_ONE_EXP_MASK (0x3F000000) #define FLOAT_BIAS (127) #define FLOAT_MANTISSA_BITS (23) inline float3 frexp_vec3(float3 len, int3* exp) { float3 mant = as_float3((int3)((as_int3(len) & (int3)FLOAT_MANTISSA_MASK) + (int3)FLOAT_NEG_ONE_EXP_MASK)); mant = select(mant, (float3)(0.5f), (int3)(mant == (float3)(1.0f))); mant = copysign(mant, len); *exp = ((as_int3(len) & (int3)FLOAT_EXPONENT_MASK) >> (int3)FLOAT_MANTISSA_BITS) - ((int3)FLOAT_BIAS - (int3)(1)); return mant; } #ifndef uniform #define uniform #endif #ifndef varying #define varying #endif uint get_sub_group_global_id() { return get_sub_group_id() + get_num_sub_groups() * get_group_id( 0 ); } // each lane contains the number of 1 bits below the corresponding position in 'mask' uint subgroup_bit_prefix_exclusive(uniform uint mask) { varying ushort lane = get_sub_group_local_id(); varying uint lane_mask = (1 << lane) - 1; varying uint m = mask & lane_mask; return popcount(m); } uint bit_prefix_exclusive(uniform uint mask, varying uint lane_idx ) { varying uint lane_mask = (1 << lane_idx) - 1; varying uint m = mask & lane_mask; return popcount(m); } uint3 sub_group_broadcast_uint3(uint3 v, uniform ushort idx) { return (uint3)(sub_group_broadcast(v.x,idx), sub_group_broadcast(v.y,idx), sub_group_broadcast(v.z,idx)); } float3 sub_group_broadcast_float3(float3 v, uniform ushort idx) { return (float3)(sub_group_broadcast(v.x, idx), sub_group_broadcast(v.y, idx), sub_group_broadcast(v.z, idx)); } float3 sub_group_reduce_min_float3(float3 v) { return (float3)(sub_group_reduce_min(v.x), sub_group_reduce_min(v.y), sub_group_reduce_min(v.z) ); } float3 sub_group_reduce_max_float3(float3 v) { return (float3)(sub_group_reduce_max(v.x), sub_group_reduce_max(v.y), sub_group_reduce_max(v.z)); } float3 sub_group_shuffle_float3(float3 v, uniform ushort idx) { return (float3)(intel_sub_group_shuffle(v.x, idx), intel_sub_group_shuffle(v.y, idx), intel_sub_group_shuffle(v.z, idx)); } uint3 sub_group_shuffle_uint3(uint3 v, uniform ushort idx) { return (uint3)( intel_sub_group_shuffle(v.x, idx), intel_sub_group_shuffle(v.y, idx), intel_sub_group_shuffle(v.z, idx)); } inline uchar sub_group_reduce_or_N6(uchar val) { val = val | intel_sub_group_shuffle_down(val, val, 4); val = val | intel_sub_group_shuffle_down(val, val, 2); val = val | intel_sub_group_shuffle_down(val, val, 1); return sub_group_broadcast(val, 0); } inline uchar sub_group_reduce_or_N6_2xSIMD8_in_SIMD16(uchar val) { uint SIMD8_id = get_sub_group_local_id() / 8; val = val | intel_sub_group_shuffle_down(val, val, 4); val = val | intel_sub_group_shuffle_down(val, val, 2); val = val | intel_sub_group_shuffle_down(val, val, 1); return intel_sub_group_shuffle(val, SIMD8_id * 8); } inline __attribute__((overloadable)) uint atomic_inc_local( local uint* p ) { return atomic_fetch_add_explicit( (volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group ); } inline __attribute__((overloadable)) int atomic_inc_local(local int* p) { return atomic_fetch_add_explicit( (volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group); } inline __attribute__((overloadable)) uint atomic_dec_local(local uint* p) { return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_work_group); } inline __attribute__((overloadable)) int atomic_dec_local(local int* p) { return atomic_fetch_sub_explicit((volatile local atomic_int*) p, (int)1, memory_order_relaxed, memory_scope_work_group); } inline __attribute__((overloadable)) uint atomic_sub_local(local uint* p, uint n) { return atomic_fetch_sub_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); } inline __attribute__((overloadable)) int atomic_sub_local(local int* p, int n ) { return atomic_fetch_sub_explicit( (volatile local atomic_int*) p, n, memory_order_relaxed, memory_scope_work_group); } inline uint atomic_add_local( local uint* p, uint n ) { return atomic_fetch_add_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); } inline uint atomic_xor_local(local uint* p, uint n) { return atomic_fetch_xor_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); } inline uint atomic_or_local(local uint* p, uint n) { return atomic_fetch_or_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); } inline uint atomic_min_local(local uint* p, uint n) { return atomic_fetch_min_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); } inline uint atomic_max_local(local uint* p, uint n) { return atomic_fetch_max_explicit((volatile local atomic_uint*) p, n, memory_order_relaxed, memory_scope_work_group); } inline uint atomic_inc_global( global uint* p ) { return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device); } inline uint atomic_dec_global(global uint* p) { return atomic_fetch_sub_explicit( (volatile global atomic_uint*) p, (uint)1, memory_order_relaxed, memory_scope_device); } inline bool atomic_compare_exchange_global(global uint* p, uint* expected, uint desired) { return atomic_compare_exchange_strong_explicit((volatile global atomic_uint*) p, expected, desired, memory_order_relaxed, memory_order_relaxed, memory_scope_device); } inline uint atomic_add_global( global uint* p, uint n ) { return atomic_fetch_add_explicit( (volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device); } inline uint atomic_sub_global(global uint* p, uint n) { return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device); } inline uint atomic_or_global(global uint* p, uint n) { return atomic_fetch_or_explicit((volatile global atomic_uint*) p, n, memory_order_relaxed, memory_scope_device); } inline uint atomic_inc_global_acquire(global uint* p) { return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_acquire, memory_scope_device); } inline uint atomic_inc_global_release(global uint* p) { return atomic_fetch_add_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device); } inline uint atomic_dec_global_release(global uint* p) { return atomic_fetch_sub_explicit((volatile global atomic_uint*) p, (uint)1, memory_order_release, memory_scope_device); } inline uint generic_atomic_add(uint* p, uint val) { if (to_global(p) != NULL) return atomic_add_global(to_global(p), val); if (to_local(p) != NULL) return atomic_add_local(to_local(p), val); return 0; } inline __attribute__((overloadable)) uint sub_group_reduce_max_N6( uint n ) { n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) ); n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) ); n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) ); return sub_group_broadcast( n, 0 ); } inline __attribute__((overloadable)) float sub_group_reduce_max_N6( float n ) { n = max( n, intel_sub_group_shuffle_down( n, n, 4 ) ); n = max( n, intel_sub_group_shuffle_down( n, n, 2 ) ); n = max( n, intel_sub_group_shuffle_down( n, n, 1 ) ); return sub_group_broadcast( n, 0 ); } inline __attribute__((overloadable)) float sub_group_reduce_max_N6_2xSIMD8_in_SIMD16(float n) { n = max(n, intel_sub_group_shuffle_down(n, n, 4)); n = max(n, intel_sub_group_shuffle_down(n, n, 2)); n = max(n, intel_sub_group_shuffle_down(n, n, 1)); return intel_sub_group_shuffle(n, (get_sub_group_local_id() / 8) * 8);//sub_group_broadcast(n, 0); } inline uint generic_atomic_inc(uint* p) { if (to_global(p) != NULL) return atomic_inc_global(to_global(p)); if (to_local(p) != NULL) return atomic_inc(to_local(p)); return 0; } // Built-in GRL function which, if called in a kernel body, will force the kernel // to be compiled to the minimum SIMD width supported by the platform void GRL_UseMinimumSIMDWidth();