1 //
2 // Copyright (C) 2009-2021 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7
8 #include "shared.h"
9
10 /// Write cache line to global memory
11 /// Assumes subgroup_size is 16
12 ///
13 /// @param dst 64 bytes aligned output pointer
14 /// @param val value to write
CacheLineSubgroupWrite(global char * dst,uint val)15 GRL_INLINE void CacheLineSubgroupWrite(global char* dst, uint val)
16 {
17 global uint* addrAligned = (global uint*)(global uint16*)dst;
18 intel_sub_group_block_write(addrAligned, val);
19 }
20
21 /// Read cache line from global memory
22 /// Assumes subgroup_size is 16
23 ///
24 /// @param src 64 bytes aligned input pointer
25 /// @return uint read from memory
CacheLineSubgroupRead(const global char * src)26 GRL_INLINE uint CacheLineSubgroupRead(const global char* src)
27 {
28 const global uint* addrAligned = (const global uint*)(global uint16*)src;
29 return intel_sub_group_block_read(addrAligned);
30 }
31
32 /// Copy cache line
33 /// Assumes subgroup_size is 16
34 ///
35 /// @param dst 64 bytes aligned output pointer
36 /// @param src input pointer
CopyCacheLine(global char * dst,const global char * src)37 GRL_INLINE void CopyCacheLine(global char* dst, const global char* src)
38 {
39 global const uint* usrc = (global const uint*) (src);
40
41 uint data = intel_sub_group_block_read(usrc);
42 CacheLineSubgroupWrite(dst, data);
43 }
44
45 /// Fast memory copy
46 ///
47 /// @param dst output pointer
48 /// @param src input pointer
49 /// @param size number of bytes to copy
50 /// @param numGroups number of groups that execute this function
CopyMemory(global char * dst,const global char * src,uint size,uint numGroups)51 GRL_INLINE void CopyMemory(global char* dst, const global char* src, uint size, uint numGroups)
52 {
53 const uint CACHELINE_SIZE = 64;
54
55 uint globalID = get_local_size(0) * get_group_id(0) + get_local_id(0);
56
57 // this part copies cacheline per physical thread one write. starting from dst aligned up to cacheline.
58 // it copies laso reminder
59 {
60 uint alignAdd = ((uint)(uint64_t)dst) & (CACHELINE_SIZE - 1);
61 alignAdd = (CACHELINE_SIZE - alignAdd) & (CACHELINE_SIZE - 1);
62
63 if (size > alignAdd)
64 {
65 uint alignedBytesCount = size - alignAdd;
66 uint alignedDWsCount = alignedBytesCount >> 2;
67 global uint* dstAlignedPart = (global uint*)(dst + alignAdd);
68 global uint* srcAlignedPart = (global uint*)(src + alignAdd);
69
70 for (uint id = globalID; id < alignedDWsCount; id += get_local_size(0) * numGroups)
71 {
72 dstAlignedPart[id] = srcAlignedPart[id];
73 }
74
75 if (globalID < alignedBytesCount - (alignedDWsCount << 2))
76 {
77 global uint8_t* dstByteRem = (global uint8_t*)(dstAlignedPart + alignedDWsCount);
78 global uint8_t* srcByteRem = (global uint8_t*)(srcAlignedPart + alignedDWsCount);
79 dstByteRem[globalID] = srcByteRem[globalID];
80 }
81 }
82 }
83
84 // copy to dst below aligned up to chacheline
85 {
86 uint misalignmentBytesSize = (4 - (((uint)dst) & /*bytes in DW*/3)) & 3;
87 if (misalignmentBytesSize)
88 {
89 if (globalID < misalignmentBytesSize)
90 {
91 dst[globalID] = src[globalID];
92 }
93 dst += misalignmentBytesSize;
94 src += misalignmentBytesSize;
95 }
96
97 uint misalignmentDWSize = (CACHELINE_SIZE - (((uint)dst) & (CACHELINE_SIZE - 1))) & (CACHELINE_SIZE - 1);
98 if (misalignmentDWSize)
99 {
100 if (globalID < (misalignmentDWSize >> 2))
101 {
102 ((global uint*)dst)[globalID] = ((global uint*)src)[globalID];
103 }
104 }
105 }
106 }
107
108 #define CACHELINE_SIZE 64
109 #define CACHELINE_PER_BLOCK 4
110 #define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
111
112 GRL_INLINE
getInstanceDataToCopy(global const char * array,global const uint64_t * arrayOfPtrs,const uint byteOffset)113 global const char *getInstanceDataToCopy(global const char *array, global const uint64_t *arrayOfPtrs, const uint byteOffset)
114 {
115 if (array != NULL)
116 {
117 return array + byteOffset;
118 }
119 else
120 {
121 return (global char *)arrayOfPtrs[byteOffset >> 6];
122 }
123 }
124
125 // assummed:
126 // dst is always 64 bytes alligned
127 // size is always multiply of 64 bytes (size of InstanceDesc is always 64 bytes)
128 GRL_INLINE
copyInstances(global char * dst,global const char * array,global const uint64_t * arrayOfPtrs,const uint64_t size,const uint numGroups)129 void copyInstances(global char *dst, global const char *array, global const uint64_t *arrayOfPtrs, const uint64_t size, const uint numGroups)
130 {
131 uint taskId = get_group_id(0);
132
133 uint blockedSize = (size) & (~(BLOCK_SIZE - 1));
134
135 uint cachelinedTailOffset = blockedSize;
136 uint cachelinedTailSize = (size - cachelinedTailOffset) & (~(CACHELINE_SIZE - 1));
137
138 uint tailCacheLines = cachelinedTailSize >> 6; // divide by CACHELINE_SIZE
139 uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups - 1)));
140 if (reversedTaskId < tailCacheLines)
141 {
142 uint byteOffset = cachelinedTailOffset + (reversedTaskId * CACHELINE_SIZE);
143 global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
144 CopyCacheLine(dst + byteOffset, src);
145 }
146
147 uint numBlocks = blockedSize >> 8;
148 while (taskId < numBlocks)
149 {
150 uint byteOffset = (taskId * BLOCK_SIZE);
151
152 for (uint cl = 0; cl < CACHELINE_PER_BLOCK; cl++)
153 {
154 global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
155 CopyCacheLine(dst + byteOffset, src);
156 byteOffset += CACHELINE_SIZE;
157 }
158
159 taskId += numGroups;
160 }
161 }