xref: /aosp_15_r20/external/mesa3d/src/intel/vulkan/grl/gpu/mem_utils.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 //
2 // Copyright (C) 2009-2021 Intel Corporation
3 //
4 // SPDX-License-Identifier: MIT
5 //
6 //
7 
8 #include "shared.h"
9 
10 /// Write cache line to global memory
11 /// Assumes subgroup_size is 16
12 ///
13 /// @param dst 64 bytes aligned output pointer
14 /// @param val value to write
CacheLineSubgroupWrite(global char * dst,uint val)15 GRL_INLINE void CacheLineSubgroupWrite(global char* dst, uint val)
16 {
17     global uint* addrAligned = (global uint*)(global uint16*)dst;
18     intel_sub_group_block_write(addrAligned, val);
19 }
20 
21 /// Read cache line from global memory
22 /// Assumes subgroup_size is 16
23 ///
24 /// @param src 64 bytes aligned input pointer
25 /// @return uint read from memory
CacheLineSubgroupRead(const global char * src)26 GRL_INLINE uint CacheLineSubgroupRead(const global char* src)
27 {
28     const global uint* addrAligned = (const global uint*)(global uint16*)src;
29     return intel_sub_group_block_read(addrAligned);
30 }
31 
32 /// Copy cache line
33 /// Assumes subgroup_size is 16
34 ///
35 /// @param dst 64 bytes aligned output pointer
36 /// @param src input pointer
CopyCacheLine(global char * dst,const global char * src)37 GRL_INLINE void CopyCacheLine(global char* dst, const global char* src)
38 {
39     global const uint* usrc = (global const uint*) (src);
40 
41     uint data = intel_sub_group_block_read(usrc);
42     CacheLineSubgroupWrite(dst, data);
43 }
44 
45 /// Fast memory copy
46 ///
47 /// @param dst output pointer
48 /// @param src input pointer
49 /// @param size number of bytes to copy
50 /// @param numGroups number of groups that execute this function
CopyMemory(global char * dst,const global char * src,uint size,uint numGroups)51 GRL_INLINE void CopyMemory(global char* dst, const global char* src, uint size, uint numGroups)
52 {
53     const uint CACHELINE_SIZE = 64;
54 
55     uint globalID = get_local_size(0) * get_group_id(0) + get_local_id(0);
56 
57     // this part copies cacheline per physical thread one write. starting from dst aligned up to cacheline.
58     // it copies laso reminder
59     {
60         uint alignAdd = ((uint)(uint64_t)dst) & (CACHELINE_SIZE - 1);
61         alignAdd = (CACHELINE_SIZE - alignAdd) & (CACHELINE_SIZE - 1);
62 
63         if (size > alignAdd)
64         {
65             uint alignedBytesCount = size - alignAdd;
66             uint alignedDWsCount = alignedBytesCount >> 2;
67             global uint* dstAlignedPart = (global uint*)(dst + alignAdd);
68             global uint* srcAlignedPart = (global uint*)(src + alignAdd);
69 
70             for (uint id = globalID; id < alignedDWsCount; id += get_local_size(0) * numGroups)
71             {
72                 dstAlignedPart[id] = srcAlignedPart[id];
73             }
74 
75             if (globalID < alignedBytesCount - (alignedDWsCount << 2))
76             {
77                 global uint8_t* dstByteRem = (global uint8_t*)(dstAlignedPart + alignedDWsCount);
78                 global uint8_t* srcByteRem = (global uint8_t*)(srcAlignedPart + alignedDWsCount);
79                 dstByteRem[globalID] = srcByteRem[globalID];
80             }
81         }
82     }
83 
84     // copy to dst below aligned up to chacheline
85     {
86         uint misalignmentBytesSize = (4 - (((uint)dst) & /*bytes in DW*/3)) & 3;
87         if (misalignmentBytesSize)
88         {
89             if (globalID < misalignmentBytesSize)
90             {
91                 dst[globalID] = src[globalID];
92             }
93             dst += misalignmentBytesSize;
94             src += misalignmentBytesSize;
95         }
96 
97         uint misalignmentDWSize = (CACHELINE_SIZE - (((uint)dst) & (CACHELINE_SIZE - 1))) & (CACHELINE_SIZE - 1);
98         if (misalignmentDWSize)
99         {
100             if (globalID < (misalignmentDWSize >> 2))
101             {
102                 ((global uint*)dst)[globalID] = ((global uint*)src)[globalID];
103             }
104         }
105     }
106 }
107 
108 #define CACHELINE_SIZE 64
109 #define CACHELINE_PER_BLOCK 4
110 #define BLOCK_SIZE 256 // = CACHELINE_SIZE * CACHELINE_PER_BLOCK;
111 
112 GRL_INLINE
getInstanceDataToCopy(global const char * array,global const uint64_t * arrayOfPtrs,const uint byteOffset)113 global const char *getInstanceDataToCopy(global const char *array, global const uint64_t *arrayOfPtrs, const uint byteOffset)
114 {
115     if (array != NULL)
116     {
117         return array + byteOffset;
118     }
119     else
120     {
121         return (global char *)arrayOfPtrs[byteOffset >> 6];
122     }
123 }
124 
125 // assummed:
126 // dst is always 64 bytes alligned
127 // size is always multiply of 64 bytes (size of InstanceDesc is always 64 bytes)
128 GRL_INLINE
copyInstances(global char * dst,global const char * array,global const uint64_t * arrayOfPtrs,const uint64_t size,const uint numGroups)129 void copyInstances(global char *dst, global const char *array, global const uint64_t *arrayOfPtrs, const uint64_t size, const uint numGroups)
130 {
131     uint taskId = get_group_id(0);
132 
133     uint blockedSize = (size) & (~(BLOCK_SIZE - 1));
134 
135     uint cachelinedTailOffset = blockedSize;
136     uint cachelinedTailSize = (size - cachelinedTailOffset) & (~(CACHELINE_SIZE - 1));
137 
138     uint tailCacheLines = cachelinedTailSize >> 6; // divide by CACHELINE_SIZE
139     uint reversedTaskId = (uint)(-(((int)taskId) - ((int)numGroups - 1)));
140     if (reversedTaskId < tailCacheLines)
141     {
142         uint byteOffset = cachelinedTailOffset + (reversedTaskId * CACHELINE_SIZE);
143         global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
144         CopyCacheLine(dst + byteOffset, src);
145     }
146 
147     uint numBlocks = blockedSize >> 8;
148     while (taskId < numBlocks)
149     {
150         uint byteOffset = (taskId * BLOCK_SIZE);
151 
152         for (uint cl = 0; cl < CACHELINE_PER_BLOCK; cl++)
153         {
154             global const char *src = getInstanceDataToCopy(array, arrayOfPtrs, byteOffset);
155             CopyCacheLine(dst + byteOffset, src);
156             byteOffset += CACHELINE_SIZE;
157         }
158 
159         taskId += numGroups;
160     }
161 }