xref: /aosp_15_r20/external/mesa3d/src/amd/common/ac_nir_meta_cs_clear_copy_buffer.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ac_nir_meta.h"
8 #include "ac_nir_helpers.h"
9 #include "nir_builder.h"
10 #include "util/helpers.h"
11 
12 /* This is regular load_ssbo with special handling for sparse buffers. Normally, sparse buffer
13  * loads return 0 for all components if a sparse load starts on a non-resident page, crosses
14  * the page boundary, and ends on a resident page. For copy_buffer, we want it to return 0 only
15  * for the portion of the load that's non-resident, and load values for the portion that's
16  * resident. The workaround is to scalarize such loads and disallow vectorization.
17  */
18 static nir_def *
load_ssbo_sparse(nir_builder * b,unsigned num_components,unsigned bit_size,nir_def * buf,nir_def * offset,struct _nir_load_ssbo_indices params,bool sparse)19 load_ssbo_sparse(nir_builder *b, unsigned num_components, unsigned bit_size, nir_def *buf,
20                  nir_def *offset, struct _nir_load_ssbo_indices params, bool sparse)
21 {
22    if (sparse && num_components > 1) {
23       nir_def *vec[NIR_MAX_VEC_COMPONENTS];
24 
25       /* Split the vector load into scalar loads. */
26       for (unsigned i = 0; i < num_components; i++) {
27          unsigned elem_offset = i * bit_size / 8;
28          unsigned align_offset = (params.align_offset + elem_offset) % params.align_mul;
29 
30          vec[i] = nir_load_ssbo(b, 1, bit_size, buf,
31                                 nir_iadd_imm(b, offset, elem_offset),
32                                 .access = params.access | ACCESS_KEEP_SCALAR,
33                                 .align_mul = params.align_mul,
34                                 .align_offset = align_offset);
35       }
36       return nir_vec(b, vec, num_components);
37    } else {
38       return nir_load_ssbo(b, num_components, bit_size, buf, offset,
39                            .access = params.access,
40                            .align_mul = params.align_mul,
41                            .align_offset = params.align_offset);
42    }
43 }
44 
45 /* Create a compute shader implementing clear_buffer or copy_buffer. */
46 nir_shader *
ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options * options,union ac_cs_clear_copy_buffer_key * key)47 ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options,
48                                union ac_cs_clear_copy_buffer_key *key)
49 {
50    if (options->print_key) {
51       fprintf(stderr, "Internal shader: dma\n");
52       fprintf(stderr, "   key.is_clear = %u\n", key->is_clear);
53       fprintf(stderr, "   key.dwords_per_thread = %u\n", key->dwords_per_thread);
54       fprintf(stderr, "   key.clear_value_size_is_12 = %u\n", key->clear_value_size_is_12);
55       fprintf(stderr, "   key.src_is_sparse = %u\n", key->src_is_sparse);
56       fprintf(stderr, "   key.src_align_offset = %u\n", key->src_align_offset);
57       fprintf(stderr, "   key.dst_align_offset = %u\n", key->dst_align_offset);
58       fprintf(stderr, "   key.dst_last_thread_bytes = %u\n", key->dst_last_thread_bytes);
59       fprintf(stderr, "   key.dst_single_thread_unaligned = %u\n", key->dst_single_thread_unaligned);
60       fprintf(stderr, "\n");
61    }
62 
63    assert(key->dwords_per_thread && key->dwords_per_thread <= 4);
64 
65    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options->nir_options,
66                                                   "clear_copy_buffer_cs");
67    b.shader->info.workgroup_size[0] = 64;
68    b.shader->info.workgroup_size[1] = 1;
69    b.shader->info.workgroup_size[2] = 1;
70    b.shader->info.num_ssbos = key->is_clear ? 1 : 2;
71    b.shader->info.cs.user_data_components_amd = 0;
72 
73    if (key->is_clear) {
74       b.shader->info.cs.user_data_components_amd +=
75          key->clear_value_size_is_12 ? 3 : key->dwords_per_thread;
76    }
77 
78    /* Add the last thread ID value. */
79    unsigned last_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
80    if (key->dst_last_thread_bytes)
81       b.shader->info.cs.user_data_components_amd++;
82 
83    unsigned start_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
84    if (key->has_start_thread)
85       b.shader->info.cs.user_data_components_amd++;
86 
87    nir_def *thread_id = ac_get_global_ids(&b, 1, 32);
88 
89    /* If the clear/copy area is unaligned, we launched extra threads at the beginning to make it
90     * aligned. Skip those threads here.
91     */
92    nir_if *if_positive = NULL;
93    if (key->has_start_thread) {
94       nir_def *start_thread =
95          nir_channel(&b, nir_load_user_data_amd(&b), start_thread_user_data_index);
96       thread_id = nir_isub(&b, thread_id, start_thread);
97       if_positive = nir_push_if(&b, nir_ige_imm(&b, thread_id, 0));
98    }
99 
100    /* Convert the global thread ID into bytes. */
101    nir_def *offset = nir_imul_imm(&b, thread_id, 4 * key->dwords_per_thread);
102    nir_def *value;
103 
104    if (key->is_clear) {
105       value = nir_trim_vector(&b, nir_load_user_data_amd(&b), key->dwords_per_thread);
106 
107       /* We store 4 dwords per thread, but the clear value has 3 dwords. Swizzle it to 4 dwords.
108        * Storing 4 dwords per thread is faster even when the ALU cost is worse.
109        */
110       if (key->clear_value_size_is_12 && key->dwords_per_thread == 4) {
111          nir_def *dw_offset = nir_imul_imm(&b, thread_id, key->dwords_per_thread);
112          nir_def *vec[3];
113 
114          /* Swizzle a 3-component clear value to get a 4-component clear value. Example:
115           * 0 1 2 3 | 4 5 6 7 | 8 9 10 11  // dw_offset
116           *              |
117           *              V
118           * 0 1 2 0 | 1 2 0 1 | 2 0 1 2    // clear value component indices
119           */
120          for (unsigned i = 0; i < 3; i++) {
121             vec[i] = nir_vector_extract(&b, value,
122                                         nir_umod_imm(&b, nir_iadd_imm(&b, dw_offset, i), 3));
123          }
124          value = nir_vec4(&b, vec[0], vec[1], vec[2], vec[0]);
125       }
126    } else {
127       /* The hw doesn't support unaligned 32-bit loads, and only supports single-component
128        * unaligned 1-byte and 2-byte loads. Luckily, we don't have to use single-component loads
129        * because ac_nir_lower_subdword_load converts 1-byte and 2-byte vector loads with unaligned
130        * offsets into aligned 32-bit loads by loading an extra dword and then bit-shifting all bits
131        * to get the expected result. We only have to set bit_size to 8 or 16 and align_offset to
132        * 1..3 to indicate that this is an unaligned load. align_offset is the amount of
133        * unalignment.
134        *
135        * Since the buffer binding offsets are rounded down to the clear/copy size of the thread
136        * (i.e. dst_align_offset is subtracted from dst_offset, and src_align_offset is subtracted
137        * from src_offset), the stores expect the loaded value to be byte-shifted accordingly.
138        * realign_offset is the amount of byte-shifting we have to do.
139        */
140       assert(util_is_power_of_two_nonzero(key->dwords_per_thread));
141       int realign_offset = key->src_align_offset - key->dst_align_offset;
142       unsigned alignment = (unsigned)realign_offset % 4 == 0 ? 4 :
143                            (unsigned)realign_offset % 2 == 0 ? 2 : 1;
144       unsigned bit_size = alignment * 8;
145       unsigned num_comps = key->dwords_per_thread * 4 / alignment;
146       nir_if *if_first_thread = NULL;
147       nir_def *value0 = NULL;
148 
149       if (realign_offset < 0) {
150          /* if src_align_offset is less than dst_align_offset, realign_offset is
151           * negative, which causes the first thread to use a negative buffer offset, which goes
152           * entirely out of bounds because the offset is treated as unsigned. Instead of that,
153           * the first thread should load from offset 0 by not loading the bytes before
154           * the beginning of the buffer.
155           */
156          if_first_thread = nir_push_if(&b, nir_ieq_imm(&b, thread_id, 0));
157          {
158             unsigned num_removed_comps = -realign_offset / alignment;
159             unsigned num_inbounds_comps = num_comps - num_removed_comps;
160 
161             /* Only 8 and 16 component vectors are valid after 5 in NIR. */
162             while (!nir_num_components_valid(num_inbounds_comps))
163                num_inbounds_comps = util_next_power_of_two(num_inbounds_comps);
164 
165             value0 = load_ssbo_sparse(&b, num_inbounds_comps, bit_size, nir_imm_int(&b, 0), offset,
166                                       (struct _nir_load_ssbo_indices){
167                                          .access = ACCESS_RESTRICT,
168                                          .align_mul = 4,
169                                          .align_offset = 0
170                                       }, key->src_is_sparse);
171 
172             /* Add the components that we didn't load as undef. */
173             nir_def *comps[16];
174             assert(num_comps <= ARRAY_SIZE(comps));
175             for (unsigned i = 0; i < num_comps; i++) {
176                if (i < num_removed_comps)
177                   comps[i] = nir_undef(&b, 1, bit_size);
178                else
179                   comps[i] = nir_channel(&b, value0, i - num_removed_comps);
180             }
181             value0 = nir_vec(&b, comps, num_comps);
182          }
183          nir_push_else(&b, if_first_thread);
184       }
185 
186       value = load_ssbo_sparse(&b, num_comps, bit_size, nir_imm_int(&b, 0),
187                                nir_iadd_imm(&b, offset, realign_offset),
188                                (struct _nir_load_ssbo_indices){
189                                   .access = ACCESS_RESTRICT,
190                                   .align_mul = 4,
191                                   .align_offset = (unsigned)realign_offset % 4
192                                }, key->src_is_sparse);
193 
194 
195       if (if_first_thread) {
196          nir_pop_if(&b, if_first_thread);
197          value = nir_if_phi(&b, value0, value);
198       }
199 
200       /* Bitcast the vector to 32 bits. */
201       if (value->bit_size != 32)
202          value = nir_extract_bits(&b, &value, 1, 0, key->dwords_per_thread, 32);
203    }
204 
205    nir_def *dst_buf = nir_imm_int(&b, !key->is_clear);
206    nir_if *if_first_thread = NULL, *if_last_thread = NULL;
207 
208    if (!key->dst_single_thread_unaligned) {
209       /* dst_align_offset means how many bytes the first thread should skip because the offset of
210        * the buffer binding is rounded down to the clear/copy size of thread, causing the bytes
211        * before dst_align_offset to be writable. Above we used realign_offset to byte-shift
212        * the value to compensate for the rounded-down offset, so that all stores are dword stores
213        * regardless of the offset/size alignment except that the first thread shouldn't store
214        * the first dst_align_offset bytes, and the last thread should only store the first
215        * dst_last_thread_bytes. In both cases, there is a dword that must be only partially
216        * written by splitting it into 8-bit and 16-bit stores.
217        */
218       if (key->dst_align_offset) {
219           if_first_thread = nir_push_if(&b, nir_ieq_imm(&b, thread_id, 0));
220           {
221              unsigned local_offset = key->dst_align_offset;
222              nir_def *first_dword = nir_channel(&b, value, local_offset / 4);
223 
224              if (local_offset % 2 == 1) {
225                 nir_store_ssbo(&b, nir_channel(&b, nir_unpack_32_4x8(&b, first_dword), local_offset % 4),
226                                dst_buf, nir_iadd_imm_nuw(&b, offset, local_offset),
227                                .access = ACCESS_RESTRICT);
228                 local_offset++;
229              }
230 
231              if (local_offset % 4 == 2) {
232                 nir_store_ssbo(&b, nir_unpack_32_2x16_split_y(&b, first_dword), dst_buf,
233                                nir_iadd_imm_nuw(&b, offset, local_offset),
234                                .access = ACCESS_RESTRICT);
235                 local_offset += 2;
236              }
237 
238              assert(local_offset % 4 == 0);
239              unsigned num_dw_remaining = key->dwords_per_thread - local_offset / 4;
240 
241              if (num_dw_remaining) {
242                 nir_def *dwords =
243                    nir_channels(&b, value, BITFIELD_RANGE(local_offset / 4, num_dw_remaining));
244 
245                 nir_store_ssbo(&b, dwords, dst_buf, nir_iadd_imm_nuw(&b, offset, local_offset),
246                                .access = ACCESS_RESTRICT);
247              }
248           }
249           nir_push_else(&b, if_first_thread);
250       }
251 
252       if (key->dst_last_thread_bytes) {
253          nir_def *last_thread_id =
254             nir_channel(&b, nir_load_user_data_amd(&b), last_thread_user_data_index);
255 
256          if_last_thread = nir_push_if(&b, nir_ieq(&b, thread_id, last_thread_id));
257          {
258             unsigned num_dwords = key->dst_last_thread_bytes / 4;
259             bool write_short = (key->dst_last_thread_bytes - num_dwords * 4) / 2;
260             bool write_byte = key->dst_last_thread_bytes % 2;
261             nir_def *last_dword = nir_channel(&b, value, num_dwords);
262 
263             if (num_dwords) {
264                nir_def *dwords = nir_channels(&b, value, BITFIELD_MASK(num_dwords));
265                nir_store_ssbo(&b, dwords, dst_buf, offset, .access = ACCESS_RESTRICT);
266             }
267 
268             if (write_short) {
269                nir_store_ssbo(&b, nir_u2u16(&b, last_dword), dst_buf,
270                               nir_iadd_imm_nuw(&b, offset, num_dwords * 4),
271                               .access = ACCESS_RESTRICT);
272             }
273 
274             if (write_byte) {
275                nir_store_ssbo(&b, nir_channel(&b, nir_unpack_32_4x8(&b, last_dword), write_short * 2),
276                               dst_buf, nir_iadd_imm_nuw(&b, offset, num_dwords * 4 + write_short * 2),
277                               .access = ACCESS_RESTRICT);
278             }
279          }
280          nir_push_else(&b, if_last_thread);
281       }
282 
283       nir_store_ssbo(&b, value, dst_buf, offset, .access = ACCESS_RESTRICT);
284 
285       if (if_last_thread)
286          nir_pop_if(&b, if_last_thread);
287       if (if_first_thread)
288          nir_pop_if(&b, if_first_thread);
289    } else {
290       /* This shader only executes a single thread (tiny copy or clear) and it's unaligned at both
291        * the beginning and the end. Walk the individual dwords/words/bytes that should be written
292        * to split the store accordingly.
293        */
294       for (unsigned local_offset = key->dst_align_offset;
295            local_offset < key->dst_last_thread_bytes;) {
296          unsigned remaining = key->dst_last_thread_bytes - local_offset;
297          nir_def *src_dword = nir_channel(&b, value, local_offset / 4);
298 
299          if (local_offset % 2 == 1 || remaining == 1) {
300             /* 1-byte store. */
301             nir_def *src_dword4x8 = nir_unpack_32_4x8(&b, src_dword);
302             nir_store_ssbo(&b, nir_channel(&b, src_dword4x8, local_offset % 4), dst_buf,
303                            nir_iadd_imm_nuw(&b, offset, local_offset), .access = ACCESS_RESTRICT);
304             local_offset++;
305          } else if (local_offset % 4 == 2 || remaining == 2 || remaining == 3) {
306             /* 2-byte store. */
307             nir_def *src_dword2x16 = nir_unpack_32_2x16(&b, src_dword);
308             nir_store_ssbo(&b, nir_channel(&b, src_dword2x16, (local_offset / 2) % 2), dst_buf,
309                            nir_iadd_imm_nuw(&b, offset, local_offset), .access = ACCESS_RESTRICT);
310             local_offset += 2;
311          } else {
312             /* 1-N dwords. */
313             unsigned dw_size = remaining / 4;
314             assert(dw_size);
315             assert(local_offset % 4 == 0);
316 
317             nir_store_ssbo(&b, nir_channels(&b, value, BITFIELD_RANGE(local_offset / 4, dw_size)),
318                            dst_buf, nir_iadd_imm_nuw(&b, offset, local_offset),
319                            .access = ACCESS_RESTRICT);
320             local_offset += dw_size * 4;
321          }
322       }
323    }
324 
325    if (key->has_start_thread)
326       nir_pop_if(&b, if_positive);
327 
328    return b.shader;
329 }
330 
331 bool
ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options * options,const struct ac_cs_clear_copy_buffer_info * info,struct ac_cs_clear_copy_buffer_dispatch * out)332 ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *options,
333                                 const struct ac_cs_clear_copy_buffer_info *info,
334                                 struct ac_cs_clear_copy_buffer_dispatch *out)
335 {
336    bool is_copy = info->clear_value_size == 0;
337 
338    memset(out, 0, sizeof(*out));
339 
340    /* Expand 1-byte and 2-byte clear values to a dword. */
341    int clear_value_size = info->clear_value_size;
342    const uint32_t *clear_value = info->clear_value;
343    uint32_t tmp_clear_value;
344 
345    if (!is_copy) {
346       if (util_lower_clearsize_to_dword(clear_value, &clear_value_size, &tmp_clear_value))
347          clear_value = &tmp_clear_value;
348 
349       assert(clear_value_size % 4 == 0);
350    }
351 
352    /* This doesn't fail very often because the only possible fallback is CP DMA, which doesn't
353     * support the render condition.
354     */
355    if (options->fail_if_slow && !info->render_condition_enabled && options->info->has_cp_dma &&
356        !options->info->cp_sdma_ge_use_system_memory_scope) {
357       switch (options->info->gfx_level) {
358       /* GFX6-8: CP DMA clears are so slow that we risk getting a GPU timeout. CP DMA copies
359        * are also slow but less.
360        */
361       case GFX6:
362          /* Optimal for Tahiti. */
363          if (is_copy) {
364             if (!info->dst_is_vram || !info->src_is_vram ||
365                 info->size <= (info->dst_offset % 4 ||
366                                (info->dst_offset == 4 && info->src_offset % 4) ? 32 * 1024 : 16 * 1024))
367                return false;
368          } else {
369             /* CP DMA only supports dword-aligned clears and small clear values. */
370             if (clear_value_size <= 4 && info->dst_offset % 4 == 0 && info->size % 4 == 0 &&
371                 info->dst_is_vram && info->size <= 1024)
372                return false;
373          }
374          break;
375 
376       case GFX7:
377          /* Optimal for Hawaii. */
378          if (is_copy && info->dst_is_vram && info->src_is_vram && info->size <= 512)
379             return false;
380          break;
381 
382       case GFX8:
383          /* Optimal for Tonga. */
384          break;
385 
386       case GFX9:
387          /* Optimal for Vega10. */
388          if (is_copy) {
389             if (info->src_is_vram) {
390                if (info->dst_is_vram) {
391                   if (info->size < 4096)
392                      return false;
393                } else {
394                   if (info->size < (info->dst_offset % 64 ? 8192 : 2048))
395                      return false;
396                }
397             } else {
398                /* GTT->VRAM and GTT->GTT. */
399                return false;
400             }
401          } else {
402             /* CP DMA only supports dword-aligned clears and small clear values. */
403             if (clear_value_size <= 4 && info->dst_offset % 4 == 0 && info->size % 4 == 0 &&
404                 !info->dst_is_vram && (info->size < 2048 || info->size >= 8 << 20 /* 8 MB */))
405                return false;
406          }
407          break;
408 
409       case GFX10:
410       case GFX10_3:
411          /* Optimal for Navi21, Navi10. */
412          break;
413 
414       case GFX11:
415       default:
416          /* Optimal for Navi31. */
417          if (is_copy && info->size < 1024 && info->dst_offset % 256 && info->dst_is_vram && info->src_is_vram)
418             return false;
419          break;
420 
421       case GFX12:
422          unreachable("cp_sdma_ge_use_system_memory_scope should be true, so we should never get here");
423       }
424    }
425 
426    unsigned dwords_per_thread = info->dwords_per_thread;
427 
428    /* Determine optimal dwords_per_thread for performance. */
429    if (!info->dwords_per_thread) {
430       /* This is a good initial value to start with. */
431       dwords_per_thread = info->size <= 64 * 1024 ? 2 : 4;
432 
433       /* Clearing 4 dwords per thread with a 3-dword clear value is faster with big sizes. */
434       if (!is_copy && clear_value_size == 12)
435          dwords_per_thread = info->size <= 4096 ? 3 : 4;
436 
437       switch (options->info->gfx_level) {
438       case GFX6:
439          /* Optimal for Tahiti. */
440          if (is_copy) {
441             if (info->dst_is_vram && info->src_is_vram)
442                dwords_per_thread = 2;
443          } else {
444             if (info->dst_is_vram && clear_value_size != 12)
445                dwords_per_thread = info->size <= 128 * 1024 || info->size >= 4 << 20 /* 4MB */ ? 2 : 4;
446 
447             if (clear_value_size == 12)
448                dwords_per_thread = info->size <= (info->dst_is_vram ? 256 : 128) * 1024 ? 3 : 4;
449          }
450          break;
451 
452       case GFX7:
453          /* Optimal for Hawaii. */
454          if (is_copy) {
455             if (info->dst_is_vram && info->src_is_vram && info->dst_offset % 4 == 0 &&
456                 info->size >= 8 << 20 /* 8MB */)
457                dwords_per_thread = 2;
458          } else {
459             if (info->dst_is_vram && clear_value_size != 12)
460                dwords_per_thread = info->size <= 32 * 1024 ? 2 : 4;
461 
462             if (clear_value_size == 12)
463                dwords_per_thread = info->size <= 256 * 1024 ? 3 : 4;
464          }
465          break;
466 
467       case GFX8:
468          /* Optimal for Tonga. */
469          if (is_copy) {
470             dwords_per_thread = 2;
471          } else {
472             if (clear_value_size == 12 && info->size < (2 << 20) /* 2MB */)
473                dwords_per_thread = 3;
474          }
475          break;
476 
477       case GFX9:
478          /* Optimal for Vega10. */
479          if (is_copy && info->src_is_vram && info->dst_is_vram && info->size >= 8 << 20 /* 8 MB */)
480             dwords_per_thread = 2;
481 
482          if (!info->dst_is_vram)
483             dwords_per_thread = 2;
484          break;
485 
486       case GFX10:
487       case GFX10_3:
488       case GFX11:
489       case GFX12:
490          /* Optimal for Gfx12xx, Navi31, Navi21, Navi10. */
491          break;
492 
493       default:
494          break;
495       }
496    }
497 
498    /* dwords_per_thread must be at least the size of the clear value. */
499    if (!is_copy)
500       dwords_per_thread = MAX2(dwords_per_thread, clear_value_size / 4);
501 
502    /* Validate dwords_per_thread. */
503    if (dwords_per_thread > 4) {
504       assert(!"dwords_per_thread must be <= 4");
505       return false; /* invalid value */
506    }
507 
508    if (clear_value_size > dwords_per_thread * 4) {
509       assert(!"clear_value_size must be <= dwords_per_thread");
510       return false; /* invalid value */
511    }
512 
513    if (clear_value_size == 12 && info->dst_offset % 4) {
514       assert(!"if clear_value_size == 12, dst_offset must be aligned to 4");
515       return false; /* invalid value */
516    }
517 
518    unsigned dst_align_offset = info->dst_offset % (dwords_per_thread * 4);
519    unsigned dst_offset_bound = info->dst_offset - dst_align_offset;
520    unsigned src_align_offset = is_copy ? info->src_offset % 4 : 0;
521    unsigned num_user_data_terms = 0;
522 
523    /* Set the clear value in user data SGPRs. */
524    if (!is_copy) {
525       assert(clear_value_size >= 4 && clear_value_size <= 16 &&
526              (clear_value_size == 12 || util_is_power_of_two_or_zero(clear_value_size)));
527 
528       /* Since the clear value may start on an unaligned offset and we just pass user SGPRs
529        * to dword stores as-is, we need to byte-shift the clear value to that offset and
530        * replicate it because 1 invocation stores up to 4 dwords from user SGPRs regardless of
531        * the clear value size.
532        */
533       num_user_data_terms = clear_value_size == 12 ? 3 : dwords_per_thread;
534       unsigned user_data_size = num_user_data_terms * 4;
535 
536       memcpy(out->user_data,
537              (uint8_t*)clear_value + clear_value_size - dst_align_offset % clear_value_size,
538              dst_align_offset % clear_value_size);
539       unsigned offset = dst_align_offset % clear_value_size;
540 
541       while (offset + clear_value_size <= user_data_size) {
542          memcpy((uint8_t*)out->user_data + offset, clear_value, clear_value_size);
543          offset += clear_value_size;
544       }
545 
546       if (offset < user_data_size)
547          memcpy((uint8_t*)out->user_data + offset, clear_value, user_data_size - offset);
548    }
549 
550    out->shader_key.key = 0;
551 
552    out->shader_key.is_clear = !is_copy;
553    assert(dwords_per_thread && dwords_per_thread <= 4);
554    out->shader_key.dwords_per_thread = dwords_per_thread;
555    out->shader_key.clear_value_size_is_12 = !is_copy && clear_value_size == 12;
556    out->shader_key.src_is_sparse = info->src_is_sparse;
557    out->shader_key.src_align_offset = src_align_offset;
558    out->shader_key.dst_align_offset = dst_align_offset;
559 
560    if ((dst_align_offset + info->size) % 4)
561       out->shader_key.dst_last_thread_bytes = (dst_align_offset + info->size) % (dwords_per_thread * 4);
562 
563    unsigned num_threads = DIV_ROUND_UP(dst_align_offset + info->size, dwords_per_thread * 4);
564    out->shader_key.dst_single_thread_unaligned = num_threads == 1 && dst_align_offset &&
565                                                  out->shader_key.dst_last_thread_bytes;
566 
567    /* start_thread offsets threads to make sure all non-zero waves start clearing/copying from
568     * the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block
569     * partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256.
570     * Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n.
571     */
572    unsigned start_thread =
573       dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ?
574             DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0;
575    out->shader_key.has_start_thread = start_thread != 0;
576 
577    /* Set the value of the last thread ID, so that the shader knows which thread is the last one. */
578    if (out->shader_key.dst_last_thread_bytes)
579       out->user_data[num_user_data_terms++] = num_threads - 1;
580    if (out->shader_key.has_start_thread)
581       out->user_data[num_user_data_terms++] = start_thread;
582 
583    /* We need to bind whole dwords because of how we compute voffset. The bytes that shouldn't
584     * be written are not written by the shader.
585     */
586    out->ssbo[is_copy].offset = dst_offset_bound;
587    out->ssbo[is_copy].size = align(dst_align_offset + info->size, 4);
588 
589    if (is_copy) {
590       /* Since unaligned copies use 32-bit loads, any dword that's partially covered by the copy
591        * range must be fully covered, so that the 32-bit loads succeed.
592        */
593       out->ssbo[0].offset = info->src_offset - src_align_offset;
594       out->ssbo[0].size = align(src_align_offset + info->size, 4);
595       assert(out->ssbo[0].offset % 4 == 0 && out->ssbo[0].size % 4 == 0);
596    }
597 
598    out->num_ssbos = is_copy ? 2 : 1;
599    out->workgroup_size = 64;
600    out->num_threads = start_thread + num_threads;
601    return true;
602 }
603