1 /*
2 * Copyright 2024 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ac_nir_meta.h"
8 #include "ac_nir_helpers.h"
9 #include "nir_builder.h"
10 #include "util/helpers.h"
11
12 /* This is regular load_ssbo with special handling for sparse buffers. Normally, sparse buffer
13 * loads return 0 for all components if a sparse load starts on a non-resident page, crosses
14 * the page boundary, and ends on a resident page. For copy_buffer, we want it to return 0 only
15 * for the portion of the load that's non-resident, and load values for the portion that's
16 * resident. The workaround is to scalarize such loads and disallow vectorization.
17 */
18 static nir_def *
load_ssbo_sparse(nir_builder * b,unsigned num_components,unsigned bit_size,nir_def * buf,nir_def * offset,struct _nir_load_ssbo_indices params,bool sparse)19 load_ssbo_sparse(nir_builder *b, unsigned num_components, unsigned bit_size, nir_def *buf,
20 nir_def *offset, struct _nir_load_ssbo_indices params, bool sparse)
21 {
22 if (sparse && num_components > 1) {
23 nir_def *vec[NIR_MAX_VEC_COMPONENTS];
24
25 /* Split the vector load into scalar loads. */
26 for (unsigned i = 0; i < num_components; i++) {
27 unsigned elem_offset = i * bit_size / 8;
28 unsigned align_offset = (params.align_offset + elem_offset) % params.align_mul;
29
30 vec[i] = nir_load_ssbo(b, 1, bit_size, buf,
31 nir_iadd_imm(b, offset, elem_offset),
32 .access = params.access | ACCESS_KEEP_SCALAR,
33 .align_mul = params.align_mul,
34 .align_offset = align_offset);
35 }
36 return nir_vec(b, vec, num_components);
37 } else {
38 return nir_load_ssbo(b, num_components, bit_size, buf, offset,
39 .access = params.access,
40 .align_mul = params.align_mul,
41 .align_offset = params.align_offset);
42 }
43 }
44
45 /* Create a compute shader implementing clear_buffer or copy_buffer. */
46 nir_shader *
ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options * options,union ac_cs_clear_copy_buffer_key * key)47 ac_create_clear_copy_buffer_cs(struct ac_cs_clear_copy_buffer_options *options,
48 union ac_cs_clear_copy_buffer_key *key)
49 {
50 if (options->print_key) {
51 fprintf(stderr, "Internal shader: dma\n");
52 fprintf(stderr, " key.is_clear = %u\n", key->is_clear);
53 fprintf(stderr, " key.dwords_per_thread = %u\n", key->dwords_per_thread);
54 fprintf(stderr, " key.clear_value_size_is_12 = %u\n", key->clear_value_size_is_12);
55 fprintf(stderr, " key.src_is_sparse = %u\n", key->src_is_sparse);
56 fprintf(stderr, " key.src_align_offset = %u\n", key->src_align_offset);
57 fprintf(stderr, " key.dst_align_offset = %u\n", key->dst_align_offset);
58 fprintf(stderr, " key.dst_last_thread_bytes = %u\n", key->dst_last_thread_bytes);
59 fprintf(stderr, " key.dst_single_thread_unaligned = %u\n", key->dst_single_thread_unaligned);
60 fprintf(stderr, "\n");
61 }
62
63 assert(key->dwords_per_thread && key->dwords_per_thread <= 4);
64
65 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options->nir_options,
66 "clear_copy_buffer_cs");
67 b.shader->info.workgroup_size[0] = 64;
68 b.shader->info.workgroup_size[1] = 1;
69 b.shader->info.workgroup_size[2] = 1;
70 b.shader->info.num_ssbos = key->is_clear ? 1 : 2;
71 b.shader->info.cs.user_data_components_amd = 0;
72
73 if (key->is_clear) {
74 b.shader->info.cs.user_data_components_amd +=
75 key->clear_value_size_is_12 ? 3 : key->dwords_per_thread;
76 }
77
78 /* Add the last thread ID value. */
79 unsigned last_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
80 if (key->dst_last_thread_bytes)
81 b.shader->info.cs.user_data_components_amd++;
82
83 unsigned start_thread_user_data_index = b.shader->info.cs.user_data_components_amd;
84 if (key->has_start_thread)
85 b.shader->info.cs.user_data_components_amd++;
86
87 nir_def *thread_id = ac_get_global_ids(&b, 1, 32);
88
89 /* If the clear/copy area is unaligned, we launched extra threads at the beginning to make it
90 * aligned. Skip those threads here.
91 */
92 nir_if *if_positive = NULL;
93 if (key->has_start_thread) {
94 nir_def *start_thread =
95 nir_channel(&b, nir_load_user_data_amd(&b), start_thread_user_data_index);
96 thread_id = nir_isub(&b, thread_id, start_thread);
97 if_positive = nir_push_if(&b, nir_ige_imm(&b, thread_id, 0));
98 }
99
100 /* Convert the global thread ID into bytes. */
101 nir_def *offset = nir_imul_imm(&b, thread_id, 4 * key->dwords_per_thread);
102 nir_def *value;
103
104 if (key->is_clear) {
105 value = nir_trim_vector(&b, nir_load_user_data_amd(&b), key->dwords_per_thread);
106
107 /* We store 4 dwords per thread, but the clear value has 3 dwords. Swizzle it to 4 dwords.
108 * Storing 4 dwords per thread is faster even when the ALU cost is worse.
109 */
110 if (key->clear_value_size_is_12 && key->dwords_per_thread == 4) {
111 nir_def *dw_offset = nir_imul_imm(&b, thread_id, key->dwords_per_thread);
112 nir_def *vec[3];
113
114 /* Swizzle a 3-component clear value to get a 4-component clear value. Example:
115 * 0 1 2 3 | 4 5 6 7 | 8 9 10 11 // dw_offset
116 * |
117 * V
118 * 0 1 2 0 | 1 2 0 1 | 2 0 1 2 // clear value component indices
119 */
120 for (unsigned i = 0; i < 3; i++) {
121 vec[i] = nir_vector_extract(&b, value,
122 nir_umod_imm(&b, nir_iadd_imm(&b, dw_offset, i), 3));
123 }
124 value = nir_vec4(&b, vec[0], vec[1], vec[2], vec[0]);
125 }
126 } else {
127 /* The hw doesn't support unaligned 32-bit loads, and only supports single-component
128 * unaligned 1-byte and 2-byte loads. Luckily, we don't have to use single-component loads
129 * because ac_nir_lower_subdword_load converts 1-byte and 2-byte vector loads with unaligned
130 * offsets into aligned 32-bit loads by loading an extra dword and then bit-shifting all bits
131 * to get the expected result. We only have to set bit_size to 8 or 16 and align_offset to
132 * 1..3 to indicate that this is an unaligned load. align_offset is the amount of
133 * unalignment.
134 *
135 * Since the buffer binding offsets are rounded down to the clear/copy size of the thread
136 * (i.e. dst_align_offset is subtracted from dst_offset, and src_align_offset is subtracted
137 * from src_offset), the stores expect the loaded value to be byte-shifted accordingly.
138 * realign_offset is the amount of byte-shifting we have to do.
139 */
140 assert(util_is_power_of_two_nonzero(key->dwords_per_thread));
141 int realign_offset = key->src_align_offset - key->dst_align_offset;
142 unsigned alignment = (unsigned)realign_offset % 4 == 0 ? 4 :
143 (unsigned)realign_offset % 2 == 0 ? 2 : 1;
144 unsigned bit_size = alignment * 8;
145 unsigned num_comps = key->dwords_per_thread * 4 / alignment;
146 nir_if *if_first_thread = NULL;
147 nir_def *value0 = NULL;
148
149 if (realign_offset < 0) {
150 /* if src_align_offset is less than dst_align_offset, realign_offset is
151 * negative, which causes the first thread to use a negative buffer offset, which goes
152 * entirely out of bounds because the offset is treated as unsigned. Instead of that,
153 * the first thread should load from offset 0 by not loading the bytes before
154 * the beginning of the buffer.
155 */
156 if_first_thread = nir_push_if(&b, nir_ieq_imm(&b, thread_id, 0));
157 {
158 unsigned num_removed_comps = -realign_offset / alignment;
159 unsigned num_inbounds_comps = num_comps - num_removed_comps;
160
161 /* Only 8 and 16 component vectors are valid after 5 in NIR. */
162 while (!nir_num_components_valid(num_inbounds_comps))
163 num_inbounds_comps = util_next_power_of_two(num_inbounds_comps);
164
165 value0 = load_ssbo_sparse(&b, num_inbounds_comps, bit_size, nir_imm_int(&b, 0), offset,
166 (struct _nir_load_ssbo_indices){
167 .access = ACCESS_RESTRICT,
168 .align_mul = 4,
169 .align_offset = 0
170 }, key->src_is_sparse);
171
172 /* Add the components that we didn't load as undef. */
173 nir_def *comps[16];
174 assert(num_comps <= ARRAY_SIZE(comps));
175 for (unsigned i = 0; i < num_comps; i++) {
176 if (i < num_removed_comps)
177 comps[i] = nir_undef(&b, 1, bit_size);
178 else
179 comps[i] = nir_channel(&b, value0, i - num_removed_comps);
180 }
181 value0 = nir_vec(&b, comps, num_comps);
182 }
183 nir_push_else(&b, if_first_thread);
184 }
185
186 value = load_ssbo_sparse(&b, num_comps, bit_size, nir_imm_int(&b, 0),
187 nir_iadd_imm(&b, offset, realign_offset),
188 (struct _nir_load_ssbo_indices){
189 .access = ACCESS_RESTRICT,
190 .align_mul = 4,
191 .align_offset = (unsigned)realign_offset % 4
192 }, key->src_is_sparse);
193
194
195 if (if_first_thread) {
196 nir_pop_if(&b, if_first_thread);
197 value = nir_if_phi(&b, value0, value);
198 }
199
200 /* Bitcast the vector to 32 bits. */
201 if (value->bit_size != 32)
202 value = nir_extract_bits(&b, &value, 1, 0, key->dwords_per_thread, 32);
203 }
204
205 nir_def *dst_buf = nir_imm_int(&b, !key->is_clear);
206 nir_if *if_first_thread = NULL, *if_last_thread = NULL;
207
208 if (!key->dst_single_thread_unaligned) {
209 /* dst_align_offset means how many bytes the first thread should skip because the offset of
210 * the buffer binding is rounded down to the clear/copy size of thread, causing the bytes
211 * before dst_align_offset to be writable. Above we used realign_offset to byte-shift
212 * the value to compensate for the rounded-down offset, so that all stores are dword stores
213 * regardless of the offset/size alignment except that the first thread shouldn't store
214 * the first dst_align_offset bytes, and the last thread should only store the first
215 * dst_last_thread_bytes. In both cases, there is a dword that must be only partially
216 * written by splitting it into 8-bit and 16-bit stores.
217 */
218 if (key->dst_align_offset) {
219 if_first_thread = nir_push_if(&b, nir_ieq_imm(&b, thread_id, 0));
220 {
221 unsigned local_offset = key->dst_align_offset;
222 nir_def *first_dword = nir_channel(&b, value, local_offset / 4);
223
224 if (local_offset % 2 == 1) {
225 nir_store_ssbo(&b, nir_channel(&b, nir_unpack_32_4x8(&b, first_dword), local_offset % 4),
226 dst_buf, nir_iadd_imm_nuw(&b, offset, local_offset),
227 .access = ACCESS_RESTRICT);
228 local_offset++;
229 }
230
231 if (local_offset % 4 == 2) {
232 nir_store_ssbo(&b, nir_unpack_32_2x16_split_y(&b, first_dword), dst_buf,
233 nir_iadd_imm_nuw(&b, offset, local_offset),
234 .access = ACCESS_RESTRICT);
235 local_offset += 2;
236 }
237
238 assert(local_offset % 4 == 0);
239 unsigned num_dw_remaining = key->dwords_per_thread - local_offset / 4;
240
241 if (num_dw_remaining) {
242 nir_def *dwords =
243 nir_channels(&b, value, BITFIELD_RANGE(local_offset / 4, num_dw_remaining));
244
245 nir_store_ssbo(&b, dwords, dst_buf, nir_iadd_imm_nuw(&b, offset, local_offset),
246 .access = ACCESS_RESTRICT);
247 }
248 }
249 nir_push_else(&b, if_first_thread);
250 }
251
252 if (key->dst_last_thread_bytes) {
253 nir_def *last_thread_id =
254 nir_channel(&b, nir_load_user_data_amd(&b), last_thread_user_data_index);
255
256 if_last_thread = nir_push_if(&b, nir_ieq(&b, thread_id, last_thread_id));
257 {
258 unsigned num_dwords = key->dst_last_thread_bytes / 4;
259 bool write_short = (key->dst_last_thread_bytes - num_dwords * 4) / 2;
260 bool write_byte = key->dst_last_thread_bytes % 2;
261 nir_def *last_dword = nir_channel(&b, value, num_dwords);
262
263 if (num_dwords) {
264 nir_def *dwords = nir_channels(&b, value, BITFIELD_MASK(num_dwords));
265 nir_store_ssbo(&b, dwords, dst_buf, offset, .access = ACCESS_RESTRICT);
266 }
267
268 if (write_short) {
269 nir_store_ssbo(&b, nir_u2u16(&b, last_dword), dst_buf,
270 nir_iadd_imm_nuw(&b, offset, num_dwords * 4),
271 .access = ACCESS_RESTRICT);
272 }
273
274 if (write_byte) {
275 nir_store_ssbo(&b, nir_channel(&b, nir_unpack_32_4x8(&b, last_dword), write_short * 2),
276 dst_buf, nir_iadd_imm_nuw(&b, offset, num_dwords * 4 + write_short * 2),
277 .access = ACCESS_RESTRICT);
278 }
279 }
280 nir_push_else(&b, if_last_thread);
281 }
282
283 nir_store_ssbo(&b, value, dst_buf, offset, .access = ACCESS_RESTRICT);
284
285 if (if_last_thread)
286 nir_pop_if(&b, if_last_thread);
287 if (if_first_thread)
288 nir_pop_if(&b, if_first_thread);
289 } else {
290 /* This shader only executes a single thread (tiny copy or clear) and it's unaligned at both
291 * the beginning and the end. Walk the individual dwords/words/bytes that should be written
292 * to split the store accordingly.
293 */
294 for (unsigned local_offset = key->dst_align_offset;
295 local_offset < key->dst_last_thread_bytes;) {
296 unsigned remaining = key->dst_last_thread_bytes - local_offset;
297 nir_def *src_dword = nir_channel(&b, value, local_offset / 4);
298
299 if (local_offset % 2 == 1 || remaining == 1) {
300 /* 1-byte store. */
301 nir_def *src_dword4x8 = nir_unpack_32_4x8(&b, src_dword);
302 nir_store_ssbo(&b, nir_channel(&b, src_dword4x8, local_offset % 4), dst_buf,
303 nir_iadd_imm_nuw(&b, offset, local_offset), .access = ACCESS_RESTRICT);
304 local_offset++;
305 } else if (local_offset % 4 == 2 || remaining == 2 || remaining == 3) {
306 /* 2-byte store. */
307 nir_def *src_dword2x16 = nir_unpack_32_2x16(&b, src_dword);
308 nir_store_ssbo(&b, nir_channel(&b, src_dword2x16, (local_offset / 2) % 2), dst_buf,
309 nir_iadd_imm_nuw(&b, offset, local_offset), .access = ACCESS_RESTRICT);
310 local_offset += 2;
311 } else {
312 /* 1-N dwords. */
313 unsigned dw_size = remaining / 4;
314 assert(dw_size);
315 assert(local_offset % 4 == 0);
316
317 nir_store_ssbo(&b, nir_channels(&b, value, BITFIELD_RANGE(local_offset / 4, dw_size)),
318 dst_buf, nir_iadd_imm_nuw(&b, offset, local_offset),
319 .access = ACCESS_RESTRICT);
320 local_offset += dw_size * 4;
321 }
322 }
323 }
324
325 if (key->has_start_thread)
326 nir_pop_if(&b, if_positive);
327
328 return b.shader;
329 }
330
331 bool
ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options * options,const struct ac_cs_clear_copy_buffer_info * info,struct ac_cs_clear_copy_buffer_dispatch * out)332 ac_prepare_cs_clear_copy_buffer(const struct ac_cs_clear_copy_buffer_options *options,
333 const struct ac_cs_clear_copy_buffer_info *info,
334 struct ac_cs_clear_copy_buffer_dispatch *out)
335 {
336 bool is_copy = info->clear_value_size == 0;
337
338 memset(out, 0, sizeof(*out));
339
340 /* Expand 1-byte and 2-byte clear values to a dword. */
341 int clear_value_size = info->clear_value_size;
342 const uint32_t *clear_value = info->clear_value;
343 uint32_t tmp_clear_value;
344
345 if (!is_copy) {
346 if (util_lower_clearsize_to_dword(clear_value, &clear_value_size, &tmp_clear_value))
347 clear_value = &tmp_clear_value;
348
349 assert(clear_value_size % 4 == 0);
350 }
351
352 /* This doesn't fail very often because the only possible fallback is CP DMA, which doesn't
353 * support the render condition.
354 */
355 if (options->fail_if_slow && !info->render_condition_enabled && options->info->has_cp_dma &&
356 !options->info->cp_sdma_ge_use_system_memory_scope) {
357 switch (options->info->gfx_level) {
358 /* GFX6-8: CP DMA clears are so slow that we risk getting a GPU timeout. CP DMA copies
359 * are also slow but less.
360 */
361 case GFX6:
362 /* Optimal for Tahiti. */
363 if (is_copy) {
364 if (!info->dst_is_vram || !info->src_is_vram ||
365 info->size <= (info->dst_offset % 4 ||
366 (info->dst_offset == 4 && info->src_offset % 4) ? 32 * 1024 : 16 * 1024))
367 return false;
368 } else {
369 /* CP DMA only supports dword-aligned clears and small clear values. */
370 if (clear_value_size <= 4 && info->dst_offset % 4 == 0 && info->size % 4 == 0 &&
371 info->dst_is_vram && info->size <= 1024)
372 return false;
373 }
374 break;
375
376 case GFX7:
377 /* Optimal for Hawaii. */
378 if (is_copy && info->dst_is_vram && info->src_is_vram && info->size <= 512)
379 return false;
380 break;
381
382 case GFX8:
383 /* Optimal for Tonga. */
384 break;
385
386 case GFX9:
387 /* Optimal for Vega10. */
388 if (is_copy) {
389 if (info->src_is_vram) {
390 if (info->dst_is_vram) {
391 if (info->size < 4096)
392 return false;
393 } else {
394 if (info->size < (info->dst_offset % 64 ? 8192 : 2048))
395 return false;
396 }
397 } else {
398 /* GTT->VRAM and GTT->GTT. */
399 return false;
400 }
401 } else {
402 /* CP DMA only supports dword-aligned clears and small clear values. */
403 if (clear_value_size <= 4 && info->dst_offset % 4 == 0 && info->size % 4 == 0 &&
404 !info->dst_is_vram && (info->size < 2048 || info->size >= 8 << 20 /* 8 MB */))
405 return false;
406 }
407 break;
408
409 case GFX10:
410 case GFX10_3:
411 /* Optimal for Navi21, Navi10. */
412 break;
413
414 case GFX11:
415 default:
416 /* Optimal for Navi31. */
417 if (is_copy && info->size < 1024 && info->dst_offset % 256 && info->dst_is_vram && info->src_is_vram)
418 return false;
419 break;
420
421 case GFX12:
422 unreachable("cp_sdma_ge_use_system_memory_scope should be true, so we should never get here");
423 }
424 }
425
426 unsigned dwords_per_thread = info->dwords_per_thread;
427
428 /* Determine optimal dwords_per_thread for performance. */
429 if (!info->dwords_per_thread) {
430 /* This is a good initial value to start with. */
431 dwords_per_thread = info->size <= 64 * 1024 ? 2 : 4;
432
433 /* Clearing 4 dwords per thread with a 3-dword clear value is faster with big sizes. */
434 if (!is_copy && clear_value_size == 12)
435 dwords_per_thread = info->size <= 4096 ? 3 : 4;
436
437 switch (options->info->gfx_level) {
438 case GFX6:
439 /* Optimal for Tahiti. */
440 if (is_copy) {
441 if (info->dst_is_vram && info->src_is_vram)
442 dwords_per_thread = 2;
443 } else {
444 if (info->dst_is_vram && clear_value_size != 12)
445 dwords_per_thread = info->size <= 128 * 1024 || info->size >= 4 << 20 /* 4MB */ ? 2 : 4;
446
447 if (clear_value_size == 12)
448 dwords_per_thread = info->size <= (info->dst_is_vram ? 256 : 128) * 1024 ? 3 : 4;
449 }
450 break;
451
452 case GFX7:
453 /* Optimal for Hawaii. */
454 if (is_copy) {
455 if (info->dst_is_vram && info->src_is_vram && info->dst_offset % 4 == 0 &&
456 info->size >= 8 << 20 /* 8MB */)
457 dwords_per_thread = 2;
458 } else {
459 if (info->dst_is_vram && clear_value_size != 12)
460 dwords_per_thread = info->size <= 32 * 1024 ? 2 : 4;
461
462 if (clear_value_size == 12)
463 dwords_per_thread = info->size <= 256 * 1024 ? 3 : 4;
464 }
465 break;
466
467 case GFX8:
468 /* Optimal for Tonga. */
469 if (is_copy) {
470 dwords_per_thread = 2;
471 } else {
472 if (clear_value_size == 12 && info->size < (2 << 20) /* 2MB */)
473 dwords_per_thread = 3;
474 }
475 break;
476
477 case GFX9:
478 /* Optimal for Vega10. */
479 if (is_copy && info->src_is_vram && info->dst_is_vram && info->size >= 8 << 20 /* 8 MB */)
480 dwords_per_thread = 2;
481
482 if (!info->dst_is_vram)
483 dwords_per_thread = 2;
484 break;
485
486 case GFX10:
487 case GFX10_3:
488 case GFX11:
489 case GFX12:
490 /* Optimal for Gfx12xx, Navi31, Navi21, Navi10. */
491 break;
492
493 default:
494 break;
495 }
496 }
497
498 /* dwords_per_thread must be at least the size of the clear value. */
499 if (!is_copy)
500 dwords_per_thread = MAX2(dwords_per_thread, clear_value_size / 4);
501
502 /* Validate dwords_per_thread. */
503 if (dwords_per_thread > 4) {
504 assert(!"dwords_per_thread must be <= 4");
505 return false; /* invalid value */
506 }
507
508 if (clear_value_size > dwords_per_thread * 4) {
509 assert(!"clear_value_size must be <= dwords_per_thread");
510 return false; /* invalid value */
511 }
512
513 if (clear_value_size == 12 && info->dst_offset % 4) {
514 assert(!"if clear_value_size == 12, dst_offset must be aligned to 4");
515 return false; /* invalid value */
516 }
517
518 unsigned dst_align_offset = info->dst_offset % (dwords_per_thread * 4);
519 unsigned dst_offset_bound = info->dst_offset - dst_align_offset;
520 unsigned src_align_offset = is_copy ? info->src_offset % 4 : 0;
521 unsigned num_user_data_terms = 0;
522
523 /* Set the clear value in user data SGPRs. */
524 if (!is_copy) {
525 assert(clear_value_size >= 4 && clear_value_size <= 16 &&
526 (clear_value_size == 12 || util_is_power_of_two_or_zero(clear_value_size)));
527
528 /* Since the clear value may start on an unaligned offset and we just pass user SGPRs
529 * to dword stores as-is, we need to byte-shift the clear value to that offset and
530 * replicate it because 1 invocation stores up to 4 dwords from user SGPRs regardless of
531 * the clear value size.
532 */
533 num_user_data_terms = clear_value_size == 12 ? 3 : dwords_per_thread;
534 unsigned user_data_size = num_user_data_terms * 4;
535
536 memcpy(out->user_data,
537 (uint8_t*)clear_value + clear_value_size - dst_align_offset % clear_value_size,
538 dst_align_offset % clear_value_size);
539 unsigned offset = dst_align_offset % clear_value_size;
540
541 while (offset + clear_value_size <= user_data_size) {
542 memcpy((uint8_t*)out->user_data + offset, clear_value, clear_value_size);
543 offset += clear_value_size;
544 }
545
546 if (offset < user_data_size)
547 memcpy((uint8_t*)out->user_data + offset, clear_value, user_data_size - offset);
548 }
549
550 out->shader_key.key = 0;
551
552 out->shader_key.is_clear = !is_copy;
553 assert(dwords_per_thread && dwords_per_thread <= 4);
554 out->shader_key.dwords_per_thread = dwords_per_thread;
555 out->shader_key.clear_value_size_is_12 = !is_copy && clear_value_size == 12;
556 out->shader_key.src_is_sparse = info->src_is_sparse;
557 out->shader_key.src_align_offset = src_align_offset;
558 out->shader_key.dst_align_offset = dst_align_offset;
559
560 if ((dst_align_offset + info->size) % 4)
561 out->shader_key.dst_last_thread_bytes = (dst_align_offset + info->size) % (dwords_per_thread * 4);
562
563 unsigned num_threads = DIV_ROUND_UP(dst_align_offset + info->size, dwords_per_thread * 4);
564 out->shader_key.dst_single_thread_unaligned = num_threads == 1 && dst_align_offset &&
565 out->shader_key.dst_last_thread_bytes;
566
567 /* start_thread offsets threads to make sure all non-zero waves start clearing/copying from
568 * the beginning a 256B block and clear/copy whole 256B blocks. Clearing/copying a 256B block
569 * partially for each wave is inefficient, which happens when dst_offset isn't aligned to 256.
570 * Clearing/copying whole 256B blocks per wave isn't possible if dwords_per_thread isn't 2^n.
571 */
572 unsigned start_thread =
573 dst_offset_bound % 256 && util_is_power_of_two_nonzero(dwords_per_thread) ?
574 DIV_ROUND_UP(256 - dst_offset_bound % 256, dwords_per_thread * 4) : 0;
575 out->shader_key.has_start_thread = start_thread != 0;
576
577 /* Set the value of the last thread ID, so that the shader knows which thread is the last one. */
578 if (out->shader_key.dst_last_thread_bytes)
579 out->user_data[num_user_data_terms++] = num_threads - 1;
580 if (out->shader_key.has_start_thread)
581 out->user_data[num_user_data_terms++] = start_thread;
582
583 /* We need to bind whole dwords because of how we compute voffset. The bytes that shouldn't
584 * be written are not written by the shader.
585 */
586 out->ssbo[is_copy].offset = dst_offset_bound;
587 out->ssbo[is_copy].size = align(dst_align_offset + info->size, 4);
588
589 if (is_copy) {
590 /* Since unaligned copies use 32-bit loads, any dword that's partially covered by the copy
591 * range must be fully covered, so that the 32-bit loads succeed.
592 */
593 out->ssbo[0].offset = info->src_offset - src_align_offset;
594 out->ssbo[0].size = align(src_align_offset + info->size, 4);
595 assert(out->ssbo[0].offset % 4 == 0 && out->ssbo[0].size % 4 == 0);
596 }
597
598 out->num_ssbos = is_copy ? 2 : 1;
599 out->workgroup_size = 64;
600 out->num_threads = start_thread + num_threads;
601 return true;
602 }
603