1 /*
2 * Copyright 2010 Jerome Glisse <[email protected]>
3 * Copyright 2015-2021 Advanced Micro Devices, Inc.
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include "si_build_pm4.h"
9 #include "sid.h"
10 #include "util/u_memory.h"
11 #include "ac_formats.h"
12
13 static
si_prepare_for_sdma_copy(struct si_context * sctx,struct si_texture * dst,struct si_texture * src)14 bool si_prepare_for_sdma_copy(struct si_context *sctx, struct si_texture *dst,struct si_texture *src)
15 {
16 if (dst->surface.bpe != src->surface.bpe)
17 return false;
18
19 /* MSAA: Blits don't exist in the real world. */
20 if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1)
21 return false;
22
23 if (dst->buffer.b.b.last_level != 0 || src->buffer.b.b.last_level != 0)
24 return false;
25
26 return true;
27 }
28
minify_as_blocks(unsigned width,unsigned level,unsigned blk_w)29 static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
30 {
31 width = u_minify(width, level);
32 return DIV_ROUND_UP(width, blk_w);
33 }
34
encode_legacy_tile_info(struct si_context * sctx,struct si_texture * tex)35 static unsigned encode_legacy_tile_info(struct si_context *sctx, struct si_texture *tex)
36 {
37 struct radeon_info *info = &sctx->screen->info;
38 unsigned tile_index = tex->surface.u.legacy.tiling_index[0];
39 unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
40 unsigned tile_mode = info->si_tile_mode_array[tile_index];
41 unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
42
43 return util_logbase2(tex->surface.bpe) |
44 (G_009910_ARRAY_MODE(tile_mode) << 3) |
45 (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
46 /* Non-depth modes don't have TILE_SPLIT set. */
47 ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
48 (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
49 (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
50 (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
51 (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
52 (G_009910_PIPE_CONFIG(tile_mode) << 26);
53 }
54
si_sdma_v4_v5_copy_texture(struct si_context * sctx,struct si_texture * sdst,struct si_texture * ssrc)55 static bool si_sdma_v4_v5_copy_texture(struct si_context *sctx, struct si_texture *sdst,
56 struct si_texture *ssrc)
57 {
58 bool is_v5 = sctx->gfx_level >= GFX10;
59 bool is_v5_2 = sctx->gfx_level >= GFX10_3;
60 bool is_v7 = sctx->gfx_level >= GFX12;
61 unsigned bpp = sdst->surface.bpe;
62 uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
63 uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
64 unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
65 unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
66 unsigned copy_width = DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w);
67 unsigned copy_height = DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h);
68
69 bool tmz = (ssrc->buffer.flags & RADEON_FLAG_ENCRYPTED);
70 assert (!tmz || (sdst->buffer.flags & RADEON_FLAG_ENCRYPTED));
71
72 /* Linear -> linear sub-window copy. */
73 if (ssrc->surface.is_linear && sdst->surface.is_linear) {
74 struct radeon_cmdbuf *cs = sctx->sdma_cs;
75
76 uint64_t bytes = (uint64_t)src_pitch * copy_height * bpp;
77 uint32_t chunk_size = 1u << (is_v5_2 ? 30 : 22);
78 uint32_t chunk_count = DIV_ROUND_UP(bytes, chunk_size);
79
80 src_address += ssrc->surface.u.gfx9.offset[0];
81 dst_address += sdst->surface.u.gfx9.offset[0];
82
83 radeon_begin(cs);
84 for (int i = 0; i < chunk_count; i++) {
85 uint32_t size = MIN2(chunk_size, bytes);
86 radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY,
87 SDMA_COPY_SUB_OPCODE_LINEAR,
88 (tmz ? 4 : 0)));
89 radeon_emit(size - 1);
90 radeon_emit(0);
91 radeon_emit(src_address);
92 radeon_emit(src_address >> 32);
93 radeon_emit(dst_address);
94 radeon_emit(dst_address >> 32);
95
96 src_address += size;
97 dst_address += size;
98 bytes -= size;
99 }
100 radeon_end();
101 return true;
102 }
103
104 /* Linear <-> Tiled sub-window copy */
105 if (ssrc->surface.is_linear != sdst->surface.is_linear) {
106 struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
107 struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
108 unsigned tiled_width = DIV_ROUND_UP(tiled->buffer.b.b.width0, tiled->surface.blk_w);
109 unsigned tiled_height = DIV_ROUND_UP(tiled->buffer.b.b.height0, tiled->surface.blk_h);
110 unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
111 uint64_t linear_slice_pitch = linear->surface.u.gfx9.surf_slice_size / bpp;
112 uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
113 uint64_t linear_address = linear == ssrc ? src_address : dst_address;
114 struct radeon_cmdbuf *cs = sctx->sdma_cs;
115 assert(tiled->buffer.b.b.depth0 == 1);
116 bool dcc;
117
118 if (is_v7) {
119 /* Compress only when dst has DCC. If src has DCC, it automatically decompresses according
120 * to PTE.D (page table bit) even if we don't enable DCC in the packet.
121 */
122 dcc = tiled == sdst &&
123 tiled->buffer.flags & RADEON_FLAG_GFX12_ALLOW_DCC;
124
125 /* Check if everything fits into the bitfields */
126 if (!(tiled_width <= (1 << 16) && tiled_height <= (1 << 16) &&
127 linear_pitch <= (1 << 16) && linear_slice_pitch <= (1ull << 32) &&
128 copy_width <= (1 << 16) && copy_height <= (1 << 16)))
129 return false;
130 } else {
131 /* Only SDMA 5 supports DCC with SDMA */
132 dcc = is_v5 && vi_dcc_enabled(tiled, 0);
133
134 /* Check if everything fits into the bitfields */
135 if (!(tiled_width <= (1 << 14) && tiled_height <= (1 << 14) &&
136 linear_pitch <= (1 << 14) && linear_slice_pitch <= (1 << 28) &&
137 copy_width <= (1 << 14) && copy_height <= (1 << 14)))
138 return false;
139 }
140
141 linear_address += linear->surface.u.gfx9.offset[0];
142
143 radeon_begin(cs);
144 radeon_emit(
145 SDMA_PACKET(SDMA_OPCODE_COPY,
146 SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW,
147 (tmz ? 4 : 0)) |
148 dcc << 19 |
149 (is_v5 ? 0 : tiled->buffer.b.b.last_level) << 20 |
150 (linear == sdst ? 1u : 0) << 31);
151 radeon_emit((uint32_t)tiled_address | (tiled->surface.tile_swizzle << 8));
152 radeon_emit((uint32_t)(tiled_address >> 32));
153 radeon_emit(0);
154 radeon_emit(((tiled_width - 1) << 16));
155 radeon_emit((tiled_height - 1));
156 radeon_emit(util_logbase2(bpp) |
157 tiled->surface.u.gfx9.swizzle_mode << 3 |
158 (is_v7 ? 0 : tiled->surface.u.gfx9.resource_type << 9) |
159 (is_v5 ? tiled->buffer.b.b.last_level : tiled->surface.u.gfx9.epitch) << 16);
160 radeon_emit((uint32_t)linear_address);
161 radeon_emit((uint32_t)(linear_address >> 32));
162 radeon_emit(0);
163 radeon_emit(((linear_pitch - 1) << 16));
164 radeon_emit(linear_slice_pitch - 1);
165 radeon_emit((copy_width - 1) | ((copy_height - 1) << 16));
166 radeon_emit(0);
167
168 if (dcc) {
169 unsigned data_format = ac_get_cb_format(sctx->gfx_level, tiled->buffer.b.b.format);
170 unsigned number_type = ac_get_cb_number_type(tiled->buffer.b.b.format);
171 uint64_t md_address = tiled_address + tiled->surface.meta_offset;
172
173 if (is_v7) {
174 radeon_emit(data_format |
175 number_type << 9) |
176 (2 << 16) | /* 0: bypass DCC, 2: decompress reads if PTE.D */
177 (1 << 18) | /* 0: bypass DCC, 1: write compressed if PTE.D, 2: write uncompressed if PTE.D */
178 (tiled->surface.u.gfx9.color.dcc.max_compressed_block_size << 24) |
179 (1 << 26); /* max uncompressed block size: 256B */
180 } else {
181 /* Add metadata */
182 radeon_emit((uint32_t)md_address);
183 radeon_emit((uint32_t)(md_address >> 32));
184 radeon_emit(data_format |
185 ac_alpha_is_on_msb(&sctx->screen->info, tiled->buffer.b.b.format) << 8 |
186 number_type << 9 |
187 tiled->surface.u.gfx9.color.dcc.max_compressed_block_size << 24 |
188 V_028C78_MAX_BLOCK_SIZE_256B << 26 |
189 tmz << 29 |
190 tiled->surface.u.gfx9.color.dcc.pipe_aligned << 31);
191 }
192 }
193 radeon_end();
194 return true;
195 }
196
197 return false;
198 }
199
200 static
cik_sdma_copy_texture(struct si_context * sctx,struct si_texture * sdst,struct si_texture * ssrc)201 bool cik_sdma_copy_texture(struct si_context *sctx, struct si_texture *sdst, struct si_texture *ssrc)
202 {
203 struct radeon_info *info = &sctx->screen->info;
204 unsigned bpp = sdst->surface.bpe;
205 uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[0].offset_256B * 256;
206 uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[0].offset_256B * 256;
207 unsigned dst_mode = sdst->surface.u.legacy.level[0].mode;
208 unsigned src_mode = ssrc->surface.u.legacy.level[0].mode;
209 unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[0];
210 unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[0];
211 unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
212 unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
213 unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
214 unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
215 unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0;
216 unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0;
217 unsigned dst_pitch = sdst->surface.u.legacy.level[0].nblk_x;
218 unsigned src_pitch = ssrc->surface.u.legacy.level[0].nblk_x;
219 uint64_t dst_slice_pitch =
220 ((uint64_t)sdst->surface.u.legacy.level[0].slice_size_dw * 4) / bpp;
221 uint64_t src_slice_pitch =
222 ((uint64_t)ssrc->surface.u.legacy.level[0].slice_size_dw * 4) / bpp;
223 unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, 0, sdst->surface.blk_w);
224 unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, 0, ssrc->surface.blk_w);
225 unsigned copy_width = DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w);
226 unsigned copy_height = DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h);
227
228 dst_address |= dst_tile_swizzle << 8;
229 src_address |= src_tile_swizzle << 8;
230
231 /* Linear -> linear sub-window copy. */
232 if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
233 /* check if everything fits into the bitfields */
234 src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) &&
235 dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
236 /* HW limitation - GFX7: */
237 (sctx->gfx_level != GFX7 ||
238 (copy_width < (1 << 14) && copy_height < (1 << 14))) &&
239 /* HW limitation - some GFX7 parts: */
240 ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) ||
241 (copy_width != (1 << 14) && copy_height != (1 << 14)))) {
242 struct radeon_cmdbuf *cs = sctx->sdma_cs;
243
244 radeon_begin(cs);
245 radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY, SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
246 (util_logbase2(bpp) << 29));
247 radeon_emit(src_address);
248 radeon_emit(src_address >> 32);
249 radeon_emit(0);
250 radeon_emit((src_pitch - 1) << 16);
251 radeon_emit(src_slice_pitch - 1);
252 radeon_emit(dst_address);
253 radeon_emit(dst_address >> 32);
254 radeon_emit(0);
255 radeon_emit((dst_pitch - 1) << 16);
256 radeon_emit(dst_slice_pitch - 1);
257 if (sctx->gfx_level == GFX7) {
258 radeon_emit(copy_width | (copy_height << 16));
259 radeon_emit(0);
260 } else {
261 radeon_emit((copy_width - 1) | ((copy_height - 1) << 16));
262 radeon_emit(0);
263 }
264 radeon_end();
265 return true;
266 }
267
268 /* Tiled <-> linear sub-window copy. */
269 if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
270 struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
271 struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
272 unsigned tiled_width = tiled == ssrc ? src_width : dst_width;
273 unsigned linear_width = linear == ssrc ? src_width : dst_width;
274 unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch;
275 unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
276 unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch;
277 unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
278 uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
279 uint64_t linear_address = linear == ssrc ? src_address : dst_address;
280 unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
281
282 assert(tiled_pitch % 8 == 0);
283 assert(tiled_slice_pitch % 64 == 0);
284 unsigned pitch_tile_max = tiled_pitch / 8 - 1;
285 unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
286 unsigned xalign = MAX2(1, 4 / bpp);
287 unsigned copy_width_aligned = copy_width;
288
289 /* If the region ends at the last pixel and is unaligned, we
290 * can copy the remainder of the line that is not visible to
291 * make it aligned.
292 */
293 if (copy_width % xalign != 0 && 0 + copy_width == linear_width &&
294 copy_width == tiled_width &&
295 align(copy_width, xalign) <= linear_pitch &&
296 align(copy_width, xalign) <= tiled_pitch)
297 copy_width_aligned = align(copy_width, xalign);
298
299 /* HW limitations. */
300 if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) &&
301 linear_pitch - 1 == 0x3fff && bpp == 16)
302 return false;
303
304 if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI ||
305 sctx->family == CHIP_KABINI) &&
306 (copy_width == (1 << 14) || copy_height == (1 << 14)))
307 return false;
308
309 /* The hw can read outside of the given linear buffer bounds,
310 * or access those pages but not touch the memory in case
311 * of writes. (it still causes a VM fault)
312 *
313 * Out-of-bounds memory access or page directory access must
314 * be prevented.
315 */
316 int64_t start_linear_address, end_linear_address;
317 unsigned granularity;
318
319 /* Deduce the size of reads from the linear surface. */
320 switch (tiled_micro_mode) {
321 case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
322 granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp);
323 break;
324 case V_009910_ADDR_SURF_THIN_MICRO_TILING:
325 case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
326 if (0 /* TODO: THICK microtiling */)
327 granularity =
328 bpp == 1 ? 32 / (8 * bpp)
329 : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
330 else
331 granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
332 break;
333 default:
334 return false;
335 }
336
337 /* The linear reads start at tiled_x & ~(granularity - 1).
338 * If linear_x == 0 && tiled_x % granularity != 0, the hw
339 * starts reading from an address preceding linear_address!!!
340 */
341 start_linear_address =
342 (uint64_t)linear->surface.u.legacy.level[0].offset_256B * 256;
343
344 end_linear_address =
345 (uint64_t)linear->surface.u.legacy.level[0].offset_256B * 256 +
346 bpp * ((copy_height - 1) * (uint64_t)linear_pitch + copy_width);
347
348 if ((0 + copy_width) % granularity)
349 end_linear_address += granularity - (0 + copy_width) % granularity;
350
351 if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size)
352 return false;
353
354 /* Check requirements. */
355 if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 &&
356 copy_width_aligned % xalign == 0 &&
357 tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
358 /* check if everything fits into the bitfields */
359 tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) &&
360 slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) &&
361 linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) &&
362 copy_height <= (1 << 14)) {
363 struct radeon_cmdbuf *cs = sctx->sdma_cs;
364 uint32_t direction = linear == sdst ? 1u << 31 : 0;
365
366 radeon_begin(cs);
367 radeon_emit(SDMA_PACKET(SDMA_OPCODE_COPY,
368 SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
369 direction);
370 radeon_emit(tiled_address);
371 radeon_emit(tiled_address >> 32);
372 radeon_emit(0);
373 radeon_emit(pitch_tile_max << 16);
374 radeon_emit(slice_tile_max);
375 radeon_emit(encode_legacy_tile_info(sctx, tiled));
376 radeon_emit(linear_address);
377 radeon_emit(linear_address >> 32);
378 radeon_emit(0);
379 radeon_emit(((linear_pitch - 1) << 16));
380 radeon_emit(linear_slice_pitch - 1);
381 if (sctx->gfx_level == GFX7) {
382 radeon_emit(copy_width_aligned | (copy_height << 16));
383 radeon_emit(1);
384 } else {
385 radeon_emit((copy_width_aligned - 1) | ((copy_height - 1) << 16));
386 radeon_emit(0);
387 }
388 radeon_end();
389 return true;
390 }
391 }
392
393 return false;
394 }
395
si_sdma_copy_image(struct si_context * sctx,struct si_texture * dst,struct si_texture * src)396 bool si_sdma_copy_image(struct si_context *sctx, struct si_texture *dst, struct si_texture *src)
397 {
398 struct radeon_winsys *ws = sctx->ws;
399
400 if (!sctx->sdma_cs) {
401 if (sctx->screen->debug_flags & DBG(NO_DMA) || sctx->gfx_level < GFX7)
402 return false;
403
404 sctx->sdma_cs = CALLOC_STRUCT(radeon_cmdbuf);
405 if (ws->cs_create(sctx->sdma_cs, sctx->ctx, AMD_IP_SDMA, NULL, NULL))
406 return false;
407 }
408
409 if (!si_prepare_for_sdma_copy(sctx, dst, src))
410 return false;
411
412 /* TODO: DCC compression is possible on GFX10+. See si_set_mutable_tex_desc_fields for
413 * additional constraints.
414 * For now, the only use-case of SDMA is DRI_PRIME tiled->linear copy, and linear dst
415 * never has DCC.
416 */
417 if (vi_dcc_enabled(dst, 0))
418 return false;
419
420 /* Decompress DCC on older chips where SDMA can't read it. */
421 if (vi_dcc_enabled(src, 0) && sctx->gfx_level < GFX10)
422 si_decompress_dcc(sctx, src);
423
424 /* Always flush the gfx queue to get the winsys to handle the dependencies for us. */
425 si_flush_gfx_cs(sctx, 0, NULL);
426
427 switch (sctx->gfx_level) {
428 case GFX7:
429 case GFX8:
430 if (!cik_sdma_copy_texture(sctx, dst, src))
431 return false;
432 break;
433 case GFX9:
434 case GFX10:
435 case GFX10_3:
436 case GFX11:
437 case GFX11_5:
438 case GFX12:
439 if (!si_sdma_v4_v5_copy_texture(sctx, dst, src))
440 return false;
441 break;
442 default:
443 return false;
444 }
445
446 radeon_add_to_buffer_list(sctx, sctx->sdma_cs, &src->buffer, RADEON_USAGE_READ |
447 RADEON_PRIO_SAMPLER_TEXTURE);
448 radeon_add_to_buffer_list(sctx, sctx->sdma_cs, &dst->buffer, RADEON_USAGE_WRITE |
449 RADEON_PRIO_SAMPLER_TEXTURE);
450
451 unsigned flags = RADEON_FLUSH_START_NEXT_GFX_IB_NOW;
452 if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
453 if ((bool) (src->buffer.flags & RADEON_FLAG_ENCRYPTED) !=
454 sctx->ws->cs_is_secure(sctx->sdma_cs)) {
455 flags = RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION;
456 }
457 }
458
459 return ws->cs_flush(sctx->sdma_cs, flags, NULL) == 0;
460 }
461