1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * SPDX-License-Identifier: MIT
9 */
10
11 #include "radv_cmd_buffer.h"
12 #include "meta/radv_meta.h"
13 #include "radv_cp_dma.h"
14 #include "radv_cs.h"
15 #include "radv_debug.h"
16 #include "radv_device_generated_commands.h"
17 #include "radv_event.h"
18 #include "radv_pipeline_rt.h"
19 #include "radv_radeon_winsys.h"
20 #include "radv_rmv.h"
21 #include "radv_rra.h"
22 #include "radv_shader.h"
23 #include "radv_shader_object.h"
24 #include "radv_sqtt.h"
25 #include "sid.h"
26 #include "vk_command_pool.h"
27 #include "vk_common_entrypoints.h"
28 #include "vk_enum_defines.h"
29 #include "vk_format.h"
30 #include "vk_framebuffer.h"
31 #include "vk_render_pass.h"
32 #include "vk_synchronization.h"
33 #include "vk_util.h"
34
35 #include "ac_debug.h"
36 #include "ac_descriptors.h"
37 #include "ac_nir.h"
38 #include "ac_shader_args.h"
39
40 #include "aco_interface.h"
41
42 #include "util/fast_idiv_by_const.h"
43
44 enum {
45 RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
46 RADV_PREFETCH_VS = (1 << 1),
47 RADV_PREFETCH_TCS = (1 << 2),
48 RADV_PREFETCH_TES = (1 << 3),
49 RADV_PREFETCH_GS = (1 << 4),
50 RADV_PREFETCH_PS = (1 << 5),
51 RADV_PREFETCH_MS = (1 << 6),
52 RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES | RADV_PREFETCH_GS |
53 RADV_PREFETCH_PS | RADV_PREFETCH_MS)
54 };
55
56 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
57 VkImageLayout src_layout, VkImageLayout dst_layout, uint32_t src_family_index,
58 uint32_t dst_family_index, const VkImageSubresourceRange *range,
59 struct radv_sample_locations_state *sample_locs);
60
61 static void
radv_bind_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_dynamic_state * src)62 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
63 {
64 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
65 const struct radv_physical_device *pdev = radv_device_physical(device);
66 struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
67 uint64_t copy_mask = src->mask;
68 uint64_t dest_mask = 0;
69
70 dest->vk.dr.rectangle_count = src->vk.dr.rectangle_count;
71 dest->sample_location.count = src->sample_location.count;
72
73 if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
74 if (dest->vk.vp.viewport_count != src->vk.vp.viewport_count) {
75 dest->vk.vp.viewport_count = src->vk.vp.viewport_count;
76 dest_mask |= RADV_DYNAMIC_VIEWPORT;
77 }
78
79 if (memcmp(&dest->vk.vp.viewports, &src->vk.vp.viewports, src->vk.vp.viewport_count * sizeof(VkViewport))) {
80 typed_memcpy(dest->vk.vp.viewports, src->vk.vp.viewports, src->vk.vp.viewport_count);
81 typed_memcpy(dest->hw_vp.xform, src->hw_vp.xform, src->vk.vp.viewport_count);
82 dest_mask |= RADV_DYNAMIC_VIEWPORT;
83 }
84 }
85
86 if (copy_mask & RADV_DYNAMIC_SCISSOR) {
87 if (dest->vk.vp.scissor_count != src->vk.vp.scissor_count) {
88 dest->vk.vp.scissor_count = src->vk.vp.scissor_count;
89 dest_mask |= RADV_DYNAMIC_SCISSOR;
90 }
91
92 if (memcmp(&dest->vk.vp.scissors, &src->vk.vp.scissors, src->vk.vp.scissor_count * sizeof(VkRect2D))) {
93 typed_memcpy(dest->vk.vp.scissors, src->vk.vp.scissors, src->vk.vp.scissor_count);
94 dest_mask |= RADV_DYNAMIC_SCISSOR;
95 }
96 }
97
98 if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
99 if (memcmp(&dest->vk.cb.blend_constants, &src->vk.cb.blend_constants, sizeof(src->vk.cb.blend_constants))) {
100 typed_memcpy(dest->vk.cb.blend_constants, src->vk.cb.blend_constants, 4);
101 dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
102 }
103 }
104
105 if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
106 if (memcmp(&dest->vk.dr.rectangles, &src->vk.dr.rectangles, src->vk.dr.rectangle_count * sizeof(VkRect2D))) {
107 typed_memcpy(dest->vk.dr.rectangles, src->vk.dr.rectangles, src->vk.dr.rectangle_count);
108 dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
109 }
110 }
111
112 if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
113 if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
114 dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
115 dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
116 memcmp(&dest->sample_location.locations, &src->sample_location.locations,
117 src->sample_location.count * sizeof(VkSampleLocationEXT))) {
118 dest->sample_location.per_pixel = src->sample_location.per_pixel;
119 dest->sample_location.grid_size = src->sample_location.grid_size;
120 typed_memcpy(dest->sample_location.locations, src->sample_location.locations, src->sample_location.count);
121 dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
122 }
123 }
124
125 if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_MASK) {
126 for (uint32_t i = 0; i < MAX_RTS; i++) {
127 if (dest->vk.cb.attachments[i].write_mask != src->vk.cb.attachments[i].write_mask) {
128 dest->vk.cb.attachments[i].write_mask = src->vk.cb.attachments[i].write_mask;
129 dest_mask |= RADV_DYNAMIC_COLOR_WRITE_MASK;
130 }
131 }
132 }
133
134 if (copy_mask & RADV_DYNAMIC_COLOR_BLEND_ENABLE) {
135 for (uint32_t i = 0; i < MAX_RTS; i++) {
136 if (dest->vk.cb.attachments[i].blend_enable != src->vk.cb.attachments[i].blend_enable) {
137 dest->vk.cb.attachments[i].blend_enable = src->vk.cb.attachments[i].blend_enable;
138 dest_mask |= RADV_DYNAMIC_COLOR_BLEND_ENABLE;
139 }
140 }
141 }
142
143 if (copy_mask & RADV_DYNAMIC_COLOR_BLEND_EQUATION) {
144 for (uint32_t i = 0; i < MAX_RTS; i++) {
145 if (dest->vk.cb.attachments[i].src_color_blend_factor != src->vk.cb.attachments[i].src_color_blend_factor ||
146 dest->vk.cb.attachments[i].dst_color_blend_factor != src->vk.cb.attachments[i].dst_color_blend_factor ||
147 dest->vk.cb.attachments[i].color_blend_op != src->vk.cb.attachments[i].color_blend_op ||
148 dest->vk.cb.attachments[i].src_alpha_blend_factor != src->vk.cb.attachments[i].src_alpha_blend_factor ||
149 dest->vk.cb.attachments[i].dst_alpha_blend_factor != src->vk.cb.attachments[i].dst_alpha_blend_factor ||
150 dest->vk.cb.attachments[i].alpha_blend_op != src->vk.cb.attachments[i].alpha_blend_op) {
151 dest->vk.cb.attachments[i].src_color_blend_factor = src->vk.cb.attachments[i].src_color_blend_factor;
152 dest->vk.cb.attachments[i].dst_color_blend_factor = src->vk.cb.attachments[i].dst_color_blend_factor;
153 dest->vk.cb.attachments[i].color_blend_op = src->vk.cb.attachments[i].color_blend_op;
154 dest->vk.cb.attachments[i].src_alpha_blend_factor = src->vk.cb.attachments[i].src_alpha_blend_factor;
155 dest->vk.cb.attachments[i].dst_alpha_blend_factor = src->vk.cb.attachments[i].dst_alpha_blend_factor;
156 dest->vk.cb.attachments[i].alpha_blend_op = src->vk.cb.attachments[i].alpha_blend_op;
157 dest_mask |= RADV_DYNAMIC_COLOR_BLEND_EQUATION;
158 }
159 }
160 }
161
162 if (memcmp(&dest->vk.cal.color_map, &src->vk.cal.color_map, sizeof(src->vk.cal.color_map))) {
163 typed_memcpy(dest->vk.cal.color_map, src->vk.cal.color_map, MAX_RTS);
164 dest_mask |= RADV_DYNAMIC_COLOR_ATTACHMENT_MAP;
165 }
166
167 if (memcmp(&dest->vk.ial, &src->vk.ial, sizeof(src->vk.ial))) {
168 typed_memcpy(dest->vk.ial.color_map, src->vk.ial.color_map, MAX_RTS);
169 dest->vk.ial.depth_att = src->vk.ial.depth_att;
170 dest->vk.ial.stencil_att = src->vk.ial.stencil_att;
171 dest_mask |= RADV_DYNAMIC_INPUT_ATTACHMENT_MAP;
172 }
173
174 #define RADV_CMP_COPY(field, flag) \
175 if (copy_mask & flag) { \
176 if (dest->field != src->field) { \
177 dest->field = src->field; \
178 dest_mask |= flag; \
179 } \
180 }
181
182 RADV_CMP_COPY(vk.ia.primitive_topology, RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
183 RADV_CMP_COPY(vk.ia.primitive_restart_enable, RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE);
184
185 RADV_CMP_COPY(vk.vp.depth_clip_negative_one_to_one, RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE);
186
187 RADV_CMP_COPY(vk.ts.patch_control_points, RADV_DYNAMIC_PATCH_CONTROL_POINTS);
188 RADV_CMP_COPY(vk.ts.domain_origin, RADV_DYNAMIC_TESS_DOMAIN_ORIGIN);
189
190 RADV_CMP_COPY(vk.rs.line.width, RADV_DYNAMIC_LINE_WIDTH);
191 RADV_CMP_COPY(vk.rs.depth_bias.constant, RADV_DYNAMIC_DEPTH_BIAS);
192 RADV_CMP_COPY(vk.rs.depth_bias.clamp, RADV_DYNAMIC_DEPTH_BIAS);
193 RADV_CMP_COPY(vk.rs.depth_bias.slope, RADV_DYNAMIC_DEPTH_BIAS);
194 RADV_CMP_COPY(vk.rs.depth_bias.representation, RADV_DYNAMIC_DEPTH_BIAS);
195 RADV_CMP_COPY(vk.rs.line.stipple.factor, RADV_DYNAMIC_LINE_STIPPLE);
196 RADV_CMP_COPY(vk.rs.line.stipple.pattern, RADV_DYNAMIC_LINE_STIPPLE);
197 RADV_CMP_COPY(vk.rs.cull_mode, RADV_DYNAMIC_CULL_MODE);
198 RADV_CMP_COPY(vk.rs.front_face, RADV_DYNAMIC_FRONT_FACE);
199 RADV_CMP_COPY(vk.rs.depth_bias.enable, RADV_DYNAMIC_DEPTH_BIAS_ENABLE);
200 RADV_CMP_COPY(vk.rs.rasterizer_discard_enable, RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
201 RADV_CMP_COPY(vk.rs.polygon_mode, RADV_DYNAMIC_POLYGON_MODE);
202 RADV_CMP_COPY(vk.rs.line.stipple.enable, RADV_DYNAMIC_LINE_STIPPLE_ENABLE);
203 RADV_CMP_COPY(vk.rs.depth_clip_enable, RADV_DYNAMIC_DEPTH_CLIP_ENABLE);
204 RADV_CMP_COPY(vk.rs.conservative_mode, RADV_DYNAMIC_CONSERVATIVE_RAST_MODE);
205 RADV_CMP_COPY(vk.rs.provoking_vertex, RADV_DYNAMIC_PROVOKING_VERTEX_MODE);
206 RADV_CMP_COPY(vk.rs.depth_clamp_enable, RADV_DYNAMIC_DEPTH_CLAMP_ENABLE);
207 RADV_CMP_COPY(vk.rs.line.mode, RADV_DYNAMIC_LINE_RASTERIZATION_MODE);
208
209 RADV_CMP_COPY(vk.ms.alpha_to_coverage_enable, RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE);
210 RADV_CMP_COPY(vk.ms.alpha_to_one_enable, RADV_DYNAMIC_ALPHA_TO_ONE_ENABLE);
211 RADV_CMP_COPY(vk.ms.sample_mask, RADV_DYNAMIC_SAMPLE_MASK);
212 RADV_CMP_COPY(vk.ms.rasterization_samples, RADV_DYNAMIC_RASTERIZATION_SAMPLES);
213 RADV_CMP_COPY(vk.ms.sample_locations_enable, RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE);
214
215 RADV_CMP_COPY(vk.ds.depth.bounds_test.min, RADV_DYNAMIC_DEPTH_BOUNDS);
216 RADV_CMP_COPY(vk.ds.depth.bounds_test.max, RADV_DYNAMIC_DEPTH_BOUNDS);
217 RADV_CMP_COPY(vk.ds.stencil.front.compare_mask, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
218 RADV_CMP_COPY(vk.ds.stencil.back.compare_mask, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
219 RADV_CMP_COPY(vk.ds.stencil.front.write_mask, RADV_DYNAMIC_STENCIL_WRITE_MASK);
220 RADV_CMP_COPY(vk.ds.stencil.back.write_mask, RADV_DYNAMIC_STENCIL_WRITE_MASK);
221 RADV_CMP_COPY(vk.ds.stencil.front.reference, RADV_DYNAMIC_STENCIL_REFERENCE);
222 RADV_CMP_COPY(vk.ds.stencil.back.reference, RADV_DYNAMIC_STENCIL_REFERENCE);
223 RADV_CMP_COPY(vk.ds.depth.test_enable, RADV_DYNAMIC_DEPTH_TEST_ENABLE);
224 RADV_CMP_COPY(vk.ds.depth.write_enable, RADV_DYNAMIC_DEPTH_WRITE_ENABLE);
225 RADV_CMP_COPY(vk.ds.depth.compare_op, RADV_DYNAMIC_DEPTH_COMPARE_OP);
226 RADV_CMP_COPY(vk.ds.depth.bounds_test.enable, RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE);
227 RADV_CMP_COPY(vk.ds.stencil.test_enable, RADV_DYNAMIC_STENCIL_TEST_ENABLE);
228 RADV_CMP_COPY(vk.ds.stencil.front.op.fail, RADV_DYNAMIC_STENCIL_OP);
229 RADV_CMP_COPY(vk.ds.stencil.front.op.pass, RADV_DYNAMIC_STENCIL_OP);
230 RADV_CMP_COPY(vk.ds.stencil.front.op.depth_fail, RADV_DYNAMIC_STENCIL_OP);
231 RADV_CMP_COPY(vk.ds.stencil.front.op.compare, RADV_DYNAMIC_STENCIL_OP);
232 RADV_CMP_COPY(vk.ds.stencil.back.op.fail, RADV_DYNAMIC_STENCIL_OP);
233 RADV_CMP_COPY(vk.ds.stencil.back.op.pass, RADV_DYNAMIC_STENCIL_OP);
234 RADV_CMP_COPY(vk.ds.stencil.back.op.depth_fail, RADV_DYNAMIC_STENCIL_OP);
235 RADV_CMP_COPY(vk.ds.stencil.back.op.compare, RADV_DYNAMIC_STENCIL_OP);
236
237 RADV_CMP_COPY(vk.cb.logic_op, RADV_DYNAMIC_LOGIC_OP);
238 RADV_CMP_COPY(vk.cb.color_write_enables, RADV_DYNAMIC_COLOR_WRITE_ENABLE);
239 RADV_CMP_COPY(vk.cb.logic_op_enable, RADV_DYNAMIC_LOGIC_OP_ENABLE);
240
241 RADV_CMP_COPY(vk.fsr.fragment_size.width, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
242 RADV_CMP_COPY(vk.fsr.fragment_size.height, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
243 RADV_CMP_COPY(vk.fsr.combiner_ops[0], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
244 RADV_CMP_COPY(vk.fsr.combiner_ops[1], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
245
246 RADV_CMP_COPY(vk.dr.enable, RADV_DYNAMIC_DISCARD_RECTANGLE_ENABLE);
247 RADV_CMP_COPY(vk.dr.mode, RADV_DYNAMIC_DISCARD_RECTANGLE_MODE);
248
249 RADV_CMP_COPY(feedback_loop_aspects, RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE);
250
251 #undef RADV_CMP_COPY
252
253 cmd_buffer->state.dirty_dynamic |= dest_mask;
254
255 /* Handle driver specific states that need to be re-emitted when PSO are bound. */
256 if (dest_mask & (RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_POLYGON_MODE | RADV_DYNAMIC_LINE_WIDTH |
257 RADV_DYNAMIC_PRIMITIVE_TOPOLOGY)) {
258 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GUARDBAND;
259 }
260
261 if (pdev->info.rbplus_allowed && (dest_mask & RADV_DYNAMIC_COLOR_WRITE_MASK)) {
262 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
263 }
264
265 if (dest_mask & (RADV_DYNAMIC_COLOR_ATTACHMENT_MAP | RADV_DYNAMIC_INPUT_ATTACHMENT_MAP)) {
266 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FBFETCH_OUTPUT;
267 }
268 }
269
270 bool
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer * cmd_buffer)271 radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
272 {
273 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
274 const struct radv_physical_device *pdev = radv_device_physical(device);
275 return cmd_buffer->qf == RADV_QUEUE_COMPUTE && pdev->info.gfx_level >= GFX7;
276 }
277
278 static void
radv_write_data(struct radv_cmd_buffer * cmd_buffer,const unsigned engine_sel,const uint64_t va,const unsigned count,const uint32_t * data,const bool predicating)279 radv_write_data(struct radv_cmd_buffer *cmd_buffer, const unsigned engine_sel, const uint64_t va, const unsigned count,
280 const uint32_t *data, const bool predicating)
281 {
282 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
283
284 radv_cs_write_data(device, cmd_buffer->cs, cmd_buffer->qf, engine_sel, va, count, data, predicating);
285 }
286
287 static void
radv_emit_clear_data(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned size)288 radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, unsigned size)
289 {
290 uint32_t *zeroes = alloca(size);
291 memset(zeroes, 0, size);
292 radv_write_data(cmd_buffer, engine_sel, va, size / 4, zeroes, false);
293 }
294
295 static void
radv_cmd_buffer_finish_shader_part_cache(struct radv_cmd_buffer * cmd_buffer)296 radv_cmd_buffer_finish_shader_part_cache(struct radv_cmd_buffer *cmd_buffer)
297 {
298 ralloc_free(cmd_buffer->vs_prologs.table);
299 ralloc_free(cmd_buffer->ps_epilogs.table);
300 }
301
302 static bool
radv_cmd_buffer_init_shader_part_cache(struct radv_device * device,struct radv_cmd_buffer * cmd_buffer)303 radv_cmd_buffer_init_shader_part_cache(struct radv_device *device, struct radv_cmd_buffer *cmd_buffer)
304 {
305 if (device->vs_prologs.ops) {
306 if (!_mesa_set_init(&cmd_buffer->vs_prologs, NULL, device->vs_prologs.ops->hash, device->vs_prologs.ops->equals))
307 return false;
308 }
309 if (device->ps_epilogs.ops) {
310 if (!_mesa_set_init(&cmd_buffer->ps_epilogs, NULL, device->ps_epilogs.ops->hash, device->ps_epilogs.ops->equals))
311 return false;
312 }
313 return true;
314 }
315
316 static void
radv_destroy_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer)317 radv_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
318 {
319 struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk);
320 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
321
322 if (cmd_buffer->qf != RADV_QUEUE_SPARSE) {
323 util_dynarray_fini(&cmd_buffer->ray_history);
324
325 radv_rra_accel_struct_buffers_unref(device, cmd_buffer->accel_struct_buffers);
326 _mesa_set_destroy(cmd_buffer->accel_struct_buffers, NULL);
327
328 list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) {
329 radv_rmv_log_command_buffer_bo_destroy(device, up->upload_bo);
330 radv_bo_destroy(device, &cmd_buffer->vk.base, up->upload_bo);
331 list_del(&up->list);
332 free(up);
333 }
334
335 if (cmd_buffer->upload.upload_bo) {
336 radv_rmv_log_command_buffer_bo_destroy(device, cmd_buffer->upload.upload_bo);
337 radv_bo_destroy(device, &cmd_buffer->vk.base, cmd_buffer->upload.upload_bo);
338 }
339
340 if (cmd_buffer->cs)
341 device->ws->cs_destroy(cmd_buffer->cs);
342 if (cmd_buffer->gang.cs)
343 device->ws->cs_destroy(cmd_buffer->gang.cs);
344 if (cmd_buffer->transfer.copy_temp)
345 radv_bo_destroy(device, &cmd_buffer->vk.base, cmd_buffer->transfer.copy_temp);
346
347 radv_cmd_buffer_finish_shader_part_cache(cmd_buffer);
348
349 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
350 struct radv_descriptor_set_header *set = &cmd_buffer->descriptors[i].push_set.set;
351 free(set->mapped_ptr);
352 if (set->layout)
353 vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk);
354 vk_object_base_finish(&set->base);
355 }
356
357 vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
358 }
359
360 vk_command_buffer_finish(&cmd_buffer->vk);
361 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
362 }
363
364 static VkResult
radv_create_cmd_buffer(struct vk_command_pool * pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmd_buffer_out)365 radv_create_cmd_buffer(struct vk_command_pool *pool, VkCommandBufferLevel level,
366 struct vk_command_buffer **cmd_buffer_out)
367 {
368 struct radv_device *device = container_of(pool->base.device, struct radv_device, vk);
369 const struct radv_physical_device *pdev = radv_device_physical(device);
370 struct radv_cmd_buffer *cmd_buffer;
371 unsigned ring;
372 cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
373 if (cmd_buffer == NULL)
374 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
375
376 VkResult result = vk_command_buffer_init(pool, &cmd_buffer->vk, &radv_cmd_buffer_ops, level);
377 if (result != VK_SUCCESS) {
378 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
379 return result;
380 }
381
382 cmd_buffer->qf = vk_queue_to_radv(pdev, pool->queue_family_index);
383
384 if (cmd_buffer->qf != RADV_QUEUE_SPARSE) {
385 list_inithead(&cmd_buffer->upload.list);
386
387 if (!radv_cmd_buffer_init_shader_part_cache(device, cmd_buffer)) {
388 radv_destroy_cmd_buffer(&cmd_buffer->vk);
389 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
390 }
391
392 ring = radv_queue_family_to_ring(pdev, cmd_buffer->qf);
393
394 cmd_buffer->cs =
395 device->ws->cs_create(device->ws, ring, cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
396 if (!cmd_buffer->cs) {
397 radv_destroy_cmd_buffer(&cmd_buffer->vk);
398 return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
399 }
400
401 vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
402
403 for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
404 vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
405
406 cmd_buffer->accel_struct_buffers = _mesa_pointer_set_create(NULL);
407 util_dynarray_init(&cmd_buffer->ray_history, NULL);
408 }
409
410 *cmd_buffer_out = &cmd_buffer->vk;
411
412 return VK_SUCCESS;
413 }
414
415 void
radv_cmd_buffer_reset_rendering(struct radv_cmd_buffer * cmd_buffer)416 radv_cmd_buffer_reset_rendering(struct radv_cmd_buffer *cmd_buffer)
417 {
418 memset(&cmd_buffer->state.render, 0, sizeof(cmd_buffer->state.render));
419 }
420
421 static void
radv_reset_tracked_regs(struct radv_cmd_buffer * cmd_buffer)422 radv_reset_tracked_regs(struct radv_cmd_buffer *cmd_buffer)
423 {
424 struct radv_tracked_regs *tracked_regs = &cmd_buffer->tracked_regs;
425
426 /* Mark all registers as unknown. */
427 memset(tracked_regs->reg_value, 0, RADV_NUM_ALL_TRACKED_REGS * sizeof(uint32_t));
428 BITSET_ZERO(tracked_regs->reg_saved_mask);
429
430 /* 0xffffffff is an impossible value for SPI_PS_INPUT_CNTL_n registers */
431 memset(tracked_regs->spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
432 }
433
434 static void
radv_reset_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer,UNUSED VkCommandBufferResetFlags flags)435 radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandBufferResetFlags flags)
436 {
437 struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk);
438 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
439
440 vk_command_buffer_reset(&cmd_buffer->vk);
441
442 if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
443 return;
444
445 device->ws->cs_reset(cmd_buffer->cs);
446 if (cmd_buffer->gang.cs)
447 device->ws->cs_reset(cmd_buffer->gang.cs);
448
449 list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) {
450 radv_rmv_log_command_buffer_bo_destroy(device, up->upload_bo);
451 radv_bo_destroy(device, &cmd_buffer->vk.base, up->upload_bo);
452 list_del(&up->list);
453 free(up);
454 }
455
456 util_dynarray_clear(&cmd_buffer->ray_history);
457
458 radv_rra_accel_struct_buffers_unref(device, cmd_buffer->accel_struct_buffers);
459
460 cmd_buffer->push_constant_stages = 0;
461 cmd_buffer->scratch_size_per_wave_needed = 0;
462 cmd_buffer->scratch_waves_wanted = 0;
463 cmd_buffer->compute_scratch_size_per_wave_needed = 0;
464 cmd_buffer->compute_scratch_waves_wanted = 0;
465 cmd_buffer->esgs_ring_size_needed = 0;
466 cmd_buffer->gsvs_ring_size_needed = 0;
467 cmd_buffer->tess_rings_needed = false;
468 cmd_buffer->task_rings_needed = false;
469 cmd_buffer->mesh_scratch_ring_needed = false;
470 cmd_buffer->gds_needed = false;
471 cmd_buffer->gds_oa_needed = false;
472 cmd_buffer->sample_positions_needed = false;
473 cmd_buffer->gang.sem.leader_value = 0;
474 cmd_buffer->gang.sem.emitted_leader_value = 0;
475 cmd_buffer->gang.sem.va = 0;
476 cmd_buffer->shader_upload_seq = 0;
477 cmd_buffer->has_indirect_pipeline_binds = false;
478
479 if (cmd_buffer->upload.upload_bo)
480 radv_cs_add_buffer(device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
481 cmd_buffer->upload.offset = 0;
482
483 memset(cmd_buffer->vertex_binding_buffers, 0, sizeof(struct radv_buffer *) * cmd_buffer->used_vertex_bindings);
484 cmd_buffer->used_vertex_bindings = 0;
485
486 for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
487 cmd_buffer->descriptors[i].dirty = 0;
488 cmd_buffer->descriptors[i].valid = 0;
489 }
490
491 radv_cmd_buffer_reset_rendering(cmd_buffer);
492 }
493
494 const struct vk_command_buffer_ops radv_cmd_buffer_ops = {
495 .create = radv_create_cmd_buffer,
496 .reset = radv_reset_cmd_buffer,
497 .destroy = radv_destroy_cmd_buffer,
498 };
499
500 static bool
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer * cmd_buffer,uint64_t min_needed)501 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
502 {
503 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
504 uint64_t new_size;
505 struct radeon_winsys_bo *bo = NULL;
506 struct radv_cmd_buffer_upload *upload;
507
508 new_size = MAX2(min_needed, 16 * 1024);
509 new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
510
511 VkResult result = radv_bo_create(
512 device, &cmd_buffer->vk.base, new_size, 4096, device->ws->cs_domain(device->ws),
513 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
514 RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, true, &bo);
515
516 if (result != VK_SUCCESS) {
517 vk_command_buffer_set_error(&cmd_buffer->vk, result);
518 return false;
519 }
520
521 radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
522 if (cmd_buffer->upload.upload_bo) {
523 upload = malloc(sizeof(*upload));
524
525 if (!upload) {
526 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
527 radv_bo_destroy(device, &cmd_buffer->vk.base, bo);
528 return false;
529 }
530
531 memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
532 list_add(&upload->list, &cmd_buffer->upload.list);
533 }
534
535 cmd_buffer->upload.upload_bo = bo;
536 cmd_buffer->upload.size = new_size;
537 cmd_buffer->upload.offset = 0;
538 cmd_buffer->upload.map = radv_buffer_map(device->ws, cmd_buffer->upload.upload_bo);
539
540 if (!cmd_buffer->upload.map) {
541 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
542 return false;
543 }
544
545 radv_rmv_log_command_buffer_bo_create(device, cmd_buffer->upload.upload_bo, 0, cmd_buffer->upload.size, 0);
546
547 return true;
548 }
549
550 bool
radv_cmd_buffer_upload_alloc_aligned(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned alignment,unsigned * out_offset,void ** ptr)551 radv_cmd_buffer_upload_alloc_aligned(struct radv_cmd_buffer *cmd_buffer, unsigned size, unsigned alignment,
552 unsigned *out_offset, void **ptr)
553 {
554 assert(size % 4 == 0);
555
556 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
557 const struct radv_physical_device *pdev = radv_device_physical(device);
558 const struct radeon_info *gpu_info = &pdev->info;
559
560 /* Align to the scalar cache line size if it results in this allocation
561 * being placed in less of them.
562 */
563 unsigned offset = cmd_buffer->upload.offset;
564 unsigned line_size = gpu_info->gfx_level >= GFX10 ? 64 : 32;
565 unsigned gap = align(offset, line_size) - offset;
566 if ((size & (line_size - 1)) > gap)
567 offset = align(offset, line_size);
568
569 if (alignment)
570 offset = align(offset, alignment);
571 if (offset + size > cmd_buffer->upload.size) {
572 if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
573 return false;
574 offset = 0;
575 }
576
577 *out_offset = offset;
578 *ptr = cmd_buffer->upload.map + offset;
579
580 cmd_buffer->upload.offset = offset + size;
581 return true;
582 }
583
584 bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned * out_offset,void ** ptr)585 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size, unsigned *out_offset, void **ptr)
586 {
587 return radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, size, 0, out_offset, ptr);
588 }
589
590 bool
radv_cmd_buffer_upload_data(struct radv_cmd_buffer * cmd_buffer,unsigned size,const void * data,unsigned * out_offset)591 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data, unsigned *out_offset)
592 {
593 uint8_t *ptr;
594
595 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
596 return false;
597 assert(ptr);
598
599 memcpy(ptr, data, size);
600 return true;
601 }
602
603 void
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer * cmd_buffer)604 radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
605 {
606 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
607 struct radeon_cmdbuf *cs = cmd_buffer->cs;
608 uint64_t va;
609
610 if (cmd_buffer->qf != RADV_QUEUE_GENERAL && cmd_buffer->qf != RADV_QUEUE_COMPUTE)
611 return;
612
613 va = radv_buffer_get_va(device->trace_bo);
614 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
615 va += offsetof(struct radv_trace_data, primary_id);
616 else
617 va += offsetof(struct radv_trace_data, secondary_id);
618
619 ++cmd_buffer->state.trace_id;
620 radv_write_data(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id, false);
621
622 radeon_check_space(device->ws, cs, 2);
623
624 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
625 radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
626 }
627
628 void
radv_cmd_buffer_annotate(struct radv_cmd_buffer * cmd_buffer,const char * annotation)629 radv_cmd_buffer_annotate(struct radv_cmd_buffer *cmd_buffer, const char *annotation)
630 {
631 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
632
633 device->ws->cs_annotate(cmd_buffer->cs, annotation);
634 }
635
636 static void
radv_gang_barrier(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask,VkPipelineStageFlags2 dst_stage_mask)637 radv_gang_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
638 VkPipelineStageFlags2 dst_stage_mask)
639 {
640 /* Update flush bits from the main cmdbuf, except the stage flush. */
641 cmd_buffer->gang.flush_bits |=
642 cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
643
644 /* Add stage flush only when necessary. */
645 if (src_stage_mask & (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
646 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
647 VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV))
648 cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
649
650 /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
651 if (src_stage_mask &
652 (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
653 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
654 dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT : 0;
655
656 /* Increment the GFX/ACE semaphore when task shaders are blocked. */
657 if (dst_stage_mask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
658 VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT))
659 cmd_buffer->gang.sem.leader_value++;
660 }
661
662 void
radv_gang_cache_flush(struct radv_cmd_buffer * cmd_buffer)663 radv_gang_cache_flush(struct radv_cmd_buffer *cmd_buffer)
664 {
665 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
666 const struct radv_physical_device *pdev = radv_device_physical(device);
667 struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
668 const uint32_t flush_bits = cmd_buffer->gang.flush_bits;
669 enum rgp_flush_bits sqtt_flush_bits = 0;
670
671 radv_cs_emit_cache_flush(device->ws, ace_cs, pdev->info.gfx_level, NULL, 0, RADV_QUEUE_COMPUTE, flush_bits,
672 &sqtt_flush_bits, 0);
673
674 cmd_buffer->gang.flush_bits = 0;
675 }
676
677 static bool
radv_gang_sem_init(struct radv_cmd_buffer * cmd_buffer)678 radv_gang_sem_init(struct radv_cmd_buffer *cmd_buffer)
679 {
680 if (cmd_buffer->gang.sem.va)
681 return true;
682
683 /* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
684 * DWORD 1: ACE->GFX semaphore
685 */
686 uint64_t sem_init = 0;
687 uint32_t va_off = 0;
688 if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
689 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
690 return false;
691 }
692
693 cmd_buffer->gang.sem.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
694 return true;
695 }
696
697 static bool
radv_gang_leader_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)698 radv_gang_leader_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
699 {
700 return cmd_buffer->gang.sem.leader_value != cmd_buffer->gang.sem.emitted_leader_value;
701 }
702
703 static bool
radv_gang_follower_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)704 radv_gang_follower_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
705 {
706 return cmd_buffer->gang.sem.follower_value != cmd_buffer->gang.sem.emitted_follower_value;
707 }
708
709 ALWAYS_INLINE static bool
radv_flush_gang_semaphore(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs,const enum radv_queue_family qf,const uint32_t va_off,const uint32_t value)710 radv_flush_gang_semaphore(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs, const enum radv_queue_family qf,
711 const uint32_t va_off, const uint32_t value)
712 {
713 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
714 const struct radv_physical_device *pdev = radv_device_physical(device);
715
716 if (!radv_gang_sem_init(cmd_buffer))
717 return false;
718
719 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 12);
720
721 radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, qf, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
722 EOP_DATA_SEL_VALUE_32BIT, cmd_buffer->gang.sem.va + va_off, value,
723 cmd_buffer->gfx9_eop_bug_va);
724
725 assert(cmd_buffer->cs->cdw <= cdw_max);
726 return true;
727 }
728
729 ALWAYS_INLINE static bool
radv_flush_gang_leader_semaphore(struct radv_cmd_buffer * cmd_buffer)730 radv_flush_gang_leader_semaphore(struct radv_cmd_buffer *cmd_buffer)
731 {
732 if (!radv_gang_leader_sem_dirty(cmd_buffer))
733 return false;
734
735 /* Gang leader writes a value to the semaphore which the follower can wait for. */
736 cmd_buffer->gang.sem.emitted_leader_value = cmd_buffer->gang.sem.leader_value;
737 return radv_flush_gang_semaphore(cmd_buffer, cmd_buffer->cs, cmd_buffer->qf, 0, cmd_buffer->gang.sem.leader_value);
738 }
739
740 ALWAYS_INLINE static bool
radv_flush_gang_follower_semaphore(struct radv_cmd_buffer * cmd_buffer)741 radv_flush_gang_follower_semaphore(struct radv_cmd_buffer *cmd_buffer)
742 {
743 if (!radv_gang_follower_sem_dirty(cmd_buffer))
744 return false;
745
746 /* Follower writes a value to the semaphore which the gang leader can wait for. */
747 cmd_buffer->gang.sem.emitted_follower_value = cmd_buffer->gang.sem.follower_value;
748 return radv_flush_gang_semaphore(cmd_buffer, cmd_buffer->gang.cs, RADV_QUEUE_COMPUTE, 4,
749 cmd_buffer->gang.sem.follower_value);
750 }
751
752 ALWAYS_INLINE static void
radv_wait_gang_semaphore(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs,const enum radv_queue_family qf,const uint32_t va_off,const uint32_t value)753 radv_wait_gang_semaphore(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs, const enum radv_queue_family qf,
754 const uint32_t va_off, const uint32_t value)
755 {
756 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
757
758 assert(cmd_buffer->gang.sem.va);
759 radeon_check_space(device->ws, cs, 7);
760 radv_cp_wait_mem(cs, qf, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->gang.sem.va + va_off, value, 0xffffffff);
761 }
762
763 ALWAYS_INLINE static void
radv_wait_gang_leader(struct radv_cmd_buffer * cmd_buffer)764 radv_wait_gang_leader(struct radv_cmd_buffer *cmd_buffer)
765 {
766 /* Follower waits for the semaphore which the gang leader wrote. */
767 radv_wait_gang_semaphore(cmd_buffer, cmd_buffer->gang.cs, RADV_QUEUE_COMPUTE, 0, cmd_buffer->gang.sem.leader_value);
768 }
769
770 ALWAYS_INLINE static void
radv_wait_gang_follower(struct radv_cmd_buffer * cmd_buffer)771 radv_wait_gang_follower(struct radv_cmd_buffer *cmd_buffer)
772 {
773 /* Gang leader waits for the semaphore which the follower wrote. */
774 radv_wait_gang_semaphore(cmd_buffer, cmd_buffer->cs, cmd_buffer->qf, 4, cmd_buffer->gang.sem.follower_value);
775 }
776
777 bool
radv_gang_init(struct radv_cmd_buffer * cmd_buffer)778 radv_gang_init(struct radv_cmd_buffer *cmd_buffer)
779 {
780 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
781
782 if (cmd_buffer->gang.cs)
783 return true;
784
785 struct radeon_cmdbuf *ace_cs =
786 device->ws->cs_create(device->ws, AMD_IP_COMPUTE, cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
787
788 if (!ace_cs) {
789 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
790 return false;
791 }
792
793 cmd_buffer->gang.cs = ace_cs;
794 return true;
795 }
796
797 static VkResult
radv_gang_finalize(struct radv_cmd_buffer * cmd_buffer)798 radv_gang_finalize(struct radv_cmd_buffer *cmd_buffer)
799 {
800 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
801
802 assert(cmd_buffer->gang.cs);
803 struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
804
805 /* Emit pending cache flush. */
806 radv_gang_cache_flush(cmd_buffer);
807
808 /* Clear the leader<->follower semaphores if they exist.
809 * This is necessary in case the same cmd buffer is submitted again in the future.
810 */
811 if (cmd_buffer->gang.sem.va) {
812 uint64_t leader2follower_va = cmd_buffer->gang.sem.va;
813 uint64_t follower2leader_va = cmd_buffer->gang.sem.va + 4;
814 const uint32_t zero = 0;
815
816 /* Follower: write 0 to the leader->follower semaphore. */
817 radv_cs_write_data(device, ace_cs, RADV_QUEUE_COMPUTE, V_370_ME, leader2follower_va, 1, &zero, false);
818
819 /* Leader: write 0 to the follower->leader semaphore. */
820 radv_write_data(cmd_buffer, V_370_ME, follower2leader_va, 1, &zero, false);
821 }
822
823 return device->ws->cs_finalize(ace_cs);
824 }
825
826 static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer * cmd_buffer,enum radv_cmd_flush_bits flags,bool dgc)827 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags, bool dgc)
828 {
829 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
830 const struct radv_physical_device *pdev = radv_device_physical(device);
831 const struct radv_instance *instance = radv_physical_device_instance(pdev);
832
833 if (unlikely(device->sqtt.bo) && !dgc) {
834 radeon_check_space(device->ws, cmd_buffer->cs, 2);
835
836 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, cmd_buffer->state.predicating));
837 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
838 }
839
840 if (instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
841 enum rgp_flush_bits sqtt_flush_bits = 0;
842 assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
843
844 /* Force wait for graphics or compute engines to be idle. */
845 radv_cs_emit_cache_flush(device->ws, cmd_buffer->cs, pdev->info.gfx_level, &cmd_buffer->gfx9_fence_idx,
846 cmd_buffer->gfx9_fence_va, cmd_buffer->qf, flags, &sqtt_flush_bits,
847 cmd_buffer->gfx9_eop_bug_va);
848
849 if ((flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) && radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
850 /* Force wait for compute engines to be idle on the internal cmdbuf. */
851 radv_cs_emit_cache_flush(device->ws, cmd_buffer->gang.cs, pdev->info.gfx_level, NULL, 0, RADV_QUEUE_COMPUTE,
852 RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
853 }
854 }
855
856 if (radv_device_fault_detection_enabled(device))
857 radv_cmd_buffer_trace_emit(cmd_buffer);
858 }
859
860 static void
radv_save_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)861 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
862 {
863 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
864 const struct radv_physical_device *pdev = radv_device_physical(device);
865 enum amd_ip_type ring;
866 uint32_t data[2];
867 uint64_t va;
868
869 va = radv_buffer_get_va(device->trace_bo);
870
871 ring = radv_queue_family_to_ring(pdev, cmd_buffer->qf);
872
873 switch (ring) {
874 case AMD_IP_GFX:
875 va += offsetof(struct radv_trace_data, gfx_ring_pipeline);
876 break;
877 case AMD_IP_COMPUTE:
878 va += offsetof(struct radv_trace_data, comp_ring_pipeline);
879 break;
880 default:
881 assert(!"invalid IP type");
882 }
883
884 uint64_t pipeline_address = (uintptr_t)pipeline;
885 data[0] = pipeline_address;
886 data[1] = pipeline_address >> 32;
887
888 radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
889 }
890
891 static void
radv_save_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,uint64_t vb_ptr)892 radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
893 {
894 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
895 uint32_t data[2];
896 uint64_t va;
897
898 va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, vertex_descriptors);
899
900 data[0] = vb_ptr;
901 data[1] = vb_ptr >> 32;
902
903 radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
904 }
905
906 static void
radv_save_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader_part * prolog)907 radv_save_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader_part *prolog)
908 {
909 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
910 uint32_t data[2];
911 uint64_t va;
912
913 va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, vertex_prolog);
914
915 uint64_t prolog_address = (uintptr_t)prolog;
916 data[0] = prolog_address;
917 data[1] = prolog_address >> 32;
918
919 radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
920 }
921
922 void
radv_set_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)923 radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
924 struct radv_descriptor_set *set, unsigned idx)
925 {
926 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
927
928 descriptors_state->sets[idx] = set;
929
930 descriptors_state->valid |= (1u << idx); /* active descriptors */
931 descriptors_state->dirty |= (1u << idx);
932 }
933
934 static void
radv_save_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)935 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
936 {
937 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
938 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
939 uint32_t data[MAX_SETS * 2] = {0};
940 uint64_t va;
941 va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, descriptor_sets);
942
943 u_foreach_bit (i, descriptors_state->valid) {
944 struct radv_descriptor_set *set = descriptors_state->sets[i];
945 data[i * 2] = (uint64_t)(uintptr_t)set;
946 data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
947 }
948
949 radv_write_data(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data, false);
950 }
951
952 static void
radv_emit_userdata_address(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,int idx,uint64_t va)953 radv_emit_userdata_address(const struct radv_device *device, struct radeon_cmdbuf *cs, const struct radv_shader *shader,
954 int idx, uint64_t va)
955 {
956 const uint32_t offset = radv_get_user_sgpr_loc(shader, idx);
957
958 if (!offset)
959 return;
960
961 radv_emit_shader_pointer(device, cs, offset, va, false);
962 }
963
964 uint64_t
radv_descriptor_get_va(const struct radv_descriptor_state * descriptors_state,unsigned set_idx)965 radv_descriptor_get_va(const struct radv_descriptor_state *descriptors_state, unsigned set_idx)
966 {
967 struct radv_descriptor_set *set = descriptors_state->sets[set_idx];
968 uint64_t va;
969
970 if (set) {
971 va = set->header.va;
972 } else {
973 va = descriptors_state->descriptor_buffers[set_idx];
974 }
975
976 return va;
977 }
978
979 static void
radv_emit_descriptors_per_stage(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,const struct radv_descriptor_state * descriptors_state)980 radv_emit_descriptors_per_stage(const struct radv_device *device, struct radeon_cmdbuf *cs,
981 const struct radv_shader *shader, const struct radv_descriptor_state *descriptors_state)
982 {
983 const uint32_t indirect_descriptor_sets_offset = radv_get_user_sgpr_loc(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS);
984
985 if (indirect_descriptor_sets_offset) {
986 radv_emit_shader_pointer(device, cs, indirect_descriptor_sets_offset,
987 descriptors_state->indirect_descriptor_sets_va, false);
988 } else {
989 const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
990 const uint32_t sh_base = shader->info.user_data_0;
991 unsigned mask = locs->descriptor_sets_enabled;
992
993 mask &= descriptors_state->dirty & descriptors_state->valid;
994
995 while (mask) {
996 int start, count;
997
998 u_bit_scan_consecutive_range(&mask, &start, &count);
999
1000 const struct radv_userdata_info *loc = &locs->descriptor_sets[start];
1001 const unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
1002
1003 radv_emit_shader_pointer_head(cs, sh_offset, count, true);
1004 for (int i = 0; i < count; i++) {
1005 uint64_t va = radv_descriptor_get_va(descriptors_state, start + i);
1006
1007 radv_emit_shader_pointer_body(device, cs, va, true);
1008 }
1009 }
1010 }
1011 }
1012
1013 static unsigned
radv_get_rasterization_prim(const struct radv_cmd_buffer * cmd_buffer)1014 radv_get_rasterization_prim(const struct radv_cmd_buffer *cmd_buffer)
1015 {
1016 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
1017 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1018
1019 if (cmd_buffer->state.active_stages &
1020 (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
1021 VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_MESH_BIT_EXT)) {
1022 /* Ignore dynamic primitive topology for TES/GS/MS stages. */
1023 return cmd_buffer->state.rast_prim;
1024 }
1025
1026 return radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg);
1027 }
1028
1029 static ALWAYS_INLINE VkLineRasterizationModeEXT
radv_get_line_mode(const struct radv_cmd_buffer * cmd_buffer)1030 radv_get_line_mode(const struct radv_cmd_buffer *cmd_buffer)
1031 {
1032 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1033
1034 const unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
1035
1036 bool draw_lines = radv_rast_prim_is_line(rast_prim) || radv_polygon_mode_is_line(d->vk.rs.polygon_mode);
1037 draw_lines &= !radv_rast_prim_is_point(rast_prim);
1038 draw_lines &= !radv_polygon_mode_is_point(d->vk.rs.polygon_mode);
1039 if (draw_lines)
1040 return d->vk.rs.line.mode;
1041
1042 return VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT;
1043 }
1044
1045 static ALWAYS_INLINE unsigned
radv_get_rasterization_samples(struct radv_cmd_buffer * cmd_buffer)1046 radv_get_rasterization_samples(struct radv_cmd_buffer *cmd_buffer)
1047 {
1048 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1049
1050 VkLineRasterizationModeEXT line_mode = radv_get_line_mode(cmd_buffer);
1051
1052 if (line_mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR) {
1053 /* From the Vulkan spec 1.3.221:
1054 *
1055 * "When Bresenham lines are being rasterized, sample locations may all be treated as being at
1056 * the pixel center (this may affect attribute and depth interpolation)."
1057 *
1058 * "One consequence of this is that Bresenham lines cover the same pixels regardless of the
1059 * number of rasterization samples, and cover all samples in those pixels (unless masked out
1060 * or killed)."
1061 */
1062 return 1;
1063 }
1064
1065 if (line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR) {
1066 return RADV_NUM_SMOOTH_AA_SAMPLES;
1067 }
1068
1069 return MAX2(1, d->vk.ms.rasterization_samples);
1070 }
1071
1072 static ALWAYS_INLINE unsigned
radv_get_ps_iter_samples(struct radv_cmd_buffer * cmd_buffer)1073 radv_get_ps_iter_samples(struct radv_cmd_buffer *cmd_buffer)
1074 {
1075 const struct radv_rendering_state *render = &cmd_buffer->state.render;
1076 unsigned ps_iter_samples = 1;
1077
1078 if (cmd_buffer->state.ms.sample_shading_enable) {
1079 unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
1080 unsigned color_samples = MAX2(render->color_samples, rasterization_samples);
1081
1082 ps_iter_samples = ceilf(cmd_buffer->state.ms.min_sample_shading * color_samples);
1083 ps_iter_samples = util_next_power_of_two(ps_iter_samples);
1084 }
1085
1086 return ps_iter_samples;
1087 }
1088
1089 /**
1090 * Convert the user sample locations to hardware sample locations (the values
1091 * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
1092 */
1093 static void
radv_convert_user_sample_locs(const struct radv_sample_locations_state * state,uint32_t x,uint32_t y,VkOffset2D * sample_locs)1094 radv_convert_user_sample_locs(const struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
1095 VkOffset2D *sample_locs)
1096 {
1097 uint32_t x_offset = x % state->grid_size.width;
1098 uint32_t y_offset = y % state->grid_size.height;
1099 uint32_t num_samples = (uint32_t)state->per_pixel;
1100 uint32_t pixel_offset;
1101
1102 pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
1103
1104 assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
1105 const VkSampleLocationEXT *user_locs = &state->locations[pixel_offset];
1106
1107 for (uint32_t i = 0; i < num_samples; i++) {
1108 float shifted_pos_x = user_locs[i].x - 0.5;
1109 float shifted_pos_y = user_locs[i].y - 0.5;
1110
1111 int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
1112 int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
1113
1114 sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
1115 sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
1116 }
1117 }
1118
1119 /**
1120 * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
1121 * locations.
1122 */
1123 static void
radv_compute_sample_locs_pixel(uint32_t num_samples,VkOffset2D * sample_locs,uint32_t * sample_locs_pixel)1124 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs, uint32_t *sample_locs_pixel)
1125 {
1126 for (uint32_t i = 0; i < num_samples; i++) {
1127 uint32_t sample_reg_idx = i / 4;
1128 uint32_t sample_loc_idx = i % 4;
1129 int32_t pos_x = sample_locs[i].x;
1130 int32_t pos_y = sample_locs[i].y;
1131
1132 uint32_t shift_x = 8 * sample_loc_idx;
1133 uint32_t shift_y = shift_x + 4;
1134
1135 sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
1136 sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
1137 }
1138 }
1139
1140 /**
1141 * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
1142 * sample locations.
1143 */
1144 static uint64_t
radv_compute_centroid_priority(struct radv_cmd_buffer * cmd_buffer,VkOffset2D * sample_locs,uint32_t num_samples)1145 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs, uint32_t num_samples)
1146 {
1147 uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
1148 uint32_t sample_mask = num_samples - 1;
1149 uint32_t *distances = alloca(num_samples * sizeof(*distances));
1150 uint64_t centroid_priority = 0;
1151
1152 /* Compute the distances from center for each sample. */
1153 for (int i = 0; i < num_samples; i++) {
1154 distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
1155 }
1156
1157 /* Compute the centroid priorities by looking at the distances array. */
1158 for (int i = 0; i < num_samples; i++) {
1159 uint32_t min_idx = 0;
1160
1161 for (int j = 1; j < num_samples; j++) {
1162 if (distances[j] < distances[min_idx])
1163 min_idx = j;
1164 }
1165
1166 centroid_priorities[i] = min_idx;
1167 distances[min_idx] = 0xffffffff;
1168 }
1169
1170 /* Compute the final centroid priority. */
1171 for (int i = 0; i < 8; i++) {
1172 centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
1173 }
1174
1175 return centroid_priority << 32 | centroid_priority;
1176 }
1177
1178 /**
1179 * Emit the sample locations that are specified with VK_EXT_sample_locations.
1180 */
1181 static void
radv_emit_sample_locations(struct radv_cmd_buffer * cmd_buffer)1182 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
1183 {
1184 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1185 const struct radv_physical_device *pdev = radv_device_physical(device);
1186 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1187 uint32_t num_samples = (uint32_t)d->sample_location.per_pixel;
1188 struct radeon_cmdbuf *cs = cmd_buffer->cs;
1189 uint32_t sample_locs_pixel[4][2] = {0};
1190 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
1191 uint64_t centroid_priority;
1192
1193 if (!d->sample_location.count || !d->vk.ms.sample_locations_enable)
1194 return;
1195
1196 /* Convert the user sample locations to hardware sample locations. */
1197 radv_convert_user_sample_locs(&d->sample_location, 0, 0, sample_locs[0]);
1198 radv_convert_user_sample_locs(&d->sample_location, 1, 0, sample_locs[1]);
1199 radv_convert_user_sample_locs(&d->sample_location, 0, 1, sample_locs[2]);
1200 radv_convert_user_sample_locs(&d->sample_location, 1, 1, sample_locs[3]);
1201
1202 /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
1203 for (uint32_t i = 0; i < 4; i++) {
1204 radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
1205 }
1206
1207 /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
1208 centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
1209
1210 /* Emit the specified user sample locations. */
1211 switch (num_samples) {
1212 case 2:
1213 case 4:
1214 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
1215 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
1216 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
1217 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
1218 break;
1219 case 8:
1220 radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
1221 radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
1222 radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
1223 radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
1224 radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]);
1225 radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]);
1226 radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]);
1227 radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]);
1228 break;
1229 default:
1230 unreachable("invalid number of samples");
1231 }
1232
1233 if (pdev->info.gfx_level >= GFX12) {
1234 radeon_set_context_reg_seq(cs, R_028BF0_PA_SC_CENTROID_PRIORITY_0, 2);
1235 } else {
1236 radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1237 }
1238 radeon_emit(cs, centroid_priority);
1239 radeon_emit(cs, centroid_priority >> 32);
1240 }
1241
1242 static void
radv_emit_inline_push_consts(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,int idx,const uint32_t * values)1243 radv_emit_inline_push_consts(const struct radv_device *device, struct radeon_cmdbuf *cs,
1244 const struct radv_shader *shader, int idx, const uint32_t *values)
1245 {
1246 const struct radv_userdata_info *loc = &shader->info.user_sgprs_locs.shader_data[idx];
1247 const uint32_t base_reg = shader->info.user_data_0;
1248
1249 if (loc->sgpr_idx == -1)
1250 return;
1251
1252 radeon_check_space(device->ws, cs, 2 + loc->num_sgprs);
1253
1254 radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1255 radeon_emit_array(cs, values, loc->num_sgprs);
1256 }
1257
1258 struct radv_bin_size_entry {
1259 unsigned bpp;
1260 VkExtent2D extent;
1261 };
1262
1263 static VkExtent2D
radv_gfx10_compute_bin_size(struct radv_cmd_buffer * cmd_buffer)1264 radv_gfx10_compute_bin_size(struct radv_cmd_buffer *cmd_buffer)
1265 {
1266 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1267 const struct radv_physical_device *pdev = radv_device_physical(device);
1268 const struct radv_rendering_state *render = &cmd_buffer->state.render;
1269 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1270 VkExtent2D extent = {512, 512};
1271
1272 const unsigned db_tag_size = 64;
1273 const unsigned db_tag_count = 312;
1274 const unsigned color_tag_size = 1024;
1275 const unsigned color_tag_count = 31;
1276 const unsigned fmask_tag_size = 256;
1277 const unsigned fmask_tag_count = 44;
1278
1279 const unsigned rb_count = pdev->info.max_render_backends;
1280 const unsigned pipe_count = MAX2(rb_count, pdev->info.num_tcc_blocks);
1281
1282 const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
1283 const unsigned color_tag_part = (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
1284 const unsigned fmask_tag_part = (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
1285
1286 const unsigned total_samples = radv_get_rasterization_samples(cmd_buffer);
1287 const unsigned samples_log = util_logbase2_ceil(total_samples);
1288
1289 unsigned color_bytes_per_pixel = 0;
1290 unsigned fmask_bytes_per_pixel = 0;
1291
1292 for (unsigned i = 0; i < render->color_att_count; ++i) {
1293 struct radv_image_view *iview = render->color_att[i].iview;
1294
1295 if (!iview)
1296 continue;
1297
1298 if (!d->vk.cb.attachments[i].write_mask)
1299 continue;
1300
1301 color_bytes_per_pixel += vk_format_get_blocksize(render->color_att[i].format);
1302
1303 if (total_samples > 1) {
1304 assert(samples_log <= 3);
1305 const unsigned fmask_array[] = {0, 1, 1, 4};
1306 fmask_bytes_per_pixel += fmask_array[samples_log];
1307 }
1308 }
1309
1310 color_bytes_per_pixel *= total_samples;
1311 color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
1312
1313 const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
1314 extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
1315 extent.height = 1ull << (color_pixel_count_log / 2);
1316
1317 if (fmask_bytes_per_pixel) {
1318 const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
1319
1320 const VkExtent2D fmask_extent = (VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
1321 .height = 1ull << (color_pixel_count_log / 2)};
1322
1323 if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
1324 extent = fmask_extent;
1325 }
1326
1327 if (render->ds_att.iview) {
1328 /* Coefficients taken from AMDVLK */
1329 unsigned depth_coeff = vk_format_has_depth(render->ds_att.format) ? 5 : 0;
1330 unsigned stencil_coeff = vk_format_has_stencil(render->ds_att.format) ? 1 : 0;
1331 unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
1332
1333 const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
1334
1335 const VkExtent2D db_extent =
1336 (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2), .height = 1ull << (color_pixel_count_log / 2)};
1337
1338 if (db_extent.width * db_extent.height < extent.width * extent.height)
1339 extent = db_extent;
1340 }
1341
1342 extent.width = MAX2(extent.width, 128);
1343 extent.height = MAX2(extent.width, pdev->info.gfx_level >= GFX12 ? 128 : 64);
1344
1345 return extent;
1346 }
1347
1348 static VkExtent2D
radv_gfx9_compute_bin_size(struct radv_cmd_buffer * cmd_buffer)1349 radv_gfx9_compute_bin_size(struct radv_cmd_buffer *cmd_buffer)
1350 {
1351 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1352 const struct radv_physical_device *pdev = radv_device_physical(device);
1353 const struct radv_rendering_state *render = &cmd_buffer->state.render;
1354 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1355 static const struct radv_bin_size_entry color_size_table[][3][9] = {
1356 {
1357 /* One RB / SE */
1358 {
1359 /* One shader engine */
1360 {0, {128, 128}},
1361 {1, {64, 128}},
1362 {2, {32, 128}},
1363 {3, {16, 128}},
1364 {17, {0, 0}},
1365 {UINT_MAX, {0, 0}},
1366 },
1367 {
1368 /* Two shader engines */
1369 {0, {128, 128}},
1370 {2, {64, 128}},
1371 {3, {32, 128}},
1372 {5, {16, 128}},
1373 {17, {0, 0}},
1374 {UINT_MAX, {0, 0}},
1375 },
1376 {
1377 /* Four shader engines */
1378 {0, {128, 128}},
1379 {3, {64, 128}},
1380 {5, {16, 128}},
1381 {17, {0, 0}},
1382 {UINT_MAX, {0, 0}},
1383 },
1384 },
1385 {
1386 /* Two RB / SE */
1387 {
1388 /* One shader engine */
1389 {0, {128, 128}},
1390 {2, {64, 128}},
1391 {3, {32, 128}},
1392 {5, {16, 128}},
1393 {33, {0, 0}},
1394 {UINT_MAX, {0, 0}},
1395 },
1396 {
1397 /* Two shader engines */
1398 {0, {128, 128}},
1399 {3, {64, 128}},
1400 {5, {32, 128}},
1401 {9, {16, 128}},
1402 {33, {0, 0}},
1403 {UINT_MAX, {0, 0}},
1404 },
1405 {
1406 /* Four shader engines */
1407 {0, {256, 256}},
1408 {2, {128, 256}},
1409 {3, {128, 128}},
1410 {5, {64, 128}},
1411 {9, {16, 128}},
1412 {33, {0, 0}},
1413 {UINT_MAX, {0, 0}},
1414 },
1415 },
1416 {
1417 /* Four RB / SE */
1418 {
1419 /* One shader engine */
1420 {0, {128, 256}},
1421 {2, {128, 128}},
1422 {3, {64, 128}},
1423 {5, {32, 128}},
1424 {9, {16, 128}},
1425 {33, {0, 0}},
1426 {UINT_MAX, {0, 0}},
1427 },
1428 {
1429 /* Two shader engines */
1430 {0, {256, 256}},
1431 {2, {128, 256}},
1432 {3, {128, 128}},
1433 {5, {64, 128}},
1434 {9, {32, 128}},
1435 {17, {16, 128}},
1436 {33, {0, 0}},
1437 {UINT_MAX, {0, 0}},
1438 },
1439 {
1440 /* Four shader engines */
1441 {0, {256, 512}},
1442 {2, {256, 256}},
1443 {3, {128, 256}},
1444 {5, {128, 128}},
1445 {9, {64, 128}},
1446 {17, {16, 128}},
1447 {33, {0, 0}},
1448 {UINT_MAX, {0, 0}},
1449 },
1450 },
1451 };
1452 static const struct radv_bin_size_entry ds_size_table[][3][9] = {
1453 {
1454 // One RB / SE
1455 {
1456 // One shader engine
1457 {0, {128, 256}},
1458 {2, {128, 128}},
1459 {4, {64, 128}},
1460 {7, {32, 128}},
1461 {13, {16, 128}},
1462 {49, {0, 0}},
1463 {UINT_MAX, {0, 0}},
1464 },
1465 {
1466 // Two shader engines
1467 {0, {256, 256}},
1468 {2, {128, 256}},
1469 {4, {128, 128}},
1470 {7, {64, 128}},
1471 {13, {32, 128}},
1472 {25, {16, 128}},
1473 {49, {0, 0}},
1474 {UINT_MAX, {0, 0}},
1475 },
1476 {
1477 // Four shader engines
1478 {0, {256, 512}},
1479 {2, {256, 256}},
1480 {4, {128, 256}},
1481 {7, {128, 128}},
1482 {13, {64, 128}},
1483 {25, {16, 128}},
1484 {49, {0, 0}},
1485 {UINT_MAX, {0, 0}},
1486 },
1487 },
1488 {
1489 // Two RB / SE
1490 {
1491 // One shader engine
1492 {0, {256, 256}},
1493 {2, {128, 256}},
1494 {4, {128, 128}},
1495 {7, {64, 128}},
1496 {13, {32, 128}},
1497 {25, {16, 128}},
1498 {97, {0, 0}},
1499 {UINT_MAX, {0, 0}},
1500 },
1501 {
1502 // Two shader engines
1503 {0, {256, 512}},
1504 {2, {256, 256}},
1505 {4, {128, 256}},
1506 {7, {128, 128}},
1507 {13, {64, 128}},
1508 {25, {32, 128}},
1509 {49, {16, 128}},
1510 {97, {0, 0}},
1511 {UINT_MAX, {0, 0}},
1512 },
1513 {
1514 // Four shader engines
1515 {0, {512, 512}},
1516 {2, {256, 512}},
1517 {4, {256, 256}},
1518 {7, {128, 256}},
1519 {13, {128, 128}},
1520 {25, {64, 128}},
1521 {49, {16, 128}},
1522 {97, {0, 0}},
1523 {UINT_MAX, {0, 0}},
1524 },
1525 },
1526 {
1527 // Four RB / SE
1528 {
1529 // One shader engine
1530 {0, {256, 512}},
1531 {2, {256, 256}},
1532 {4, {128, 256}},
1533 {7, {128, 128}},
1534 {13, {64, 128}},
1535 {25, {32, 128}},
1536 {49, {16, 128}},
1537 {UINT_MAX, {0, 0}},
1538 },
1539 {
1540 // Two shader engines
1541 {0, {512, 512}},
1542 {2, {256, 512}},
1543 {4, {256, 256}},
1544 {7, {128, 256}},
1545 {13, {128, 128}},
1546 {25, {64, 128}},
1547 {49, {32, 128}},
1548 {97, {16, 128}},
1549 {UINT_MAX, {0, 0}},
1550 },
1551 {
1552 // Four shader engines
1553 {0, {512, 512}},
1554 {4, {256, 512}},
1555 {7, {256, 256}},
1556 {13, {128, 256}},
1557 {25, {128, 128}},
1558 {49, {64, 128}},
1559 {97, {16, 128}},
1560 {UINT_MAX, {0, 0}},
1561 },
1562 },
1563 };
1564
1565 VkExtent2D extent = {512, 512};
1566
1567 unsigned log_num_rb_per_se = util_logbase2_ceil(pdev->info.max_render_backends / pdev->info.max_se);
1568 unsigned log_num_se = util_logbase2_ceil(pdev->info.max_se);
1569
1570 unsigned total_samples = radv_get_rasterization_samples(cmd_buffer);
1571 unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
1572 unsigned effective_samples = total_samples;
1573 unsigned color_bytes_per_pixel = 0;
1574
1575 for (unsigned i = 0; i < render->color_att_count; ++i) {
1576 struct radv_image_view *iview = render->color_att[i].iview;
1577
1578 if (!iview)
1579 continue;
1580
1581 if (!d->vk.cb.attachments[i].write_mask)
1582 continue;
1583
1584 color_bytes_per_pixel += vk_format_get_blocksize(render->color_att[i].format);
1585 }
1586
1587 /* MSAA images typically don't use all samples all the time. */
1588 if (effective_samples >= 2 && ps_iter_samples <= 1)
1589 effective_samples = 2;
1590 color_bytes_per_pixel *= effective_samples;
1591
1592 const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
1593 while (color_entry[1].bpp <= color_bytes_per_pixel)
1594 ++color_entry;
1595
1596 extent = color_entry->extent;
1597
1598 if (render->ds_att.iview) {
1599 /* Coefficients taken from AMDVLK */
1600 unsigned depth_coeff = vk_format_has_depth(render->ds_att.format) ? 5 : 0;
1601 unsigned stencil_coeff = vk_format_has_stencil(render->ds_att.format) ? 1 : 0;
1602 unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
1603
1604 const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
1605 while (ds_entry[1].bpp <= ds_bytes_per_pixel)
1606 ++ds_entry;
1607
1608 if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
1609 extent = ds_entry->extent;
1610 }
1611
1612 return extent;
1613 }
1614
1615 static unsigned
radv_get_disabled_binning_state(struct radv_cmd_buffer * cmd_buffer)1616 radv_get_disabled_binning_state(struct radv_cmd_buffer *cmd_buffer)
1617 {
1618 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1619 const struct radv_physical_device *pdev = radv_device_physical(device);
1620 const struct radv_rendering_state *render = &cmd_buffer->state.render;
1621 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1622 uint32_t pa_sc_binner_cntl_0;
1623
1624 if (pdev->info.gfx_level >= GFX12) {
1625 const uint32_t bin_size_x = 128, bin_size_y = 128;
1626
1627 pa_sc_binner_cntl_0 =
1628 S_028C44_BINNING_MODE(V_028C44_BINNING_DISABLED) | S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(bin_size_x) - 5) |
1629 S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(bin_size_y) - 5) | S_028C44_DISABLE_START_OF_PRIM(1) |
1630 S_028C44_FPOVS_PER_BATCH(63) | S_028C44_OPTIMAL_BIN_SELECTION(1) | S_028C44_FLUSH_ON_BINNING_TRANSITION(1);
1631 } else if (pdev->info.gfx_level >= GFX10) {
1632 const unsigned binning_disabled =
1633 pdev->info.gfx_level >= GFX11_5 ? V_028C44_BINNING_DISABLED : V_028C44_DISABLE_BINNING_USE_NEW_SC;
1634 unsigned min_bytes_per_pixel = 0;
1635
1636 for (unsigned i = 0; i < render->color_att_count; ++i) {
1637 struct radv_image_view *iview = render->color_att[i].iview;
1638
1639 if (!iview)
1640 continue;
1641
1642 if (!d->vk.cb.attachments[i].write_mask)
1643 continue;
1644
1645 unsigned bytes = vk_format_get_blocksize(render->color_att[i].format);
1646 if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
1647 min_bytes_per_pixel = bytes;
1648 }
1649
1650 pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(binning_disabled) | S_028C44_BIN_SIZE_X(0) | S_028C44_BIN_SIZE_Y(0) |
1651 S_028C44_BIN_SIZE_X_EXTEND(2) | /* 128 */
1652 S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
1653 S_028C44_DISABLE_START_OF_PRIM(1) | S_028C44_FLUSH_ON_BINNING_TRANSITION(1);
1654 } else {
1655 pa_sc_binner_cntl_0 =
1656 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | S_028C44_DISABLE_START_OF_PRIM(1) |
1657 S_028C44_FLUSH_ON_BINNING_TRANSITION(pdev->info.family == CHIP_VEGA12 || pdev->info.family == CHIP_VEGA20 ||
1658 pdev->info.family >= CHIP_RAVEN2);
1659 }
1660
1661 return pa_sc_binner_cntl_0;
1662 }
1663
1664 static unsigned
radv_get_binning_state(struct radv_cmd_buffer * cmd_buffer)1665 radv_get_binning_state(struct radv_cmd_buffer *cmd_buffer)
1666 {
1667 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1668 const struct radv_physical_device *pdev = radv_device_physical(device);
1669 unsigned pa_sc_binner_cntl_0;
1670 VkExtent2D bin_size;
1671
1672 if (pdev->info.gfx_level >= GFX10) {
1673 bin_size = radv_gfx10_compute_bin_size(cmd_buffer);
1674 } else {
1675 assert(pdev->info.gfx_level == GFX9);
1676 bin_size = radv_gfx9_compute_bin_size(cmd_buffer);
1677 }
1678
1679 if (device->pbb_allowed && bin_size.width && bin_size.height) {
1680 const struct radv_binning_settings *settings = &pdev->binning_settings;
1681
1682 pa_sc_binner_cntl_0 =
1683 S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.width == 16) |
1684 S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
1685 S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
1686 S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
1687 S_028C44_CONTEXT_STATES_PER_BIN(settings->context_states_per_bin - 1) |
1688 S_028C44_PERSISTENT_STATES_PER_BIN(settings->persistent_states_per_bin - 1) |
1689 S_028C44_DISABLE_START_OF_PRIM(1) | S_028C44_FPOVS_PER_BATCH(settings->fpovs_per_batch) |
1690 S_028C44_OPTIMAL_BIN_SELECTION(1) |
1691 S_028C44_FLUSH_ON_BINNING_TRANSITION(pdev->info.family == CHIP_VEGA12 || pdev->info.family == CHIP_VEGA20 ||
1692 pdev->info.family >= CHIP_RAVEN2);
1693 } else {
1694 pa_sc_binner_cntl_0 = radv_get_disabled_binning_state(cmd_buffer);
1695 }
1696
1697 return pa_sc_binner_cntl_0;
1698 }
1699
1700 static void
radv_emit_binning_state(struct radv_cmd_buffer * cmd_buffer)1701 radv_emit_binning_state(struct radv_cmd_buffer *cmd_buffer)
1702 {
1703 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1704 const struct radv_physical_device *pdev = radv_device_physical(device);
1705 unsigned pa_sc_binner_cntl_0;
1706
1707 if (pdev->info.gfx_level < GFX9)
1708 return;
1709
1710 pa_sc_binner_cntl_0 = radv_get_binning_state(cmd_buffer);
1711
1712 radeon_opt_set_context_reg(cmd_buffer, R_028C44_PA_SC_BINNER_CNTL_0, RADV_TRACKED_PA_SC_BINNER_CNTL_0,
1713 pa_sc_binner_cntl_0);
1714 }
1715
1716 static void
radv_emit_shader_prefetch(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader)1717 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader)
1718 {
1719 uint64_t va;
1720
1721 if (!shader)
1722 return;
1723
1724 va = radv_shader_get_va(shader);
1725
1726 radv_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1727 }
1728
1729 ALWAYS_INLINE static void
radv_emit_prefetch_L2(struct radv_cmd_buffer * cmd_buffer,bool first_stage_only)1730 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, bool first_stage_only)
1731 {
1732 struct radv_cmd_state *state = &cmd_buffer->state;
1733 uint32_t mask = state->prefetch_L2_mask;
1734
1735 /* Fast prefetch path for starting draws as soon as possible. */
1736 if (first_stage_only)
1737 mask &= RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS | RADV_PREFETCH_MS;
1738
1739 if (mask & RADV_PREFETCH_VS)
1740 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_VERTEX]);
1741
1742 if (mask & RADV_PREFETCH_MS)
1743 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_MESH]);
1744
1745 if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1746 radv_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size);
1747
1748 if (mask & RADV_PREFETCH_TCS)
1749 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]);
1750
1751 if (mask & RADV_PREFETCH_TES)
1752 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]);
1753
1754 if (mask & RADV_PREFETCH_GS) {
1755 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]);
1756 if (cmd_buffer->state.gs_copy_shader)
1757 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.gs_copy_shader);
1758 }
1759
1760 if (mask & RADV_PREFETCH_PS) {
1761 radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]);
1762 }
1763
1764 state->prefetch_L2_mask &= ~mask;
1765 }
1766
1767 static void
radv_emit_rbplus_state(struct radv_cmd_buffer * cmd_buffer)1768 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1769 {
1770 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1771 const struct radv_physical_device *pdev = radv_device_physical(device);
1772
1773 assert(pdev->info.rbplus_allowed);
1774
1775 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1776 struct radv_rendering_state *render = &cmd_buffer->state.render;
1777
1778 unsigned sx_ps_downconvert = 0;
1779 unsigned sx_blend_opt_epsilon = 0;
1780 unsigned sx_blend_opt_control = 0;
1781
1782 for (unsigned i = 0; i < render->color_att_count; i++) {
1783 unsigned format, swap;
1784 bool has_alpha, has_rgb;
1785 if (render->color_att[i].iview == NULL) {
1786 /* We don't set the DISABLE bits, because the HW can't have holes,
1787 * so the SPI color format is set to 32-bit 1-component. */
1788 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1789 continue;
1790 }
1791
1792 struct radv_color_buffer_info *cb = &render->color_att[i].cb;
1793
1794 format = pdev->info.gfx_level >= GFX11 ? G_028C70_FORMAT_GFX11(cb->ac.cb_color_info)
1795 : G_028C70_FORMAT_GFX6(cb->ac.cb_color_info);
1796 swap = G_028C70_COMP_SWAP(cb->ac.cb_color_info);
1797 has_alpha = pdev->info.gfx_level >= GFX11 ? !G_028C74_FORCE_DST_ALPHA_1_GFX11(cb->ac.cb_color_attrib)
1798 : !G_028C74_FORCE_DST_ALPHA_1_GFX6(cb->ac.cb_color_attrib);
1799
1800 uint32_t spi_format = (cmd_buffer->state.spi_shader_col_format >> (i * 4)) & 0xf;
1801 uint32_t colormask = d->vk.cb.attachments[i].write_mask;
1802
1803 if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1804 has_rgb = !has_alpha;
1805 else
1806 has_rgb = true;
1807
1808 /* Check the colormask and export format. */
1809 if (!(colormask & 0x7))
1810 has_rgb = false;
1811 if (!(colormask & 0x8))
1812 has_alpha = false;
1813
1814 if (spi_format == V_028714_SPI_SHADER_ZERO) {
1815 has_rgb = false;
1816 has_alpha = false;
1817 }
1818
1819 /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1820 * optimization, even though it has no alpha. */
1821 if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1822 has_alpha = true;
1823
1824 /* Disable value checking for disabled channels. */
1825 if (!has_rgb)
1826 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1827 if (!has_alpha)
1828 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1829
1830 /* Enable down-conversion for 32bpp and smaller formats. */
1831 switch (format) {
1832 case V_028C70_COLOR_8:
1833 case V_028C70_COLOR_8_8:
1834 case V_028C70_COLOR_8_8_8_8:
1835 /* For 1 and 2-channel formats, use the superset thereof. */
1836 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1837 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1838 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1839
1840 if (G_028C70_NUMBER_TYPE(cb->ac.cb_color_info) != V_028C70_NUMBER_SRGB)
1841 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT_0_5 << (i * 4);
1842 }
1843 break;
1844
1845 case V_028C70_COLOR_5_6_5:
1846 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1847 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1848 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT_0_5 << (i * 4);
1849 }
1850 break;
1851
1852 case V_028C70_COLOR_1_5_5_5:
1853 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1854 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1855 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT_0_5 << (i * 4);
1856 }
1857 break;
1858
1859 case V_028C70_COLOR_4_4_4_4:
1860 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1861 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1862 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT_0_5 << (i * 4);
1863 }
1864 break;
1865
1866 case V_028C70_COLOR_32:
1867 if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1868 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1869 else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1870 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1871 break;
1872
1873 case V_028C70_COLOR_16:
1874 case V_028C70_COLOR_16_16:
1875 /* For 1-channel formats, use the superset thereof. */
1876 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1877 spi_format == V_028714_SPI_SHADER_UINT16_ABGR || spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1878 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1879 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1880 else
1881 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1882 }
1883 break;
1884
1885 case V_028C70_COLOR_10_11_11:
1886 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1887 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1888 break;
1889
1890 case V_028C70_COLOR_2_10_10_10:
1891 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1892 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1893 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT_0_5 << (i * 4);
1894 }
1895 break;
1896 case V_028C70_COLOR_5_9_9_9:
1897 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1898 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1899 break;
1900 }
1901 }
1902
1903 /* Do not set the DISABLE bits for the unused attachments, as that
1904 * breaks dual source blending in SkQP and does not seem to improve
1905 * performance. */
1906
1907 radeon_opt_set_context_reg3(cmd_buffer, R_028754_SX_PS_DOWNCONVERT, RADV_TRACKED_SX_PS_DOWNCONVERT,
1908 sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control);
1909
1910 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_RBPLUS;
1911 }
1912
1913 static void
radv_emit_ps_epilog_state(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_part * ps_epilog)1914 radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_part *ps_epilog)
1915 {
1916 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1917 const struct radv_physical_device *pdev = radv_device_physical(device);
1918 struct radv_shader *ps_shader = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
1919
1920 if (cmd_buffer->state.emitted_ps_epilog == ps_epilog)
1921 return;
1922
1923 if (ps_epilog->spi_shader_z_format) {
1924 if (pdev->info.gfx_level >= GFX12) {
1925 radeon_set_context_reg(cmd_buffer->cs, R_028650_SPI_SHADER_Z_FORMAT, ps_epilog->spi_shader_z_format);
1926 } else {
1927 radeon_set_context_reg(cmd_buffer->cs, R_028710_SPI_SHADER_Z_FORMAT, ps_epilog->spi_shader_z_format);
1928 }
1929 }
1930
1931 assert(ps_shader->config.num_shared_vgprs == 0);
1932 if (G_00B848_VGPRS(ps_epilog->rsrc1) > G_00B848_VGPRS(ps_shader->config.rsrc1)) {
1933 uint32_t rsrc1 = ps_shader->config.rsrc1;
1934 rsrc1 = (rsrc1 & C_00B848_VGPRS) | (ps_epilog->rsrc1 & ~C_00B848_VGPRS);
1935 radeon_set_sh_reg(cmd_buffer->cs, ps_shader->info.regs.pgm_rsrc1, rsrc1);
1936 }
1937
1938 radv_cs_add_buffer(device->ws, cmd_buffer->cs, ps_epilog->bo);
1939
1940 assert((ps_epilog->va >> 32) == pdev->info.address32_hi);
1941
1942 const uint32_t epilog_pc_offset = radv_get_user_sgpr_loc(ps_shader, AC_UD_EPILOG_PC);
1943 radv_emit_shader_pointer(device, cmd_buffer->cs, epilog_pc_offset, ps_epilog->va, false);
1944
1945 cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, ps_epilog->upload_seq);
1946
1947 cmd_buffer->state.emitted_ps_epilog = ps_epilog;
1948 }
1949
1950 void
radv_emit_compute_shader(const struct radv_physical_device * pdev,struct radeon_cmdbuf * cs,const struct radv_shader * shader)1951 radv_emit_compute_shader(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs,
1952 const struct radv_shader *shader)
1953 {
1954 uint64_t va = radv_shader_get_va(shader);
1955
1956 radeon_set_sh_reg(cs, shader->info.regs.pgm_lo, va >> 8);
1957
1958 radeon_set_sh_reg_seq(cs, shader->info.regs.pgm_rsrc1, 2);
1959 radeon_emit(cs, shader->config.rsrc1);
1960 radeon_emit(cs, shader->config.rsrc2);
1961 if (pdev->info.gfx_level >= GFX10) {
1962 radeon_set_sh_reg(cs, shader->info.regs.pgm_rsrc3, shader->config.rsrc3);
1963 }
1964
1965 radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, shader->info.regs.cs.compute_resource_limits);
1966 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
1967 radeon_emit(cs, shader->info.regs.cs.compute_num_thread_x);
1968 radeon_emit(cs, shader->info.regs.cs.compute_num_thread_y);
1969 radeon_emit(cs, shader->info.regs.cs.compute_num_thread_z);
1970 }
1971
1972 static void
radv_emit_vgt_gs_mode(struct radv_cmd_buffer * cmd_buffer)1973 radv_emit_vgt_gs_mode(struct radv_cmd_buffer *cmd_buffer)
1974 {
1975 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1976 const struct radv_physical_device *pdev = radv_device_physical(device);
1977 const struct radv_shader_info *info = &cmd_buffer->state.last_vgt_shader->info;
1978 unsigned vgt_primitiveid_en = 0;
1979 uint32_t vgt_gs_mode = 0;
1980
1981 if (info->is_ngg)
1982 return;
1983
1984 if (info->stage == MESA_SHADER_GEOMETRY) {
1985 vgt_gs_mode = ac_vgt_gs_mode(info->gs.vertices_out, pdev->info.gfx_level);
1986 } else if (info->outinfo.export_prim_id || info->uses_prim_id) {
1987 vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
1988 vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
1989 }
1990
1991 radeon_opt_set_context_reg(cmd_buffer, R_028A84_VGT_PRIMITIVEID_EN, RADV_TRACKED_VGT_PRIMITIVEID_EN,
1992 vgt_primitiveid_en);
1993 radeon_opt_set_context_reg(cmd_buffer, R_028A40_VGT_GS_MODE, RADV_TRACKED_VGT_GS_MODE, vgt_gs_mode);
1994 }
1995
1996 static void
radv_emit_hw_vs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)1997 radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
1998 {
1999 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2000 const struct radv_physical_device *pdev = radv_device_physical(device);
2001 const uint64_t va = radv_shader_get_va(shader);
2002
2003 radeon_set_sh_reg_seq(cmd_buffer->cs, shader->info.regs.pgm_lo, 4);
2004 radeon_emit(cmd_buffer->cs, va >> 8);
2005 radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(va >> 40));
2006 radeon_emit(cmd_buffer->cs, shader->config.rsrc1);
2007 radeon_emit(cmd_buffer->cs, shader->config.rsrc2);
2008
2009 radeon_opt_set_context_reg(cmd_buffer, R_0286C4_SPI_VS_OUT_CONFIG, RADV_TRACKED_SPI_VS_OUT_CONFIG,
2010 shader->info.regs.spi_vs_out_config);
2011 radeon_opt_set_context_reg(cmd_buffer, R_02870C_SPI_SHADER_POS_FORMAT, RADV_TRACKED_SPI_SHADER_POS_FORMAT,
2012 shader->info.regs.spi_shader_pos_format);
2013 radeon_opt_set_context_reg(cmd_buffer, R_02881C_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
2014 shader->info.regs.pa_cl_vs_out_cntl);
2015
2016 if (pdev->info.gfx_level <= GFX8)
2017 radeon_opt_set_context_reg(cmd_buffer, R_028AB4_VGT_REUSE_OFF, RADV_TRACKED_VGT_REUSE_OFF,
2018 shader->info.regs.vs.vgt_reuse_off);
2019
2020 if (pdev->info.gfx_level >= GFX7) {
2021 radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
2022 shader->info.regs.vs.spi_shader_pgm_rsrc3_vs);
2023 radeon_set_sh_reg(cmd_buffer->cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
2024 shader->info.regs.vs.spi_shader_late_alloc_vs);
2025
2026 if (pdev->info.gfx_level >= GFX10) {
2027 radeon_set_uconfig_reg(cmd_buffer->cs, R_030980_GE_PC_ALLOC, shader->info.regs.ge_pc_alloc);
2028
2029 if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
2030 radeon_opt_set_context_reg(cmd_buffer, R_028A44_VGT_GS_ONCHIP_CNTL, RADV_TRACKED_VGT_GS_ONCHIP_CNTL,
2031 shader->info.regs.vgt_gs_onchip_cntl);
2032 }
2033 }
2034 }
2035 }
2036
2037 static void
radv_emit_hw_es(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)2038 radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
2039 {
2040 const uint64_t va = radv_shader_get_va(shader);
2041
2042 radeon_set_sh_reg_seq(cmd_buffer->cs, shader->info.regs.pgm_lo, 4);
2043 radeon_emit(cmd_buffer->cs, va >> 8);
2044 radeon_emit(cmd_buffer->cs, S_00B324_MEM_BASE(va >> 40));
2045 radeon_emit(cmd_buffer->cs, shader->config.rsrc1);
2046 radeon_emit(cmd_buffer->cs, shader->config.rsrc2);
2047 }
2048
2049 static void
radv_emit_hw_ls(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)2050 radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
2051 {
2052 const uint64_t va = radv_shader_get_va(shader);
2053
2054 radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_lo, va >> 8);
2055
2056 radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
2057 }
2058
2059 static void
radv_emit_hw_ngg(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * es,const struct radv_shader * shader)2060 radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *es, const struct radv_shader *shader)
2061 {
2062 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2063 const struct radv_physical_device *pdev = radv_device_physical(device);
2064 const uint64_t va = radv_shader_get_va(shader);
2065 gl_shader_stage es_type;
2066 const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info;
2067
2068 if (shader->info.stage == MESA_SHADER_GEOMETRY) {
2069 if (shader->info.merged_shader_compiled_separately) {
2070 es_type = es->info.stage;
2071 } else {
2072 es_type = shader->info.gs.es_type;
2073 }
2074 } else {
2075 es_type = shader->info.stage;
2076 }
2077
2078 if (!shader->info.merged_shader_compiled_separately) {
2079 radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_lo, va >> 8);
2080
2081 radeon_set_sh_reg_seq(cmd_buffer->cs, shader->info.regs.pgm_rsrc1, 2);
2082 radeon_emit(cmd_buffer->cs, shader->config.rsrc1);
2083 radeon_emit(cmd_buffer->cs, shader->config.rsrc2);
2084 }
2085
2086 const struct radv_vs_output_info *outinfo = &shader->info.outinfo;
2087
2088 const bool es_enable_prim_id = outinfo->export_prim_id || (es && es->info.uses_prim_id);
2089 bool break_wave_at_eoi = false;
2090
2091 if (es_type == MESA_SHADER_TESS_EVAL) {
2092 if (es_enable_prim_id || (shader->info.uses_prim_id))
2093 break_wave_at_eoi = true;
2094 }
2095
2096 if (pdev->info.gfx_level >= GFX12) {
2097 radeon_opt_set_context_reg(cmd_buffer, R_028818_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
2098 shader->info.regs.pa_cl_vs_out_cntl);
2099
2100 radeon_opt_set_context_reg(cmd_buffer, R_028B3C_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
2101 shader->info.regs.vgt_gs_instance_cnt);
2102
2103 radeon_set_uconfig_reg(cmd_buffer->cs, R_030988_VGT_PRIMITIVEID_EN, shader->info.regs.ngg.vgt_primitiveid_en);
2104
2105 radeon_opt_set_context_reg2(cmd_buffer, R_028648_SPI_SHADER_IDX_FORMAT, RADV_TRACKED_SPI_SHADER_IDX_FORMAT,
2106 shader->info.regs.ngg.spi_shader_idx_format, shader->info.regs.spi_shader_pos_format);
2107 } else {
2108 radeon_opt_set_context_reg(cmd_buffer, R_02881C_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
2109 shader->info.regs.pa_cl_vs_out_cntl);
2110
2111 radeon_opt_set_context_reg(cmd_buffer, R_028B90_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
2112 shader->info.regs.vgt_gs_instance_cnt);
2113
2114 radeon_opt_set_context_reg(cmd_buffer, R_028A84_VGT_PRIMITIVEID_EN, RADV_TRACKED_VGT_PRIMITIVEID_EN,
2115 shader->info.regs.ngg.vgt_primitiveid_en | S_028A84_PRIMITIVEID_EN(es_enable_prim_id));
2116
2117 radeon_opt_set_context_reg2(cmd_buffer, R_028708_SPI_SHADER_IDX_FORMAT, RADV_TRACKED_SPI_SHADER_IDX_FORMAT,
2118 shader->info.regs.ngg.spi_shader_idx_format, shader->info.regs.spi_shader_pos_format);
2119
2120 radeon_opt_set_context_reg(cmd_buffer, R_0286C4_SPI_VS_OUT_CONFIG, RADV_TRACKED_SPI_VS_OUT_CONFIG,
2121 shader->info.regs.spi_vs_out_config);
2122 }
2123
2124 radeon_opt_set_context_reg(cmd_buffer, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, RADV_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
2125 shader->info.regs.ngg.ge_max_output_per_subgroup);
2126
2127 radeon_opt_set_context_reg(cmd_buffer, R_028B4C_GE_NGG_SUBGRP_CNTL, RADV_TRACKED_GE_NGG_SUBGRP_CNTL,
2128 shader->info.regs.ngg.ge_ngg_subgrp_cntl);
2129
2130 uint32_t ge_cntl = shader->info.regs.ngg.ge_cntl;
2131 if (pdev->info.gfx_level >= GFX11) {
2132 ge_cntl |= S_03096C_BREAK_PRIMGRP_AT_EOI(break_wave_at_eoi);
2133 } else {
2134 ge_cntl |= S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
2135
2136 /* Bug workaround for a possible hang with non-tessellation cases.
2137 * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
2138 *
2139 * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
2140 */
2141 if (pdev->info.gfx_level == GFX10 && es_type != MESA_SHADER_TESS_EVAL && ngg_state->hw_max_esverts != 256) {
2142 ge_cntl &= C_03096C_VERT_GRP_SIZE;
2143
2144 if (ngg_state->hw_max_esverts > 5) {
2145 ge_cntl |= S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts - 5);
2146 }
2147 }
2148
2149 radeon_opt_set_context_reg(cmd_buffer, R_028A44_VGT_GS_ONCHIP_CNTL, RADV_TRACKED_VGT_GS_ONCHIP_CNTL,
2150 shader->info.regs.vgt_gs_onchip_cntl);
2151 }
2152
2153 radeon_set_uconfig_reg(cmd_buffer->cs, R_03096C_GE_CNTL, ge_cntl);
2154
2155 if (pdev->info.gfx_level >= GFX12) {
2156 radeon_set_sh_reg(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_RSRC4_GS, shader->info.regs.spi_shader_pgm_rsrc4_gs);
2157 } else {
2158 if (pdev->info.gfx_level >= GFX7) {
2159 radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
2160 shader->info.regs.spi_shader_pgm_rsrc3_gs);
2161 }
2162
2163 radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
2164 shader->info.regs.spi_shader_pgm_rsrc4_gs);
2165
2166 radeon_set_uconfig_reg(cmd_buffer->cs, R_030980_GE_PC_ALLOC, shader->info.regs.ge_pc_alloc);
2167 }
2168 }
2169
2170 static void
radv_emit_hw_hs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)2171 radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
2172 {
2173 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2174 const struct radv_physical_device *pdev = radv_device_physical(device);
2175 const uint64_t va = radv_shader_get_va(shader);
2176
2177 if (pdev->info.gfx_level >= GFX9) {
2178 radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_lo, va >> 8);
2179 radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
2180 } else {
2181 radeon_set_sh_reg_seq(cmd_buffer->cs, shader->info.regs.pgm_lo, 4);
2182 radeon_emit(cmd_buffer->cs, va >> 8);
2183 radeon_emit(cmd_buffer->cs, S_00B424_MEM_BASE(va >> 40));
2184 radeon_emit(cmd_buffer->cs, shader->config.rsrc1);
2185 radeon_emit(cmd_buffer->cs, shader->config.rsrc2);
2186 }
2187 }
2188
2189 static void
radv_emit_vertex_shader(struct radv_cmd_buffer * cmd_buffer)2190 radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer)
2191 {
2192 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2193 const struct radv_physical_device *pdev = radv_device_physical(device);
2194 const struct radv_shader *vs = cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
2195
2196 if (vs->info.merged_shader_compiled_separately) {
2197 assert(vs->info.next_stage == MESA_SHADER_TESS_CTRL || vs->info.next_stage == MESA_SHADER_GEOMETRY);
2198
2199 const struct radv_shader *next_stage = cmd_buffer->state.shaders[vs->info.next_stage];
2200
2201 if (!vs->info.vs.has_prolog) {
2202 uint32_t rsrc1, rsrc2;
2203
2204 radeon_set_sh_reg(cmd_buffer->cs, vs->info.regs.pgm_lo, vs->va >> 8);
2205
2206 if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
2207 radv_shader_combine_cfg_vs_tcs(vs, next_stage, &rsrc1, NULL);
2208
2209 radeon_set_sh_reg(cmd_buffer->cs, vs->info.regs.pgm_rsrc1, rsrc1);
2210 } else {
2211 radv_shader_combine_cfg_vs_gs(vs, next_stage, &rsrc1, &rsrc2);
2212
2213 unsigned lds_size;
2214 if (next_stage->info.is_ngg) {
2215 lds_size = DIV_ROUND_UP(next_stage->info.ngg_info.lds_size, pdev->info.lds_encode_granularity);
2216 } else {
2217 lds_size = next_stage->info.gs_ring_info.lds_size;
2218 }
2219
2220 radeon_set_sh_reg_seq(cmd_buffer->cs, vs->info.regs.pgm_rsrc1, 2);
2221 radeon_emit(cmd_buffer->cs, rsrc1);
2222 radeon_emit(cmd_buffer->cs, rsrc2 | S_00B22C_LDS_SIZE(lds_size));
2223 }
2224 }
2225
2226 const uint32_t next_stage_pc_offset = radv_get_user_sgpr_loc(vs, AC_UD_NEXT_STAGE_PC);
2227 radv_emit_shader_pointer(device, cmd_buffer->cs, next_stage_pc_offset, next_stage->va, false);
2228 return;
2229 }
2230
2231 if (vs->info.vs.as_ls)
2232 radv_emit_hw_ls(cmd_buffer, vs);
2233 else if (vs->info.vs.as_es)
2234 radv_emit_hw_es(cmd_buffer, vs);
2235 else if (vs->info.is_ngg)
2236 radv_emit_hw_ngg(cmd_buffer, NULL, vs);
2237 else
2238 radv_emit_hw_vs(cmd_buffer, vs);
2239 }
2240
2241 static void
radv_emit_tess_ctrl_shader(struct radv_cmd_buffer * cmd_buffer)2242 radv_emit_tess_ctrl_shader(struct radv_cmd_buffer *cmd_buffer)
2243 {
2244 const struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
2245
2246 if (tcs->info.merged_shader_compiled_separately) {
2247 /* When VS+TCS are compiled separately on GFX9+, the VS will jump to the TCS and everything is
2248 * emitted as part of the VS.
2249 */
2250 return;
2251 }
2252
2253 radv_emit_hw_hs(cmd_buffer, tcs);
2254 }
2255
2256 static void
radv_emit_tess_eval_shader(struct radv_cmd_buffer * cmd_buffer)2257 radv_emit_tess_eval_shader(struct radv_cmd_buffer *cmd_buffer)
2258 {
2259 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2260 const struct radv_physical_device *pdev = radv_device_physical(device);
2261 const struct radv_shader *tes = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL];
2262
2263 if (tes->info.merged_shader_compiled_separately) {
2264 assert(tes->info.next_stage == MESA_SHADER_GEOMETRY);
2265
2266 const struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
2267 uint32_t rsrc1, rsrc2;
2268
2269 radv_shader_combine_cfg_tes_gs(tes, gs, &rsrc1, &rsrc2);
2270
2271 radeon_set_sh_reg(cmd_buffer->cs, tes->info.regs.pgm_lo, tes->va >> 8);
2272
2273 unsigned lds_size;
2274 if (gs->info.is_ngg) {
2275 lds_size = DIV_ROUND_UP(gs->info.ngg_info.lds_size, pdev->info.lds_encode_granularity);
2276 } else {
2277 lds_size = gs->info.gs_ring_info.lds_size;
2278 }
2279
2280 radeon_set_sh_reg_seq(cmd_buffer->cs, tes->info.regs.pgm_rsrc1, 2);
2281 radeon_emit(cmd_buffer->cs, rsrc1);
2282 radeon_emit(cmd_buffer->cs, rsrc2 | S_00B22C_LDS_SIZE(lds_size));
2283
2284 const uint32_t next_stage_pc_offset = radv_get_user_sgpr_loc(tes, AC_UD_NEXT_STAGE_PC);
2285 radv_emit_shader_pointer(device, cmd_buffer->cs, next_stage_pc_offset, gs->va, false);
2286 return;
2287 }
2288
2289 if (tes->info.is_ngg) {
2290 radv_emit_hw_ngg(cmd_buffer, NULL, tes);
2291 } else if (tes->info.tes.as_es) {
2292 radv_emit_hw_es(cmd_buffer, tes);
2293 } else {
2294 radv_emit_hw_vs(cmd_buffer, tes);
2295 }
2296 }
2297
2298 static void
radv_emit_hw_gs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * gs)2299 radv_emit_hw_gs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *gs)
2300 {
2301 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2302 const struct radv_physical_device *pdev = radv_device_physical(device);
2303 const struct radv_legacy_gs_info *gs_state = &gs->info.gs_ring_info;
2304 const uint64_t va = radv_shader_get_va(gs);
2305
2306 radeon_opt_set_context_reg3(cmd_buffer, R_028A60_VGT_GSVS_RING_OFFSET_1, RADV_TRACKED_VGT_GSVS_RING_OFFSET_1,
2307 gs->info.regs.gs.vgt_gsvs_ring_offset[0], gs->info.regs.gs.vgt_gsvs_ring_offset[1],
2308 gs->info.regs.gs.vgt_gsvs_ring_offset[2]);
2309
2310 radeon_opt_set_context_reg(cmd_buffer, R_028AB0_VGT_GSVS_RING_ITEMSIZE, RADV_TRACKED_VGT_GSVS_RING_ITEMSIZE,
2311 gs->info.regs.gs.vgt_gsvs_ring_itemsize);
2312
2313 radeon_opt_set_context_reg4(cmd_buffer, R_028B5C_VGT_GS_VERT_ITEMSIZE, RADV_TRACKED_VGT_GS_VERT_ITEMSIZE,
2314 gs->info.regs.gs.vgt_gs_vert_itemsize[0], gs->info.regs.gs.vgt_gs_vert_itemsize[1],
2315 gs->info.regs.gs.vgt_gs_vert_itemsize[2], gs->info.regs.gs.vgt_gs_vert_itemsize[3]);
2316
2317 radeon_opt_set_context_reg(cmd_buffer, R_028B90_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
2318 gs->info.regs.gs.vgt_gs_instance_cnt);
2319
2320 if (pdev->info.gfx_level >= GFX9) {
2321 if (!gs->info.merged_shader_compiled_separately) {
2322 radeon_set_sh_reg(cmd_buffer->cs, gs->info.regs.pgm_lo, va >> 8);
2323
2324 radeon_set_sh_reg_seq(cmd_buffer->cs, gs->info.regs.pgm_rsrc1, 2);
2325 radeon_emit(cmd_buffer->cs, gs->config.rsrc1);
2326 radeon_emit(cmd_buffer->cs, gs->config.rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size));
2327 }
2328
2329 radeon_opt_set_context_reg(cmd_buffer, R_028A44_VGT_GS_ONCHIP_CNTL, RADV_TRACKED_VGT_GS_ONCHIP_CNTL,
2330 gs->info.regs.vgt_gs_onchip_cntl);
2331
2332 if (pdev->info.gfx_level == GFX9) {
2333 radeon_opt_set_context_reg(cmd_buffer, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
2334 RADV_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
2335 gs->info.regs.gs.vgt_gs_max_prims_per_subgroup);
2336 }
2337 } else {
2338 radeon_set_sh_reg_seq(cmd_buffer->cs, gs->info.regs.pgm_lo, 4);
2339 radeon_emit(cmd_buffer->cs, va >> 8);
2340 radeon_emit(cmd_buffer->cs, S_00B224_MEM_BASE(va >> 40));
2341 radeon_emit(cmd_buffer->cs, gs->config.rsrc1);
2342 radeon_emit(cmd_buffer->cs, gs->config.rsrc2);
2343
2344 /* GFX6-8: ESGS offchip ring buffer is allocated according to VGT_ESGS_RING_ITEMSIZE.
2345 * GFX9+: Only used to set the GS input VGPRs, emulated in shaders.
2346 */
2347 radeon_opt_set_context_reg(cmd_buffer, R_028AAC_VGT_ESGS_RING_ITEMSIZE, RADV_TRACKED_VGT_ESGS_RING_ITEMSIZE,
2348 gs->info.regs.gs.vgt_esgs_ring_itemsize);
2349 }
2350
2351 if (pdev->info.gfx_level >= GFX7) {
2352 radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
2353 gs->info.regs.spi_shader_pgm_rsrc3_gs);
2354 }
2355
2356 if (pdev->info.gfx_level >= GFX10) {
2357 radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
2358 gs->info.regs.spi_shader_pgm_rsrc4_gs);
2359 }
2360 }
2361
2362 static void
radv_emit_geometry_shader(struct radv_cmd_buffer * cmd_buffer)2363 radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer)
2364 {
2365 const struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
2366 const struct radv_shader *es = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
2367 ? cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
2368 : cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
2369 if (gs->info.is_ngg) {
2370 radv_emit_hw_ngg(cmd_buffer, es, gs);
2371 } else {
2372 radv_emit_hw_gs(cmd_buffer, gs);
2373 radv_emit_hw_vs(cmd_buffer, cmd_buffer->state.gs_copy_shader);
2374 }
2375
2376 radeon_opt_set_context_reg(cmd_buffer, R_028B38_VGT_GS_MAX_VERT_OUT, RADV_TRACKED_VGT_GS_MAX_VERT_OUT,
2377 gs->info.regs.vgt_gs_max_vert_out);
2378
2379 if (gs->info.merged_shader_compiled_separately) {
2380 const uint32_t vgt_esgs_ring_itemsize_offset = radv_get_user_sgpr_loc(gs, AC_UD_VGT_ESGS_RING_ITEMSIZE);
2381
2382 assert(vgt_esgs_ring_itemsize_offset);
2383
2384 radeon_set_sh_reg(cmd_buffer->cs, vgt_esgs_ring_itemsize_offset, es->info.esgs_itemsize / 4);
2385
2386 if (gs->info.is_ngg) {
2387 const uint32_t ngg_lds_layout_offset = radv_get_user_sgpr_loc(gs, AC_UD_NGG_LDS_LAYOUT);
2388
2389 assert(ngg_lds_layout_offset);
2390 assert(!(gs->info.ngg_info.esgs_ring_size & 0xffff0000) && !(gs->info.ngg_info.scratch_lds_base & 0xffff0000));
2391
2392 radeon_set_sh_reg(cmd_buffer->cs, ngg_lds_layout_offset,
2393 SET_SGPR_FIELD(NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE, gs->info.ngg_info.esgs_ring_size) |
2394 SET_SGPR_FIELD(NGG_LDS_LAYOUT_SCRATCH_BASE, gs->info.ngg_info.scratch_lds_base));
2395 }
2396 }
2397 }
2398
2399 static void
radv_emit_vgt_gs_out(struct radv_cmd_buffer * cmd_buffer,uint32_t vgt_gs_out_prim_type)2400 radv_emit_vgt_gs_out(struct radv_cmd_buffer *cmd_buffer, uint32_t vgt_gs_out_prim_type)
2401 {
2402 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2403 const struct radv_physical_device *pdev = radv_device_physical(device);
2404
2405 if (pdev->info.gfx_level >= GFX11) {
2406 radeon_set_uconfig_reg(cmd_buffer->cs, R_030998_VGT_GS_OUT_PRIM_TYPE, vgt_gs_out_prim_type);
2407 } else {
2408 radeon_opt_set_context_reg(cmd_buffer, R_028A6C_VGT_GS_OUT_PRIM_TYPE, RADV_TRACKED_VGT_GS_OUT_PRIM_TYPE,
2409 vgt_gs_out_prim_type);
2410 }
2411 }
2412
2413 static void
radv_emit_mesh_shader(struct radv_cmd_buffer * cmd_buffer)2414 radv_emit_mesh_shader(struct radv_cmd_buffer *cmd_buffer)
2415 {
2416 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2417 const struct radv_physical_device *pdev = radv_device_physical(device);
2418 const struct radv_shader *ms = cmd_buffer->state.shaders[MESA_SHADER_MESH];
2419 const uint32_t gs_out = radv_conv_gl_prim_to_gs_out(ms->info.ms.output_prim);
2420
2421 radv_emit_hw_ngg(cmd_buffer, NULL, ms);
2422 radeon_opt_set_context_reg(cmd_buffer, R_028B38_VGT_GS_MAX_VERT_OUT, RADV_TRACKED_VGT_GS_MAX_VERT_OUT,
2423 ms->info.regs.vgt_gs_max_vert_out);
2424 radeon_set_uconfig_reg_idx(&pdev->info, cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, V_008958_DI_PT_POINTLIST);
2425
2426 if (pdev->mesh_fast_launch_2) {
2427 radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, 2);
2428 radeon_emit(cmd_buffer->cs, ms->info.regs.ms.spi_shader_gs_meshlet_dim);
2429 radeon_emit(cmd_buffer->cs, ms->info.regs.ms.spi_shader_gs_meshlet_exp_alloc);
2430 }
2431
2432 radv_emit_vgt_gs_out(cmd_buffer, gs_out);
2433 }
2434
2435 enum radv_ps_in_type {
2436 radv_ps_in_interpolated,
2437 radv_ps_in_flat,
2438 radv_ps_in_explicit,
2439 radv_ps_in_explicit_strict,
2440 radv_ps_in_interpolated_fp16,
2441 radv_ps_in_interpolated_fp16_hi,
2442 radv_ps_in_per_prim_gfx103,
2443 radv_ps_in_per_prim_gfx11,
2444 };
2445
2446 static uint32_t
offset_to_ps_input(const uint32_t offset,const enum radv_ps_in_type type)2447 offset_to_ps_input(const uint32_t offset, const enum radv_ps_in_type type)
2448 {
2449 assert(offset != AC_EXP_PARAM_UNDEFINED);
2450
2451 if (offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && offset <= AC_EXP_PARAM_DEFAULT_VAL_1111) {
2452 /* The input is a DEFAULT_VAL constant. */
2453 return S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset - AC_EXP_PARAM_DEFAULT_VAL_0000);
2454 }
2455
2456 assert(offset <= AC_EXP_PARAM_OFFSET_31);
2457 uint32_t ps_input_cntl = S_028644_OFFSET(offset);
2458
2459 switch (type) {
2460 case radv_ps_in_explicit_strict:
2461 /* Rotate parameter cache contents to strict vertex order. */
2462 ps_input_cntl |= S_028644_ROTATE_PC_PTR(1);
2463 FALLTHROUGH;
2464 case radv_ps_in_explicit:
2465 /* Force parameter cache to be read in passthrough mode. */
2466 ps_input_cntl |= S_028644_OFFSET(1 << 5);
2467 FALLTHROUGH;
2468 case radv_ps_in_flat:
2469 ps_input_cntl |= S_028644_FLAT_SHADE(1);
2470 break;
2471 case radv_ps_in_interpolated_fp16_hi:
2472 ps_input_cntl |= S_028644_ATTR1_VALID(1);
2473 FALLTHROUGH;
2474 case radv_ps_in_interpolated_fp16:
2475 /* These must be set even if only the high 16 bits are used. */
2476 ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | S_028644_ATTR0_VALID(1);
2477 break;
2478 case radv_ps_in_per_prim_gfx11:
2479 ps_input_cntl |= S_028644_PRIM_ATTR(1);
2480 break;
2481 case radv_ps_in_interpolated:
2482 case radv_ps_in_per_prim_gfx103:
2483 break;
2484 }
2485
2486 return ps_input_cntl;
2487 }
2488
2489 static void
slot_to_ps_input(const struct radv_vs_output_info * outinfo,unsigned slot,uint32_t * ps_input_cntl,unsigned * ps_offset,const bool use_default_0,const enum radv_ps_in_type type)2490 slot_to_ps_input(const struct radv_vs_output_info *outinfo, unsigned slot, uint32_t *ps_input_cntl, unsigned *ps_offset,
2491 const bool use_default_0, const enum radv_ps_in_type type)
2492 {
2493 unsigned vs_offset = outinfo->vs_output_param_offset[slot];
2494
2495 if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
2496 if (use_default_0)
2497 vs_offset = AC_EXP_PARAM_DEFAULT_VAL_0000;
2498 else
2499 return;
2500 }
2501
2502 ps_input_cntl[*ps_offset] = offset_to_ps_input(vs_offset, type);
2503 ++(*ps_offset);
2504 }
2505
2506 static void
input_mask_to_ps_inputs(const struct radv_vs_output_info * outinfo,const struct radv_shader * ps,uint32_t input_mask,uint32_t * ps_input_cntl,unsigned * ps_offset,const enum radv_ps_in_type default_type)2507 input_mask_to_ps_inputs(const struct radv_vs_output_info *outinfo, const struct radv_shader *ps, uint32_t input_mask,
2508 uint32_t *ps_input_cntl, unsigned *ps_offset, const enum radv_ps_in_type default_type)
2509 {
2510 u_foreach_bit (i, input_mask) {
2511 unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i];
2512 if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
2513 ps_input_cntl[*ps_offset] = S_028644_OFFSET(0x20);
2514 ++(*ps_offset);
2515 continue;
2516 }
2517
2518 enum radv_ps_in_type type = default_type;
2519
2520 if (ps->info.ps.explicit_shaded_mask & BITFIELD_BIT(*ps_offset))
2521 type = radv_ps_in_explicit;
2522 else if (ps->info.ps.explicit_strict_shaded_mask & BITFIELD_BIT(*ps_offset))
2523 type = radv_ps_in_explicit_strict;
2524 else if (ps->info.ps.float16_hi_shaded_mask & BITFIELD_BIT(*ps_offset))
2525 type = radv_ps_in_interpolated_fp16_hi;
2526 else if (ps->info.ps.float16_shaded_mask & BITFIELD_BIT(*ps_offset))
2527 type = radv_ps_in_interpolated_fp16;
2528 else if (ps->info.ps.float32_shaded_mask & BITFIELD_BIT(*ps_offset))
2529 type = radv_ps_in_interpolated;
2530
2531 ps_input_cntl[*ps_offset] = offset_to_ps_input(vs_offset, type);
2532 ++(*ps_offset);
2533 }
2534 }
2535
2536 static void
radv_emit_ps_inputs(struct radv_cmd_buffer * cmd_buffer)2537 radv_emit_ps_inputs(struct radv_cmd_buffer *cmd_buffer)
2538 {
2539 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2540 const struct radv_physical_device *pdev = radv_device_physical(device);
2541 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2542 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
2543 const struct radv_vs_output_info *outinfo = &last_vgt_shader->info.outinfo;
2544 const bool mesh = last_vgt_shader->info.stage == MESA_SHADER_MESH;
2545 const bool gfx11plus = pdev->info.gfx_level >= GFX11;
2546 const enum radv_ps_in_type per_prim = gfx11plus ? radv_ps_in_per_prim_gfx11 : radv_ps_in_per_prim_gfx103;
2547
2548 uint32_t ps_input_cntl[32];
2549 unsigned ps_offset = 0;
2550
2551 if (!mesh) {
2552 if (ps->info.ps.prim_id_input)
2553 slot_to_ps_input(outinfo, VARYING_SLOT_PRIMITIVE_ID, ps_input_cntl, &ps_offset, false, radv_ps_in_flat);
2554
2555 if (ps->info.ps.layer_input)
2556 slot_to_ps_input(outinfo, VARYING_SLOT_LAYER, ps_input_cntl, &ps_offset, true, radv_ps_in_flat);
2557
2558 if (ps->info.ps.viewport_index_input)
2559 slot_to_ps_input(outinfo, VARYING_SLOT_VIEWPORT, ps_input_cntl, &ps_offset, true, radv_ps_in_flat);
2560 }
2561
2562 if (ps->info.ps.has_pcoord)
2563 ps_input_cntl[ps_offset++] = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20);
2564
2565 if (ps->info.ps.input_clips_culls_mask & 0x0f)
2566 slot_to_ps_input(outinfo, VARYING_SLOT_CLIP_DIST0, ps_input_cntl, &ps_offset, false, radv_ps_in_interpolated);
2567
2568 if (ps->info.ps.input_clips_culls_mask & 0xf0)
2569 slot_to_ps_input(outinfo, VARYING_SLOT_CLIP_DIST1, ps_input_cntl, &ps_offset, false, radv_ps_in_interpolated);
2570
2571 input_mask_to_ps_inputs(outinfo, ps, ps->info.ps.input_mask, ps_input_cntl, &ps_offset, radv_ps_in_flat);
2572
2573 /* Per-primitive PS inputs: the HW needs these to be last. */
2574 if (mesh) {
2575 if (ps->info.ps.prim_id_input)
2576 slot_to_ps_input(outinfo, VARYING_SLOT_PRIMITIVE_ID, ps_input_cntl, &ps_offset, false, per_prim);
2577
2578 if (ps->info.ps.layer_input)
2579 slot_to_ps_input(outinfo, VARYING_SLOT_LAYER, ps_input_cntl, &ps_offset, true, per_prim);
2580
2581 if (ps->info.ps.viewport_index_input)
2582 slot_to_ps_input(outinfo, VARYING_SLOT_VIEWPORT, ps_input_cntl, &ps_offset, true, per_prim);
2583 }
2584
2585 input_mask_to_ps_inputs(outinfo, ps, ps->info.ps.input_per_primitive_mask, ps_input_cntl, &ps_offset, per_prim);
2586
2587 if (pdev->info.gfx_level >= GFX12) {
2588 radeon_set_sh_reg(cmd_buffer->cs, R_00B0C4_SPI_SHADER_GS_OUT_CONFIG_PS,
2589 last_vgt_shader->info.regs.spi_vs_out_config | ps->info.regs.ps.spi_gs_out_config_ps);
2590
2591 radeon_opt_set_context_regn(cmd_buffer, R_028664_SPI_PS_INPUT_CNTL_0, ps_input_cntl,
2592 cmd_buffer->tracked_regs.spi_ps_input_cntl, ps_offset);
2593 } else {
2594 radeon_opt_set_context_regn(cmd_buffer, R_028644_SPI_PS_INPUT_CNTL_0, ps_input_cntl,
2595 cmd_buffer->tracked_regs.spi_ps_input_cntl, ps_offset);
2596 }
2597 }
2598
2599 static void
radv_emit_fragment_shader(struct radv_cmd_buffer * cmd_buffer)2600 radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer)
2601 {
2602 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2603 const struct radv_physical_device *pdev = radv_device_physical(device);
2604 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2605 const uint64_t va = radv_shader_get_va(ps);
2606
2607 radeon_set_sh_reg_seq(cmd_buffer->cs, ps->info.regs.pgm_lo, 4);
2608 radeon_emit(cmd_buffer->cs, va >> 8);
2609 radeon_emit(cmd_buffer->cs, S_00B024_MEM_BASE(va >> 40));
2610 radeon_emit(cmd_buffer->cs, ps->config.rsrc1);
2611 radeon_emit(cmd_buffer->cs, ps->config.rsrc2);
2612
2613 if (pdev->info.gfx_level >= GFX12) {
2614 radeon_opt_set_context_reg2(cmd_buffer, R_02865C_SPI_PS_INPUT_ENA, RADV_TRACKED_SPI_PS_INPUT_ENA,
2615 ps->config.spi_ps_input_ena, ps->config.spi_ps_input_addr);
2616
2617 radeon_opt_set_context_reg(cmd_buffer, R_028640_SPI_PS_IN_CONTROL, RADV_TRACKED_SPI_PS_IN_CONTROL,
2618 ps->info.regs.ps.spi_ps_in_control);
2619
2620 radeon_set_context_reg(cmd_buffer->cs, R_028650_SPI_SHADER_Z_FORMAT, ps->info.regs.ps.spi_shader_z_format);
2621
2622 radeon_set_context_reg(cmd_buffer->cs, R_028BBC_PA_SC_HISZ_CONTROL, ps->info.regs.ps.pa_sc_hisz_control);
2623 } else {
2624 radeon_opt_set_context_reg2(cmd_buffer, R_0286CC_SPI_PS_INPUT_ENA, RADV_TRACKED_SPI_PS_INPUT_ENA,
2625 ps->config.spi_ps_input_ena, ps->config.spi_ps_input_addr);
2626
2627 radeon_opt_set_context_reg(cmd_buffer, R_0286D8_SPI_PS_IN_CONTROL, RADV_TRACKED_SPI_PS_IN_CONTROL,
2628 ps->info.regs.ps.spi_ps_in_control);
2629
2630 radeon_opt_set_context_reg(cmd_buffer, R_028710_SPI_SHADER_Z_FORMAT, RADV_TRACKED_SPI_SHADER_Z_FORMAT,
2631 ps->info.regs.ps.spi_shader_z_format);
2632
2633 if (pdev->info.gfx_level >= GFX9 && pdev->info.gfx_level < GFX11)
2634 radeon_opt_set_context_reg(cmd_buffer, R_028C40_PA_SC_SHADER_CONTROL, RADV_TRACKED_PA_SC_SHADER_CONTROL,
2635 ps->info.regs.ps.pa_sc_shader_control);
2636 }
2637 }
2638
2639 static void
radv_emit_vgt_reuse(struct radv_cmd_buffer * cmd_buffer,const struct radv_vgt_shader_key * key)2640 radv_emit_vgt_reuse(struct radv_cmd_buffer *cmd_buffer, const struct radv_vgt_shader_key *key)
2641 {
2642 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2643 const struct radv_physical_device *pdev = radv_device_physical(device);
2644 const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
2645
2646 if (pdev->info.gfx_level == GFX10_3) {
2647 /* Legacy Tess+GS should disable reuse to prevent hangs on GFX10.3. */
2648 const bool has_legacy_tess_gs = key->tess && key->gs && !key->ngg;
2649
2650 radeon_opt_set_context_reg(cmd_buffer, R_028AB4_VGT_REUSE_OFF, RADV_TRACKED_VGT_REUSE_OFF,
2651 S_028AB4_REUSE_OFF(has_legacy_tess_gs));
2652 }
2653
2654 if (pdev->info.family >= CHIP_POLARIS10 && pdev->info.gfx_level < GFX10) {
2655 unsigned vtx_reuse_depth = 30;
2656 if (tes && tes->info.tes.spacing == TESS_SPACING_FRACTIONAL_ODD) {
2657 vtx_reuse_depth = 14;
2658 }
2659 radeon_opt_set_context_reg(cmd_buffer, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
2660 RADV_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth));
2661 }
2662 }
2663
2664 static void
radv_emit_vgt_shader_config_gfx12(struct radv_cmd_buffer * cmd_buffer,const struct radv_vgt_shader_key * key)2665 radv_emit_vgt_shader_config_gfx12(struct radv_cmd_buffer *cmd_buffer, const struct radv_vgt_shader_key *key)
2666 {
2667 const bool ngg_wave_id_en = key->ngg_streamout || (key->mesh && key->mesh_scratch_ring);
2668 uint32_t stages = 0;
2669
2670 stages |= S_028A98_GS_EN(key->gs) | S_028A98_GS_FAST_LAUNCH(key->mesh) | S_028A98_GS_W32_EN(key->gs_wave32) |
2671 S_028A98_NGG_WAVE_ID_EN(ngg_wave_id_en) | S_028A98_PRIMGEN_PASSTHRU_NO_MSG(key->ngg_passthrough);
2672
2673 if (key->tess)
2674 stages |= S_028A98_HS_EN(1) | S_028A98_HS_W32_EN(key->hs_wave32);
2675
2676 radeon_opt_set_context_reg(cmd_buffer, R_028A98_VGT_SHADER_STAGES_EN, RADV_TRACKED_VGT_SHADER_STAGES_EN, stages);
2677 }
2678
2679 static void
radv_emit_vgt_shader_config_gfx6(struct radv_cmd_buffer * cmd_buffer,const struct radv_vgt_shader_key * key)2680 radv_emit_vgt_shader_config_gfx6(struct radv_cmd_buffer *cmd_buffer, const struct radv_vgt_shader_key *key)
2681 {
2682 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2683 const struct radv_physical_device *pdev = radv_device_physical(device);
2684 uint32_t stages = 0;
2685
2686 if (key->tess) {
2687 stages |=
2688 S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(pdev->info.gfx_level != GFX9);
2689
2690 if (key->gs)
2691 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
2692 else if (key->ngg)
2693 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
2694 else
2695 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
2696 } else if (key->gs) {
2697 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
2698 } else if (key->mesh) {
2699 assert(!key->ngg_passthrough);
2700 unsigned gs_fast_launch = pdev->mesh_fast_launch_2 ? 2 : 1;
2701 stages |=
2702 S_028B54_GS_EN(1) | S_028B54_GS_FAST_LAUNCH(gs_fast_launch) | S_028B54_NGG_WAVE_ID_EN(key->mesh_scratch_ring);
2703 } else if (key->ngg) {
2704 stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
2705 }
2706
2707 if (key->ngg) {
2708 stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_NGG_WAVE_ID_EN(key->ngg_streamout) |
2709 S_028B54_PRIMGEN_PASSTHRU_EN(key->ngg_passthrough) |
2710 S_028B54_PRIMGEN_PASSTHRU_NO_MSG(key->ngg_passthrough && pdev->info.family >= CHIP_NAVI23);
2711 } else if (key->gs) {
2712 stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
2713 }
2714
2715 if (pdev->info.gfx_level >= GFX9)
2716 stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
2717
2718 if (pdev->info.gfx_level >= GFX10) {
2719 stages |= S_028B54_HS_W32_EN(key->hs_wave32) | S_028B54_GS_W32_EN(key->gs_wave32) |
2720 S_028B54_VS_W32_EN(pdev->info.gfx_level < GFX11 && key->vs_wave32);
2721 /* Legacy GS only supports Wave64. Read it as an implication. */
2722 assert(!(key->gs && !key->ngg) || !key->gs_wave32);
2723 }
2724
2725 radeon_opt_set_context_reg(cmd_buffer, R_028B54_VGT_SHADER_STAGES_EN, RADV_TRACKED_VGT_SHADER_STAGES_EN, stages);
2726 }
2727
2728 static void
radv_emit_vgt_shader_config(struct radv_cmd_buffer * cmd_buffer,const struct radv_vgt_shader_key * key)2729 radv_emit_vgt_shader_config(struct radv_cmd_buffer *cmd_buffer, const struct radv_vgt_shader_key *key)
2730 {
2731 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2732 const struct radv_physical_device *pdev = radv_device_physical(device);
2733
2734 if (pdev->info.gfx_level >= GFX12) {
2735 radv_emit_vgt_shader_config_gfx12(cmd_buffer, key);
2736 } else {
2737 radv_emit_vgt_shader_config_gfx6(cmd_buffer, key);
2738 }
2739 }
2740
2741 static void
gfx103_emit_vgt_draw_payload_cntl(struct radv_cmd_buffer * cmd_buffer)2742 gfx103_emit_vgt_draw_payload_cntl(struct radv_cmd_buffer *cmd_buffer)
2743 {
2744 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2745 const struct radv_physical_device *pdev = radv_device_physical(device);
2746 const struct radv_shader *mesh_shader = cmd_buffer->state.shaders[MESA_SHADER_MESH];
2747 const bool enable_vrs = cmd_buffer->state.uses_vrs;
2748 bool enable_prim_payload = false;
2749
2750 /* Enables the second channel of the primitive export instruction.
2751 * This channel contains: VRS rate x, y, viewport and layer.
2752 */
2753 if (mesh_shader) {
2754 const struct radv_vs_output_info *outinfo = &mesh_shader->info.outinfo;
2755
2756 enable_prim_payload = (outinfo->writes_viewport_index_per_primitive || outinfo->writes_layer_per_primitive ||
2757 outinfo->writes_primitive_shading_rate_per_primitive);
2758 }
2759
2760 const uint32_t vgt_draw_payload_cntl =
2761 S_028A98_EN_VRS_RATE(enable_vrs) | S_028A98_EN_PRIM_PAYLOAD(enable_prim_payload);
2762
2763 if (pdev->info.gfx_level >= GFX12) {
2764 radeon_opt_set_context_reg(cmd_buffer, R_028AA0_VGT_DRAW_PAYLOAD_CNTL, RADV_TRACKED_VGT_DRAW_PAYLOAD_CNTL,
2765 vgt_draw_payload_cntl);
2766 } else {
2767 radeon_opt_set_context_reg(cmd_buffer, R_028A98_VGT_DRAW_PAYLOAD_CNTL, RADV_TRACKED_VGT_DRAW_PAYLOAD_CNTL,
2768 vgt_draw_payload_cntl);
2769 }
2770 }
2771
2772 static void
gfx103_emit_vrs_state(struct radv_cmd_buffer * cmd_buffer)2773 gfx103_emit_vrs_state(struct radv_cmd_buffer *cmd_buffer)
2774 {
2775 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2776 const struct radv_physical_device *pdev = radv_device_physical(device);
2777 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2778 const bool force_vrs_per_vertex = cmd_buffer->state.last_vgt_shader->info.force_vrs_per_vertex;
2779 const bool enable_vrs_coarse_shading = cmd_buffer->state.uses_vrs_coarse_shading;
2780 uint32_t mode = V_028064_SC_VRS_COMB_MODE_PASSTHRU;
2781 uint8_t rate_x = 0, rate_y = 0;
2782
2783 if (enable_vrs_coarse_shading) {
2784 /* When per-draw VRS is not enabled at all, try enabling VRS coarse shading 2x2 if the driver
2785 * determined that it's safe to enable.
2786 */
2787 mode = V_028064_SC_VRS_COMB_MODE_OVERRIDE;
2788 rate_x = rate_y = 1;
2789 } else if (force_vrs_per_vertex) {
2790 /* Otherwise, if per-draw VRS is not enabled statically, try forcing per-vertex VRS if
2791 * requested by the user. Note that vkd3d-proton always has to declare VRS as dynamic because
2792 * in DX12 it's fully dynamic.
2793 */
2794 radeon_opt_set_context_reg(cmd_buffer, R_028848_PA_CL_VRS_CNTL, RADV_TRACKED_PA_CL_VRS_CNTL,
2795 S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE) |
2796 S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE));
2797
2798 /* If the shader is using discard, turn off coarse shading because discard at 2x2 pixel
2799 * granularity degrades quality too much. MIN allows sample shading but not coarse shading.
2800 */
2801 mode = ps->info.ps.can_discard ? V_028064_SC_VRS_COMB_MODE_MIN : V_028064_SC_VRS_COMB_MODE_PASSTHRU;
2802 }
2803
2804 if (pdev->info.gfx_level < GFX11) {
2805 radeon_opt_set_context_reg(cmd_buffer, R_028064_DB_VRS_OVERRIDE_CNTL, RADV_TRACKED_DB_VRS_OVERRIDE_CNTL,
2806 S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) | S_028064_VRS_OVERRIDE_RATE_X(rate_x) |
2807 S_028064_VRS_OVERRIDE_RATE_Y(rate_y));
2808 }
2809 }
2810
2811 static void
radv_emit_graphics_shaders(struct radv_cmd_buffer * cmd_buffer)2812 radv_emit_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
2813 {
2814 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2815 const struct radv_physical_device *pdev = radv_device_physical(device);
2816
2817 radv_foreach_stage(s, cmd_buffer->state.active_stages & RADV_GRAPHICS_STAGE_BITS)
2818 {
2819 switch (s) {
2820 case MESA_SHADER_VERTEX:
2821 radv_emit_vertex_shader(cmd_buffer);
2822 break;
2823 case MESA_SHADER_TESS_CTRL:
2824 radv_emit_tess_ctrl_shader(cmd_buffer);
2825 break;
2826 case MESA_SHADER_TESS_EVAL:
2827 radv_emit_tess_eval_shader(cmd_buffer);
2828 break;
2829 case MESA_SHADER_GEOMETRY:
2830 radv_emit_geometry_shader(cmd_buffer);
2831 break;
2832 case MESA_SHADER_FRAGMENT:
2833 radv_emit_fragment_shader(cmd_buffer);
2834 radv_emit_ps_inputs(cmd_buffer);
2835 break;
2836 case MESA_SHADER_MESH:
2837 radv_emit_mesh_shader(cmd_buffer);
2838 break;
2839 case MESA_SHADER_TASK:
2840 radv_emit_compute_shader(pdev, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK]);
2841 break;
2842 default:
2843 unreachable("invalid bind stage");
2844 }
2845 }
2846
2847 const struct radv_vgt_shader_key vgt_shader_cfg_key =
2848 radv_get_vgt_shader_key(device, cmd_buffer->state.shaders, cmd_buffer->state.gs_copy_shader);
2849
2850 radv_emit_vgt_gs_mode(cmd_buffer);
2851 radv_emit_vgt_reuse(cmd_buffer, &vgt_shader_cfg_key);
2852 radv_emit_vgt_shader_config(cmd_buffer, &vgt_shader_cfg_key);
2853
2854 if (pdev->info.gfx_level >= GFX10_3) {
2855 gfx103_emit_vgt_draw_payload_cntl(cmd_buffer);
2856 gfx103_emit_vrs_state(cmd_buffer);
2857 }
2858
2859 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GRAPHICS_SHADERS;
2860 }
2861
2862 static void
radv_emit_graphics_pipeline(struct radv_cmd_buffer * cmd_buffer)2863 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
2864 {
2865 struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
2866 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2867 const struct radv_physical_device *pdev = radv_device_physical(device);
2868
2869 if (cmd_buffer->state.emitted_graphics_pipeline == pipeline)
2870 return;
2871
2872 if (cmd_buffer->state.emitted_graphics_pipeline) {
2873 if (radv_rast_prim_is_points_or_lines(cmd_buffer->state.emitted_graphics_pipeline->rast_prim) !=
2874 radv_rast_prim_is_points_or_lines(pipeline->rast_prim))
2875 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GUARDBAND;
2876
2877 if (cmd_buffer->state.emitted_graphics_pipeline->rast_prim != pipeline->rast_prim)
2878 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_RASTERIZATION_SAMPLES;
2879
2880 if (cmd_buffer->state.emitted_graphics_pipeline->ms.min_sample_shading != pipeline->ms.min_sample_shading ||
2881 cmd_buffer->state.emitted_graphics_pipeline->uses_out_of_order_rast != pipeline->uses_out_of_order_rast ||
2882 cmd_buffer->state.emitted_graphics_pipeline->uses_vrs_attachment != pipeline->uses_vrs_attachment)
2883 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
2884
2885 if (cmd_buffer->state.emitted_graphics_pipeline->ms.sample_shading_enable != pipeline->ms.sample_shading_enable) {
2886 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
2887 if (pdev->info.gfx_level >= GFX10_3)
2888 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
2889 }
2890
2891 if (cmd_buffer->state.emitted_graphics_pipeline->db_render_control != pipeline->db_render_control)
2892 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
2893 }
2894
2895 radv_emit_graphics_shaders(cmd_buffer);
2896
2897 if (device->pbb_allowed) {
2898 const struct radv_binning_settings *settings = &pdev->binning_settings;
2899
2900 if ((!cmd_buffer->state.emitted_graphics_pipeline ||
2901 cmd_buffer->state.emitted_graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT] !=
2902 cmd_buffer->state.graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT]) &&
2903 (settings->context_states_per_bin > 1 || settings->persistent_states_per_bin > 1)) {
2904 /* Break the batch on PS changes. */
2905 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
2906 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
2907 }
2908 }
2909
2910 if (pipeline->sqtt_shaders_reloc) {
2911 /* Emit shaders relocation because RGP requires them to be contiguous in memory. */
2912 radv_sqtt_emit_relocated_shaders(cmd_buffer, pipeline);
2913
2914 struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
2915 if (task_shader) {
2916 const struct radv_sqtt_shaders_reloc *reloc = pipeline->sqtt_shaders_reloc;
2917 const uint64_t va = reloc->va[MESA_SHADER_TASK];
2918
2919 radeon_set_sh_reg(cmd_buffer->gang.cs, task_shader->info.regs.pgm_lo, va >> 8);
2920 }
2921 }
2922
2923 if (radv_device_fault_detection_enabled(device))
2924 radv_save_pipeline(cmd_buffer, &pipeline->base);
2925
2926 cmd_buffer->state.emitted_graphics_pipeline = pipeline;
2927
2928 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
2929 }
2930
2931 static bool
radv_get_depth_clip_enable(struct radv_cmd_buffer * cmd_buffer)2932 radv_get_depth_clip_enable(struct radv_cmd_buffer *cmd_buffer)
2933 {
2934 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2935
2936 return d->vk.rs.depth_clip_enable == VK_MESA_DEPTH_CLIP_ENABLE_TRUE ||
2937 (d->vk.rs.depth_clip_enable == VK_MESA_DEPTH_CLIP_ENABLE_NOT_CLAMP && !d->vk.rs.depth_clamp_enable);
2938 }
2939
2940 enum radv_depth_clamp_mode {
2941 RADV_DEPTH_CLAMP_MODE_VIEWPORT = 0, /* Clamp to the viewport min/max depth bounds */
2942 RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE = 1, /* Clamp between 0.0f and 1.0f */
2943 RADV_DEPTH_CLAMP_MODE_DISABLED = 2, /* Disable depth clamping */
2944 };
2945
2946 static enum radv_depth_clamp_mode
radv_get_depth_clamp_mode(struct radv_cmd_buffer * cmd_buffer)2947 radv_get_depth_clamp_mode(struct radv_cmd_buffer *cmd_buffer)
2948 {
2949 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2950 bool depth_clip_enable = radv_get_depth_clip_enable(cmd_buffer);
2951 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2952 enum radv_depth_clamp_mode mode;
2953
2954 mode = RADV_DEPTH_CLAMP_MODE_VIEWPORT;
2955 if (!d->vk.rs.depth_clamp_enable) {
2956 /* For optimal performance, depth clamping should always be enabled except if the application
2957 * disables clamping explicitly or uses depth values outside of the [0.0, 1.0] range.
2958 */
2959 if (!depth_clip_enable || device->vk.enabled_extensions.EXT_depth_range_unrestricted) {
2960 mode = RADV_DEPTH_CLAMP_MODE_DISABLED;
2961 } else {
2962 mode = RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE;
2963 }
2964 }
2965
2966 return mode;
2967 }
2968
2969 static void
radv_get_viewport_zscale_ztranslate(struct radv_cmd_buffer * cmd_buffer,uint32_t vp_idx,float * zscale,float * ztranslate)2970 radv_get_viewport_zscale_ztranslate(struct radv_cmd_buffer *cmd_buffer, uint32_t vp_idx, float *zscale,
2971 float *ztranslate)
2972 {
2973 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2974
2975 if (d->vk.vp.depth_clip_negative_one_to_one) {
2976 *zscale = d->hw_vp.xform[vp_idx].scale[2] * 0.5f;
2977 *ztranslate = (d->hw_vp.xform[vp_idx].translate[2] + d->vk.vp.viewports[vp_idx].maxDepth) * 0.5f;
2978 } else {
2979 *zscale = d->hw_vp.xform[vp_idx].scale[2];
2980 *ztranslate = d->hw_vp.xform[vp_idx].translate[2];
2981 }
2982 }
2983
2984 static void
radv_get_viewport_zmin_zmax(struct radv_cmd_buffer * cmd_buffer,const VkViewport * viewport,float * zmin,float * zmax)2985 radv_get_viewport_zmin_zmax(struct radv_cmd_buffer *cmd_buffer, const VkViewport *viewport, float *zmin, float *zmax)
2986 {
2987 const enum radv_depth_clamp_mode depth_clamp_mode = radv_get_depth_clamp_mode(cmd_buffer);
2988
2989 if (depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE) {
2990 *zmin = 0.0f;
2991 *zmax = 1.0f;
2992 } else {
2993 *zmin = MIN2(viewport->minDepth, viewport->maxDepth);
2994 *zmax = MAX2(viewport->minDepth, viewport->maxDepth);
2995 }
2996 }
2997
2998 static void
radv_emit_viewport(struct radv_cmd_buffer * cmd_buffer)2999 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
3000 {
3001 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3002 const struct radv_physical_device *pdev = radv_device_physical(device);
3003 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3004
3005 assert(d->vk.vp.viewport_count);
3006
3007 if (pdev->info.gfx_level >= GFX12) {
3008 radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, d->vk.vp.viewport_count * 8);
3009
3010 for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
3011 float zscale, ztranslate, zmin, zmax;
3012
3013 radv_get_viewport_zscale_ztranslate(cmd_buffer, i, &zscale, &ztranslate);
3014 radv_get_viewport_zmin_zmax(cmd_buffer, &d->vk.vp.viewports[i], &zmin, &zmax);
3015
3016 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[0]));
3017 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[0]));
3018 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[1]));
3019 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[1]));
3020 radeon_emit(cmd_buffer->cs, fui(zscale));
3021 radeon_emit(cmd_buffer->cs, fui(ztranslate));
3022 radeon_emit(cmd_buffer->cs, fui(zmin));
3023 radeon_emit(cmd_buffer->cs, fui(zmax));
3024 }
3025 } else {
3026 radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, d->vk.vp.viewport_count * 6);
3027
3028 for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
3029 float zscale, ztranslate;
3030
3031 radv_get_viewport_zscale_ztranslate(cmd_buffer, i, &zscale, &ztranslate);
3032
3033 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[0]));
3034 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[0]));
3035 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[1]));
3036 radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[1]));
3037 radeon_emit(cmd_buffer->cs, fui(zscale));
3038 radeon_emit(cmd_buffer->cs, fui(ztranslate));
3039 }
3040
3041 radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, d->vk.vp.viewport_count * 2);
3042 for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
3043 float zmin, zmax;
3044
3045 radv_get_viewport_zmin_zmax(cmd_buffer, &d->vk.vp.viewports[i], &zmin, &zmax);
3046
3047 radeon_emit(cmd_buffer->cs, fui(zmin));
3048 radeon_emit(cmd_buffer->cs, fui(zmax));
3049 }
3050 }
3051 }
3052
3053 static VkRect2D
radv_scissor_from_viewport(const VkViewport * viewport)3054 radv_scissor_from_viewport(const VkViewport *viewport)
3055 {
3056 float scale[3], translate[3];
3057 VkRect2D rect;
3058
3059 radv_get_viewport_xform(viewport, scale, translate);
3060
3061 rect.offset.x = translate[0] - fabsf(scale[0]);
3062 rect.offset.y = translate[1] - fabsf(scale[1]);
3063 rect.extent.width = ceilf(translate[0] + fabsf(scale[0])) - rect.offset.x;
3064 rect.extent.height = ceilf(translate[1] + fabsf(scale[1])) - rect.offset.y;
3065
3066 return rect;
3067 }
3068
3069 static VkRect2D
radv_intersect_scissor(const VkRect2D * a,const VkRect2D * b)3070 radv_intersect_scissor(const VkRect2D *a, const VkRect2D *b)
3071 {
3072 VkRect2D ret;
3073 ret.offset.x = MAX2(a->offset.x, b->offset.x);
3074 ret.offset.y = MAX2(a->offset.y, b->offset.y);
3075 ret.extent.width = MIN2(a->offset.x + a->extent.width, b->offset.x + b->extent.width) - ret.offset.x;
3076 ret.extent.height = MIN2(a->offset.y + a->extent.height, b->offset.y + b->extent.height) - ret.offset.y;
3077 return ret;
3078 }
3079
3080 static void
radv_emit_scissor(struct radv_cmd_buffer * cmd_buffer)3081 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
3082 {
3083 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3084 const struct radv_physical_device *pdev = radv_device_physical(device);
3085 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3086 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3087
3088 if (!d->vk.vp.scissor_count)
3089 return;
3090
3091 radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, d->vk.vp.scissor_count * 2);
3092 for (unsigned i = 0; i < d->vk.vp.scissor_count; i++) {
3093 VkRect2D viewport_scissor = radv_scissor_from_viewport(d->vk.vp.viewports + i);
3094 VkRect2D scissor = radv_intersect_scissor(&d->vk.vp.scissors[i], &viewport_scissor);
3095
3096 uint32_t minx = scissor.offset.x;
3097 uint32_t miny = scissor.offset.y;
3098 uint32_t maxx = minx + scissor.extent.width;
3099 uint32_t maxy = miny + scissor.extent.height;
3100
3101 if (pdev->info.gfx_level >= GFX12) {
3102 /* On GFX12, an empty scissor must be done like this because the bottom-right bounds are inclusive. */
3103 if (maxx == 0 || maxy == 0) {
3104 minx = miny = maxx = maxy = 1;
3105 }
3106
3107 radeon_emit(cs, S_028250_TL_X(minx) | S_028250_TL_Y_GFX12(miny));
3108 radeon_emit(cs, S_028254_BR_X(maxx - 1) | S_028254_BR_Y(maxy - 1));
3109 } else {
3110 radeon_emit(cs, S_028250_TL_X(minx) | S_028250_TL_Y_GFX6(miny) | S_028250_WINDOW_OFFSET_DISABLE(1));
3111 radeon_emit(cs, S_028254_BR_X(maxx) | S_028254_BR_Y(maxy));
3112 }
3113 }
3114 }
3115
3116 static void
radv_emit_discard_rectangle(struct radv_cmd_buffer * cmd_buffer)3117 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
3118 {
3119 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3120 const struct radv_physical_device *pdev = radv_device_physical(device);
3121 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3122 uint32_t cliprect_rule = 0;
3123
3124 if (!d->vk.dr.enable) {
3125 cliprect_rule = 0xffff;
3126 } else {
3127 for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
3128 /* Interpret i as a bitmask, and then set the bit in
3129 * the mask if that combination of rectangles in which
3130 * the pixel is contained should pass the cliprect
3131 * test.
3132 */
3133 unsigned relevant_subset = i & ((1u << d->vk.dr.rectangle_count) - 1);
3134
3135 if (d->vk.dr.mode == VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT && !relevant_subset)
3136 continue;
3137
3138 if (d->vk.dr.mode == VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT && relevant_subset)
3139 continue;
3140
3141 cliprect_rule |= 1u << i;
3142 }
3143
3144 radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL, d->vk.dr.rectangle_count * 2);
3145 for (unsigned i = 0; i < d->vk.dr.rectangle_count; ++i) {
3146 VkRect2D rect = d->vk.dr.rectangles[i];
3147 radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
3148 radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
3149 S_028214_BR_Y(rect.offset.y + rect.extent.height));
3150 }
3151
3152 if (pdev->info.gfx_level >= GFX12) {
3153 radeon_set_context_reg_seq(cmd_buffer->cs, R_028374_PA_SC_CLIPRECT_0_EXT, d->vk.dr.rectangle_count);
3154 for (unsigned i = 0; i < d->vk.dr.rectangle_count; ++i) {
3155 VkRect2D rect = d->vk.dr.rectangles[i];
3156 radeon_emit(cmd_buffer->cs, S_028374_TL_X_EXT(rect.offset.x >> 15) |
3157 S_028374_TL_Y_EXT(rect.offset.y >> 15) |
3158 S_028374_BR_X_EXT((rect.offset.x + rect.extent.width) >> 15) |
3159 S_028374_BR_Y_EXT((rect.offset.y + rect.extent.height) >> 15));
3160 }
3161 }
3162 }
3163
3164 radeon_set_context_reg(cmd_buffer->cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
3165 }
3166
3167 static void
radv_emit_line_width(struct radv_cmd_buffer * cmd_buffer)3168 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
3169 {
3170 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3171
3172 radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
3173 S_028A08_WIDTH(CLAMP(d->vk.rs.line.width * 8, 0, 0xFFFF)));
3174 }
3175
3176 static void
radv_emit_blend_constants(struct radv_cmd_buffer * cmd_buffer)3177 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
3178 {
3179 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3180
3181 radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
3182 radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->vk.cb.blend_constants, 4);
3183 }
3184
3185 static void
radv_emit_stencil(struct radv_cmd_buffer * cmd_buffer)3186 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
3187 {
3188 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3189 const struct radv_physical_device *pdev = radv_device_physical(device);
3190 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3191
3192 if (pdev->info.gfx_level >= GFX12) {
3193 radeon_set_context_reg(
3194 cmd_buffer->cs, R_028088_DB_STENCIL_REF,
3195 S_028088_TESTVAL(d->vk.ds.stencil.front.reference) | S_028088_TESTVAL_BF(d->vk.ds.stencil.back.reference));
3196
3197 radeon_set_context_reg(cmd_buffer->cs, R_028090_DB_STENCIL_READ_MASK,
3198 S_028090_TESTMASK(d->vk.ds.stencil.front.compare_mask) |
3199 S_028090_TESTMASK_BF(d->vk.ds.stencil.back.compare_mask));
3200
3201 radeon_set_context_reg(cmd_buffer->cs, R_028094_DB_STENCIL_WRITE_MASK,
3202 S_028094_WRITEMASK(d->vk.ds.stencil.front.write_mask) |
3203 S_028094_WRITEMASK_BF(d->vk.ds.stencil.back.write_mask));
3204 } else {
3205 radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
3206 radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->vk.ds.stencil.front.reference) |
3207 S_028430_STENCILMASK(d->vk.ds.stencil.front.compare_mask) |
3208 S_028430_STENCILWRITEMASK(d->vk.ds.stencil.front.write_mask) |
3209 S_028430_STENCILOPVAL(1));
3210 radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->vk.ds.stencil.back.reference) |
3211 S_028434_STENCILMASK_BF(d->vk.ds.stencil.back.compare_mask) |
3212 S_028434_STENCILWRITEMASK_BF(d->vk.ds.stencil.back.write_mask) |
3213 S_028434_STENCILOPVAL_BF(1));
3214 }
3215 }
3216
3217 static void
radv_emit_depth_bounds(struct radv_cmd_buffer * cmd_buffer)3218 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
3219 {
3220 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3221 const struct radv_physical_device *pdev = radv_device_physical(device);
3222 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3223
3224 if (pdev->info.gfx_level >= GFX12) {
3225 radeon_set_context_reg_seq(cmd_buffer->cs, R_028050_DB_DEPTH_BOUNDS_MIN, 2);
3226 } else {
3227 radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
3228 }
3229
3230 radeon_emit(cmd_buffer->cs, fui(d->vk.ds.depth.bounds_test.min));
3231 radeon_emit(cmd_buffer->cs, fui(d->vk.ds.depth.bounds_test.max));
3232 }
3233
3234 static void
radv_emit_depth_bias(struct radv_cmd_buffer * cmd_buffer)3235 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
3236 {
3237 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3238 struct radv_rendering_state *render = &cmd_buffer->state.render;
3239 unsigned slope = fui(d->vk.rs.depth_bias.slope * 16.0f);
3240 unsigned pa_su_poly_offset_db_fmt_cntl = 0;
3241
3242 if (vk_format_has_depth(render->ds_att.format) &&
3243 d->vk.rs.depth_bias.representation != VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT) {
3244 VkFormat format = vk_format_depth_only(render->ds_att.format);
3245
3246 if (format == VK_FORMAT_D16_UNORM) {
3247 pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
3248 } else {
3249 assert(format == VK_FORMAT_D32_SFLOAT);
3250 if (d->vk.rs.depth_bias.representation ==
3251 VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT) {
3252 pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
3253 } else {
3254 pa_su_poly_offset_db_fmt_cntl =
3255 S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
3256 }
3257 }
3258 }
3259
3260 radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
3261 radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.clamp)); /* CLAMP */
3262 radeon_emit(cmd_buffer->cs, slope); /* FRONT SCALE */
3263 radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.constant)); /* FRONT OFFSET */
3264 radeon_emit(cmd_buffer->cs, slope); /* BACK SCALE */
3265 radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.constant)); /* BACK OFFSET */
3266
3267 radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
3268 }
3269
3270 static void
radv_emit_line_stipple(struct radv_cmd_buffer * cmd_buffer)3271 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
3272 {
3273 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3274 const struct radv_physical_device *pdev = radv_device_physical(device);
3275 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3276 enum amd_gfx_level gfx_level = pdev->info.gfx_level;
3277 /* GFX9 chips fail linestrip CTS tests unless this is set to 0 = no reset */
3278 uint32_t auto_reset_cntl = (gfx_level == GFX9) ? 0 : 2;
3279
3280 if (radv_primitive_topology_is_line_list(d->vk.ia.primitive_topology))
3281 auto_reset_cntl = 1;
3282
3283 radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
3284 S_028A0C_LINE_PATTERN(d->vk.rs.line.stipple.pattern) |
3285 S_028A0C_REPEAT_COUNT(d->vk.rs.line.stipple.factor - 1) |
3286 S_028A0C_AUTO_RESET_CNTL(pdev->info.gfx_level < GFX12 ? auto_reset_cntl : 0));
3287
3288 if (pdev->info.gfx_level >= GFX12) {
3289 radeon_set_context_reg(cmd_buffer->cs, R_028A44_PA_SC_LINE_STIPPLE_RESET,
3290 S_028A44_AUTO_RESET_CNTL(auto_reset_cntl));
3291 }
3292 }
3293
3294 static void
radv_emit_culling(struct radv_cmd_buffer * cmd_buffer)3295 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer)
3296 {
3297 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3298 const struct radv_physical_device *pdev = radv_device_physical(device);
3299 enum amd_gfx_level gfx_level = pdev->info.gfx_level;
3300 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3301 unsigned pa_su_sc_mode_cntl;
3302
3303 pa_su_sc_mode_cntl =
3304 S_028814_CULL_FRONT(!!(d->vk.rs.cull_mode & VK_CULL_MODE_FRONT_BIT)) |
3305 S_028814_CULL_BACK(!!(d->vk.rs.cull_mode & VK_CULL_MODE_BACK_BIT)) | S_028814_FACE(d->vk.rs.front_face) |
3306 S_028814_POLY_OFFSET_FRONT_ENABLE(d->vk.rs.depth_bias.enable) |
3307 S_028814_POLY_OFFSET_BACK_ENABLE(d->vk.rs.depth_bias.enable) |
3308 S_028814_POLY_OFFSET_PARA_ENABLE(d->vk.rs.depth_bias.enable) |
3309 S_028814_POLY_MODE(d->vk.rs.polygon_mode != V_028814_X_DRAW_TRIANGLES) |
3310 S_028814_POLYMODE_FRONT_PTYPE(d->vk.rs.polygon_mode) | S_028814_POLYMODE_BACK_PTYPE(d->vk.rs.polygon_mode) |
3311 S_028814_PROVOKING_VTX_LAST(d->vk.rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT);
3312
3313 if (gfx_level >= GFX10 && gfx_level < GFX12) {
3314 /* Ensure that SC processes the primitive group in the same order as PA produced them. Needed
3315 * when either POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set.
3316 */
3317 pa_su_sc_mode_cntl |=
3318 S_028814_KEEP_TOGETHER_ENABLE(d->vk.rs.polygon_mode != V_028814_X_DRAW_TRIANGLES ||
3319 radv_get_line_mode(cmd_buffer) == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR);
3320 }
3321
3322 if (pdev->info.gfx_level >= GFX12) {
3323 radeon_set_context_reg(cmd_buffer->cs, R_02881C_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
3324 } else {
3325 radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
3326 }
3327 }
3328
3329 static void
radv_emit_provoking_vertex_mode(struct radv_cmd_buffer * cmd_buffer)3330 radv_emit_provoking_vertex_mode(struct radv_cmd_buffer *cmd_buffer)
3331 {
3332 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
3333 const unsigned stage = last_vgt_shader->info.stage;
3334 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3335 const uint32_t ngg_provoking_vtx_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_PROVOKING_VTX);
3336 unsigned provoking_vtx = 0;
3337
3338 if (!ngg_provoking_vtx_offset)
3339 return;
3340
3341 if (d->vk.rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
3342 if (stage == MESA_SHADER_VERTEX) {
3343 provoking_vtx = radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg);
3344 } else {
3345 assert(stage == MESA_SHADER_GEOMETRY);
3346 provoking_vtx = last_vgt_shader->info.gs.vertices_in - 1;
3347 }
3348 }
3349
3350 radeon_set_sh_reg(cmd_buffer->cs, ngg_provoking_vtx_offset, provoking_vtx);
3351 }
3352
3353 static void
radv_emit_primitive_topology(struct radv_cmd_buffer * cmd_buffer)3354 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
3355 {
3356 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3357 const struct radv_physical_device *pdev = radv_device_physical(device);
3358 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
3359 const uint32_t verts_per_prim_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NUM_VERTS_PER_PRIM);
3360 const uint32_t vgt_gs_out_prim_type = radv_get_rasterization_prim(cmd_buffer);
3361 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3362
3363 assert(!cmd_buffer->state.mesh_shading);
3364
3365 if (pdev->info.gfx_level >= GFX7) {
3366 uint32_t vgt_prim = d->vk.ia.primitive_topology;
3367
3368 if (pdev->info.gfx_level >= GFX12)
3369 vgt_prim |= S_030908_NUM_INPUT_CP(d->vk.ts.patch_control_points);
3370
3371 radeon_set_uconfig_reg_idx(&pdev->info, cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
3372 } else {
3373 radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->vk.ia.primitive_topology);
3374 }
3375
3376 radv_emit_vgt_gs_out(cmd_buffer, vgt_gs_out_prim_type);
3377
3378 if (!verts_per_prim_offset)
3379 return;
3380
3381 radeon_set_sh_reg(cmd_buffer->cs, verts_per_prim_offset,
3382 radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg) + 1);
3383 }
3384
3385 static void
radv_emit_depth_control(struct radv_cmd_buffer * cmd_buffer)3386 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer)
3387 {
3388 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3389 const struct radv_physical_device *pdev = radv_device_physical(device);
3390 const struct radv_rendering_state *render = &cmd_buffer->state.render;
3391 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3392 const bool stencil_test_enable =
3393 d->vk.ds.stencil.test_enable && (render->ds_att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
3394 const uint32_t db_depth_control =
3395 S_028800_Z_ENABLE(d->vk.ds.depth.test_enable ? 1 : 0) |
3396 S_028800_Z_WRITE_ENABLE(d->vk.ds.depth.write_enable ? 1 : 0) | S_028800_ZFUNC(d->vk.ds.depth.compare_op) |
3397 S_028800_DEPTH_BOUNDS_ENABLE(d->vk.ds.depth.bounds_test.enable ? 1 : 0) |
3398 S_028800_STENCIL_ENABLE(stencil_test_enable) | S_028800_BACKFACE_ENABLE(stencil_test_enable) |
3399 S_028800_STENCILFUNC(d->vk.ds.stencil.front.op.compare) |
3400 S_028800_STENCILFUNC_BF(d->vk.ds.stencil.back.op.compare);
3401
3402 if (pdev->info.gfx_level >= GFX12) {
3403 radeon_set_context_reg(cmd_buffer->cs, R_028070_DB_DEPTH_CONTROL, db_depth_control);
3404 } else {
3405 radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
3406 }
3407 }
3408
3409 static void
radv_emit_stencil_control(struct radv_cmd_buffer * cmd_buffer)3410 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
3411 {
3412 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3413 const struct radv_physical_device *pdev = radv_device_physical(device);
3414 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3415 const uint32_t db_stencil_control =
3416 S_02842C_STENCILFAIL(radv_translate_stencil_op(d->vk.ds.stencil.front.op.fail)) |
3417 S_02842C_STENCILZPASS(radv_translate_stencil_op(d->vk.ds.stencil.front.op.pass)) |
3418 S_02842C_STENCILZFAIL(radv_translate_stencil_op(d->vk.ds.stencil.front.op.depth_fail)) |
3419 S_02842C_STENCILFAIL_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.fail)) |
3420 S_02842C_STENCILZPASS_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.pass)) |
3421 S_02842C_STENCILZFAIL_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.depth_fail));
3422
3423 if (pdev->info.gfx_level >= GFX12) {
3424 radeon_set_context_reg(cmd_buffer->cs, R_028074_DB_STENCIL_CONTROL, db_stencil_control);
3425 } else {
3426 radeon_set_context_reg(cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
3427 }
3428 }
3429
3430 static bool
radv_should_force_vrs1x1(struct radv_cmd_buffer * cmd_buffer)3431 radv_should_force_vrs1x1(struct radv_cmd_buffer *cmd_buffer)
3432 {
3433 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3434 const struct radv_physical_device *pdev = radv_device_physical(device);
3435 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
3436
3437 return pdev->info.gfx_level >= GFX10_3 &&
3438 (cmd_buffer->state.ms.sample_shading_enable || (ps && ps->info.ps.force_sample_iter_shading_rate));
3439 }
3440
3441 static void
radv_emit_fragment_shading_rate(struct radv_cmd_buffer * cmd_buffer)3442 radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
3443 {
3444 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3445 const struct radv_physical_device *pdev = radv_device_physical(device);
3446 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3447
3448 /* When per-vertex VRS is forced and the dynamic fragment shading rate is a no-op, ignore
3449 * it. This is needed for vkd3d-proton because it always declares per-draw VRS as dynamic.
3450 */
3451 if (device->force_vrs != RADV_FORCE_VRS_1x1 && d->vk.fsr.fragment_size.width == 1 &&
3452 d->vk.fsr.fragment_size.height == 1 &&
3453 d->vk.fsr.combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
3454 d->vk.fsr.combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)
3455 return;
3456
3457 uint32_t rate_x = MIN2(2, d->vk.fsr.fragment_size.width) - 1;
3458 uint32_t rate_y = MIN2(2, d->vk.fsr.fragment_size.height) - 1;
3459 uint32_t pipeline_comb_mode = d->vk.fsr.combiner_ops[0];
3460 uint32_t htile_comb_mode = d->vk.fsr.combiner_ops[1];
3461 uint32_t pa_cl_vrs_cntl = 0;
3462
3463 assert(pdev->info.gfx_level >= GFX10_3);
3464
3465 if (!cmd_buffer->state.render.vrs_att.iview) {
3466 /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
3467 * can cheat by tweaking the different combiner modes.
3468 */
3469 switch (htile_comb_mode) {
3470 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
3471 /* The result of min(A, 1x1) is always 1x1. */
3472 FALLTHROUGH;
3473 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
3474 /* Force the per-draw VRS rate to 1x1. */
3475 rate_x = rate_y = 0;
3476
3477 /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
3478 * combiner mode as passthrough.
3479 */
3480 pipeline_comb_mode = V_028848_SC_VRS_COMB_MODE_PASSTHRU;
3481 break;
3482 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
3483 /* The result of max(A, 1x1) is always A. */
3484 FALLTHROUGH;
3485 case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
3486 /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
3487 break;
3488 default:
3489 break;
3490 }
3491 }
3492
3493 /* Emit per-draw VRS rate which is the first combiner. */
3494 radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE, S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
3495
3496 /* Disable VRS and use the rates from PS_ITER_SAMPLES if:
3497 *
3498 * 1) sample shading is enabled or per-sample interpolation is used by the fragment shader
3499 * 2) the fragment shader requires 1x1 shading rate for some other reason
3500 */
3501 if (radv_should_force_vrs1x1(cmd_buffer)) {
3502 pa_cl_vrs_cntl |= S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE);
3503 }
3504
3505 /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
3506 * draw rate and the vertex rate.
3507 */
3508 if (cmd_buffer->state.mesh_shading) {
3509 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_PASSTHRU) |
3510 S_028848_PRIMITIVE_RATE_COMBINER_MODE(pipeline_comb_mode);
3511 } else {
3512 pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(pipeline_comb_mode) |
3513 S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_PASSTHRU);
3514 }
3515
3516 /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
3517 * rate.
3518 */
3519 pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
3520
3521 radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
3522 }
3523
3524 static uint32_t
radv_get_primitive_reset_index(const struct radv_cmd_buffer * cmd_buffer)3525 radv_get_primitive_reset_index(const struct radv_cmd_buffer *cmd_buffer)
3526 {
3527 const uint32_t index_type = G_028A7C_INDEX_TYPE(cmd_buffer->state.index_type);
3528 switch (index_type) {
3529 case V_028A7C_VGT_INDEX_8:
3530 return 0xffu;
3531 case V_028A7C_VGT_INDEX_16:
3532 return 0xffffu;
3533 case V_028A7C_VGT_INDEX_32:
3534 return 0xffffffffu;
3535 default:
3536 unreachable("invalid index type");
3537 }
3538 }
3539
3540 static void
radv_emit_primitive_restart_enable(struct radv_cmd_buffer * cmd_buffer)3541 radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
3542 {
3543 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3544 const struct radv_physical_device *pdev = radv_device_physical(device);
3545 const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
3546 const struct radv_dynamic_state *const d = &cmd_buffer->state.dynamic;
3547 struct radeon_cmdbuf *cs = cmd_buffer->cs;
3548 const bool en = d->vk.ia.primitive_restart_enable;
3549
3550 if (gfx_level >= GFX11) {
3551 radeon_set_uconfig_reg(cs, R_03092C_GE_MULTI_PRIM_IB_RESET_EN,
3552 S_03092C_RESET_EN(en) |
3553 /* This disables primitive restart for non-indexed draws.
3554 * By keeping this set, we don't have to unset RESET_EN
3555 * for non-indexed draws. */
3556 S_03092C_DISABLE_FOR_AUTO_INDEX(1));
3557 } else if (gfx_level >= GFX9) {
3558 radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, en);
3559 } else {
3560 radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, en);
3561
3562 /* GFX6-7: All 32 bits are compared.
3563 * GFX8: Only index type bits are compared.
3564 * GFX9+: Default is same as GFX8, MATCH_ALL_BITS=1 selects GFX6-7 behavior
3565 */
3566 if (en && gfx_level <= GFX7) {
3567 const uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
3568
3569 radeon_opt_set_context_reg(cmd_buffer, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
3570 RADV_TRACKED_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
3571 }
3572 }
3573 }
3574
3575 static void
radv_emit_clipping(struct radv_cmd_buffer * cmd_buffer)3576 radv_emit_clipping(struct radv_cmd_buffer *cmd_buffer)
3577 {
3578 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3579 bool depth_clip_enable = radv_get_depth_clip_enable(cmd_buffer);
3580
3581 radeon_set_context_reg(
3582 cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL,
3583 S_028810_DX_RASTERIZATION_KILL(d->vk.rs.rasterizer_discard_enable) |
3584 S_028810_ZCLIP_NEAR_DISABLE(!depth_clip_enable) | S_028810_ZCLIP_FAR_DISABLE(!depth_clip_enable) |
3585 S_028810_DX_CLIP_SPACE_DEF(!d->vk.vp.depth_clip_negative_one_to_one) | S_028810_DX_LINEAR_ATTR_CLIP_ENA(1));
3586 }
3587
3588 static bool
radv_is_mrt0_dual_src(struct radv_cmd_buffer * cmd_buffer)3589 radv_is_mrt0_dual_src(struct radv_cmd_buffer *cmd_buffer)
3590 {
3591 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3592
3593 if (!d->vk.cb.attachments[0].write_mask || !d->vk.cb.attachments[0].blend_enable)
3594 return false;
3595
3596 return radv_can_enable_dual_src(&d->vk.cb.attachments[0]);
3597 }
3598
3599 static void
radv_emit_logic_op(struct radv_cmd_buffer * cmd_buffer)3600 radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
3601 {
3602 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3603 const struct radv_physical_device *pdev = radv_device_physical(device);
3604 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3605 unsigned cb_color_control = 0;
3606
3607 if (d->vk.cb.logic_op_enable) {
3608 cb_color_control |= S_028808_ROP3(d->vk.cb.logic_op);
3609 } else {
3610 cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
3611 }
3612
3613 if (pdev->info.has_rbplus) {
3614 /* RB+ doesn't work with dual source blending, logic op and CB_RESOLVE. */
3615 bool mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
3616
3617 cb_color_control |= S_028808_DISABLE_DUAL_QUAD(mrt0_is_dual_src || d->vk.cb.logic_op_enable ||
3618 cmd_buffer->state.custom_blend_mode == V_028808_CB_RESOLVE);
3619 }
3620
3621 if (cmd_buffer->state.custom_blend_mode) {
3622 cb_color_control |= S_028808_MODE(cmd_buffer->state.custom_blend_mode);
3623 } else {
3624 bool color_write_enabled = false;
3625
3626 for (unsigned i = 0; i < MAX_RTS; i++) {
3627 if (d->vk.cb.attachments[i].write_mask) {
3628 color_write_enabled = true;
3629 break;
3630 }
3631 }
3632
3633 if (color_write_enabled) {
3634 cb_color_control |= S_028808_MODE(V_028808_CB_NORMAL);
3635 } else {
3636 cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
3637 }
3638 }
3639
3640 if (pdev->info.gfx_level >= GFX12) {
3641 radeon_set_context_reg(cmd_buffer->cs, R_028858_CB_COLOR_CONTROL, cb_color_control);
3642 } else {
3643 radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
3644 }
3645 }
3646
3647 static void
radv_emit_color_write(struct radv_cmd_buffer * cmd_buffer)3648 radv_emit_color_write(struct radv_cmd_buffer *cmd_buffer)
3649 {
3650 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3651 const struct radv_physical_device *pdev = radv_device_physical(device);
3652 const struct radv_binning_settings *settings = &pdev->binning_settings;
3653 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3654 uint32_t color_write_enable = 0, color_write_mask = 0;
3655
3656 u_foreach_bit (i, d->vk.cb.color_write_enables) {
3657 color_write_enable |= 0xfu << (i * 4);
3658 }
3659
3660 for (unsigned i = 0; i < MAX_RTS; i++) {
3661 color_write_mask |= d->vk.cb.attachments[i].write_mask << (4 * i);
3662 }
3663
3664 if (device->pbb_allowed && settings->context_states_per_bin > 1) {
3665 /* Flush DFSM on CB_TARGET_MASK changes. */
3666 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
3667 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
3668 }
3669
3670 if (pdev->info.gfx_level >= GFX12) {
3671 radeon_set_context_reg(cmd_buffer->cs, R_028850_CB_TARGET_MASK, color_write_mask & color_write_enable);
3672 } else {
3673 radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, color_write_mask & color_write_enable);
3674 }
3675 }
3676
3677 static void
radv_emit_patch_control_points(struct radv_cmd_buffer * cmd_buffer)3678 radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
3679 {
3680 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3681 const struct radv_physical_device *pdev = radv_device_physical(device);
3682 const struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
3683 const struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
3684 const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
3685 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3686 unsigned ls_hs_config;
3687
3688 /* Compute tessellation info that depends on the number of patch control points when this state
3689 * is dynamic.
3690 */
3691 if (cmd_buffer->state.uses_dynamic_patch_control_points) {
3692 /* Compute the number of patches. */
3693 cmd_buffer->state.tess_num_patches = radv_get_tcs_num_patches(
3694 pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
3695 tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs,
3696 tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs);
3697
3698 /* Compute the LDS size. */
3699 cmd_buffer->state.tess_lds_size =
3700 radv_get_tess_lds_size(pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out,
3701 vs->info.vs.num_linked_outputs, cmd_buffer->state.tess_num_patches,
3702 tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs);
3703 }
3704
3705 ls_hs_config = S_028B58_NUM_PATCHES(cmd_buffer->state.tess_num_patches) |
3706 /* GFX12 programs patch_vertices in VGT_PRIMITIVE_TYPE.NUM_INPUT_CP. */
3707 S_028B58_HS_NUM_INPUT_CP(pdev->info.gfx_level < GFX12 ? d->vk.ts.patch_control_points : 0) |
3708 S_028B58_HS_NUM_OUTPUT_CP(tcs->info.tcs.tcs_vertices_out);
3709
3710 if (pdev->info.gfx_level >= GFX7) {
3711 radeon_set_context_reg_idx(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
3712 } else {
3713 radeon_set_context_reg(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
3714 }
3715
3716 if (pdev->info.gfx_level >= GFX9) {
3717 unsigned hs_rsrc2;
3718
3719 if (tcs->info.merged_shader_compiled_separately) {
3720 radv_shader_combine_cfg_vs_tcs(cmd_buffer->state.shaders[MESA_SHADER_VERTEX], tcs, NULL, &hs_rsrc2);
3721 } else {
3722 hs_rsrc2 = tcs->config.rsrc2;
3723 }
3724
3725 if (pdev->info.gfx_level >= GFX10) {
3726 hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(cmd_buffer->state.tess_lds_size);
3727 } else {
3728 hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(cmd_buffer->state.tess_lds_size);
3729 }
3730
3731 radeon_set_sh_reg(cmd_buffer->cs, tcs->info.regs.pgm_rsrc2, hs_rsrc2);
3732 } else {
3733 unsigned ls_rsrc2 = vs->config.rsrc2 | S_00B52C_LDS_SIZE(cmd_buffer->state.tess_lds_size);
3734
3735 radeon_set_sh_reg(cmd_buffer->cs, vs->info.regs.pgm_rsrc2, ls_rsrc2);
3736 }
3737
3738 /* Emit user SGPRs for dynamic patch control points. */
3739 uint32_t tcs_offchip_layout_offset = radv_get_user_sgpr_loc(tcs, AC_UD_TCS_OFFCHIP_LAYOUT);
3740 if (!tcs_offchip_layout_offset)
3741 return;
3742
3743 unsigned tcs_offchip_layout =
3744 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PATCH_CONTROL_POINTS, d->vk.ts.patch_control_points - 1) |
3745 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP, tcs->info.tcs.tcs_vertices_out - 1) |
3746 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_PATCHES, cmd_buffer->state.tess_num_patches - 1) |
3747 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS, vs->info.vs.num_linked_outputs) |
3748 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS, tcs->info.tcs.num_linked_outputs) |
3749 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_TES_READS_TF, tes->info.tes.reads_tess_factors) |
3750 SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PRIMITIVE_MODE, tes->info.tes._primitive_mode);
3751
3752 radeon_set_sh_reg(cmd_buffer->cs, tcs_offchip_layout_offset, tcs_offchip_layout);
3753
3754 tcs_offchip_layout_offset = radv_get_user_sgpr_loc(tes, AC_UD_TCS_OFFCHIP_LAYOUT);
3755 assert(tcs_offchip_layout_offset);
3756
3757 radeon_set_sh_reg(cmd_buffer->cs, tcs_offchip_layout_offset, tcs_offchip_layout);
3758 }
3759
3760 static void
radv_emit_conservative_rast_mode(struct radv_cmd_buffer * cmd_buffer)3761 radv_emit_conservative_rast_mode(struct radv_cmd_buffer *cmd_buffer)
3762 {
3763 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3764 const struct radv_physical_device *pdev = radv_device_physical(device);
3765 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3766
3767 if (pdev->info.gfx_level >= GFX9) {
3768 uint32_t pa_sc_conservative_rast;
3769
3770 if (d->vk.rs.conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
3771 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
3772 const bool uses_inner_coverage = ps && ps->info.ps.reads_fully_covered;
3773
3774 pa_sc_conservative_rast =
3775 S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) | S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
3776
3777 /* Inner coverage requires underestimate conservative rasterization. */
3778 if (d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT &&
3779 !uses_inner_coverage) {
3780 pa_sc_conservative_rast |= S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
3781 S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
3782 } else {
3783 pa_sc_conservative_rast |= S_028C4C_OVER_RAST_SAMPLE_SELECT(1) | S_028C4C_UNDER_RAST_ENABLE(1);
3784 }
3785 } else {
3786 pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
3787 }
3788
3789 if (pdev->info.gfx_level >= GFX12) {
3790 radeon_set_context_reg(cmd_buffer->cs, R_028C54_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
3791 pa_sc_conservative_rast);
3792 } else {
3793 radeon_set_context_reg(cmd_buffer->cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
3794 pa_sc_conservative_rast);
3795 }
3796 }
3797 }
3798
3799 static void
radv_emit_depth_clamp_enable(struct radv_cmd_buffer * cmd_buffer)3800 radv_emit_depth_clamp_enable(struct radv_cmd_buffer *cmd_buffer)
3801 {
3802 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3803 const struct radv_physical_device *pdev = radv_device_physical(device);
3804
3805 enum radv_depth_clamp_mode mode = radv_get_depth_clamp_mode(cmd_buffer);
3806
3807 radeon_set_context_reg(
3808 cmd_buffer->cs, R_02800C_DB_RENDER_OVERRIDE,
3809 S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE) |
3810 S_02800C_DISABLE_VIEWPORT_CLAMP(pdev->info.gfx_level < GFX12 && mode == RADV_DEPTH_CLAMP_MODE_DISABLED));
3811
3812 if (pdev->info.gfx_level >= GFX12) {
3813 radeon_set_context_reg(cmd_buffer->cs, R_028064_DB_VIEWPORT_CONTROL,
3814 S_028064_DISABLE_VIEWPORT_CLAMP(mode == RADV_DEPTH_CLAMP_MODE_DISABLED));
3815 }
3816 }
3817
3818 static void
radv_emit_rasterization_samples(struct radv_cmd_buffer * cmd_buffer)3819 radv_emit_rasterization_samples(struct radv_cmd_buffer *cmd_buffer)
3820 {
3821 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3822 const struct radv_physical_device *pdev = radv_device_physical(device);
3823 unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
3824 unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
3825 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3826 unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
3827 unsigned pa_sc_mode_cntl_1;
3828 bool has_hiz_his = false;
3829
3830 if (pdev->info.gfx_level >= GFX12) {
3831 const struct radv_rendering_state *render = &cmd_buffer->state.render;
3832
3833 if (render->ds_att.iview) {
3834 const struct radeon_surf *surf = &render->ds_att.iview->image->planes[0].surface;
3835 has_hiz_his = surf->u.gfx9.zs.hiz.offset || surf->u.gfx9.zs.his.offset;
3836 }
3837 }
3838
3839 pa_sc_mode_cntl_1 =
3840 S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes
3841 S_028A4C_WALK_FENCE_SIZE(pdev->info.num_tile_pipes == 2 ? 2 : 3) |
3842 S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(cmd_buffer->state.uses_out_of_order_rast) |
3843 S_028A4C_OUT_OF_ORDER_WATER_MARK(pdev->info.gfx_level >= GFX12 ? 0 : 0x7) |
3844 /* always 1: */
3845 S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
3846 S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
3847 S_028A4C_FORCE_EOV_REZ_ENABLE(1) |
3848 /* This should only be set when VRS surfaces aren't enabled on GFX11, otherwise the GPU might
3849 * hang.
3850 */
3851 S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(pdev->info.gfx_level < GFX11 || !cmd_buffer->state.uses_vrs_attachment ||
3852 (pdev->info.gfx_level >= GFX12 && !has_hiz_his));
3853
3854 if (!d->sample_location.count)
3855 radv_emit_default_sample_locations(pdev, cmd_buffer->cs, rasterization_samples);
3856
3857 if (ps_iter_samples > 1) {
3858 spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
3859 pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
3860 }
3861
3862 if (radv_should_force_vrs1x1(cmd_buffer)) {
3863 /* Make sure sample shading is enabled even if only MSAA1x is used because the SAMPLE_ITER
3864 * combiner is in passthrough mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate. The
3865 * default VRS rate when sample shading is enabled is 1x1.
3866 */
3867 if (!G_028A4C_PS_ITER_SAMPLE(pa_sc_mode_cntl_1))
3868 pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
3869 }
3870
3871 if (pdev->info.gfx_level >= GFX12) {
3872 radeon_set_context_reg(cmd_buffer->cs, R_028658_SPI_BARYC_CNTL, spi_baryc_cntl);
3873 } else {
3874 radeon_set_context_reg(cmd_buffer->cs, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
3875 }
3876
3877 radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
3878 }
3879
3880 static void
radv_emit_fb_color_state(struct radv_cmd_buffer * cmd_buffer,int index,struct radv_color_buffer_info * cb,struct radv_image_view * iview,VkImageLayout layout)3881 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index, struct radv_color_buffer_info *cb,
3882 struct radv_image_view *iview, VkImageLayout layout)
3883 {
3884 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3885 const struct radv_physical_device *pdev = radv_device_physical(device);
3886 bool is_vi = pdev->info.gfx_level >= GFX8;
3887 uint32_t cb_fdcc_control = cb->ac.cb_dcc_control;
3888 uint32_t cb_color_info = cb->ac.cb_color_info;
3889 struct radv_image *image = iview->image;
3890
3891 if (!radv_layout_dcc_compressed(device, image, iview->vk.base_mip_level, layout,
3892 radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf))) {
3893 if (pdev->info.gfx_level >= GFX11) {
3894 cb_fdcc_control &= C_028C78_FDCC_ENABLE;
3895 } else {
3896 cb_color_info &= C_028C70_DCC_ENABLE;
3897 }
3898 }
3899
3900 const enum radv_fmask_compression fmask_comp = radv_layout_fmask_compression(
3901 device, image, layout, radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf));
3902 if (fmask_comp == RADV_FMASK_COMPRESSION_NONE) {
3903 cb_color_info &= C_028C70_COMPRESSION;
3904 }
3905
3906 if (pdev->info.gfx_level >= GFX12) {
3907 radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x24, cb->ac.cb_color_base);
3908 radeon_set_context_reg(cmd_buffer->cs, R_028C64_CB_COLOR0_VIEW + index * 0x24, cb->ac.cb_color_view);
3909 radeon_set_context_reg(cmd_buffer->cs, R_028C68_CB_COLOR0_VIEW2 + index * 0x24, cb->ac.cb_color_view2);
3910 radeon_set_context_reg(cmd_buffer->cs, R_028C6C_CB_COLOR0_ATTRIB + index * 0x24, cb->ac.cb_color_attrib);
3911 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_FDCC_CONTROL + index * 0x24, cb_fdcc_control);
3912 radeon_set_context_reg(cmd_buffer->cs, R_028C78_CB_COLOR0_ATTRIB2 + index * 0x24, cb->ac.cb_color_attrib2);
3913 radeon_set_context_reg(cmd_buffer->cs, R_028C7C_CB_COLOR0_ATTRIB3 + index * 0x24, cb->ac.cb_color_attrib3);
3914 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
3915 S_028E40_BASE_256B(cb->ac.cb_color_base >> 32));
3916 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_INFO + index * 4, cb->ac.cb_color_info);
3917 } else if (pdev->info.gfx_level >= GFX11) {
3918 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C6C_CB_COLOR0_VIEW + index * 0x3c, 4);
3919 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_view); /* CB_COLOR0_VIEW */
3920 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_info); /* CB_COLOR0_INFO */
3921 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib); /* CB_COLOR0_ATTRIB */
3922 radeon_emit(cmd_buffer->cs, cb_fdcc_control); /* CB_COLOR0_FDCC_CONTROL */
3923
3924 radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, cb->ac.cb_color_base);
3925 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
3926 S_028E40_BASE_256B(cb->ac.cb_color_base >> 32));
3927 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->ac.cb_dcc_base);
3928 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
3929 S_028EA0_BASE_256B(cb->ac.cb_dcc_base >> 32));
3930 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->ac.cb_color_attrib2);
3931 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->ac.cb_color_attrib3);
3932 } else if (pdev->info.gfx_level >= GFX10) {
3933 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
3934 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_base);
3935 radeon_emit(cmd_buffer->cs, 0);
3936 radeon_emit(cmd_buffer->cs, 0);
3937 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_view);
3938 radeon_emit(cmd_buffer->cs, cb_color_info);
3939 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib);
3940 radeon_emit(cmd_buffer->cs, cb->ac.cb_dcc_control);
3941 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_cmask);
3942 radeon_emit(cmd_buffer->cs, 0);
3943 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_fmask);
3944 radeon_emit(cmd_buffer->cs, 0);
3945
3946 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->ac.cb_dcc_base);
3947
3948 radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
3949 S_028E40_BASE_256B(cb->ac.cb_color_base >> 32));
3950 radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
3951 S_028E60_BASE_256B(cb->ac.cb_color_cmask >> 32));
3952 radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
3953 S_028E80_BASE_256B(cb->ac.cb_color_fmask >> 32));
3954 radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
3955 S_028EA0_BASE_256B(cb->ac.cb_dcc_base >> 32));
3956 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->ac.cb_color_attrib2);
3957 radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->ac.cb_color_attrib3);
3958 } else if (pdev->info.gfx_level == GFX9) {
3959 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
3960 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_base);
3961 radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->ac.cb_color_base >> 32));
3962 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib2);
3963 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_view);
3964 radeon_emit(cmd_buffer->cs, cb_color_info);
3965 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib);
3966 radeon_emit(cmd_buffer->cs, cb->ac.cb_dcc_control);
3967 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_cmask);
3968 radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->ac.cb_color_cmask >> 32));
3969 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_fmask);
3970 radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->ac.cb_color_fmask >> 32));
3971
3972 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
3973 radeon_emit(cmd_buffer->cs, cb->ac.cb_dcc_base);
3974 radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->ac.cb_dcc_base >> 32));
3975
3976 radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, cb->ac.cb_mrt_epitch);
3977 } else {
3978 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 6);
3979 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_base);
3980 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_pitch);
3981 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_slice);
3982 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_view);
3983 radeon_emit(cmd_buffer->cs, cb_color_info);
3984 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib);
3985
3986 if (pdev->info.gfx_level == GFX8)
3987 radeon_set_context_reg(cmd_buffer->cs, R_028C78_CB_COLOR0_DCC_CONTROL + index * 0x3c, cb->ac.cb_dcc_control);
3988
3989 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C7C_CB_COLOR0_CMASK + index * 0x3c, 4);
3990 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_cmask);
3991 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_cmask_slice);
3992 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_fmask);
3993 radeon_emit(cmd_buffer->cs, cb->ac.cb_color_fmask_slice);
3994
3995 if (is_vi) { /* DCC BASE */
3996 radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->ac.cb_dcc_base);
3997 }
3998 }
3999
4000 if (pdev->info.gfx_level >= GFX11 ? G_028C78_FDCC_ENABLE(cb_fdcc_control) : G_028C70_DCC_ENABLE(cb_color_info)) {
4001 /* Drawing with DCC enabled also compresses colorbuffers. */
4002 VkImageSubresourceRange range = {
4003 .aspectMask = iview->vk.aspects,
4004 .baseMipLevel = iview->vk.base_mip_level,
4005 .levelCount = iview->vk.level_count,
4006 .baseArrayLayer = iview->vk.base_array_layer,
4007 .layerCount = iview->vk.layer_count,
4008 };
4009
4010 radv_update_dcc_metadata(cmd_buffer, image, &range, true);
4011 }
4012 }
4013
4014 static void
radv_update_zrange_precision(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,const struct radv_image_view * iview,bool requires_cond_exec)4015 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
4016 const struct radv_image_view *iview, bool requires_cond_exec)
4017 {
4018 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4019 const struct radv_physical_device *pdev = radv_device_physical(device);
4020 const struct radv_image *image = iview->image;
4021 uint32_t db_z_info = ds->ac.db_z_info;
4022 uint32_t db_z_info_reg;
4023
4024 if (!pdev->info.has_tc_compat_zrange_bug || !radv_image_is_tc_compat_htile(image))
4025 return;
4026
4027 db_z_info &= C_028040_ZRANGE_PRECISION;
4028
4029 if (pdev->info.gfx_level == GFX9) {
4030 db_z_info_reg = R_028038_DB_Z_INFO;
4031 } else {
4032 db_z_info_reg = R_028040_DB_Z_INFO;
4033 }
4034
4035 /* When we don't know the last fast clear value we need to emit a
4036 * conditional packet that will eventually skip the following
4037 * SET_CONTEXT_REG packet.
4038 */
4039 if (requires_cond_exec) {
4040 uint64_t va = radv_get_tc_compat_zrange_va(image, iview->vk.base_mip_level);
4041
4042 radv_emit_cond_exec(device, cmd_buffer->cs, va, 3 /* SET_CONTEXT_REG size */);
4043 }
4044
4045 radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
4046 }
4047
4048 static struct radv_image *
radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer * cmd_buffer)4049 radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
4050 {
4051 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4052
4053 if (!device->vrs.image) {
4054 VkResult result;
4055
4056 /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
4057 result = radv_device_init_vrs_state(device);
4058 if (result != VK_SUCCESS) {
4059 vk_command_buffer_set_error(&cmd_buffer->vk, result);
4060 return NULL;
4061 }
4062 }
4063
4064 return device->vrs.image;
4065 }
4066
4067 static void
radv_emit_fb_ds_state(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,bool depth_compressed,bool stencil_compressed)4068 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, struct radv_image_view *iview,
4069 bool depth_compressed, bool stencil_compressed)
4070 {
4071 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4072 const struct radv_physical_device *pdev = radv_device_physical(device);
4073 uint64_t db_htile_data_base = ds->ac.u.gfx6.db_htile_data_base;
4074 uint32_t db_htile_surface = ds->ac.u.gfx6.db_htile_surface;
4075 uint32_t db_render_control = ds->db_render_control | cmd_buffer->state.db_render_control;
4076 uint32_t db_z_info = ds->ac.db_z_info;
4077
4078 if (!depth_compressed)
4079 db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(1);
4080 if (!stencil_compressed)
4081 db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(1);
4082
4083 if (pdev->info.gfx_level == GFX10_3) {
4084 if (!cmd_buffer->state.render.vrs_att.iview) {
4085 db_htile_surface &= C_028ABC_VRS_HTILE_ENCODING;
4086 } else {
4087 /* On GFX10.3, when a subpass uses VRS attachment but HTILE can't be enabled, we fallback to
4088 * our internal HTILE buffer.
4089 */
4090 if (!radv_htile_enabled(iview->image, iview->vk.base_mip_level) && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
4091 struct radv_buffer *htile_buffer = device->vrs.buffer;
4092
4093 assert(!G_028038_TILE_SURFACE_ENABLE(db_z_info) && !db_htile_data_base && !db_htile_surface);
4094 db_z_info |= S_028038_TILE_SURFACE_ENABLE(1);
4095 db_htile_data_base = radv_buffer_get_va(htile_buffer->bo) >> 8;
4096 db_htile_surface = S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1) |
4097 S_028ABC_VRS_HTILE_ENCODING(V_028ABC_VRS_HTILE_4BIT_ENCODING);
4098 }
4099 }
4100 }
4101
4102 if (pdev->info.gfx_level < GFX12) {
4103 radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, db_render_control);
4104 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->ac.db_depth_view);
4105 radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
4106 }
4107
4108 radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2, ds->db_render_override2);
4109
4110 if (pdev->info.gfx_level >= GFX12) {
4111 radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_DEPTH_VIEW, ds->ac.db_depth_view);
4112 radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW1, ds->ac.u.gfx12.db_depth_view1);
4113 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_DEPTH_SIZE_XY, ds->ac.db_depth_size);
4114 radeon_set_context_reg(cmd_buffer->cs, R_028018_DB_Z_INFO, ds->ac.db_z_info);
4115 radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_STENCIL_INFO, ds->ac.db_stencil_info);
4116 radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_Z_READ_BASE, ds->ac.db_depth_base);
4117 radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_Z_READ_BASE_HI, S_028024_BASE_HI(ds->ac.db_depth_base >> 32));
4118 radeon_set_context_reg(cmd_buffer->cs, R_028028_DB_Z_WRITE_BASE, ds->ac.db_depth_base);
4119 radeon_set_context_reg(cmd_buffer->cs, R_02802C_DB_Z_WRITE_BASE_HI, S_02802C_BASE_HI(ds->ac.db_depth_base >> 32));
4120 radeon_set_context_reg(cmd_buffer->cs, R_028030_DB_STENCIL_READ_BASE, ds->ac.db_stencil_base);
4121 radeon_set_context_reg(cmd_buffer->cs, R_028034_DB_STENCIL_READ_BASE_HI,
4122 S_028034_BASE_HI(ds->ac.db_stencil_base >> 32));
4123 radeon_set_context_reg(cmd_buffer->cs, R_028038_DB_STENCIL_WRITE_BASE, ds->ac.db_stencil_base);
4124 radeon_set_context_reg(cmd_buffer->cs, R_02803C_DB_STENCIL_WRITE_BASE_HI,
4125 S_02803C_BASE_HI(ds->ac.db_stencil_base >> 32));
4126 radeon_set_context_reg(cmd_buffer->cs, R_028B94_PA_SC_HIZ_INFO, ds->ac.u.gfx12.hiz_info);
4127 radeon_set_context_reg(cmd_buffer->cs, R_028B98_PA_SC_HIS_INFO, ds->ac.u.gfx12.his_info);
4128
4129 if (ds->ac.u.gfx12.hiz_info) {
4130 radeon_set_context_reg(cmd_buffer->cs, R_028B9C_PA_SC_HIZ_BASE, ds->ac.u.gfx12.hiz_base);
4131 radeon_set_context_reg(cmd_buffer->cs, R_028BA0_PA_SC_HIZ_BASE_EXT,
4132 S_028BA0_BASE_256B(ds->ac.u.gfx12.hiz_base >> 32));
4133 radeon_set_context_reg(cmd_buffer->cs, R_028BA4_PA_SC_HIZ_SIZE_XY, ds->ac.u.gfx12.hiz_size_xy);
4134 }
4135 if (ds->ac.u.gfx12.his_info) {
4136 radeon_set_context_reg(cmd_buffer->cs, R_028BA8_PA_SC_HIS_BASE, ds->ac.u.gfx12.his_base);
4137 radeon_set_context_reg(cmd_buffer->cs, R_028BAC_PA_SC_HIS_BASE_EXT,
4138 S_028BAC_BASE_256B(ds->ac.u.gfx12.his_base >> 32));
4139 radeon_set_context_reg(cmd_buffer->cs, R_028BB0_PA_SC_HIS_SIZE_XY, ds->ac.u.gfx12.his_size_xy);
4140 }
4141 } else if (pdev->info.gfx_level >= GFX10) {
4142 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, db_htile_data_base);
4143 radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->ac.db_depth_size);
4144
4145 if (pdev->info.gfx_level >= GFX11) {
4146 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 6);
4147 } else {
4148 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
4149 radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
4150 }
4151 radeon_emit(cmd_buffer->cs, db_z_info);
4152 radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_info);
4153 radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base);
4154 radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base);
4155 radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base);
4156 radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base);
4157
4158 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
4159 radeon_emit(cmd_buffer->cs, S_028068_BASE_HI(ds->ac.db_depth_base >> 32));
4160 radeon_emit(cmd_buffer->cs, S_02806C_BASE_HI(ds->ac.db_stencil_base >> 32));
4161 radeon_emit(cmd_buffer->cs, S_028070_BASE_HI(ds->ac.db_depth_base >> 32));
4162 radeon_emit(cmd_buffer->cs, S_028074_BASE_HI(ds->ac.db_stencil_base >> 32));
4163 radeon_emit(cmd_buffer->cs, S_028078_BASE_HI(db_htile_data_base >> 32));
4164 } else if (pdev->info.gfx_level == GFX9) {
4165 radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
4166 radeon_emit(cmd_buffer->cs, db_htile_data_base);
4167 radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(db_htile_data_base >> 32));
4168 radeon_emit(cmd_buffer->cs, ds->ac.db_depth_size);
4169
4170 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
4171 radeon_emit(cmd_buffer->cs, db_z_info); /* DB_Z_INFO */
4172 radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_info); /* DB_STENCIL_INFO */
4173 radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base); /* DB_Z_READ_BASE */
4174 radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->ac.db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
4175 radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base); /* DB_STENCIL_READ_BASE */
4176 radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->ac.db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
4177 radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base); /* DB_Z_WRITE_BASE */
4178 radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->ac.db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */
4179 radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base); /* DB_STENCIL_WRITE_BASE */
4180 radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->ac.db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
4181
4182 radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
4183 radeon_emit(cmd_buffer->cs, ds->ac.u.gfx6.db_z_info2);
4184 radeon_emit(cmd_buffer->cs, ds->ac.u.gfx6.db_stencil_info2);
4185 } else {
4186 radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, db_htile_data_base);
4187
4188 radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
4189 radeon_emit(cmd_buffer->cs, ds->ac.u.gfx6.db_depth_info); /* R_02803C_DB_DEPTH_INFO */
4190 radeon_emit(cmd_buffer->cs, db_z_info); /* R_028040_DB_Z_INFO */
4191 radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_info); /* R_028044_DB_STENCIL_INFO */
4192 radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base); /* R_028048_DB_Z_READ_BASE */
4193 radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base); /* R_02804C_DB_STENCIL_READ_BASE */
4194 radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base); /* R_028050_DB_Z_WRITE_BASE */
4195 radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base); /* R_028054_DB_STENCIL_WRITE_BASE */
4196 radeon_emit(cmd_buffer->cs, ds->ac.db_depth_size); /* R_028058_DB_DEPTH_SIZE */
4197 radeon_emit(cmd_buffer->cs, ds->ac.u.gfx6.db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */
4198 }
4199
4200 /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
4201 radv_update_zrange_precision(cmd_buffer, ds, iview, true);
4202 }
4203
4204 static void
radv_emit_null_ds_state(struct radv_cmd_buffer * cmd_buffer)4205 radv_emit_null_ds_state(struct radv_cmd_buffer *cmd_buffer)
4206 {
4207 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4208 const struct radv_physical_device *pdev = radv_device_physical(device);
4209 const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
4210
4211 if (pdev->info.gfx_level >= GFX12) {
4212 radeon_set_context_reg_seq(cmd_buffer->cs, R_028018_DB_Z_INFO, 2);
4213 radeon_emit(cmd_buffer->cs, S_028018_FORMAT(V_028018_Z_INVALID) | S_028018_NUM_SAMPLES(3));
4214 radeon_emit(cmd_buffer->cs, S_02801C_FORMAT(V_02801C_STENCIL_INVALID) | S_02801C_TILE_STENCIL_DISABLE(1));
4215
4216 radeon_set_context_reg(cmd_buffer->cs, R_028B94_PA_SC_HIZ_INFO, S_028B94_SURFACE_ENABLE(0));
4217 radeon_set_context_reg(cmd_buffer->cs, R_028B98_PA_SC_HIS_INFO, S_028B98_SURFACE_ENABLE(0));
4218 } else {
4219 if (gfx_level == GFX9) {
4220 radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
4221 } else {
4222 radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
4223 }
4224
4225 /* On GFX11+, the hw intentionally looks at DB_Z_INFO.NUM_SAMPLES when there is no bound
4226 * depth/stencil buffer and it clamps the number of samples like MIN2(DB_Z_INFO.NUM_SAMPLES,
4227 * PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES). Use 8x for DB_Z_INFO.NUM_SAMPLES to make sure it's not
4228 * the constraining factor. This affects VRS, occlusion queries and POPS.
4229 */
4230 radeon_emit(cmd_buffer->cs,
4231 S_028040_FORMAT(V_028040_Z_INVALID) | S_028040_NUM_SAMPLES(pdev->info.gfx_level >= GFX11 ? 3 : 0));
4232 radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID));
4233 uint32_t db_render_control = 0;
4234
4235 if (gfx_level == GFX11 || gfx_level == GFX11_5)
4236 radv_gfx11_set_db_render_control(device, 1, &db_render_control);
4237
4238 radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, db_render_control);
4239 }
4240
4241 radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2,
4242 S_028010_CENTROID_COMPUTATION_MODE(gfx_level >= GFX10_3));
4243 }
4244 /**
4245 * Update the fast clear depth/stencil values if the image is bound as a
4246 * depth/stencil buffer.
4247 */
4248 static void
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)4249 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
4250 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
4251 {
4252 const struct radv_image *image = iview->image;
4253 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4254
4255 if (cmd_buffer->state.render.ds_att.iview == NULL || cmd_buffer->state.render.ds_att.iview->image != image)
4256 return;
4257
4258 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4259 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
4260 radeon_emit(cs, ds_clear_value.stencil);
4261 radeon_emit(cs, fui(ds_clear_value.depth));
4262 } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
4263 radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
4264 } else {
4265 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
4266 radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
4267 }
4268
4269 /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
4270 * only needed when clearing Z to 0.0.
4271 */
4272 if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
4273 radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.render.ds_att.ds, iview, false);
4274 }
4275
4276 cmd_buffer->state.context_roll_without_scissor_emitted = true;
4277 }
4278
4279 /**
4280 * Set the clear depth/stencil values to the image's metadata.
4281 */
4282 static void
radv_set_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)4283 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4284 const VkImageSubresourceRange *range, VkClearDepthStencilValue ds_clear_value,
4285 VkImageAspectFlags aspects)
4286 {
4287 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4288 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4289 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4290
4291 if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4292 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
4293
4294 /* Use the fastest way when both aspects are used. */
4295 ASSERTED unsigned cdw_end = radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va,
4296 2 * level_count, cmd_buffer->state.predicating);
4297
4298 for (uint32_t l = 0; l < level_count; l++) {
4299 radeon_emit(cs, ds_clear_value.stencil);
4300 radeon_emit(cs, fui(ds_clear_value.depth));
4301 }
4302
4303 assert(cmd_buffer->cs->cdw == cdw_end);
4304 } else {
4305 /* Otherwise we need one WRITE_DATA packet per level. */
4306 for (uint32_t l = 0; l < level_count; l++) {
4307 uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
4308 unsigned value;
4309
4310 if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
4311 value = fui(ds_clear_value.depth);
4312 va += 4;
4313 } else {
4314 assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
4315 value = ds_clear_value.stencil;
4316 }
4317
4318 radv_write_data(cmd_buffer, V_370_PFP, va, 1, &value, cmd_buffer->state.predicating);
4319 }
4320 }
4321 }
4322
4323 /**
4324 * Update the TC-compat metadata value for this image.
4325 */
4326 static void
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)4327 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4328 const VkImageSubresourceRange *range, uint32_t value)
4329 {
4330 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4331 const struct radv_physical_device *pdev = radv_device_physical(device);
4332 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4333
4334 if (!pdev->info.has_tc_compat_zrange_bug)
4335 return;
4336
4337 uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
4338 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4339
4340 ASSERTED unsigned cdw_end = radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va,
4341 level_count, cmd_buffer->state.predicating);
4342
4343 for (uint32_t l = 0; l < level_count; l++)
4344 radeon_emit(cs, value);
4345
4346 assert(cmd_buffer->cs->cdw == cdw_end);
4347 }
4348
4349 static void
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value)4350 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
4351 VkClearDepthStencilValue ds_clear_value)
4352 {
4353 VkImageSubresourceRange range = {
4354 .aspectMask = iview->vk.aspects,
4355 .baseMipLevel = iview->vk.base_mip_level,
4356 .levelCount = iview->vk.level_count,
4357 .baseArrayLayer = iview->vk.base_array_layer,
4358 .layerCount = iview->vk.layer_count,
4359 };
4360 uint32_t cond_val;
4361
4362 /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
4363 * depth clear value is 0.0f.
4364 */
4365 cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
4366
4367 radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
4368 }
4369
4370 /**
4371 * Update the clear depth/stencil values for this image.
4372 */
4373 void
radv_update_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)4374 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
4375 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
4376 {
4377 VkImageSubresourceRange range = {
4378 .aspectMask = iview->vk.aspects,
4379 .baseMipLevel = iview->vk.base_mip_level,
4380 .levelCount = iview->vk.level_count,
4381 .baseArrayLayer = iview->vk.base_array_layer,
4382 .layerCount = iview->vk.layer_count,
4383 };
4384 struct radv_image *image = iview->image;
4385
4386 assert(radv_htile_enabled(image, range.baseMipLevel));
4387
4388 radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
4389
4390 if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
4391 radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
4392 }
4393
4394 radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
4395 }
4396
4397 /**
4398 * Load the clear depth/stencil values from the image's metadata.
4399 */
4400 static void
radv_load_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview)4401 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
4402 {
4403 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4404 const struct radv_physical_device *pdev = radv_device_physical(device);
4405 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4406 const struct radv_image *image = iview->image;
4407 VkImageAspectFlags aspects = vk_format_aspects(image->vk.format);
4408 uint64_t va = radv_get_ds_clear_value_va(image, iview->vk.base_mip_level);
4409 unsigned reg_offset = 0, reg_count = 0;
4410
4411 assert(radv_image_has_htile(image));
4412
4413 if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
4414 ++reg_count;
4415 } else {
4416 ++reg_offset;
4417 va += 4;
4418 }
4419 if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
4420 ++reg_count;
4421
4422 uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
4423
4424 if (pdev->info.has_load_ctx_reg_pkt) {
4425 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
4426 radeon_emit(cs, va);
4427 radeon_emit(cs, va >> 32);
4428 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
4429 radeon_emit(cs, reg_count);
4430 } else {
4431 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4432 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
4433 (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
4434 radeon_emit(cs, va);
4435 radeon_emit(cs, va >> 32);
4436 radeon_emit(cs, reg >> 2);
4437 radeon_emit(cs, 0);
4438
4439 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
4440 radeon_emit(cs, 0);
4441 }
4442 }
4443
4444 /*
4445 * With DCC some colors don't require CMASK elimination before being
4446 * used as a texture. This sets a predicate value to determine if the
4447 * cmask eliminate is required.
4448 */
4449 void
radv_update_fce_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)4450 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4451 const VkImageSubresourceRange *range, bool value)
4452 {
4453 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4454
4455 if (!image->fce_pred_offset)
4456 return;
4457
4458 uint64_t pred_val = value;
4459 uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
4460 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4461
4462 ASSERTED unsigned cdw_end =
4463 radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va, 2 * level_count, false);
4464
4465 for (uint32_t l = 0; l < level_count; l++) {
4466 radeon_emit(cmd_buffer->cs, pred_val);
4467 radeon_emit(cmd_buffer->cs, pred_val >> 32);
4468 }
4469
4470 assert(cmd_buffer->cs->cdw == cdw_end);
4471 }
4472
4473 /**
4474 * Update the DCC predicate to reflect the compression state.
4475 */
4476 void
radv_update_dcc_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)4477 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4478 const VkImageSubresourceRange *range, bool value)
4479 {
4480 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4481
4482 if (image->dcc_pred_offset == 0)
4483 return;
4484
4485 uint64_t pred_val = value;
4486 uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
4487 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4488
4489 assert(radv_dcc_enabled(image, range->baseMipLevel));
4490
4491 ASSERTED unsigned cdw_end =
4492 radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va, 2 * level_count, false);
4493
4494 for (uint32_t l = 0; l < level_count; l++) {
4495 radeon_emit(cmd_buffer->cs, pred_val);
4496 radeon_emit(cmd_buffer->cs, pred_val >> 32);
4497 }
4498
4499 assert(cmd_buffer->cs->cdw == cdw_end);
4500 }
4501
4502 /**
4503 * Update the fast clear color values if the image is bound as a color buffer.
4504 */
4505 static void
radv_update_bound_fast_clear_color(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,int cb_idx,uint32_t color_values[2])4506 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, int cb_idx,
4507 uint32_t color_values[2])
4508 {
4509 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4510 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4511
4512 if (cb_idx >= cmd_buffer->state.render.color_att_count || cmd_buffer->state.render.color_att[cb_idx].iview == NULL ||
4513 cmd_buffer->state.render.color_att[cb_idx].iview->image != image)
4514 return;
4515
4516 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 4);
4517
4518 radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
4519 radeon_emit(cs, color_values[0]);
4520 radeon_emit(cs, color_values[1]);
4521
4522 assert(cmd_buffer->cs->cdw <= cdw_max);
4523
4524 cmd_buffer->state.context_roll_without_scissor_emitted = true;
4525 }
4526
4527 /**
4528 * Set the clear color values to the image's metadata.
4529 */
4530 static void
radv_set_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t color_values[2])4531 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4532 const VkImageSubresourceRange *range, uint32_t color_values[2])
4533 {
4534 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4535 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4536 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4537
4538 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
4539
4540 if (radv_image_has_clear_value(image)) {
4541 uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
4542
4543 ASSERTED unsigned cdw_end = radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va,
4544 2 * level_count, cmd_buffer->state.predicating);
4545
4546 for (uint32_t l = 0; l < level_count; l++) {
4547 radeon_emit(cs, color_values[0]);
4548 radeon_emit(cs, color_values[1]);
4549 }
4550
4551 assert(cmd_buffer->cs->cdw == cdw_end);
4552 } else {
4553 /* Some default value we can set in the update. */
4554 assert(color_values[0] == 0 && color_values[1] == 0);
4555 }
4556 }
4557
4558 /**
4559 * Update the clear color values for this image.
4560 */
4561 void
radv_update_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,int cb_idx,uint32_t color_values[2])4562 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview, int cb_idx,
4563 uint32_t color_values[2])
4564 {
4565 struct radv_image *image = iview->image;
4566 VkImageSubresourceRange range = {
4567 .aspectMask = iview->vk.aspects,
4568 .baseMipLevel = iview->vk.base_mip_level,
4569 .levelCount = iview->vk.level_count,
4570 .baseArrayLayer = iview->vk.base_array_layer,
4571 .layerCount = iview->vk.layer_count,
4572 };
4573
4574 assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->vk.base_mip_level));
4575
4576 /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
4577 * mode because the hardware gets the value from the image directly.
4578 */
4579 if (iview->image->support_comp_to_single)
4580 return;
4581
4582 radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
4583
4584 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
4585 }
4586
4587 /**
4588 * Load the clear color values from the image's metadata.
4589 */
4590 static void
radv_load_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * iview,int cb_idx)4591 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview, int cb_idx)
4592 {
4593 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4594 const struct radv_physical_device *pdev = radv_device_physical(device);
4595 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4596 struct radv_image *image = iview->image;
4597
4598 if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->vk.base_mip_level))
4599 return;
4600
4601 if (iview->image->support_comp_to_single)
4602 return;
4603
4604 if (!radv_image_has_clear_value(image)) {
4605 uint32_t color_values[2] = {0, 0};
4606 radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
4607 return;
4608 }
4609
4610 uint64_t va = radv_image_get_fast_clear_va(image, iview->vk.base_mip_level);
4611 uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
4612
4613 if (pdev->info.has_load_ctx_reg_pkt) {
4614 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
4615 radeon_emit(cs, va);
4616 radeon_emit(cs, va >> 32);
4617 radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
4618 radeon_emit(cs, 2);
4619 } else {
4620 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
4621 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_COUNT_SEL);
4622 radeon_emit(cs, va);
4623 radeon_emit(cs, va >> 32);
4624 radeon_emit(cs, reg >> 2);
4625 radeon_emit(cs, 0);
4626
4627 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
4628 radeon_emit(cs, 0);
4629 }
4630 }
4631
4632 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
4633 * broken if the CB caches data of multiple mips of the same image at the
4634 * same time.
4635 *
4636 * Insert some flushes to avoid this.
4637 */
4638 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)4639 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
4640 {
4641 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4642 const struct radv_physical_device *pdev = radv_device_physical(device);
4643 struct radv_rendering_state *render = &cmd_buffer->state.render;
4644 bool color_mip_changed = false;
4645
4646 /* Entire workaround is not applicable before GFX9 */
4647 if (pdev->info.gfx_level < GFX9)
4648 return;
4649
4650 for (int i = 0; i < render->color_att_count; ++i) {
4651 struct radv_image_view *iview = render->color_att[i].iview;
4652 if (!iview)
4653 continue;
4654
4655 if ((radv_image_has_cmask(iview->image) || radv_dcc_enabled(iview->image, iview->vk.base_mip_level) ||
4656 radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
4657 cmd_buffer->state.cb_mip[i] != iview->vk.base_mip_level)
4658 color_mip_changed = true;
4659
4660 cmd_buffer->state.cb_mip[i] = iview->vk.base_mip_level;
4661 }
4662
4663 if (color_mip_changed) {
4664 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4665 }
4666
4667 const struct radv_image_view *iview = render->ds_att.iview;
4668 if (iview) {
4669 if ((radv_htile_enabled(iview->image, iview->vk.base_mip_level) ||
4670 radv_htile_enabled(iview->image, cmd_buffer->state.ds_mip)) &&
4671 cmd_buffer->state.ds_mip != iview->vk.base_mip_level) {
4672 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4673 }
4674
4675 cmd_buffer->state.ds_mip = iview->vk.base_mip_level;
4676 }
4677 }
4678
4679 /* This function does the flushes for mip changes if the levels are not zero for
4680 * all render targets. This way we can assume at the start of the next cmd_buffer
4681 * that rendering to mip 0 doesn't need any flushes. As that is the most common
4682 * case that saves some flushes. */
4683 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)4684 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
4685 {
4686 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4687 const struct radv_physical_device *pdev = radv_device_physical(device);
4688
4689 /* Entire workaround is not applicable before GFX9 */
4690 if (pdev->info.gfx_level < GFX9)
4691 return;
4692
4693 bool need_color_mip_flush = false;
4694 for (unsigned i = 0; i < 8; ++i) {
4695 if (cmd_buffer->state.cb_mip[i]) {
4696 need_color_mip_flush = true;
4697 break;
4698 }
4699 }
4700
4701 if (need_color_mip_flush) {
4702 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4703 }
4704
4705 if (cmd_buffer->state.ds_mip) {
4706 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4707 }
4708
4709 memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
4710 cmd_buffer->state.ds_mip = 0;
4711 }
4712
4713 static void
radv_emit_framebuffer_state(struct radv_cmd_buffer * cmd_buffer)4714 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
4715 {
4716 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4717 const struct radv_physical_device *pdev = radv_device_physical(device);
4718 struct radv_rendering_state *render = &cmd_buffer->state.render;
4719 int i;
4720 bool disable_constant_encode_ac01 = false;
4721 unsigned color_invalid = pdev->info.gfx_level >= GFX12 ? S_028EC0_FORMAT(V_028EC0_COLOR_INVALID)
4722 : pdev->info.gfx_level >= GFX11 ? S_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)
4723 : S_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID);
4724 VkExtent2D extent = {MAX_FRAMEBUFFER_WIDTH, MAX_FRAMEBUFFER_HEIGHT};
4725
4726 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 51 + MAX_RTS * 70);
4727
4728 for (i = 0; i < render->color_att_count; ++i) {
4729 struct radv_image_view *iview = render->color_att[i].iview;
4730 if (!iview) {
4731 if (pdev->info.gfx_level >= GFX12) {
4732 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_INFO + i * 4, color_invalid);
4733 } else {
4734 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
4735 }
4736 continue;
4737 }
4738
4739 VkImageLayout layout = render->color_att[i].layout;
4740
4741 radv_cs_add_buffer(device->ws, cmd_buffer->cs, iview->image->bindings[0].bo);
4742
4743 assert(iview->vk.aspects & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
4744 VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
4745
4746 if (iview->image->disjoint && iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
4747 for (uint32_t plane_id = 0; plane_id < iview->image->plane_count; plane_id++) {
4748 radv_cs_add_buffer(device->ws, cmd_buffer->cs, iview->image->bindings[plane_id].bo);
4749 }
4750 } else {
4751 uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
4752 radv_cs_add_buffer(device->ws, cmd_buffer->cs, iview->image->bindings[plane_id].bo);
4753 }
4754
4755 radv_emit_fb_color_state(cmd_buffer, i, &render->color_att[i].cb, iview, layout);
4756
4757 radv_load_color_clear_metadata(cmd_buffer, iview, i);
4758
4759 if (pdev->info.gfx_level >= GFX9 && iview->image->dcc_sign_reinterpret) {
4760 /* Disable constant encoding with the clear value of "1" with different DCC signedness
4761 * because the hardware will fill "1" instead of the clear value.
4762 */
4763 disable_constant_encode_ac01 = true;
4764 }
4765
4766 extent.width = MIN2(extent.width, iview->vk.extent.width);
4767 extent.height = MIN2(extent.height, iview->vk.extent.height);
4768 }
4769 for (; i < cmd_buffer->state.last_subpass_color_count; i++) {
4770 if (pdev->info.gfx_level >= GFX12) {
4771 radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_INFO + i * 4, color_invalid);
4772 } else {
4773 radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
4774 }
4775 }
4776 cmd_buffer->state.last_subpass_color_count = render->color_att_count;
4777
4778 if (render->ds_att.iview) {
4779 struct radv_image_view *iview = render->ds_att.iview;
4780 const struct radv_image *image = iview->image;
4781 radv_cs_add_buffer(device->ws, cmd_buffer->cs, image->bindings[0].bo);
4782
4783 uint32_t qf_mask = radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf);
4784 bool depth_compressed = radv_layout_is_htile_compressed(device, image, render->ds_att.layout, qf_mask);
4785 bool stencil_compressed = radv_layout_is_htile_compressed(device, image, render->ds_att.stencil_layout, qf_mask);
4786
4787 radv_emit_fb_ds_state(cmd_buffer, &render->ds_att.ds, iview, depth_compressed, stencil_compressed);
4788
4789 if (depth_compressed || stencil_compressed) {
4790 /* Only load the depth/stencil fast clear values when
4791 * compressed rendering is enabled.
4792 */
4793 radv_load_ds_clear_metadata(cmd_buffer, iview);
4794 }
4795
4796 extent.width = MIN2(extent.width, iview->vk.extent.width);
4797 extent.height = MIN2(extent.height, iview->vk.extent.height);
4798 } else if (pdev->info.gfx_level == GFX10_3 && render->vrs_att.iview && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
4799 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
4800 * bind our internal depth buffer that contains the VRS data as part of HTILE.
4801 */
4802 VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
4803 struct radv_buffer *htile_buffer = device->vrs.buffer;
4804 struct radv_image *image = device->vrs.image;
4805 struct radv_ds_buffer_info ds;
4806 struct radv_image_view iview;
4807
4808 radv_image_view_init(&iview, device,
4809 &(VkImageViewCreateInfo){
4810 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4811 .image = radv_image_to_handle(image),
4812 .viewType = radv_meta_get_view_type(image),
4813 .format = image->vk.format,
4814 .subresourceRange =
4815 {
4816 .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
4817 .baseMipLevel = 0,
4818 .levelCount = 1,
4819 .baseArrayLayer = 0,
4820 .layerCount = 1,
4821 },
4822 },
4823 0, NULL);
4824
4825 radv_initialise_vrs_surface(image, htile_buffer, &ds);
4826
4827 radv_cs_add_buffer(device->ws, cmd_buffer->cs, htile_buffer->bo);
4828
4829 bool depth_compressed = radv_layout_is_htile_compressed(
4830 device, image, layout, radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf));
4831 radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, depth_compressed, false);
4832
4833 radv_image_view_finish(&iview);
4834 } else {
4835 radv_emit_null_ds_state(cmd_buffer);
4836 }
4837
4838 if (pdev->info.gfx_level >= GFX11) {
4839 bool vrs_surface_enable = render->vrs_att.iview != NULL;
4840 unsigned xmax = 0, ymax = 0;
4841 uint64_t va = 0;
4842
4843 if (vrs_surface_enable) {
4844 const struct radv_image_view *vrs_iview = render->vrs_att.iview;
4845 struct radv_image *vrs_image = vrs_iview->image;
4846
4847 va = radv_image_get_va(vrs_image, 0);
4848 va |= vrs_image->planes[0].surface.tile_swizzle << 8;
4849
4850 xmax = vrs_iview->vk.extent.width - 1;
4851 ymax = vrs_iview->vk.extent.height - 1;
4852 }
4853
4854 radeon_set_context_reg_seq(cmd_buffer->cs, R_0283F0_PA_SC_VRS_RATE_BASE, 3);
4855 radeon_emit(cmd_buffer->cs, va >> 8);
4856 radeon_emit(cmd_buffer->cs, S_0283F4_BASE_256B(va >> 40));
4857 radeon_emit(cmd_buffer->cs, S_0283F8_X_MAX(xmax) | S_0283F8_Y_MAX(ymax));
4858
4859 radeon_set_context_reg(cmd_buffer->cs, R_0283D0_PA_SC_VRS_OVERRIDE_CNTL,
4860 S_0283D0_VRS_SURFACE_ENABLE(vrs_surface_enable));
4861 }
4862
4863 if (pdev->info.gfx_level >= GFX8 && pdev->info.gfx_level < GFX12) {
4864 bool disable_constant_encode = pdev->info.has_dcc_constant_encode;
4865 enum amd_gfx_level gfx_level = pdev->info.gfx_level;
4866
4867 if (pdev->info.gfx_level >= GFX11) {
4868 const bool has_dedicated_vram = pdev->info.has_dedicated_vram;
4869
4870 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_FDCC_CONTROL,
4871 S_028424_SAMPLE_MASK_TRACKER_WATERMARK(has_dedicated_vram ? 0 : 15));
4872 } else {
4873 uint8_t watermark = gfx_level >= GFX10 ? 6 : 4;
4874
4875 radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
4876 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(gfx_level <= GFX9) |
4877 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
4878 S_028424_DISABLE_CONSTANT_ENCODE_AC01(disable_constant_encode_ac01) |
4879 S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
4880 }
4881 }
4882
4883 if (pdev->info.gfx_level >= GFX12) {
4884 radeon_set_context_reg(cmd_buffer->cs, R_028184_PA_SC_SCREEN_SCISSOR_BR,
4885 S_028034_BR_X(extent.width) | S_028034_BR_Y(extent.height));
4886 } else {
4887 radeon_set_context_reg(cmd_buffer->cs, R_028034_PA_SC_SCREEN_SCISSOR_BR,
4888 S_028034_BR_X(extent.width) | S_028034_BR_Y(extent.height));
4889 }
4890
4891 assert(cmd_buffer->cs->cdw <= cdw_max);
4892
4893 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
4894 }
4895
4896 static void
radv_emit_guardband_state(struct radv_cmd_buffer * cmd_buffer)4897 radv_emit_guardband_state(struct radv_cmd_buffer *cmd_buffer)
4898 {
4899 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4900 const struct radv_physical_device *pdev = radv_device_physical(device);
4901 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4902 unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
4903 const bool draw_points = radv_rast_prim_is_point(rast_prim) || radv_polygon_mode_is_point(d->vk.rs.polygon_mode);
4904 const bool draw_lines = radv_rast_prim_is_line(rast_prim) || radv_polygon_mode_is_line(d->vk.rs.polygon_mode);
4905 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4906 int i;
4907 float scale[3], translate[3], guardband_x = INFINITY, guardband_y = INFINITY;
4908 float discard_x = 1.0f, discard_y = 1.0f;
4909 const float max_range = 32767.0f;
4910
4911 if (!d->vk.vp.viewport_count)
4912 return;
4913
4914 for (i = 0; i < d->vk.vp.viewport_count; i++) {
4915 radv_get_viewport_xform(d->vk.vp.viewports + i, scale, translate);
4916 scale[0] = fabsf(scale[0]);
4917 scale[1] = fabsf(scale[1]);
4918
4919 if (scale[0] < 0.5)
4920 scale[0] = 0.5;
4921 if (scale[1] < 0.5)
4922 scale[1] = 0.5;
4923
4924 guardband_x = MIN2(guardband_x, (max_range - fabsf(translate[0])) / scale[0]);
4925 guardband_y = MIN2(guardband_y, (max_range - fabsf(translate[1])) / scale[1]);
4926
4927 if (draw_points || draw_lines) {
4928 /* When rendering wide points or lines, we need to be more conservative about when to
4929 * discard them entirely. */
4930 float pixels;
4931
4932 if (draw_points) {
4933 pixels = 8191.875f;
4934 } else {
4935 pixels = d->vk.rs.line.width;
4936 }
4937
4938 /* Add half the point size / line width. */
4939 discard_x += pixels / (2.0 * scale[0]);
4940 discard_y += pixels / (2.0 * scale[1]);
4941
4942 /* Discard primitives that would lie entirely outside the clip region. */
4943 discard_x = MIN2(discard_x, guardband_x);
4944 discard_y = MIN2(discard_y, guardband_y);
4945 }
4946 }
4947
4948 if (pdev->info.gfx_level >= GFX12) {
4949 radeon_set_context_reg_seq(cs, R_02842C_PA_CL_GB_VERT_CLIP_ADJ, 4);
4950 } else {
4951 radeon_set_context_reg_seq(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4);
4952 }
4953 radeon_emit(cs, fui(guardband_y));
4954 radeon_emit(cs, fui(discard_y));
4955 radeon_emit(cs, fui(guardband_x));
4956 radeon_emit(cs, fui(discard_x));
4957
4958 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GUARDBAND;
4959 }
4960
4961 /* Bind an internal index buffer for GPUs that hang with 0-sized index buffers to handle robustness2
4962 * which requires 0 for out-of-bounds access.
4963 */
4964 static void
radv_handle_zero_index_buffer_bug(struct radv_cmd_buffer * cmd_buffer,uint64_t * index_va,uint32_t * remaining_indexes)4965 radv_handle_zero_index_buffer_bug(struct radv_cmd_buffer *cmd_buffer, uint64_t *index_va, uint32_t *remaining_indexes)
4966 {
4967 const uint32_t zero = 0;
4968 uint32_t offset;
4969
4970 if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint32_t), &zero, &offset)) {
4971 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
4972 return;
4973 }
4974
4975 *index_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
4976 *remaining_indexes = 1;
4977 }
4978
4979 static void
radv_emit_index_buffer(struct radv_cmd_buffer * cmd_buffer)4980 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
4981 {
4982 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4983 const struct radv_physical_device *pdev = radv_device_physical(device);
4984 struct radeon_cmdbuf *cs = cmd_buffer->cs;
4985 struct radv_cmd_state *state = &cmd_buffer->state;
4986 uint32_t max_index_count = state->max_index_count;
4987 uint64_t index_va = state->index_va;
4988
4989 /* With indirect generated commands the index buffer bind may be part of the
4990 * indirect command buffer, in which case the app may not have bound any yet. */
4991 if (state->index_type < 0)
4992 return;
4993
4994 /* Handle indirect draw calls with NULL index buffer if the GPU doesn't support them. */
4995 if (!max_index_count && pdev->info.has_zero_index_buffer_bug) {
4996 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &max_index_count);
4997 }
4998
4999 radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
5000 radeon_emit(cs, index_va);
5001 radeon_emit(cs, index_va >> 32);
5002
5003 radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
5004 radeon_emit(cs, max_index_count);
5005
5006 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
5007 }
5008
5009 static void
radv_flush_occlusion_query_state(struct radv_cmd_buffer * cmd_buffer)5010 radv_flush_occlusion_query_state(struct radv_cmd_buffer *cmd_buffer)
5011 {
5012 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5013 const struct radv_physical_device *pdev = radv_device_physical(device);
5014 const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
5015 const bool enable_occlusion_queries =
5016 cmd_buffer->state.active_occlusion_queries || cmd_buffer->state.inherited_occlusion_queries;
5017 uint32_t db_count_control;
5018
5019 if (!enable_occlusion_queries) {
5020 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(gfx_level < GFX11);
5021 } else {
5022 bool gfx10_perfect =
5023 gfx_level >= GFX10 && (cmd_buffer->state.perfect_occlusion_queries_enabled ||
5024 cmd_buffer->state.inherited_query_control_flags & VK_QUERY_CONTROL_PRECISE_BIT);
5025
5026 if (gfx_level >= GFX7) {
5027 /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
5028 * covered tiles, discards, and early depth testing. For more details,
5029 * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
5030 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
5031 S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | S_028004_ZPASS_ENABLE(1) |
5032 S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
5033 } else {
5034 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1);
5035 }
5036
5037 if (gfx_level < GFX12) {
5038 const uint32_t rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
5039 const uint32_t sample_rate = util_logbase2(rasterization_samples);
5040
5041 db_count_control |= S_028004_SAMPLE_RATE(sample_rate);
5042 }
5043 }
5044
5045 if (pdev->info.gfx_level >= GFX12) {
5046 radeon_opt_set_context_reg(cmd_buffer, R_028060_DB_COUNT_CONTROL, RADV_TRACKED_DB_COUNT_CONTROL,
5047 db_count_control);
5048 } else {
5049 radeon_opt_set_context_reg(cmd_buffer, R_028004_DB_COUNT_CONTROL, RADV_TRACKED_DB_COUNT_CONTROL,
5050 db_count_control);
5051 }
5052
5053 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_OCCLUSION_QUERY;
5054 }
5055
5056 unsigned
radv_instance_rate_prolog_index(unsigned num_attributes,uint32_t instance_rate_inputs)5057 radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
5058 {
5059 /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
5060 * single array sorted in ascending order using:
5061 * - total number of attributes
5062 * - number of instanced attributes
5063 * - index of first instanced attribute
5064 */
5065
5066 /* From total number of attributes to offset. */
5067 static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84, 120, 165, 220, 286, 364, 455, 560, 680};
5068 unsigned start_index = total_to_offset[num_attributes - 1];
5069
5070 /* From number of instanced attributes to offset. This would require a different LUT depending on
5071 * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
5072 * attributes.
5073 */
5074 static const uint8_t count_to_offset_total16[16] = {0, 16, 31, 45, 58, 70, 81, 91,
5075 100, 108, 115, 121, 126, 130, 133, 135};
5076 unsigned count = util_bitcount(instance_rate_inputs);
5077 unsigned offset_from_start_index = count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
5078
5079 unsigned first = ffs(instance_rate_inputs) - 1;
5080 return start_index + offset_from_start_index + first;
5081 }
5082
5083 static struct radv_shader_part *
lookup_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,uint32_t * nontrivial_divisors)5084 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader, uint32_t *nontrivial_divisors)
5085 {
5086 assert(vs_shader->info.vs.dynamic_inputs);
5087
5088 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5089 const struct radv_physical_device *pdev = radv_device_physical(device);
5090 const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
5091
5092 unsigned num_attributes = util_last_bit(vs_shader->info.vs.vb_desc_usage_mask);
5093 uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
5094
5095 uint32_t instance_rate_inputs = vi_state->instance_rate_inputs & attribute_mask;
5096 uint32_t zero_divisors = vi_state->zero_divisors & attribute_mask;
5097 *nontrivial_divisors = vi_state->nontrivial_divisors & attribute_mask;
5098 uint32_t misaligned_mask = cmd_buffer->state.vbo_misaligned_mask;
5099 uint32_t unaligned_mask = cmd_buffer->state.vbo_unaligned_mask;
5100 if (cmd_buffer->state.vbo_misaligned_mask_invalid) {
5101 bool misalignment_possible = pdev->info.gfx_level == GFX6 || pdev->info.gfx_level >= GFX10;
5102 u_foreach_bit (index, cmd_buffer->state.vbo_misaligned_mask_invalid & attribute_mask) {
5103 uint8_t binding = vi_state->bindings[index];
5104 if (!(cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(binding)))
5105 continue;
5106
5107 uint8_t format_req = vi_state->format_align_req_minus_1[index];
5108 uint8_t component_req = vi_state->component_align_req_minus_1[index];
5109 uint64_t vb_offset = cmd_buffer->vertex_bindings[binding].offset;
5110 uint64_t vb_stride = cmd_buffer->vertex_bindings[binding].stride;
5111
5112 VkDeviceSize offset = vb_offset + vi_state->offsets[index];
5113
5114 if (misalignment_possible && ((offset | vb_stride) & format_req))
5115 misaligned_mask |= BITFIELD_BIT(index);
5116 if ((offset | vb_stride) & component_req)
5117 unaligned_mask |= BITFIELD_BIT(index);
5118 }
5119 cmd_buffer->state.vbo_misaligned_mask = misaligned_mask;
5120 cmd_buffer->state.vbo_unaligned_mask = unaligned_mask;
5121 cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask;
5122 }
5123 misaligned_mask |= vi_state->nontrivial_formats | unaligned_mask;
5124 misaligned_mask &= attribute_mask;
5125 unaligned_mask &= attribute_mask;
5126
5127 const bool can_use_simple_input =
5128 cmd_buffer->state.shaders[MESA_SHADER_VERTEX] &&
5129 !cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.merged_shader_compiled_separately &&
5130 cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.is_ngg == pdev->use_ngg &&
5131 cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.wave_size == pdev->ge_wave_size;
5132
5133 /* The instance ID input VGPR is placed differently when as_ls=true. as_ls is also needed to
5134 * workaround the LS VGPR initialization bug.
5135 */
5136 bool as_ls = vs_shader->info.vs.as_ls && (instance_rate_inputs || pdev->info.has_ls_vgpr_init_bug);
5137
5138 /* try to use a pre-compiled prolog first */
5139 struct radv_shader_part *prolog = NULL;
5140 if (can_use_simple_input && !as_ls && !misaligned_mask && !vi_state->alpha_adjust_lo && !vi_state->alpha_adjust_hi) {
5141 if (!instance_rate_inputs) {
5142 prolog = device->simple_vs_prologs[num_attributes - 1];
5143 } else if (num_attributes <= 16 && !*nontrivial_divisors && !zero_divisors &&
5144 util_bitcount(instance_rate_inputs) ==
5145 (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
5146 unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
5147 prolog = device->instance_rate_vs_prologs[index];
5148 }
5149 }
5150 if (prolog)
5151 return prolog;
5152
5153 struct radv_vs_prolog_key key;
5154 memset(&key, 0, sizeof(key));
5155 key.instance_rate_inputs = instance_rate_inputs;
5156 key.nontrivial_divisors = *nontrivial_divisors;
5157 key.zero_divisors = zero_divisors;
5158 /* If the attribute is aligned, post shuffle is implemented using DST_SEL instead. */
5159 key.post_shuffle = vi_state->post_shuffle & misaligned_mask;
5160 key.alpha_adjust_hi = vi_state->alpha_adjust_hi & attribute_mask & ~unaligned_mask;
5161 key.alpha_adjust_lo = vi_state->alpha_adjust_lo & attribute_mask & ~unaligned_mask;
5162 u_foreach_bit (index, misaligned_mask)
5163 key.formats[index] = vi_state->formats[index];
5164 key.num_attributes = num_attributes;
5165 key.misaligned_mask = misaligned_mask;
5166 key.unaligned_mask = unaligned_mask;
5167 key.as_ls = as_ls;
5168 key.is_ngg = vs_shader->info.is_ngg;
5169 key.wave32 = vs_shader->info.wave_size == 32;
5170
5171 if (vs_shader->info.merged_shader_compiled_separately) {
5172 assert(vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL || vs_shader->info.next_stage == MESA_SHADER_GEOMETRY);
5173 key.next_stage = vs_shader->info.next_stage;
5174 } else {
5175 key.next_stage = vs_shader->info.stage;
5176 }
5177
5178 return radv_shader_part_cache_get(device, &device->vs_prologs, &cmd_buffer->vs_prologs, &key);
5179 }
5180
5181 static void
emit_prolog_regs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,const struct radv_shader_part * prolog)5182 emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader,
5183 const struct radv_shader_part *prolog)
5184 {
5185 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5186 const struct radv_physical_device *pdev = radv_device_physical(device);
5187 uint32_t rsrc1, rsrc2;
5188
5189 /* no need to re-emit anything in this case */
5190 if (cmd_buffer->state.emitted_vs_prolog == prolog)
5191 return;
5192
5193 enum amd_gfx_level chip = pdev->info.gfx_level;
5194
5195 assert(cmd_buffer->state.emitted_graphics_pipeline == cmd_buffer->state.graphics_pipeline);
5196
5197 if (vs_shader->info.merged_shader_compiled_separately) {
5198 if (vs_shader->info.next_stage == MESA_SHADER_GEOMETRY) {
5199 radv_shader_combine_cfg_vs_gs(vs_shader, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY], &rsrc1, &rsrc2);
5200 } else {
5201 assert(vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL);
5202
5203 radv_shader_combine_cfg_vs_tcs(vs_shader, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL], &rsrc1, &rsrc2);
5204 }
5205 } else {
5206 rsrc1 = vs_shader->config.rsrc1;
5207 }
5208
5209 if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(rsrc1))
5210 rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
5211
5212 if (G_00B848_VGPRS(prolog->rsrc1) > G_00B848_VGPRS(rsrc1))
5213 rsrc1 = (rsrc1 & C_00B848_VGPRS) | (prolog->rsrc1 & ~C_00B848_VGPRS);
5214
5215 radeon_set_sh_reg(cmd_buffer->cs, vs_shader->info.regs.pgm_lo, prolog->va >> 8);
5216 radeon_set_sh_reg(cmd_buffer->cs, vs_shader->info.regs.pgm_rsrc1, rsrc1);
5217
5218 if (vs_shader->info.merged_shader_compiled_separately) {
5219 if (vs_shader->info.next_stage == MESA_SHADER_GEOMETRY) {
5220 const struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
5221 unsigned lds_size;
5222
5223 if (gs->info.is_ngg) {
5224 lds_size = DIV_ROUND_UP(gs->info.ngg_info.lds_size, pdev->info.lds_encode_granularity);
5225 } else {
5226 lds_size = gs->info.gs_ring_info.lds_size;
5227 }
5228
5229 radeon_set_sh_reg(cmd_buffer->cs, vs_shader->info.regs.pgm_rsrc2, rsrc2 | S_00B22C_LDS_SIZE(lds_size));
5230 } else {
5231 radeon_set_sh_reg(cmd_buffer->cs, vs_shader->info.regs.pgm_rsrc2, rsrc2);
5232 }
5233 }
5234
5235 radv_cs_add_buffer(device->ws, cmd_buffer->cs, prolog->bo);
5236 }
5237
5238 static void
emit_prolog_inputs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,uint32_t nontrivial_divisors)5239 emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader,
5240 uint32_t nontrivial_divisors)
5241 {
5242 /* no need to re-emit anything in this case */
5243 if (!nontrivial_divisors && cmd_buffer->state.emitted_vs_prolog &&
5244 !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
5245 return;
5246
5247 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5248 const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
5249 uint64_t input_va = radv_shader_get_va(vs_shader);
5250
5251 if (nontrivial_divisors) {
5252 unsigned inputs_offset;
5253 uint32_t *inputs;
5254 unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
5255 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
5256 return;
5257
5258 *(inputs++) = input_va;
5259 *(inputs++) = input_va >> 32;
5260
5261 u_foreach_bit (index, nontrivial_divisors) {
5262 uint32_t div = vi_state->divisors[index];
5263 if (div == 0) {
5264 *(inputs++) = 0;
5265 *(inputs++) = 1;
5266 } else if (util_is_power_of_two_or_zero(div)) {
5267 *(inputs++) = util_logbase2(div) | (1 << 8);
5268 *(inputs++) = 0xffffffffu;
5269 } else {
5270 struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
5271 *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
5272 *(inputs++) = info.multiplier;
5273 }
5274 }
5275
5276 input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
5277 }
5278
5279 const uint32_t vs_prolog_inputs_offset = radv_get_user_sgpr_loc(vs_shader, AC_UD_VS_PROLOG_INPUTS);
5280 radv_emit_shader_pointer(device, cmd_buffer->cs, vs_prolog_inputs_offset, input_va, true);
5281 }
5282
5283 static void
radv_emit_vertex_input(struct radv_cmd_buffer * cmd_buffer)5284 radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer)
5285 {
5286 const struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
5287 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5288
5289 assert(!cmd_buffer->state.mesh_shading);
5290
5291 if (!vs_shader->info.vs.has_prolog)
5292 return;
5293
5294 uint32_t nontrivial_divisors;
5295 struct radv_shader_part *prolog = lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
5296 if (!prolog) {
5297 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
5298 return;
5299 }
5300 emit_prolog_regs(cmd_buffer, vs_shader, prolog);
5301 emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors);
5302
5303 cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, prolog->upload_seq);
5304
5305 cmd_buffer->state.emitted_vs_prolog = prolog;
5306
5307 if (radv_device_fault_detection_enabled(device))
5308 radv_save_vs_prolog(cmd_buffer, prolog);
5309 }
5310
5311 static void
radv_emit_tess_domain_origin(struct radv_cmd_buffer * cmd_buffer)5312 radv_emit_tess_domain_origin(struct radv_cmd_buffer *cmd_buffer)
5313 {
5314 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5315 const struct radv_physical_device *pdev = radv_device_physical(device);
5316 const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
5317 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5318 unsigned type = 0, partitioning = 0;
5319 unsigned topology;
5320
5321 switch (tes->info.tes._primitive_mode) {
5322 case TESS_PRIMITIVE_TRIANGLES:
5323 type = V_028B6C_TESS_TRIANGLE;
5324 break;
5325 case TESS_PRIMITIVE_QUADS:
5326 type = V_028B6C_TESS_QUAD;
5327 break;
5328 case TESS_PRIMITIVE_ISOLINES:
5329 type = V_028B6C_TESS_ISOLINE;
5330 break;
5331 default:
5332 unreachable("Invalid tess primitive type");
5333 }
5334
5335 switch (tes->info.tes.spacing) {
5336 case TESS_SPACING_EQUAL:
5337 partitioning = V_028B6C_PART_INTEGER;
5338 break;
5339 case TESS_SPACING_FRACTIONAL_ODD:
5340 partitioning = V_028B6C_PART_FRAC_ODD;
5341 break;
5342 case TESS_SPACING_FRACTIONAL_EVEN:
5343 partitioning = V_028B6C_PART_FRAC_EVEN;
5344 break;
5345 default:
5346 unreachable("Invalid tess spacing type");
5347 }
5348
5349 if (tes->info.tes.point_mode) {
5350 topology = V_028B6C_OUTPUT_POINT;
5351 } else if (tes->info.tes._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
5352 topology = V_028B6C_OUTPUT_LINE;
5353 } else {
5354 bool ccw = tes->info.tes.ccw;
5355
5356 if (d->vk.ts.domain_origin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT) {
5357 ccw = !ccw;
5358 }
5359
5360 topology = ccw ? V_028B6C_OUTPUT_TRIANGLE_CCW : V_028B6C_OUTPUT_TRIANGLE_CW;
5361 }
5362
5363 uint32_t vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | S_028B6C_TOPOLOGY(topology) |
5364 S_028B6C_DISTRIBUTION_MODE(pdev->tess_distribution_mode);
5365
5366 if (pdev->info.gfx_level >= GFX12) {
5367 vgt_tf_param |= S_028AA4_TEMPORAL(gfx12_load_last_use_discard);
5368
5369 radeon_set_context_reg(cmd_buffer->cs, R_028AA4_VGT_TF_PARAM, vgt_tf_param);
5370 } else {
5371 radeon_set_context_reg(cmd_buffer->cs, R_028B6C_VGT_TF_PARAM, vgt_tf_param);
5372 }
5373 }
5374
5375 static void
radv_emit_alpha_to_coverage_enable(struct radv_cmd_buffer * cmd_buffer)5376 radv_emit_alpha_to_coverage_enable(struct radv_cmd_buffer *cmd_buffer)
5377 {
5378 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5379 const struct radv_physical_device *pdev = radv_device_physical(device);
5380 const struct radv_instance *instance = radv_physical_device_instance(pdev);
5381 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5382 unsigned db_alpha_to_mask = 0;
5383
5384 if (instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING) {
5385 db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
5386 S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
5387 S_028B70_OFFSET_ROUND(0);
5388 } else {
5389 db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
5390 S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
5391 S_028B70_OFFSET_ROUND(1);
5392 }
5393
5394 db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(d->vk.ms.alpha_to_coverage_enable);
5395
5396 if (pdev->info.gfx_level >= GFX12) {
5397 radeon_set_context_reg(cmd_buffer->cs, R_02807C_DB_ALPHA_TO_MASK, db_alpha_to_mask);
5398 } else {
5399 radeon_set_context_reg(cmd_buffer->cs, R_028B70_DB_ALPHA_TO_MASK, db_alpha_to_mask);
5400 }
5401 }
5402
5403 static void
radv_emit_sample_mask(struct radv_cmd_buffer * cmd_buffer)5404 radv_emit_sample_mask(struct radv_cmd_buffer *cmd_buffer)
5405 {
5406 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5407
5408 radeon_set_context_reg_seq(cmd_buffer->cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
5409 radeon_emit(cmd_buffer->cs, d->vk.ms.sample_mask | ((uint32_t)d->vk.ms.sample_mask << 16));
5410 radeon_emit(cmd_buffer->cs, d->vk.ms.sample_mask | ((uint32_t)d->vk.ms.sample_mask << 16));
5411 }
5412
5413 static void
radv_emit_color_blend(struct radv_cmd_buffer * cmd_buffer)5414 radv_emit_color_blend(struct radv_cmd_buffer *cmd_buffer)
5415 {
5416 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5417 const struct radv_physical_device *pdev = radv_device_physical(device);
5418 const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
5419 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5420 unsigned cb_blend_control[MAX_RTS], sx_mrt_blend_opt[MAX_RTS];
5421 bool mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
5422
5423 for (unsigned i = 0; i < MAX_RTS; i++) {
5424 VkBlendOp eqRGB = d->vk.cb.attachments[i].color_blend_op;
5425 VkBlendFactor srcRGB = d->vk.cb.attachments[i].src_color_blend_factor;
5426 VkBlendFactor dstRGB = d->vk.cb.attachments[i].dst_color_blend_factor;
5427 VkBlendOp eqA = d->vk.cb.attachments[i].alpha_blend_op;
5428 VkBlendFactor srcA = d->vk.cb.attachments[i].src_alpha_blend_factor;
5429 VkBlendFactor dstA = d->vk.cb.attachments[i].dst_alpha_blend_factor;
5430 unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
5431 unsigned blend_cntl = 0;
5432
5433 cb_blend_control[i] = sx_mrt_blend_opt[i] = 0;
5434
5435 /* Ignore other blend targets if dual-source blending is enabled to prevent wrong behaviour.
5436 */
5437 if (i > 0 && mrt0_is_dual_src)
5438 continue;
5439
5440 if (!d->vk.cb.attachments[i].blend_enable) {
5441 sx_mrt_blend_opt[i] |= S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
5442 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
5443 continue;
5444 }
5445
5446 radv_normalize_blend_factor(eqRGB, &srcRGB, &dstRGB);
5447 radv_normalize_blend_factor(eqA, &srcA, &dstA);
5448
5449 /* Blending optimizations for RB+.
5450 * These transformations don't change the behavior.
5451 *
5452 * First, get rid of DST in the blend factors:
5453 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
5454 */
5455 radv_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, VK_BLEND_FACTOR_DST_COLOR, VK_BLEND_FACTOR_SRC_COLOR);
5456
5457 radv_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_COLOR, VK_BLEND_FACTOR_SRC_COLOR);
5458
5459 radv_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_ALPHA, VK_BLEND_FACTOR_SRC_ALPHA);
5460
5461 /* Look up the ideal settings from tables. */
5462 srcRGB_opt = radv_translate_blend_opt_factor(srcRGB, false);
5463 dstRGB_opt = radv_translate_blend_opt_factor(dstRGB, false);
5464 srcA_opt = radv_translate_blend_opt_factor(srcA, true);
5465 dstA_opt = radv_translate_blend_opt_factor(dstA, true);
5466
5467 /* Handle interdependencies. */
5468 if (radv_blend_factor_uses_dst(srcRGB))
5469 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
5470 if (radv_blend_factor_uses_dst(srcA))
5471 dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
5472
5473 if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE &&
5474 (dstRGB == VK_BLEND_FACTOR_ZERO || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
5475 dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE))
5476 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
5477
5478 /* Set the final value. */
5479 sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) |
5480 S_028760_COLOR_COMB_FCN(radv_translate_blend_opt_function(eqRGB)) |
5481 S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
5482 S_028760_ALPHA_COMB_FCN(radv_translate_blend_opt_function(eqA));
5483
5484 blend_cntl |= S_028780_ENABLE(1);
5485 blend_cntl |= S_028780_COLOR_COMB_FCN(radv_translate_blend_function(eqRGB));
5486 blend_cntl |= S_028780_COLOR_SRCBLEND(radv_translate_blend_factor(gfx_level, srcRGB));
5487 blend_cntl |= S_028780_COLOR_DESTBLEND(radv_translate_blend_factor(gfx_level, dstRGB));
5488 if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
5489 blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
5490 blend_cntl |= S_028780_ALPHA_COMB_FCN(radv_translate_blend_function(eqA));
5491 blend_cntl |= S_028780_ALPHA_SRCBLEND(radv_translate_blend_factor(gfx_level, srcA));
5492 blend_cntl |= S_028780_ALPHA_DESTBLEND(radv_translate_blend_factor(gfx_level, dstA));
5493 }
5494 cb_blend_control[i] = blend_cntl;
5495 }
5496
5497 if (pdev->info.has_rbplus) {
5498 /* Disable RB+ blend optimizations for dual source blending. */
5499 if (mrt0_is_dual_src) {
5500 for (unsigned i = 0; i < MAX_RTS; i++) {
5501 sx_mrt_blend_opt[i] =
5502 S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
5503 }
5504 }
5505
5506 /* Disable RB+ blend optimizations on GFX11 when alpha-to-coverage is enabled. */
5507 if (gfx_level >= GFX11 && d->vk.ms.alpha_to_coverage_enable) {
5508 sx_mrt_blend_opt[0] =
5509 S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
5510 }
5511 }
5512
5513 radeon_set_context_reg_seq(cmd_buffer->cs, R_028780_CB_BLEND0_CONTROL, MAX_RTS);
5514 radeon_emit_array(cmd_buffer->cs, cb_blend_control, MAX_RTS);
5515
5516 if (pdev->info.has_rbplus) {
5517 radeon_set_context_reg_seq(cmd_buffer->cs, R_028760_SX_MRT0_BLEND_OPT, MAX_RTS);
5518 radeon_emit_array(cmd_buffer->cs, sx_mrt_blend_opt, MAX_RTS);
5519 }
5520 }
5521
5522 static struct radv_shader_part *
lookup_ps_epilog(struct radv_cmd_buffer * cmd_buffer)5523 lookup_ps_epilog(struct radv_cmd_buffer *cmd_buffer)
5524 {
5525 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5526 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
5527 const struct radv_rendering_state *render = &cmd_buffer->state.render;
5528 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5529 const struct radv_physical_device *pdev = radv_device_physical(device);
5530 struct radv_ps_epilog_state state = {0};
5531 uint8_t color_remap[MAX_RTS];
5532
5533 memset(color_remap, MESA_VK_ATTACHMENT_UNUSED, sizeof(color_remap));
5534
5535 state.color_attachment_count = render->color_att_count;
5536 for (unsigned i = 0; i < render->color_att_count; ++i) {
5537 state.color_attachment_formats[i] = render->color_att[i].format;
5538 }
5539
5540 for (unsigned i = 0; i < MAX_RTS; i++) {
5541 VkBlendOp eqRGB = d->vk.cb.attachments[i].color_blend_op;
5542 VkBlendFactor srcRGB = d->vk.cb.attachments[i].src_color_blend_factor;
5543 VkBlendFactor dstRGB = d->vk.cb.attachments[i].dst_color_blend_factor;
5544
5545 state.color_write_mask |= d->vk.cb.attachments[i].write_mask << (4 * i);
5546 state.color_blend_enable |= d->vk.cb.attachments[i].blend_enable << (4 * i);
5547
5548 radv_normalize_blend_factor(eqRGB, &srcRGB, &dstRGB);
5549
5550 if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
5551 srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
5552 srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA)
5553 state.need_src_alpha |= 1 << i;
5554
5555 state.color_attachment_mappings[i] = d->vk.cal.color_map[i];
5556 if (state.color_attachment_mappings[i] != MESA_VK_ATTACHMENT_UNUSED)
5557 color_remap[state.color_attachment_mappings[i]] = i;
5558 }
5559
5560 state.mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
5561
5562 if (d->vk.ms.alpha_to_coverage_enable) {
5563 /* Select a color export format with alpha when alpha to coverage is enabled. */
5564 state.need_src_alpha |= 0x1;
5565 }
5566
5567 state.alpha_to_one = d->vk.ms.alpha_to_one_enable;
5568
5569 if (ps) {
5570 state.colors_written = ps->info.ps.colors_written;
5571
5572 if (ps->info.ps.exports_mrtz_via_epilog) {
5573 assert(pdev->info.gfx_level >= GFX11);
5574 state.export_depth = ps->info.ps.writes_z;
5575 state.export_stencil = ps->info.ps.writes_stencil;
5576 state.export_sample_mask = ps->info.ps.writes_sample_mask;
5577 state.alpha_to_coverage_via_mrtz = d->vk.ms.alpha_to_coverage_enable;
5578 }
5579 }
5580
5581 struct radv_ps_epilog_key key = radv_generate_ps_epilog_key(device, &state);
5582
5583 /* Determine the actual colors written if outputs are remapped. */
5584 uint32_t colors_written = 0;
5585 for (uint32_t i = 0; i < MAX_RTS; i++) {
5586 if (!((ps->info.ps.colors_written >> (i * 4)) & 0xf))
5587 continue;
5588
5589 if (color_remap[i] == MESA_VK_ATTACHMENT_UNUSED)
5590 continue;
5591
5592 colors_written |= 0xfu << (4 * color_remap[i]);
5593 }
5594
5595 /* Clear color attachments that aren't exported by the FS to match IO shader arguments. */
5596 key.spi_shader_col_format &= colors_written;
5597
5598 return radv_shader_part_cache_get(device, &device->ps_epilogs, &cmd_buffer->ps_epilogs, &key);
5599 }
5600
5601 static void
radv_emit_msaa_state(struct radv_cmd_buffer * cmd_buffer)5602 radv_emit_msaa_state(struct radv_cmd_buffer *cmd_buffer)
5603 {
5604 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5605 const struct radv_physical_device *pdev = radv_device_physical(device);
5606 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
5607 unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
5608 const struct radv_rendering_state *render = &cmd_buffer->state.render;
5609 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5610 unsigned log_samples = util_logbase2(rasterization_samples);
5611 unsigned pa_sc_aa_config = 0;
5612 unsigned max_sample_dist = 0;
5613 unsigned db_eqaa;
5614
5615 db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(pdev->info.gfx_level < GFX12) |
5616 S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
5617
5618 if (pdev->info.gfx_level >= GFX9 && d->vk.rs.conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
5619 /* Adjust MSAA state if conservative rasterization is enabled. */
5620 db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(4);
5621 pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
5622 }
5623
5624 if (!d->sample_location.count) {
5625 max_sample_dist = radv_get_default_max_sample_dist(log_samples);
5626 } else {
5627 uint32_t num_samples = (uint32_t)d->sample_location.per_pixel;
5628 VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
5629
5630 /* Convert the user sample locations to hardware sample locations. */
5631 radv_convert_user_sample_locs(&d->sample_location, 0, 0, sample_locs[0]);
5632 radv_convert_user_sample_locs(&d->sample_location, 1, 0, sample_locs[1]);
5633 radv_convert_user_sample_locs(&d->sample_location, 0, 1, sample_locs[2]);
5634 radv_convert_user_sample_locs(&d->sample_location, 1, 1, sample_locs[3]);
5635
5636 /* Compute the maximum sample distance from the specified locations. */
5637 for (unsigned i = 0; i < 4; ++i) {
5638 for (uint32_t j = 0; j < num_samples; j++) {
5639 VkOffset2D offset = sample_locs[i][j];
5640 max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
5641 }
5642 }
5643 }
5644
5645 if (rasterization_samples > 1) {
5646 unsigned z_samples = MAX2(render->ds_samples, rasterization_samples);
5647 unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
5648 unsigned log_z_samples = util_logbase2(z_samples);
5649 unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
5650 bool uses_underestimate = d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT;
5651
5652 pa_sc_aa_config |=
5653 S_028BE0_MSAA_NUM_SAMPLES(uses_underestimate ? 0 : log_samples) | S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
5654
5655 if (pdev->info.gfx_level >= GFX12) {
5656 pa_sc_aa_config |= S_028BE0_PS_ITER_SAMPLES(log_ps_iter_samples);
5657
5658 db_eqaa |= S_028078_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028078_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
5659 } else {
5660 pa_sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(max_sample_dist) |
5661 S_028BE0_COVERED_CENTROID_IS_CENTER(pdev->info.gfx_level >= GFX10_3);
5662
5663 db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
5664 S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
5665 }
5666
5667 if (radv_get_line_mode(cmd_buffer) == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR)
5668 db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
5669 }
5670
5671 /* GFX12 programs it in SPI_PS_INPUT_ENA.COVERAGE_TO_SHADER_SELECT */
5672 pa_sc_aa_config |=
5673 S_028BE0_COVERAGE_TO_SHADER_SELECT(pdev->info.gfx_level < GFX12 && ps && ps->info.ps.reads_fully_covered);
5674
5675 if (pdev->info.gfx_level >= GFX12) {
5676 radeon_set_context_reg(cmd_buffer->cs, R_028C5C_PA_SC_SAMPLE_PROPERTIES,
5677 S_028C5C_MAX_SAMPLE_DIST(max_sample_dist));
5678
5679 radeon_set_context_reg(cmd_buffer->cs, R_028078_DB_EQAA, db_eqaa);
5680 } else {
5681 radeon_set_context_reg(cmd_buffer->cs, R_028804_DB_EQAA, db_eqaa);
5682 }
5683
5684 radeon_set_context_reg(cmd_buffer->cs, R_028BE0_PA_SC_AA_CONFIG, pa_sc_aa_config);
5685 radeon_set_context_reg(
5686 cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0,
5687 S_028A48_ALTERNATE_RBS_PER_TILE(pdev->info.gfx_level >= GFX9) | S_028A48_VPORT_SCISSOR_ENABLE(1) |
5688 S_028A48_LINE_STIPPLE_ENABLE(d->vk.rs.line.stipple.enable) | S_028A48_MSAA_ENABLE(rasterization_samples > 1));
5689 }
5690
5691 static void
radv_emit_line_rasterization_mode(struct radv_cmd_buffer * cmd_buffer)5692 radv_emit_line_rasterization_mode(struct radv_cmd_buffer *cmd_buffer)
5693 {
5694 /* The DX10 diamond test is unnecessary with Vulkan and it decreases line rasterization
5695 * performance.
5696 */
5697 radeon_set_context_reg(
5698 cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL,
5699 S_028BDC_PERPENDICULAR_ENDCAP_ENA(radv_get_line_mode(cmd_buffer) == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR));
5700 }
5701
5702 static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const uint64_t states)5703 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const uint64_t states)
5704 {
5705 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5706 const struct radv_physical_device *pdev = radv_device_physical(device);
5707
5708 if (states & (RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_DEPTH_CLIP_ENABLE | RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE |
5709 RADV_DYNAMIC_DEPTH_CLAMP_ENABLE))
5710 radv_emit_viewport(cmd_buffer);
5711
5712 if (states & (RADV_DYNAMIC_SCISSOR | RADV_DYNAMIC_VIEWPORT) && !pdev->info.has_gfx9_scissor_bug)
5713 radv_emit_scissor(cmd_buffer);
5714
5715 if (states & RADV_DYNAMIC_LINE_WIDTH)
5716 radv_emit_line_width(cmd_buffer);
5717
5718 if (states & RADV_DYNAMIC_BLEND_CONSTANTS)
5719 radv_emit_blend_constants(cmd_buffer);
5720
5721 if (states & (RADV_DYNAMIC_STENCIL_REFERENCE | RADV_DYNAMIC_STENCIL_WRITE_MASK | RADV_DYNAMIC_STENCIL_COMPARE_MASK))
5722 radv_emit_stencil(cmd_buffer);
5723
5724 if (states & RADV_DYNAMIC_DEPTH_BOUNDS)
5725 radv_emit_depth_bounds(cmd_buffer);
5726
5727 if (states & RADV_DYNAMIC_DEPTH_BIAS)
5728 radv_emit_depth_bias(cmd_buffer);
5729
5730 if (states &
5731 (RADV_DYNAMIC_DISCARD_RECTANGLE | RADV_DYNAMIC_DISCARD_RECTANGLE_ENABLE | RADV_DYNAMIC_DISCARD_RECTANGLE_MODE))
5732 radv_emit_discard_rectangle(cmd_buffer);
5733
5734 if (states & RADV_DYNAMIC_CONSERVATIVE_RAST_MODE)
5735 radv_emit_conservative_rast_mode(cmd_buffer);
5736
5737 if (states & RADV_DYNAMIC_SAMPLE_LOCATIONS)
5738 radv_emit_sample_locations(cmd_buffer);
5739
5740 if (states & RADV_DYNAMIC_LINE_STIPPLE)
5741 radv_emit_line_stipple(cmd_buffer);
5742
5743 if (states & (RADV_DYNAMIC_CULL_MODE | RADV_DYNAMIC_FRONT_FACE | RADV_DYNAMIC_DEPTH_BIAS_ENABLE |
5744 RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE | RADV_DYNAMIC_PROVOKING_VERTEX_MODE |
5745 RADV_DYNAMIC_LINE_RASTERIZATION_MODE))
5746 radv_emit_culling(cmd_buffer);
5747
5748 if (states & (RADV_DYNAMIC_PROVOKING_VERTEX_MODE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY))
5749 radv_emit_provoking_vertex_mode(cmd_buffer);
5750
5751 if ((states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) ||
5752 (pdev->info.gfx_level >= GFX12 && states & RADV_DYNAMIC_PATCH_CONTROL_POINTS))
5753 radv_emit_primitive_topology(cmd_buffer);
5754
5755 if (states & (RADV_DYNAMIC_DEPTH_TEST_ENABLE | RADV_DYNAMIC_DEPTH_WRITE_ENABLE | RADV_DYNAMIC_DEPTH_COMPARE_OP |
5756 RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE | RADV_DYNAMIC_STENCIL_TEST_ENABLE | RADV_DYNAMIC_STENCIL_OP))
5757 radv_emit_depth_control(cmd_buffer);
5758
5759 if (states & RADV_DYNAMIC_STENCIL_OP)
5760 radv_emit_stencil_control(cmd_buffer);
5761
5762 if (states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE)
5763 radv_emit_fragment_shading_rate(cmd_buffer);
5764
5765 if (states & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
5766 radv_emit_primitive_restart_enable(cmd_buffer);
5767
5768 if (states & (RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_DYNAMIC_DEPTH_CLIP_ENABLE |
5769 RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE | RADV_DYNAMIC_DEPTH_CLAMP_ENABLE))
5770 radv_emit_clipping(cmd_buffer);
5771
5772 if (states & (RADV_DYNAMIC_LOGIC_OP | RADV_DYNAMIC_LOGIC_OP_ENABLE | RADV_DYNAMIC_COLOR_WRITE_MASK |
5773 RADV_DYNAMIC_COLOR_BLEND_ENABLE | RADV_DYNAMIC_COLOR_BLEND_EQUATION))
5774 radv_emit_logic_op(cmd_buffer);
5775
5776 if (states & (RADV_DYNAMIC_COLOR_WRITE_ENABLE | RADV_DYNAMIC_COLOR_WRITE_MASK))
5777 radv_emit_color_write(cmd_buffer);
5778
5779 if (states & RADV_DYNAMIC_VERTEX_INPUT)
5780 radv_emit_vertex_input(cmd_buffer);
5781
5782 if (states & RADV_DYNAMIC_PATCH_CONTROL_POINTS)
5783 radv_emit_patch_control_points(cmd_buffer);
5784
5785 if (states & RADV_DYNAMIC_TESS_DOMAIN_ORIGIN)
5786 radv_emit_tess_domain_origin(cmd_buffer);
5787
5788 if (states & RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE)
5789 radv_emit_alpha_to_coverage_enable(cmd_buffer);
5790
5791 if (states & RADV_DYNAMIC_SAMPLE_MASK)
5792 radv_emit_sample_mask(cmd_buffer);
5793
5794 if (states & (RADV_DYNAMIC_DEPTH_CLAMP_ENABLE | RADV_DYNAMIC_DEPTH_CLIP_ENABLE))
5795 radv_emit_depth_clamp_enable(cmd_buffer);
5796
5797 if (states & (RADV_DYNAMIC_COLOR_BLEND_ENABLE | RADV_DYNAMIC_COLOR_WRITE_MASK | RADV_DYNAMIC_COLOR_BLEND_EQUATION |
5798 RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE))
5799 radv_emit_color_blend(cmd_buffer);
5800
5801 if (states & (RADV_DYNAMIC_LINE_RASTERIZATION_MODE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE))
5802 radv_emit_line_rasterization_mode(cmd_buffer);
5803
5804 if (states & (RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_LINE_RASTERIZATION_MODE |
5805 RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE))
5806 radv_emit_rasterization_samples(cmd_buffer);
5807
5808 if (states & (RADV_DYNAMIC_LINE_STIPPLE_ENABLE | RADV_DYNAMIC_CONSERVATIVE_RAST_MODE |
5809 RADV_DYNAMIC_SAMPLE_LOCATIONS | RADV_DYNAMIC_RASTERIZATION_SAMPLES |
5810 RADV_DYNAMIC_LINE_RASTERIZATION_MODE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE))
5811 radv_emit_msaa_state(cmd_buffer);
5812
5813 /* RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE is handled by radv_emit_db_shader_control. */
5814
5815 cmd_buffer->state.dirty_dynamic &= ~states;
5816 }
5817
5818 static void
radv_flush_push_descriptors(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_state * descriptors_state)5819 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_state *descriptors_state)
5820 {
5821 struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5822 unsigned bo_offset;
5823
5824 if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr, &bo_offset))
5825 return;
5826
5827 set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5828 set->header.va += bo_offset;
5829 }
5830
5831 void
radv_upload_indirect_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_state * descriptors_state)5832 radv_upload_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
5833 struct radv_descriptor_state *descriptors_state)
5834 {
5835 uint32_t size = MAX_SETS * 4;
5836 uint32_t offset;
5837 void *ptr;
5838
5839 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
5840 return;
5841
5842 descriptors_state->indirect_descriptor_sets_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
5843
5844 for (unsigned i = 0; i < MAX_SETS; i++) {
5845 uint32_t *uptr = ((uint32_t *)ptr) + i;
5846 uint64_t set_va = 0;
5847 if (descriptors_state->valid & (1u << i))
5848 set_va = radv_descriptor_get_va(descriptors_state, i);
5849
5850 uptr[0] = set_va & 0xffffffff;
5851 }
5852 }
5853
5854 ALWAYS_INLINE static void
radv_flush_descriptors(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)5855 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, VkPipelineBindPoint bind_point)
5856 {
5857 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
5858 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5859 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5860 bool flush_indirect_descriptors;
5861
5862 if (!descriptors_state->dirty)
5863 return;
5864
5865 flush_indirect_descriptors = descriptors_state->need_indirect_descriptor_sets;
5866
5867 if (flush_indirect_descriptors)
5868 radv_upload_indirect_descriptor_sets(cmd_buffer, descriptors_state);
5869
5870 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, MAX_SETS * MESA_VULKAN_SHADER_STAGES * 4);
5871
5872 if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
5873 struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
5874 ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
5875 : cmd_buffer->state.rt_prolog;
5876
5877 radv_emit_descriptors_per_stage(device, cs, compute_shader, descriptors_state);
5878 } else {
5879 radv_foreach_stage(stage, stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
5880 {
5881 if (!cmd_buffer->state.shaders[stage])
5882 continue;
5883
5884 radv_emit_descriptors_per_stage(device, cs, cmd_buffer->state.shaders[stage], descriptors_state);
5885 }
5886
5887 if (stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
5888 radv_emit_descriptors_per_stage(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
5889 descriptors_state);
5890 }
5891 }
5892
5893 descriptors_state->dirty = 0;
5894
5895 assert(cmd_buffer->cs->cdw <= cdw_max);
5896
5897 if (radv_device_fault_detection_enabled(device))
5898 radv_save_descriptors(cmd_buffer, bind_point);
5899 }
5900
5901 static void
radv_emit_all_inline_push_consts(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,const uint32_t * values,bool * need_push_constants)5902 radv_emit_all_inline_push_consts(const struct radv_device *device, struct radeon_cmdbuf *cs,
5903 const struct radv_shader *shader, const uint32_t *values, bool *need_push_constants)
5904 {
5905 if (radv_get_user_sgpr_info(shader, AC_UD_PUSH_CONSTANTS)->sgpr_idx != -1)
5906 *need_push_constants |= true;
5907
5908 const uint64_t mask = shader->info.inline_push_constant_mask;
5909 if (!mask)
5910 return;
5911
5912 const uint8_t base = ffs(mask) - 1;
5913 if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
5914 /* consecutive inline push constants */
5915 radv_emit_inline_push_consts(device, cs, shader, AC_UD_INLINE_PUSH_CONSTANTS, values + base);
5916 } else {
5917 /* sparse inline push constants */
5918 uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
5919 unsigned num_consts = 0;
5920 u_foreach_bit64 (idx, mask)
5921 consts[num_consts++] = values[idx];
5922 radv_emit_inline_push_consts(device, cs, shader, AC_UD_INLINE_PUSH_CONSTANTS, consts);
5923 }
5924 }
5925
5926 ALWAYS_INLINE static VkShaderStageFlags
radv_must_flush_constants(const struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)5927 radv_must_flush_constants(const struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
5928 VkPipelineBindPoint bind_point)
5929 {
5930 const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
5931
5932 if (push_constants->size || push_constants->dynamic_offset_count)
5933 return stages & cmd_buffer->push_constant_stages;
5934
5935 return 0;
5936 }
5937
5938 static void
radv_flush_constants(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)5939 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, VkPipelineBindPoint bind_point)
5940 {
5941 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5942 struct radeon_cmdbuf *cs = cmd_buffer->cs;
5943 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
5944 const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
5945 struct radv_shader *shader, *prev_shader;
5946 bool need_push_constants = false;
5947 unsigned offset;
5948 void *ptr;
5949 uint64_t va;
5950 uint32_t internal_stages = stages;
5951 uint32_t dirty_stages = 0;
5952
5953 switch (bind_point) {
5954 case VK_PIPELINE_BIND_POINT_GRAPHICS:
5955 break;
5956 case VK_PIPELINE_BIND_POINT_COMPUTE:
5957 dirty_stages = RADV_RT_STAGE_BITS;
5958 break;
5959 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
5960 internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
5961 dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
5962 break;
5963 default:
5964 unreachable("Unhandled bind point");
5965 }
5966
5967 if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
5968 struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
5969 ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
5970 : cmd_buffer->state.rt_prolog;
5971
5972 radv_emit_all_inline_push_consts(device, cs, compute_shader, (uint32_t *)cmd_buffer->push_constants,
5973 &need_push_constants);
5974 } else {
5975 radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
5976 {
5977 shader = radv_get_shader(cmd_buffer->state.shaders, stage);
5978
5979 if (!shader)
5980 continue;
5981
5982 radv_emit_all_inline_push_consts(device, cs, shader, (uint32_t *)cmd_buffer->push_constants,
5983 &need_push_constants);
5984 }
5985
5986 if (internal_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
5987 radv_emit_all_inline_push_consts(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
5988 (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
5989 }
5990 }
5991
5992 if (need_push_constants) {
5993 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_constants->size + 16 * push_constants->dynamic_offset_count,
5994 &offset, &ptr))
5995 return;
5996
5997 memcpy(ptr, cmd_buffer->push_constants, push_constants->size);
5998 memcpy((char *)ptr + push_constants->size, descriptors_state->dynamic_buffers,
5999 16 * push_constants->dynamic_offset_count);
6000
6001 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6002 va += offset;
6003
6004 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, MESA_VULKAN_SHADER_STAGES * 4);
6005
6006 if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
6007 struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
6008 ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
6009 : cmd_buffer->state.rt_prolog;
6010
6011 radv_emit_userdata_address(device, cs, compute_shader, AC_UD_PUSH_CONSTANTS, va);
6012 } else {
6013 prev_shader = NULL;
6014 radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
6015 {
6016 shader = radv_get_shader(cmd_buffer->state.shaders, stage);
6017
6018 /* Avoid redundantly emitting the address for merged stages. */
6019 if (shader && shader != prev_shader) {
6020 radv_emit_userdata_address(device, cs, shader, AC_UD_PUSH_CONSTANTS, va);
6021
6022 prev_shader = shader;
6023 }
6024 }
6025
6026 if (internal_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
6027 radv_emit_userdata_address(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
6028 AC_UD_PUSH_CONSTANTS, va);
6029 }
6030 }
6031
6032 assert(cmd_buffer->cs->cdw <= cdw_max);
6033 }
6034
6035 cmd_buffer->push_constant_stages &= ~stages;
6036 cmd_buffer->push_constant_stages |= dirty_stages;
6037 }
6038
6039 void
radv_get_vbo_info(const struct radv_cmd_buffer * cmd_buffer,uint32_t idx,struct radv_vbo_info * vbo_info)6040 radv_get_vbo_info(const struct radv_cmd_buffer *cmd_buffer, uint32_t idx, struct radv_vbo_info *vbo_info)
6041 {
6042 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6043 const struct radv_physical_device *pdev = radv_device_physical(device);
6044 const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
6045 const uint32_t binding = vi_state->bindings[idx];
6046
6047 memset(vbo_info, 0, sizeof(*vbo_info));
6048
6049 vbo_info->binding = binding;
6050 vbo_info->stride = cmd_buffer->vertex_bindings[binding].stride;
6051
6052 vbo_info->attrib_offset = vi_state->offsets[idx];
6053 vbo_info->attrib_index_offset = vi_state->attrib_index_offset[idx];
6054 vbo_info->attrib_format_size = vi_state->format_sizes[idx];
6055
6056 if (!(vi_state->nontrivial_formats & BITFIELD_BIT(idx))) {
6057 const struct ac_vtx_format_info *vtx_info_table =
6058 ac_get_vtx_format_info_table(pdev->info.gfx_level, pdev->info.family);
6059 const struct ac_vtx_format_info *vtx_info = &vtx_info_table[vi_state->formats[idx]];
6060 const uint32_t hw_format = vtx_info->hw_format[vtx_info->num_channels - 1];
6061
6062 if (pdev->info.gfx_level >= GFX10) {
6063 vbo_info->non_trivial_format |= vtx_info->dst_sel | S_008F0C_FORMAT_GFX10(hw_format);
6064 } else {
6065 vbo_info->non_trivial_format |=
6066 vtx_info->dst_sel | S_008F0C_NUM_FORMAT((hw_format >> 4) & 0x7) | S_008F0C_DATA_FORMAT(hw_format & 0xf);
6067 }
6068 }
6069
6070 const struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
6071
6072 if (!buffer)
6073 return;
6074
6075 const uint32_t offset = cmd_buffer->vertex_bindings[binding].offset;
6076
6077 vbo_info->va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
6078
6079 if (cmd_buffer->vertex_bindings[binding].size) {
6080 vbo_info->size = cmd_buffer->vertex_bindings[binding].size;
6081 } else {
6082 vbo_info->size = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
6083 }
6084 }
6085
6086 static void
radv_write_vertex_descriptors(const struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs,void * vb_ptr)6087 radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs, void *vb_ptr)
6088 {
6089 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6090 const struct radv_physical_device *pdev = radv_device_physical(device);
6091 enum amd_gfx_level chip = pdev->info.gfx_level;
6092 unsigned desc_index = 0;
6093 uint32_t mask = vs->info.vs.vb_desc_usage_mask;
6094 const bool uses_dynamic_inputs = vs->info.vs.dynamic_inputs;
6095 const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
6096
6097 while (mask) {
6098 unsigned i = u_bit_scan(&mask);
6099 uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
6100
6101 if (uses_dynamic_inputs && !(vi_state->attribute_mask & BITFIELD_BIT(i))) {
6102 /* No vertex attribute description given: assume that the shader doesn't use this
6103 * location (vb_desc_usage_mask can be larger than attribute usage) and use a null
6104 * descriptor to avoid hangs (prologs load all attributes, even if there are holes).
6105 */
6106 memset(desc, 0, 4 * 4);
6107 continue;
6108 }
6109
6110 struct radv_vbo_info vbo_info;
6111 radv_get_vbo_info(cmd_buffer, i, &vbo_info);
6112
6113 uint32_t rsrc_word3;
6114
6115 if (uses_dynamic_inputs && vbo_info.non_trivial_format) {
6116 rsrc_word3 = vbo_info.non_trivial_format;
6117 } else {
6118 rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6119 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
6120
6121 if (pdev->info.gfx_level >= GFX10) {
6122 rsrc_word3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX10_FORMAT_32_UINT);
6123 } else {
6124 rsrc_word3 |=
6125 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6126 }
6127 }
6128
6129 if (!vbo_info.va) {
6130 if (uses_dynamic_inputs) {
6131 /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
6132 * to include the format/word3 so that the alpha channel is 1 for formats without an
6133 * alpha channel.
6134 */
6135 desc[0] = 0;
6136 desc[1] = S_008F04_STRIDE(16);
6137 desc[2] = 0;
6138 desc[3] = rsrc_word3;
6139 } else {
6140 memset(desc, 0, 4 * 4);
6141 }
6142
6143 continue;
6144 }
6145
6146 const unsigned stride = vbo_info.stride;
6147 uint32_t num_records = vbo_info.size;
6148
6149 if (vs->info.vs.use_per_attribute_vb_descs) {
6150 const uint32_t attrib_end = vbo_info.attrib_offset + vbo_info.attrib_format_size;
6151
6152 if (num_records < attrib_end) {
6153 num_records = 0; /* not enough space for one vertex */
6154 } else if (stride == 0) {
6155 num_records = 1; /* only one vertex */
6156 } else {
6157 num_records = (num_records - attrib_end) / stride + 1;
6158 /* If attrib_offset>stride, then the compiler will increase the vertex index by
6159 * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
6160 * only allowed with static strides.
6161 */
6162 num_records += vbo_info.attrib_index_offset;
6163 }
6164
6165 /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
6166 * into bytes in that case. GFX8 always uses bytes.
6167 */
6168 if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
6169 num_records = (num_records - 1) * stride + attrib_end;
6170 } else if (!num_records) {
6171 /* On GFX9, it seems bounds checking is disabled if both
6172 * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
6173 * GFX10.3 but it doesn't hurt.
6174 */
6175 if (uses_dynamic_inputs) {
6176 desc[0] = 0;
6177 desc[1] = S_008F04_STRIDE(16);
6178 desc[2] = 0;
6179 desc[3] = rsrc_word3;
6180 } else {
6181 memset(desc, 0, 16);
6182 }
6183
6184 continue;
6185 }
6186 } else {
6187 if (chip != GFX8 && stride)
6188 num_records = DIV_ROUND_UP(num_records, stride);
6189 }
6190
6191 if (chip >= GFX10) {
6192 /* OOB_SELECT chooses the out-of-bounds check:
6193 * - 1: index >= NUM_RECORDS (Structured)
6194 * - 3: offset >= NUM_RECORDS (Raw)
6195 */
6196 int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
6197 rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
6198 }
6199
6200 uint64_t va = vbo_info.va;
6201 if (uses_dynamic_inputs)
6202 va += vbo_info.attrib_offset;
6203
6204 desc[0] = va;
6205 desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
6206 desc[2] = num_records;
6207 desc[3] = rsrc_word3;
6208 }
6209 }
6210
6211 static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer)6212 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer)
6213 {
6214 struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
6215 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6216
6217 if (!vs->info.vs.vb_desc_usage_mask)
6218 return;
6219
6220 /* Mesh shaders don't have vertex descriptors. */
6221 assert(!cmd_buffer->state.mesh_shading);
6222
6223 unsigned vb_desc_alloc_size = util_bitcount(vs->info.vs.vb_desc_usage_mask) * 16;
6224 unsigned vb_offset;
6225 void *vb_ptr;
6226 uint64_t va;
6227
6228 /* allocate some descriptor state for vertex buffers */
6229 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, vb_desc_alloc_size, &vb_offset, &vb_ptr))
6230 return;
6231
6232 radv_write_vertex_descriptors(cmd_buffer, vs, vb_ptr);
6233
6234 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6235 va += vb_offset;
6236
6237 radv_emit_userdata_address(device, cmd_buffer->cs, vs, AC_UD_VS_VERTEX_BUFFERS, va);
6238
6239 cmd_buffer->state.vb_va = va;
6240 cmd_buffer->state.vb_size = vb_desc_alloc_size;
6241 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
6242
6243 if (radv_device_fault_detection_enabled(device))
6244 radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
6245
6246 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
6247 }
6248
6249 static void
radv_emit_streamout_buffers(struct radv_cmd_buffer * cmd_buffer,uint64_t va)6250 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
6251 {
6252 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6253 uint32_t streamout_buffers_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_STREAMOUT_BUFFERS);
6254 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6255
6256 if (!streamout_buffers_offset)
6257 return;
6258
6259 radv_emit_shader_pointer(device, cmd_buffer->cs, streamout_buffers_offset, va, false);
6260
6261 if (cmd_buffer->state.gs_copy_shader) {
6262 streamout_buffers_offset = radv_get_user_sgpr_loc(cmd_buffer->state.gs_copy_shader, AC_UD_STREAMOUT_BUFFERS);
6263 if (streamout_buffers_offset)
6264 radv_emit_shader_pointer(device, cmd_buffer->cs, streamout_buffers_offset, va, false);
6265 }
6266 }
6267
6268 static void
radv_emit_streamout_state(struct radv_cmd_buffer * cmd_buffer,uint64_t va)6269 radv_emit_streamout_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
6270 {
6271 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6272 const uint32_t streamout_state_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_STREAMOUT_STATE);
6273 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6274
6275 if (!streamout_state_offset)
6276 return;
6277
6278 radv_emit_shader_pointer(device, cmd_buffer->cs, streamout_state_offset, va, false);
6279 }
6280
6281 static void
radv_flush_streamout_descriptors(struct radv_cmd_buffer * cmd_buffer)6282 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
6283 {
6284 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6285 const struct radv_physical_device *pdev = radv_device_physical(device);
6286
6287 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
6288 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
6289 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6290 unsigned so_offset;
6291 uint64_t desc_va;
6292 void *so_ptr;
6293
6294 /* Allocate some descriptor state for streamout buffers. */
6295 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
6296 return;
6297
6298 for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
6299 struct radv_buffer *buffer = sb[i].buffer;
6300 uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
6301 uint32_t size = 0;
6302 uint64_t va = 0;
6303
6304 if (so->enabled_mask & (1 << i)) {
6305 va = radv_buffer_get_va(buffer->bo) + buffer->offset;
6306
6307 va += sb[i].offset;
6308
6309 /* Set the descriptor.
6310 *
6311 * On GFX8, the format must be non-INVALID, otherwise
6312 * the buffer will be considered not bound and store
6313 * instructions will be no-ops.
6314 */
6315 size = 0xffffffff;
6316
6317 if (pdev->use_ngg_streamout) {
6318 /* With NGG streamout, the buffer size is used to determine the max emit per buffer
6319 * and also acts as a disable bit when it's 0.
6320 */
6321 size = radv_is_streamout_enabled(cmd_buffer) ? sb[i].size : 0;
6322 }
6323 }
6324
6325 ac_build_raw_buffer_descriptor(pdev->info.gfx_level, va, size, desc);
6326 }
6327
6328 desc_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6329 desc_va += so_offset;
6330
6331 radv_emit_streamout_buffers(cmd_buffer, desc_va);
6332
6333 if (pdev->info.gfx_level >= GFX12) {
6334 const uint8_t first_target = ffs(so->enabled_mask) - 1;
6335 unsigned state_offset;
6336 uint64_t state_va;
6337 void *state_ptr;
6338
6339 /* The layout is:
6340 * struct {
6341 * struct {
6342 * uint32_t ordered_id; // equal for all buffers
6343 * uint32_t dwords_written;
6344 * } buffer[4];
6345 * };
6346 *
6347 * The buffer must be initialized to 0 and the address must be aligned to 64
6348 * because it's faster when the atomic doesn't straddle a 64B block boundary.
6349 */
6350 if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, MAX_SO_BUFFERS * 8, 64, &state_offset, &state_ptr))
6351 return;
6352
6353 memset(state_ptr, 0, MAX_SO_BUFFERS * 8);
6354
6355 state_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6356 state_va += state_offset;
6357
6358 /* The first enabled streamout target will contain the ordered ID/offset buffer for all
6359 * targets.
6360 */
6361 state_va += first_target * 8;
6362
6363 radv_emit_streamout_state(cmd_buffer, state_va);
6364 }
6365 }
6366
6367 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
6368 }
6369
6370 static void
radv_flush_shader_query_state_gfx(struct radv_cmd_buffer * cmd_buffer)6371 radv_flush_shader_query_state_gfx(struct radv_cmd_buffer *cmd_buffer)
6372 {
6373 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6374 const struct radv_physical_device *pdev = radv_device_physical(device);
6375 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6376 const uint32_t shader_query_state_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_SHADER_QUERY_STATE);
6377 enum radv_shader_query_state shader_query_state = radv_shader_query_none;
6378
6379 if (!shader_query_state_offset)
6380 return;
6381
6382 assert(last_vgt_shader->info.is_ngg || last_vgt_shader->info.stage == MESA_SHADER_GEOMETRY);
6383
6384 /* By default shader queries are disabled but they are enabled if the command buffer has active GDS
6385 * queries or if it's a secondary command buffer that inherits the number of generated
6386 * primitives.
6387 */
6388 if (cmd_buffer->state.active_pipeline_gds_queries ||
6389 (cmd_buffer->state.inherited_pipeline_statistics &
6390 (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
6391 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT)) ||
6392 (pdev->emulate_mesh_shader_queries && (cmd_buffer->state.inherited_pipeline_statistics &
6393 VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT)))
6394 shader_query_state |= radv_shader_query_pipeline_stat;
6395
6396 if (cmd_buffer->state.active_prims_gen_gds_queries)
6397 shader_query_state |= radv_shader_query_prim_gen;
6398
6399 if (cmd_buffer->state.active_prims_xfb_gds_queries && radv_is_streamout_enabled(cmd_buffer)) {
6400 shader_query_state |= radv_shader_query_prim_xfb | radv_shader_query_prim_gen;
6401 }
6402
6403 radeon_set_sh_reg(cmd_buffer->cs, shader_query_state_offset, shader_query_state);
6404 }
6405
6406 static void
radv_flush_shader_query_state_ace(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * task_shader)6407 radv_flush_shader_query_state_ace(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *task_shader)
6408 {
6409 const uint32_t shader_query_state_offset = radv_get_user_sgpr_loc(task_shader, AC_UD_SHADER_QUERY_STATE);
6410 enum radv_shader_query_state shader_query_state = radv_shader_query_none;
6411
6412 if (!shader_query_state_offset)
6413 return;
6414
6415 /* By default shader queries are disabled but they are enabled if the command buffer has active ACE
6416 * queries or if it's a secondary command buffer that inherits the number of task shader
6417 * invocations query.
6418 */
6419 if (cmd_buffer->state.active_pipeline_ace_queries ||
6420 (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT))
6421 shader_query_state |= radv_shader_query_pipeline_stat;
6422
6423 radeon_set_sh_reg(cmd_buffer->gang.cs, shader_query_state_offset, shader_query_state);
6424 }
6425
6426 static void
radv_flush_shader_query_state(struct radv_cmd_buffer * cmd_buffer)6427 radv_flush_shader_query_state(struct radv_cmd_buffer *cmd_buffer)
6428 {
6429 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6430 const struct radv_physical_device *pdev = radv_device_physical(device);
6431
6432 radv_flush_shader_query_state_gfx(cmd_buffer);
6433
6434 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK) && pdev->emulate_mesh_shader_queries)
6435 radv_flush_shader_query_state_ace(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TASK]);
6436
6437 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_SHADER_QUERY;
6438 }
6439
6440 static void
radv_flush_force_vrs_state(struct radv_cmd_buffer * cmd_buffer)6441 radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
6442 {
6443 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6444 const struct radv_physical_device *pdev = radv_device_physical(device);
6445 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6446 uint32_t force_vrs_rates_offset;
6447
6448 if (!last_vgt_shader->info.force_vrs_per_vertex) {
6449 /* Un-set the SGPR index so we know to re-emit it later. */
6450 cmd_buffer->state.last_force_vrs_rates_offset = -1;
6451 return;
6452 }
6453
6454 if (cmd_buffer->state.gs_copy_shader) {
6455 force_vrs_rates_offset = radv_get_user_sgpr_loc(cmd_buffer->state.gs_copy_shader, AC_UD_FORCE_VRS_RATES);
6456 } else {
6457 force_vrs_rates_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_FORCE_VRS_RATES);
6458 }
6459
6460 enum amd_gfx_level gfx_level = pdev->info.gfx_level;
6461 uint32_t vrs_rates = 0;
6462
6463 switch (device->force_vrs) {
6464 case RADV_FORCE_VRS_2x2:
6465 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X2 : (1u << 2) | (1u << 4);
6466 break;
6467 case RADV_FORCE_VRS_2x1:
6468 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X1 : (1u << 2) | (0u << 4);
6469 break;
6470 case RADV_FORCE_VRS_1x2:
6471 vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_1X2 : (0u << 2) | (1u << 4);
6472 break;
6473 default:
6474 break;
6475 }
6476
6477 if (cmd_buffer->state.last_vrs_rates != vrs_rates ||
6478 cmd_buffer->state.last_force_vrs_rates_offset != force_vrs_rates_offset) {
6479 radeon_set_sh_reg(cmd_buffer->cs, force_vrs_rates_offset, vrs_rates);
6480 }
6481
6482 cmd_buffer->state.last_vrs_rates = vrs_rates;
6483 cmd_buffer->state.last_force_vrs_rates_offset = force_vrs_rates_offset;
6484 }
6485
6486 static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer * cmd_buffer)6487 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
6488 {
6489 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)
6490 radv_flush_vertex_descriptors(cmd_buffer);
6491
6492 radv_flush_streamout_descriptors(cmd_buffer);
6493
6494 VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS;
6495 radv_flush_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
6496
6497 const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
6498 if (pc_stages)
6499 radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
6500
6501 radv_flush_force_vrs_state(cmd_buffer);
6502 }
6503
6504 struct radv_draw_info {
6505 /**
6506 * Number of vertices.
6507 */
6508 uint32_t count;
6509
6510 /**
6511 * First instance id.
6512 */
6513 uint32_t first_instance;
6514
6515 /**
6516 * Number of instances.
6517 */
6518 uint32_t instance_count;
6519
6520 /**
6521 * Whether it's an indexed draw.
6522 */
6523 bool indexed;
6524
6525 /**
6526 * Indirect draw parameters resource.
6527 */
6528 struct radv_buffer *indirect;
6529 uint64_t indirect_offset;
6530 uint32_t stride;
6531
6532 /**
6533 * Draw count parameters resource.
6534 */
6535 struct radv_buffer *count_buffer;
6536 uint64_t count_buffer_offset;
6537
6538 /**
6539 * Stream output parameters resource.
6540 */
6541 struct radv_buffer *strmout_buffer;
6542 uint64_t strmout_buffer_offset;
6543 };
6544
6545 struct radv_prim_vertex_count {
6546 uint8_t min;
6547 uint8_t incr;
6548 };
6549
6550 static inline unsigned
radv_prims_for_vertices(struct radv_prim_vertex_count * info,unsigned num)6551 radv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num)
6552 {
6553 if (num == 0)
6554 return 0;
6555
6556 if (info->incr == 0)
6557 return 0;
6558
6559 if (num < info->min)
6560 return 0;
6561
6562 return 1 + ((num - info->min) / info->incr);
6563 }
6564
6565 static const struct radv_prim_vertex_count prim_size_table[] = {
6566 [V_008958_DI_PT_NONE] = {0, 0}, [V_008958_DI_PT_POINTLIST] = {1, 1},
6567 [V_008958_DI_PT_LINELIST] = {2, 2}, [V_008958_DI_PT_LINESTRIP] = {2, 1},
6568 [V_008958_DI_PT_TRILIST] = {3, 3}, [V_008958_DI_PT_TRIFAN] = {3, 1},
6569 [V_008958_DI_PT_TRISTRIP] = {3, 1}, [V_008958_DI_PT_LINELIST_ADJ] = {4, 4},
6570 [V_008958_DI_PT_LINESTRIP_ADJ] = {4, 1}, [V_008958_DI_PT_TRILIST_ADJ] = {6, 6},
6571 [V_008958_DI_PT_TRISTRIP_ADJ] = {6, 2}, [V_008958_DI_PT_RECTLIST] = {3, 3},
6572 [V_008958_DI_PT_LINELOOP] = {2, 1}, [V_008958_DI_PT_POLYGON] = {3, 1},
6573 [V_008958_DI_PT_2D_TRI_STRIP] = {0, 0},
6574 };
6575
6576 static uint32_t
radv_get_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count,unsigned topology,bool prim_restart_enable,unsigned patch_control_points,unsigned num_tess_patches)6577 radv_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, bool indirect_draw,
6578 bool count_from_stream_output, uint32_t draw_vertex_count, unsigned topology,
6579 bool prim_restart_enable, unsigned patch_control_points, unsigned num_tess_patches)
6580 {
6581 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6582 const struct radv_physical_device *pdev = radv_device_physical(device);
6583 const struct radeon_info *gpu_info = &pdev->info;
6584 const unsigned max_primgroup_in_wave = 2;
6585 /* SWITCH_ON_EOP(0) is always preferable. */
6586 bool wd_switch_on_eop = false;
6587 bool ia_switch_on_eop = false;
6588 bool ia_switch_on_eoi = false;
6589 bool partial_vs_wave = false;
6590 bool partial_es_wave = cmd_buffer->state.ia_multi_vgt_param.partial_es_wave;
6591 bool multi_instances_smaller_than_primgroup;
6592 struct radv_prim_vertex_count prim_vertex_count = prim_size_table[topology];
6593 unsigned primgroup_size;
6594
6595 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TESS_CTRL)) {
6596 primgroup_size = num_tess_patches;
6597 } else if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY)) {
6598 primgroup_size = 64;
6599 } else {
6600 primgroup_size = 128; /* recommended without a GS */
6601 }
6602
6603 /* GS requirement. */
6604 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY) && gpu_info->gfx_level <= GFX8) {
6605 unsigned gs_table_depth = pdev->gs_table_depth;
6606 if (SI_GS_PER_ES / primgroup_size >= gs_table_depth - 3)
6607 partial_es_wave = true;
6608 }
6609
6610 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TESS_CTRL)) {
6611 if (topology == V_008958_DI_PT_PATCH) {
6612 prim_vertex_count.min = patch_control_points;
6613 prim_vertex_count.incr = 1;
6614 }
6615 }
6616
6617 multi_instances_smaller_than_primgroup = indirect_draw;
6618 if (!multi_instances_smaller_than_primgroup && instanced_draw) {
6619 uint32_t num_prims = radv_prims_for_vertices(&prim_vertex_count, draw_vertex_count);
6620 if (num_prims < primgroup_size)
6621 multi_instances_smaller_than_primgroup = true;
6622 }
6623
6624 ia_switch_on_eoi = cmd_buffer->state.ia_multi_vgt_param.ia_switch_on_eoi;
6625 partial_vs_wave = cmd_buffer->state.ia_multi_vgt_param.partial_vs_wave;
6626
6627 if (gpu_info->gfx_level >= GFX7) {
6628 /* WD_SWITCH_ON_EOP has no effect on GPUs with less than
6629 * 4 shader engines. Set 1 to pass the assertion below.
6630 * The other cases are hardware requirements. */
6631 if (gpu_info->max_se < 4 || topology == V_008958_DI_PT_POLYGON || topology == V_008958_DI_PT_LINELOOP ||
6632 topology == V_008958_DI_PT_TRIFAN || topology == V_008958_DI_PT_TRISTRIP_ADJ ||
6633 (prim_restart_enable && (gpu_info->family < CHIP_POLARIS10 ||
6634 (topology != V_008958_DI_PT_POINTLIST && topology != V_008958_DI_PT_LINESTRIP))))
6635 wd_switch_on_eop = true;
6636
6637 /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
6638 * We don't know that for indirect drawing, so treat it as
6639 * always problematic. */
6640 if (gpu_info->family == CHIP_HAWAII && (instanced_draw || indirect_draw))
6641 wd_switch_on_eop = true;
6642
6643 /* Performance recommendation for 4 SE Gfx7-8 parts if
6644 * instances are smaller than a primgroup.
6645 * Assume indirect draws always use small instances.
6646 * This is needed for good VS wave utilization.
6647 */
6648 if (gpu_info->gfx_level <= GFX8 && gpu_info->max_se == 4 && multi_instances_smaller_than_primgroup)
6649 wd_switch_on_eop = true;
6650
6651 /* Hardware requirement when drawing primitives from a stream
6652 * output buffer.
6653 */
6654 if (count_from_stream_output)
6655 wd_switch_on_eop = true;
6656
6657 /* Required on GFX7 and later. */
6658 if (gpu_info->max_se > 2 && !wd_switch_on_eop)
6659 ia_switch_on_eoi = true;
6660
6661 /* Required by Hawaii and, for some special cases, by GFX8. */
6662 if (ia_switch_on_eoi &&
6663 (gpu_info->family == CHIP_HAWAII ||
6664 (gpu_info->gfx_level == GFX8 &&
6665 /* max primgroup in wave is always 2 - leave this for documentation */
6666 (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY) || max_primgroup_in_wave != 2))))
6667 partial_vs_wave = true;
6668
6669 /* Instancing bug on Bonaire. */
6670 if (gpu_info->family == CHIP_BONAIRE && ia_switch_on_eoi && (instanced_draw || indirect_draw))
6671 partial_vs_wave = true;
6672
6673 /* If the WD switch is false, the IA switch must be false too. */
6674 assert(wd_switch_on_eop || !ia_switch_on_eop);
6675 }
6676 /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
6677 if (gpu_info->gfx_level <= GFX8 && ia_switch_on_eoi)
6678 partial_es_wave = true;
6679
6680 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY)) {
6681 /* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
6682 * The hw doc says all multi-SE chips are affected, but amdgpu-pro Vulkan
6683 * only applies it to Hawaii. Do what amdgpu-pro Vulkan does.
6684 */
6685 if (gpu_info->family == CHIP_HAWAII && ia_switch_on_eoi) {
6686 bool set_vgt_flush = indirect_draw;
6687 if (!set_vgt_flush && instanced_draw) {
6688 uint32_t num_prims = radv_prims_for_vertices(&prim_vertex_count, draw_vertex_count);
6689 if (num_prims <= 1)
6690 set_vgt_flush = true;
6691 }
6692 if (set_vgt_flush)
6693 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
6694 }
6695 }
6696
6697 /* Workaround for a VGT hang when strip primitive types are used with
6698 * primitive restart.
6699 */
6700 if (prim_restart_enable && (topology == V_008958_DI_PT_LINESTRIP || topology == V_008958_DI_PT_TRISTRIP ||
6701 topology == V_008958_DI_PT_LINESTRIP_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
6702 partial_vs_wave = true;
6703 }
6704
6705 return cmd_buffer->state.ia_multi_vgt_param.base | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) |
6706 S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
6707 S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
6708 S_028AA8_WD_SWITCH_ON_EOP(gpu_info->gfx_level >= GFX7 ? wd_switch_on_eop : 0);
6709 }
6710
6711 static void
radv_emit_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count)6712 radv_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, bool indirect_draw,
6713 bool count_from_stream_output, uint32_t draw_vertex_count)
6714 {
6715 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6716 const struct radv_physical_device *pdev = radv_device_physical(device);
6717 const struct radeon_info *gpu_info = &pdev->info;
6718 struct radv_cmd_state *state = &cmd_buffer->state;
6719 const unsigned patch_control_points = state->dynamic.vk.ts.patch_control_points;
6720 const unsigned topology = state->dynamic.vk.ia.primitive_topology;
6721 const bool prim_restart_enable = state->dynamic.vk.ia.primitive_restart_enable;
6722 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6723 unsigned ia_multi_vgt_param;
6724
6725 ia_multi_vgt_param = radv_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
6726 draw_vertex_count, topology, prim_restart_enable,
6727 patch_control_points, state->tess_num_patches);
6728
6729 if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
6730 if (gpu_info->gfx_level == GFX9) {
6731 radeon_set_uconfig_reg_idx(&pdev->info, cs, R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
6732 } else if (gpu_info->gfx_level >= GFX7) {
6733 radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
6734 } else {
6735 radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
6736 }
6737 state->last_ia_multi_vgt_param = ia_multi_vgt_param;
6738 }
6739 }
6740
6741 static void
gfx10_emit_ge_cntl(struct radv_cmd_buffer * cmd_buffer)6742 gfx10_emit_ge_cntl(struct radv_cmd_buffer *cmd_buffer)
6743 {
6744 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6745 struct radv_cmd_state *state = &cmd_buffer->state;
6746 bool break_wave_at_eoi = false;
6747 unsigned primgroup_size;
6748 unsigned ge_cntl;
6749
6750 if (last_vgt_shader->info.is_ngg)
6751 return;
6752
6753 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TESS_CTRL)) {
6754 const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
6755
6756 primgroup_size = state->tess_num_patches;
6757
6758 if (cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id || tes->info.uses_prim_id ||
6759 (tes->info.merged_shader_compiled_separately &&
6760 cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id)) {
6761 break_wave_at_eoi = true;
6762 }
6763 } else if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY)) {
6764 const struct radv_legacy_gs_info *gs_state = &cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
6765 primgroup_size = gs_state->gs_prims_per_subgroup;
6766 } else {
6767 primgroup_size = 128; /* recommended without a GS and tess */
6768 }
6769
6770 ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(primgroup_size) | S_03096C_VERT_GRP_SIZE(256) | /* disable vertex grouping */
6771 S_03096C_PACKET_TO_ONE_PA(0) /* this should only be set if LINE_STIPPLE_TEX_ENA == 1 */ |
6772 S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
6773
6774 if (state->last_ge_cntl != ge_cntl) {
6775 radeon_set_uconfig_reg(cmd_buffer->cs, R_03096C_GE_CNTL, ge_cntl);
6776 state->last_ge_cntl = ge_cntl;
6777 }
6778 }
6779
6780 static void
radv_emit_draw_registers(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)6781 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
6782 {
6783 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6784 const struct radv_physical_device *pdev = radv_device_physical(device);
6785 const struct radeon_info *gpu_info = &pdev->info;
6786 struct radv_cmd_state *state = &cmd_buffer->state;
6787 struct radeon_cmdbuf *cs = cmd_buffer->cs;
6788 uint32_t topology = state->dynamic.vk.ia.primitive_topology;
6789 bool disable_instance_packing = false;
6790
6791 /* Draw state. */
6792 if (gpu_info->gfx_level >= GFX10) {
6793 gfx10_emit_ge_cntl(cmd_buffer);
6794 } else {
6795 radv_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
6796 !!draw_info->strmout_buffer, draw_info->indirect ? 0 : draw_info->count);
6797 }
6798
6799 /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
6800 * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
6801 * be applied for indexed and non-indexed draws.
6802 */
6803 if (gpu_info->gfx_level == GFX10_3 && state->active_pipeline_queries > 0 &&
6804 (draw_info->instance_count > 1 || draw_info->indirect) &&
6805 (topology == V_008958_DI_PT_LINELIST_ADJ || topology == V_008958_DI_PT_LINESTRIP_ADJ ||
6806 topology == V_008958_DI_PT_TRILIST_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
6807 disable_instance_packing = true;
6808 }
6809
6810 if ((draw_info->indexed && state->index_type != state->last_index_type) ||
6811 (gpu_info->gfx_level == GFX10_3 &&
6812 (state->last_index_type == -1 ||
6813 disable_instance_packing != G_028A7C_DISABLE_INSTANCE_PACKING(state->last_index_type)))) {
6814 uint32_t index_type = state->index_type | S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing);
6815
6816 if (pdev->info.gfx_level >= GFX9) {
6817 radeon_set_uconfig_reg_idx(&pdev->info, cs, R_03090C_VGT_INDEX_TYPE, 2, index_type);
6818 } else {
6819 radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
6820 radeon_emit(cs, index_type);
6821 }
6822
6823 state->last_index_type = index_type;
6824 }
6825 }
6826
6827 static void
radv_stage_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask)6828 radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
6829 {
6830 /* For simplicity, if the barrier wants to wait for the task shader,
6831 * just make it wait for the mesh shader too.
6832 */
6833 if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT)
6834 src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT;
6835
6836 if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
6837 VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
6838 /* Be conservative for now. */
6839 src_stage_mask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
6840 }
6841
6842 if (src_stage_mask &
6843 (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
6844 VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV | VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
6845 VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR | VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
6846 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
6847 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
6848 }
6849
6850 if (src_stage_mask & (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
6851 VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
6852 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
6853 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
6854 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
6855 } else if (src_stage_mask &
6856 (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
6857 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
6858 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
6859 VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
6860 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)) {
6861 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
6862 }
6863 }
6864
6865 static bool
can_skip_buffer_l2_flushes(struct radv_device * device)6866 can_skip_buffer_l2_flushes(struct radv_device *device)
6867 {
6868 const struct radv_physical_device *pdev = radv_device_physical(device);
6869 return pdev->info.gfx_level == GFX9 || (pdev->info.gfx_level >= GFX10 && !pdev->info.tcc_rb_non_coherent);
6870 }
6871
6872 /*
6873 * In vulkan barriers have two kinds of operations:
6874 *
6875 * - visibility (implemented with radv_src_access_flush)
6876 * - availability (implemented with radv_dst_access_flush)
6877 *
6878 * for a memory operation to observe the result of a previous memory operation
6879 * one needs to do a visibility operation from the source memory and then an
6880 * availability operation to the target memory.
6881 *
6882 * The complication is the availability and visibility operations do not need to
6883 * be in the same barrier.
6884 *
6885 * The cleanest way to implement this is to define the visibility operation to
6886 * bring the caches to a "state of rest", which none of the caches below that
6887 * level dirty.
6888 *
6889 * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
6890 *
6891 * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
6892 * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
6893 * images. However, given the existence of memory barriers which do not specify
6894 * the image/buffer it often devolves to just VRAM/GTT anyway.
6895 *
6896 * To help reducing the invalidations for GPUs that have L2 coherency between the
6897 * RB and the shader caches, we always invalidate L2 on the src side, as we can
6898 * use our knowledge of past usage to optimize flushes away.
6899 */
6900
6901 enum radv_cmd_flush_bits
radv_src_access_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stages,VkAccessFlags2 src_flags,const struct radv_image * image)6902 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stages, VkAccessFlags2 src_flags,
6903 const struct radv_image *image)
6904 {
6905 src_flags = vk_expand_src_access_flags2(src_stages, src_flags);
6906
6907 bool has_CB_meta = true, has_DB_meta = true;
6908 bool image_is_coherent = image ? image->l2_coherent : false;
6909 enum radv_cmd_flush_bits flush_bits = 0;
6910
6911 if (image) {
6912 if (!radv_image_has_CB_metadata(image))
6913 has_CB_meta = false;
6914 if (!radv_image_has_htile(image))
6915 has_DB_meta = false;
6916 }
6917
6918 if (src_flags & VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV)
6919 flush_bits |= RADV_CMD_FLAG_INV_L2;
6920
6921 if (src_flags & (VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR)) {
6922 /* since the STORAGE bit isn't set we know that this is a meta operation.
6923 * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
6924 * set it here. */
6925 if (image && !(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
6926 if (vk_format_is_depth_or_stencil(image->vk.format)) {
6927 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
6928 } else {
6929 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
6930 }
6931 }
6932
6933 if (!image_is_coherent)
6934 flush_bits |= RADV_CMD_FLAG_INV_L2;
6935 }
6936
6937 if (src_flags &
6938 (VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT)) {
6939 if (!image_is_coherent)
6940 flush_bits |= RADV_CMD_FLAG_WB_L2;
6941 }
6942
6943 if (src_flags & VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT) {
6944 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
6945 if (has_CB_meta)
6946 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
6947 }
6948
6949 if (src_flags & VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT) {
6950 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
6951 if (has_DB_meta)
6952 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
6953 }
6954
6955 if (src_flags & VK_ACCESS_2_TRANSFER_WRITE_BIT) {
6956 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
6957
6958 if (!image_is_coherent)
6959 flush_bits |= RADV_CMD_FLAG_INV_L2;
6960 if (has_CB_meta)
6961 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
6962 if (has_DB_meta)
6963 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
6964 }
6965
6966 return flush_bits;
6967 }
6968
6969 enum radv_cmd_flush_bits
radv_dst_access_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 dst_stages,VkAccessFlags2 dst_flags,const struct radv_image * image)6970 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 dst_stages, VkAccessFlags2 dst_flags,
6971 const struct radv_image *image)
6972 {
6973 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6974 const struct radv_physical_device *pdev = radv_device_physical(device);
6975 bool has_CB_meta = true, has_DB_meta = true;
6976 enum radv_cmd_flush_bits flush_bits = 0;
6977 bool flush_CB = true, flush_DB = true;
6978 bool image_is_coherent = image ? image->l2_coherent : false;
6979 bool flush_L2_metadata = false;
6980
6981 dst_flags = vk_expand_dst_access_flags2(dst_stages, dst_flags);
6982
6983 if (image) {
6984 if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
6985 flush_CB = false;
6986 flush_DB = false;
6987 }
6988
6989 if (!radv_image_has_CB_metadata(image))
6990 has_CB_meta = false;
6991 if (!radv_image_has_htile(image))
6992 has_DB_meta = false;
6993 }
6994
6995 flush_L2_metadata = (has_CB_meta || has_DB_meta) && pdev->info.gfx_level < GFX12;
6996
6997 /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
6998 * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
6999 image_is_coherent |= can_skip_buffer_l2_flushes(device) && !cmd_buffer->state.rb_noncoherent_dirty;
7000
7001 if (dst_flags & VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT) {
7002 /* SMEM loads are used to read compute dispatch size in shaders */
7003 if (!device->load_grid_size_from_user_sgpr)
7004 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
7005
7006 /* Ensure the DGC meta shader can read the commands. */
7007 if (radv_uses_device_generated_commands(device)) {
7008 flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
7009
7010 if (pdev->info.gfx_level < GFX9)
7011 flush_bits |= RADV_CMD_FLAG_INV_L2;
7012 }
7013 }
7014
7015 if (dst_flags & VK_ACCESS_2_UNIFORM_READ_BIT)
7016 flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
7017
7018 if (dst_flags & (VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT |
7019 VK_ACCESS_2_TRANSFER_READ_BIT)) {
7020 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
7021
7022 if (flush_L2_metadata)
7023 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
7024 if (!image_is_coherent)
7025 flush_bits |= RADV_CMD_FLAG_INV_L2;
7026 }
7027
7028 if (dst_flags & VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT)
7029 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
7030
7031 if (dst_flags & (VK_ACCESS_2_SHADER_STORAGE_READ_BIT | VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR |
7032 VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT)) {
7033 if (dst_flags & (VK_ACCESS_2_SHADER_STORAGE_READ_BIT | VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR |
7034 VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR)) {
7035 /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
7036 * invalidate the scalar cache. */
7037 if (!pdev->use_llvm && !image)
7038 flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
7039 }
7040
7041 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
7042 if (flush_L2_metadata)
7043 flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
7044 if (!image_is_coherent)
7045 flush_bits |= RADV_CMD_FLAG_INV_L2;
7046 }
7047
7048 if (dst_flags & VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_NV) {
7049 flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
7050 if (pdev->info.gfx_level < GFX9)
7051 flush_bits |= RADV_CMD_FLAG_INV_L2;
7052 }
7053
7054 if (dst_flags & VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT) {
7055 if (flush_CB)
7056 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
7057 if (has_CB_meta)
7058 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
7059 }
7060
7061 if (dst_flags & VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT) {
7062 if (flush_DB)
7063 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
7064 if (has_DB_meta)
7065 flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
7066 }
7067
7068 return flush_bits;
7069 }
7070
7071 void
radv_emit_resolve_barrier(struct radv_cmd_buffer * cmd_buffer,const struct radv_resolve_barrier * barrier)7072 radv_emit_resolve_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_resolve_barrier *barrier)
7073 {
7074 struct radv_rendering_state *render = &cmd_buffer->state.render;
7075
7076 for (uint32_t i = 0; i < render->color_att_count; i++) {
7077 struct radv_image_view *iview = render->color_att[i].iview;
7078 if (!iview)
7079 continue;
7080
7081 cmd_buffer->state.flush_bits |=
7082 radv_src_access_flush(cmd_buffer, barrier->src_stage_mask, barrier->src_access_mask, iview->image);
7083 }
7084 if (render->ds_att.iview) {
7085 cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_stage_mask,
7086 barrier->src_access_mask, render->ds_att.iview->image);
7087 }
7088
7089 radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
7090
7091 for (uint32_t i = 0; i < render->color_att_count; i++) {
7092 struct radv_image_view *iview = render->color_att[i].iview;
7093 if (!iview)
7094 continue;
7095
7096 cmd_buffer->state.flush_bits |=
7097 radv_dst_access_flush(cmd_buffer, barrier->dst_stage_mask, barrier->dst_access_mask, iview->image);
7098 }
7099 if (render->ds_att.iview) {
7100 cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_stage_mask,
7101 barrier->dst_access_mask, render->ds_att.iview->image);
7102 }
7103
7104 radv_gang_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
7105 }
7106
7107 static void
radv_handle_image_transition_separate(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,VkImageLayout src_stencil_layout,VkImageLayout dst_stencil_layout,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)7108 radv_handle_image_transition_separate(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7109 VkImageLayout src_layout, VkImageLayout dst_layout,
7110 VkImageLayout src_stencil_layout, VkImageLayout dst_stencil_layout,
7111 uint32_t src_family_index, uint32_t dst_family_index,
7112 const VkImageSubresourceRange *range,
7113 struct radv_sample_locations_state *sample_locs)
7114 {
7115 /* If we have a stencil layout that's different from depth, we need to
7116 * perform the stencil transition separately.
7117 */
7118 if ((range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) &&
7119 (src_layout != src_stencil_layout || dst_layout != dst_stencil_layout)) {
7120 VkImageSubresourceRange aspect_range = *range;
7121 /* Depth-only transitions. */
7122 if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
7123 aspect_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
7124 radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout, src_family_index, dst_family_index,
7125 &aspect_range, sample_locs);
7126 }
7127
7128 /* Stencil-only transitions. */
7129 aspect_range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
7130 radv_handle_image_transition(cmd_buffer, image, src_stencil_layout, dst_stencil_layout, src_family_index,
7131 dst_family_index, &aspect_range, sample_locs);
7132 } else {
7133 radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout, src_family_index, dst_family_index, range,
7134 sample_locs);
7135 }
7136 }
7137
7138 static void
radv_handle_rendering_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * view,uint32_t layer_count,uint32_t view_mask,VkImageLayout initial_layout,VkImageLayout initial_stencil_layout,VkImageLayout final_layout,VkImageLayout final_stencil_layout,struct radv_sample_locations_state * sample_locs)7139 radv_handle_rendering_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *view,
7140 uint32_t layer_count, uint32_t view_mask, VkImageLayout initial_layout,
7141 VkImageLayout initial_stencil_layout, VkImageLayout final_layout,
7142 VkImageLayout final_stencil_layout,
7143 struct radv_sample_locations_state *sample_locs)
7144 {
7145 VkImageSubresourceRange range;
7146 range.aspectMask = view->image->vk.aspects;
7147 range.baseMipLevel = view->vk.base_mip_level;
7148 range.levelCount = 1;
7149
7150 if (view_mask) {
7151 while (view_mask) {
7152 int start, count;
7153 u_bit_scan_consecutive_range(&view_mask, &start, &count);
7154
7155 range.baseArrayLayer = view->vk.base_array_layer + start;
7156 range.layerCount = count;
7157
7158 radv_handle_image_transition_separate(cmd_buffer, view->image, initial_layout, final_layout,
7159 initial_stencil_layout, final_stencil_layout, 0, 0, &range, sample_locs);
7160 }
7161 } else {
7162 range.baseArrayLayer = view->vk.base_array_layer;
7163 range.layerCount = layer_count;
7164 radv_handle_image_transition_separate(cmd_buffer, view->image, initial_layout, final_layout,
7165 initial_stencil_layout, final_stencil_layout, 0, 0, &range, sample_locs);
7166 }
7167 }
7168
7169 VKAPI_ATTR VkResult VKAPI_CALL
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)7170 radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
7171 {
7172 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7173 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7174 const struct radv_physical_device *pdev = radv_device_physical(device);
7175 VkResult result = VK_SUCCESS;
7176
7177 vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
7178
7179 if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
7180 return result;
7181
7182 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
7183 cmd_buffer->state.last_index_type = -1;
7184 cmd_buffer->state.last_num_instances = -1;
7185 cmd_buffer->state.last_vertex_offset_valid = false;
7186 cmd_buffer->state.last_first_instance = -1;
7187 cmd_buffer->state.last_drawid = -1;
7188 cmd_buffer->state.last_subpass_color_count = MAX_RTS;
7189 cmd_buffer->state.predication_type = -1;
7190 cmd_buffer->state.mesh_shading = false;
7191 cmd_buffer->state.last_vrs_rates = -1;
7192 cmd_buffer->state.last_force_vrs_rates_offset = -1;
7193
7194 radv_reset_tracked_regs(cmd_buffer);
7195
7196 cmd_buffer->usage_flags = pBeginInfo->flags;
7197
7198 cmd_buffer->state.dirty |=
7199 RADV_CMD_DIRTY_GUARDBAND | RADV_CMD_DIRTY_OCCLUSION_QUERY | RADV_CMD_DIRTY_DB_SHADER_CONTROL;
7200 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_ALL;
7201
7202 if (cmd_buffer->qf == RADV_QUEUE_GENERAL)
7203 vk_dynamic_graphics_state_init(&cmd_buffer->state.dynamic.vk);
7204
7205 if (cmd_buffer->qf == RADV_QUEUE_COMPUTE || device->vk.enabled_features.taskShader) {
7206 uint32_t pred_value = 0;
7207 uint32_t pred_offset;
7208 if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
7209 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7210
7211 cmd_buffer->state.mec_inv_pred_emitted = false;
7212 cmd_buffer->state.mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
7213 }
7214
7215 if (pdev->info.gfx_level >= GFX9 && cmd_buffer->qf == RADV_QUEUE_GENERAL) {
7216 unsigned num_db = pdev->info.max_render_backends;
7217 unsigned fence_offset, eop_bug_offset;
7218 void *fence_ptr;
7219
7220 radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
7221 memset(fence_ptr, 0, 8);
7222
7223 cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
7224 cmd_buffer->gfx9_fence_va += fence_offset;
7225
7226 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
7227
7228 if (pdev->info.gfx_level == GFX9) {
7229 /* Allocate a buffer for the EOP bug on GFX9. */
7230 radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
7231 memset(fence_ptr, 0, 16 * num_db);
7232 cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
7233 cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
7234
7235 radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
7236 }
7237 }
7238
7239 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
7240 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
7241
7242 char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
7243 const VkRenderingInfo *resume_info =
7244 vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level, pBeginInfo, gcbiar_data);
7245 if (resume_info) {
7246 radv_CmdBeginRendering(commandBuffer, resume_info);
7247 } else {
7248 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
7249 vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level, pBeginInfo);
7250
7251 radv_cmd_buffer_reset_rendering(cmd_buffer);
7252 struct radv_rendering_state *render = &cmd_buffer->state.render;
7253 render->active = true;
7254 render->view_mask = inheritance_info->viewMask;
7255 render->max_samples = inheritance_info->rasterizationSamples;
7256 render->color_att_count = inheritance_info->colorAttachmentCount;
7257 for (uint32_t i = 0; i < render->color_att_count; i++) {
7258 render->color_att[i] = (struct radv_attachment){
7259 .format = inheritance_info->pColorAttachmentFormats[i],
7260 };
7261 }
7262 assert(inheritance_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ||
7263 inheritance_info->stencilAttachmentFormat == VK_FORMAT_UNDEFINED ||
7264 inheritance_info->depthAttachmentFormat == inheritance_info->stencilAttachmentFormat);
7265 render->ds_att = (struct radv_attachment){.iview = NULL};
7266 if (inheritance_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED)
7267 render->ds_att.format = inheritance_info->depthAttachmentFormat;
7268 if (inheritance_info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED)
7269 render->ds_att.format = inheritance_info->stencilAttachmentFormat;
7270
7271 if (vk_format_has_depth(render->ds_att.format))
7272 render->ds_att_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
7273 if (vk_format_has_stencil(render->ds_att.format))
7274 render->ds_att_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
7275 }
7276
7277 cmd_buffer->state.inherited_pipeline_statistics = pBeginInfo->pInheritanceInfo->pipelineStatistics;
7278
7279 if (cmd_buffer->state.inherited_pipeline_statistics &
7280 (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
7281 VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT))
7282 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
7283
7284 cmd_buffer->state.inherited_occlusion_queries = pBeginInfo->pInheritanceInfo->occlusionQueryEnable;
7285 cmd_buffer->state.inherited_query_control_flags = pBeginInfo->pInheritanceInfo->queryFlags;
7286 if (cmd_buffer->state.inherited_occlusion_queries)
7287 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
7288 }
7289
7290 if (radv_device_fault_detection_enabled(device))
7291 radv_cmd_buffer_trace_emit(cmd_buffer);
7292
7293 radv_describe_begin_cmd_buffer(cmd_buffer);
7294
7295 return result;
7296 }
7297
7298 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)7299 radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount,
7300 const VkBuffer *pBuffers, const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
7301 const VkDeviceSize *pStrides)
7302 {
7303 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7304 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7305 struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
7306 const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
7307
7308 /* We have to defer setting up vertex buffer since we need the buffer
7309 * stride from the pipeline. */
7310
7311 assert(firstBinding + bindingCount <= MAX_VBS);
7312
7313 if (firstBinding + bindingCount > cmd_buffer->used_vertex_bindings)
7314 cmd_buffer->used_vertex_bindings = firstBinding + bindingCount;
7315
7316 uint32_t misaligned_mask_invalid = 0;
7317
7318 for (uint32_t i = 0; i < bindingCount; i++) {
7319 VK_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
7320 uint32_t idx = firstBinding + i;
7321 VkDeviceSize size = pSizes ? pSizes[i] : 0;
7322 /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
7323 VkDeviceSize stride = pStrides ? pStrides[i] : vb[idx].stride;
7324
7325 if (!!cmd_buffer->vertex_binding_buffers[idx] != !!buffer ||
7326 (buffer && ((vb[idx].offset & 0x3) != (pOffsets[i] & 0x3) || (vb[idx].stride & 0x3) != (stride & 0x3)))) {
7327 misaligned_mask_invalid |= vi_state->bindings_match_attrib ? BITFIELD_BIT(idx) : 0xffffffff;
7328 }
7329
7330 cmd_buffer->vertex_binding_buffers[idx] = buffer;
7331 vb[idx].offset = pOffsets[i];
7332 vb[idx].size = buffer ? vk_buffer_range(&buffer->vk, pOffsets[i], size) : size;
7333 vb[idx].stride = stride;
7334
7335 uint32_t bit = BITFIELD_BIT(idx);
7336 if (buffer) {
7337 radv_cs_add_buffer(device->ws, cmd_buffer->cs, cmd_buffer->vertex_binding_buffers[idx]->bo);
7338 cmd_buffer->state.vbo_bound_mask |= bit;
7339 } else {
7340 cmd_buffer->state.vbo_bound_mask &= ~bit;
7341 }
7342 }
7343
7344 if (misaligned_mask_invalid) {
7345 cmd_buffer->state.vbo_misaligned_mask_invalid = misaligned_mask_invalid;
7346 cmd_buffer->state.vbo_misaligned_mask &= ~misaligned_mask_invalid;
7347 cmd_buffer->state.vbo_unaligned_mask &= ~misaligned_mask_invalid;
7348 }
7349
7350 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
7351 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
7352 }
7353
7354 static uint32_t
vk_to_index_type(VkIndexType type)7355 vk_to_index_type(VkIndexType type)
7356 {
7357 switch (type) {
7358 case VK_INDEX_TYPE_UINT8_KHR:
7359 return V_028A7C_VGT_INDEX_8;
7360 case VK_INDEX_TYPE_UINT16:
7361 return V_028A7C_VGT_INDEX_16;
7362 case VK_INDEX_TYPE_UINT32:
7363 return V_028A7C_VGT_INDEX_32;
7364 default:
7365 unreachable("invalid index type");
7366 }
7367 }
7368
7369 static uint32_t
radv_get_vgt_index_size(uint32_t type)7370 radv_get_vgt_index_size(uint32_t type)
7371 {
7372 uint32_t index_type = G_028A7C_INDEX_TYPE(type);
7373 switch (index_type) {
7374 case V_028A7C_VGT_INDEX_8:
7375 return 1;
7376 case V_028A7C_VGT_INDEX_16:
7377 return 2;
7378 case V_028A7C_VGT_INDEX_32:
7379 return 4;
7380 default:
7381 unreachable("invalid index type");
7382 }
7383 }
7384
7385 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)7386 radv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size,
7387 VkIndexType indexType)
7388 {
7389 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7390 VK_FROM_HANDLE(radv_buffer, index_buffer, buffer);
7391 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7392 const struct radv_physical_device *pdev = radv_device_physical(device);
7393
7394 cmd_buffer->state.index_type = vk_to_index_type(indexType);
7395
7396 if (index_buffer) {
7397 cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
7398 cmd_buffer->state.index_va += index_buffer->offset + offset;
7399
7400 int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
7401 cmd_buffer->state.max_index_count = (vk_buffer_range(&index_buffer->vk, offset, size)) / index_size;
7402 radv_cs_add_buffer(device->ws, cmd_buffer->cs, index_buffer->bo);
7403 } else {
7404 cmd_buffer->state.index_va = 0;
7405 cmd_buffer->state.max_index_count = 0;
7406
7407 if (pdev->info.has_null_index_buffer_clamping_bug)
7408 cmd_buffer->state.index_va = 0x2;
7409 }
7410
7411 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
7412
7413 /* Primitive restart state depends on the index type. */
7414 if (cmd_buffer->state.dynamic.vk.ia.primitive_restart_enable)
7415 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
7416 }
7417
7418 static void
radv_bind_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)7419 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
7420 struct radv_descriptor_set *set, unsigned idx)
7421 {
7422 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7423 struct radeon_winsys *ws = device->ws;
7424
7425 radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
7426
7427 assert(set);
7428 assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
7429
7430 if (!device->use_global_bo_list) {
7431 for (unsigned j = 0; j < set->header.buffer_count; ++j)
7432 if (set->descriptors[j])
7433 radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
7434 }
7435
7436 if (set->header.bo)
7437 radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
7438 }
7439
7440 static void
radv_bind_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo,VkPipelineBindPoint bind_point)7441 radv_bind_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
7442 const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo, VkPipelineBindPoint bind_point)
7443 {
7444 VK_FROM_HANDLE(radv_pipeline_layout, layout, pBindDescriptorSetsInfo->layout);
7445 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7446 const struct radv_physical_device *pdev = radv_device_physical(device);
7447 const struct radv_instance *instance = radv_physical_device_instance(pdev);
7448 const bool no_dynamic_bounds = instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
7449 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
7450 unsigned dyn_idx = 0;
7451
7452 for (unsigned i = 0; i < pBindDescriptorSetsInfo->descriptorSetCount; ++i) {
7453 unsigned set_idx = i + pBindDescriptorSetsInfo->firstSet;
7454 VK_FROM_HANDLE(radv_descriptor_set, set, pBindDescriptorSetsInfo->pDescriptorSets[i]);
7455
7456 if (!set)
7457 continue;
7458
7459 /* If the set is already bound we only need to update the
7460 * (potentially changed) dynamic offsets. */
7461 if (descriptors_state->sets[set_idx] != set || !(descriptors_state->valid & (1u << set_idx))) {
7462 radv_bind_descriptor_set(cmd_buffer, bind_point, set, set_idx);
7463 }
7464
7465 for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) {
7466 unsigned idx = j + layout->set[i + pBindDescriptorSetsInfo->firstSet].dynamic_offset_start;
7467 uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
7468 assert(dyn_idx < pBindDescriptorSetsInfo->dynamicOffsetCount);
7469
7470 struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
7471
7472 if (!range->va) {
7473 memset(dst, 0, 4 * 4);
7474 } else {
7475 uint64_t va = range->va + pBindDescriptorSetsInfo->pDynamicOffsets[dyn_idx];
7476 const uint32_t size = no_dynamic_bounds ? 0xffffffffu : range->size;
7477
7478 ac_build_raw_buffer_descriptor(pdev->info.gfx_level, va, size, dst);
7479 }
7480
7481 cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages;
7482 }
7483 }
7484 }
7485
7486 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo)7487 radv_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,
7488 const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
7489 {
7490 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7491
7492 if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
7493 radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
7494 }
7495
7496 if (pBindDescriptorSetsInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
7497 radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
7498 }
7499
7500 if (pBindDescriptorSetsInfo->stageFlags & RADV_RT_STAGE_BITS) {
7501 radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
7502 }
7503 }
7504
7505 static bool
radv_init_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_set * set,struct radv_descriptor_set_layout * layout,VkPipelineBindPoint bind_point)7506 radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
7507 struct radv_descriptor_set_layout *layout, VkPipelineBindPoint bind_point)
7508 {
7509 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
7510 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7511 set->header.size = layout->size;
7512
7513 if (set->header.layout != layout) {
7514 if (set->header.layout)
7515 vk_descriptor_set_layout_unref(&device->vk, &set->header.layout->vk);
7516 vk_descriptor_set_layout_ref(&layout->vk);
7517 set->header.layout = layout;
7518 }
7519
7520 if (descriptors_state->push_set.capacity < set->header.size) {
7521 size_t new_size = MAX2(set->header.size, 1024);
7522 new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
7523 new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
7524
7525 free(set->header.mapped_ptr);
7526 set->header.mapped_ptr = malloc(new_size);
7527
7528 if (!set->header.mapped_ptr) {
7529 descriptors_state->push_set.capacity = 0;
7530 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7531 return false;
7532 }
7533
7534 descriptors_state->push_set.capacity = new_size;
7535 }
7536
7537 return true;
7538 }
7539
7540 void
radv_meta_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)7541 radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint,
7542 VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
7543 const VkWriteDescriptorSet *pDescriptorWrites)
7544 {
7545 VK_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
7546 struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
7547 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7548 unsigned bo_offset;
7549
7550 assert(set == 0);
7551 assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
7552
7553 push_set->header.size = layout->set[set].layout->size;
7554 push_set->header.layout = layout->set[set].layout;
7555
7556 if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
7557 (void **)&push_set->header.mapped_ptr))
7558 return;
7559
7560 push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
7561 push_set->header.va += bo_offset;
7562
7563 radv_cmd_update_descriptor_sets(device, cmd_buffer, radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
7564 pDescriptorWrites, 0, NULL);
7565
7566 radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
7567 }
7568
7569 static void
radv_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo,VkPipelineBindPoint bind_point)7570 radv_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo,
7571 VkPipelineBindPoint bind_point)
7572 {
7573 VK_FROM_HANDLE(radv_pipeline_layout, layout, pPushDescriptorSetInfo->layout);
7574 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
7575 struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
7576 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7577
7578 assert(layout->set[pPushDescriptorSetInfo->set].layout->flags &
7579 VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
7580
7581 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[pPushDescriptorSetInfo->set].layout,
7582 bind_point))
7583 return;
7584
7585 /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
7586 * because it is invalid, according to Vulkan spec.
7587 */
7588 for (int i = 0; i < pPushDescriptorSetInfo->descriptorWriteCount; i++) {
7589 ASSERTED const VkWriteDescriptorSet *writeset = &pPushDescriptorSetInfo->pDescriptorWrites[i];
7590 assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
7591 }
7592
7593 radv_cmd_update_descriptor_sets(device, cmd_buffer, radv_descriptor_set_to_handle(push_set),
7594 pPushDescriptorSetInfo->descriptorWriteCount,
7595 pPushDescriptorSetInfo->pDescriptorWrites, 0, NULL);
7596
7597 radv_set_descriptor_set(cmd_buffer, bind_point, push_set, pPushDescriptorSetInfo->set);
7598
7599 radv_flush_push_descriptors(cmd_buffer, descriptors_state);
7600 }
7601
7602 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo)7603 radv_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer, const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
7604 {
7605 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7606
7607 if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
7608 radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
7609 }
7610
7611 if (pPushDescriptorSetInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
7612 radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
7613 }
7614
7615 if (pPushDescriptorSetInfo->stageFlags & RADV_RT_STAGE_BITS) {
7616 radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
7617 }
7618 }
7619
7620 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSetWithTemplate2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetWithTemplateInfoKHR * pPushDescriptorSetWithTemplateInfo)7621 radv_CmdPushDescriptorSetWithTemplate2KHR(
7622 VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR *pPushDescriptorSetWithTemplateInfo)
7623 {
7624 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7625 VK_FROM_HANDLE(radv_pipeline_layout, layout, pPushDescriptorSetWithTemplateInfo->layout);
7626 VK_FROM_HANDLE(radv_descriptor_update_template, templ, pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
7627 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, templ->bind_point);
7628 struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
7629 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7630
7631 assert(layout->set[pPushDescriptorSetWithTemplateInfo->set].layout->flags &
7632 VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
7633
7634 if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[pPushDescriptorSetWithTemplateInfo->set].layout,
7635 templ->bind_point))
7636 return;
7637
7638 radv_cmd_update_descriptor_set_with_template(device, cmd_buffer, push_set,
7639 pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate,
7640 pPushDescriptorSetWithTemplateInfo->pData);
7641
7642 radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, pPushDescriptorSetWithTemplateInfo->set);
7643
7644 radv_flush_push_descriptors(cmd_buffer, descriptors_state);
7645 }
7646
7647 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,const VkPushConstantsInfoKHR * pPushConstantsInfo)7648 radv_CmdPushConstants2KHR(VkCommandBuffer commandBuffer, const VkPushConstantsInfoKHR *pPushConstantsInfo)
7649 {
7650 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7651 memcpy(cmd_buffer->push_constants + pPushConstantsInfo->offset, pPushConstantsInfo->pValues,
7652 pPushConstantsInfo->size);
7653 cmd_buffer->push_constant_stages |= pPushConstantsInfo->stageFlags;
7654 }
7655
7656 VKAPI_ATTR VkResult VKAPI_CALL
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)7657 radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
7658 {
7659 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7660 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7661 const struct radv_physical_device *pdev = radv_device_physical(device);
7662
7663 if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
7664 return vk_command_buffer_end(&cmd_buffer->vk);
7665
7666 radv_emit_mip_change_flush_default(cmd_buffer);
7667
7668 const bool is_gfx_or_ace = cmd_buffer->qf == RADV_QUEUE_GENERAL || cmd_buffer->qf == RADV_QUEUE_COMPUTE;
7669
7670 if (is_gfx_or_ace) {
7671 if (pdev->info.gfx_level == GFX6)
7672 cmd_buffer->state.flush_bits |=
7673 RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
7674
7675 /* Make sure to sync all pending active queries at the end of
7676 * command buffer.
7677 */
7678 cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
7679
7680 /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
7681 * command buffer.
7682 */
7683 if (cmd_buffer->state.rb_noncoherent_dirty && !can_skip_buffer_l2_flushes(device))
7684 cmd_buffer->state.flush_bits |= radv_src_access_flush(
7685 cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
7686 VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, NULL);
7687
7688 /* Since NGG streamout uses GDS, we need to make GDS idle when
7689 * we leave the IB, otherwise another process might overwrite
7690 * it while our shaders are busy.
7691 */
7692 if (cmd_buffer->gds_needed)
7693 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
7694 }
7695
7696 /* Finalize the internal compute command stream, if it exists. */
7697 if (cmd_buffer->gang.cs) {
7698 VkResult result = radv_gang_finalize(cmd_buffer);
7699 if (result != VK_SUCCESS)
7700 return vk_error(cmd_buffer, result);
7701 }
7702
7703 if (is_gfx_or_ace) {
7704 radv_emit_cache_flush(cmd_buffer);
7705
7706 /* Make sure CP DMA is idle at the end of IBs because the kernel
7707 * doesn't wait for it.
7708 */
7709 radv_cp_dma_wait_for_idle(cmd_buffer);
7710 }
7711
7712 radv_describe_end_cmd_buffer(cmd_buffer);
7713
7714 VkResult result = device->ws->cs_finalize(cmd_buffer->cs);
7715 if (result != VK_SUCCESS)
7716 return vk_error(cmd_buffer, result);
7717
7718 return vk_command_buffer_end(&cmd_buffer->vk);
7719 }
7720
7721 static void
radv_emit_compute_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline)7722 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_compute_pipeline *pipeline)
7723 {
7724 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7725 const struct radv_physical_device *pdev = radv_device_physical(device);
7726
7727 if (pipeline == cmd_buffer->state.emitted_compute_pipeline)
7728 return;
7729
7730 radeon_check_space(device->ws, cmd_buffer->cs, pdev->info.gfx_level >= GFX10 ? 19 : 16);
7731
7732 if (pipeline->base.type == RADV_PIPELINE_COMPUTE) {
7733 radv_emit_compute_shader(pdev, cmd_buffer->cs, cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]);
7734 } else {
7735 radv_emit_compute_shader(pdev, cmd_buffer->cs, cmd_buffer->state.rt_prolog);
7736 }
7737
7738 cmd_buffer->state.emitted_compute_pipeline = pipeline;
7739
7740 if (radv_device_fault_detection_enabled(device))
7741 radv_save_pipeline(cmd_buffer, &pipeline->base);
7742 }
7743
7744 static void
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)7745 radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
7746 {
7747 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
7748
7749 descriptors_state->dirty |= descriptors_state->valid;
7750 }
7751
7752 static void
radv_bind_vs_input_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_graphics_pipeline * pipeline)7753 radv_bind_vs_input_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_graphics_pipeline *pipeline)
7754 {
7755 const struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
7756 const struct radv_vertex_input_state *src = &pipeline->vertex_input;
7757
7758 /* Bind the vertex input state from the pipeline when it's static. */
7759 if (!vs_shader || !vs_shader->info.vs.vb_desc_usage_mask || (pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT))
7760 return;
7761
7762 cmd_buffer->state.vertex_input = *src;
7763
7764 if (!(pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE)) {
7765 for (uint32_t i = 0; i < MAX_VBS; i++)
7766 cmd_buffer->vertex_bindings[i].stride = pipeline->binding_stride[i];
7767 }
7768
7769 /* When the vertex input state is static but the VS has been compiled without it (GPL), the
7770 * driver needs to compile a VS prolog.
7771 */
7772 if (!vs_shader->info.vs.has_prolog)
7773 return;
7774
7775 cmd_buffer->state.vbo_misaligned_mask = 0;
7776 cmd_buffer->state.vbo_unaligned_mask = 0;
7777 cmd_buffer->state.vbo_misaligned_mask_invalid = src->attribute_mask;
7778
7779 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
7780 }
7781
7782 static void
radv_bind_multisample_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_multisample_state * ms)7783 radv_bind_multisample_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_multisample_state *ms)
7784 {
7785 if (ms->sample_shading_enable) {
7786 cmd_buffer->state.ms.sample_shading_enable = true;
7787 cmd_buffer->state.ms.min_sample_shading = ms->min_sample_shading;
7788 }
7789 }
7790
7791 static void
radv_bind_custom_blend_mode(struct radv_cmd_buffer * cmd_buffer,unsigned custom_blend_mode)7792 radv_bind_custom_blend_mode(struct radv_cmd_buffer *cmd_buffer, unsigned custom_blend_mode)
7793 {
7794 /* Re-emit CB_COLOR_CONTROL when the custom blending mode changes. */
7795 if (cmd_buffer->state.custom_blend_mode != custom_blend_mode)
7796 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_LOGIC_OP | RADV_DYNAMIC_LOGIC_OP_ENABLE;
7797
7798 cmd_buffer->state.custom_blend_mode = custom_blend_mode;
7799 }
7800
7801 static void
radv_bind_pre_rast_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)7802 radv_bind_pre_rast_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
7803 {
7804 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7805 const struct radv_physical_device *pdev = radv_device_physical(device);
7806 bool mesh_shading = shader->info.stage == MESA_SHADER_MESH;
7807 const struct radv_userdata_info *loc;
7808
7809 assert(shader->info.stage == MESA_SHADER_VERTEX || shader->info.stage == MESA_SHADER_TESS_CTRL ||
7810 shader->info.stage == MESA_SHADER_TESS_EVAL || shader->info.stage == MESA_SHADER_GEOMETRY ||
7811 shader->info.stage == MESA_SHADER_MESH);
7812
7813 if (radv_get_user_sgpr_info(shader, AC_UD_NGG_PROVOKING_VTX)->sgpr_idx != -1) {
7814 /* Re-emit the provoking vertex mode state because the SGPR idx can be different. */
7815 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PROVOKING_VERTEX_MODE;
7816 }
7817
7818 if (radv_get_user_sgpr_info(shader, AC_UD_STREAMOUT_BUFFERS)->sgpr_idx != -1) {
7819 /* Re-emit the streamout buffers because the SGPR idx can be different and with NGG streamout
7820 * they always need to be emitted because a buffer size of 0 is used to disable streamout.
7821 */
7822 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
7823
7824 if (pdev->use_ngg_streamout && pdev->info.gfx_level < GFX12) {
7825 /* GFX11 needs GDS OA for streamout. */
7826 cmd_buffer->gds_oa_needed = true;
7827 }
7828 }
7829
7830 if (radv_get_user_sgpr_info(shader, AC_UD_NUM_VERTS_PER_PRIM)->sgpr_idx != -1) {
7831 /* Re-emit the primitive topology because the SGPR idx can be different. */
7832 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
7833 }
7834
7835 if (radv_get_user_sgpr_info(shader, AC_UD_SHADER_QUERY_STATE)->sgpr_idx != -1) {
7836 /* Re-emit shader query state when SGPR exists but location potentially changed. */
7837 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
7838 }
7839
7840 const bool needs_vtx_sgpr =
7841 shader->info.stage == MESA_SHADER_VERTEX || shader->info.stage == MESA_SHADER_MESH ||
7842 (shader->info.stage == MESA_SHADER_GEOMETRY && !shader->info.merged_shader_compiled_separately) ||
7843 (shader->info.stage == MESA_SHADER_TESS_CTRL && !shader->info.merged_shader_compiled_separately);
7844
7845 loc = radv_get_user_sgpr_info(shader, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
7846 if (needs_vtx_sgpr && loc->sgpr_idx != -1) {
7847 cmd_buffer->state.vtx_base_sgpr = shader->info.user_data_0 + loc->sgpr_idx * 4;
7848 cmd_buffer->state.vtx_emit_num = loc->num_sgprs;
7849 cmd_buffer->state.uses_drawid = shader->info.vs.needs_draw_id;
7850 cmd_buffer->state.uses_baseinstance = shader->info.vs.needs_base_instance;
7851
7852 if (shader->info.merged_shader_compiled_separately) {
7853 /* Merged shaders compiled separately (eg. VS+TCS) always declare these user SGPRS
7854 * because the input arguments must match.
7855 */
7856 cmd_buffer->state.uses_drawid = true;
7857 cmd_buffer->state.uses_baseinstance = true;
7858 }
7859
7860 /* Re-emit some vertex states because the SGPR idx can be different. */
7861 cmd_buffer->state.last_first_instance = -1;
7862 cmd_buffer->state.last_vertex_offset_valid = false;
7863 cmd_buffer->state.last_drawid = -1;
7864 }
7865
7866 if (mesh_shading != cmd_buffer->state.mesh_shading) {
7867 /* Re-emit VRS state because the combiner is different (vertex vs primitive). Re-emit
7868 * primitive topology because the mesh shading pipeline clobbered it.
7869 */
7870 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
7871 }
7872
7873 cmd_buffer->state.mesh_shading = mesh_shading;
7874 }
7875
7876 static void
radv_bind_vertex_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs)7877 radv_bind_vertex_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs)
7878 {
7879 radv_bind_pre_rast_shader(cmd_buffer, vs);
7880
7881 /* Re-emit states that need to be updated when the vertex shader is compiled separately
7882 * because shader configs are combined.
7883 */
7884 if (vs->info.merged_shader_compiled_separately && vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
7885 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PATCH_CONTROL_POINTS;
7886 }
7887
7888 /* Can't put anything else here due to merged shaders */
7889 }
7890
7891 static void
radv_bind_tess_ctrl_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * tcs)7892 radv_bind_tess_ctrl_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *tcs)
7893 {
7894 radv_bind_pre_rast_shader(cmd_buffer, tcs);
7895
7896 cmd_buffer->tess_rings_needed = true;
7897
7898 /* Always re-emit patch control points/domain origin when a new pipeline with tessellation is
7899 * bound because a bunch of parameters (user SGPRs, TCS vertices out, ccw, etc) can be different.
7900 */
7901 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PATCH_CONTROL_POINTS | RADV_DYNAMIC_TESS_DOMAIN_ORIGIN;
7902
7903 /* Re-emit the VS prolog when the tessellation control shader is compiled separately because
7904 * shader configs are combined and need to be updated.
7905 */
7906 if (tcs->info.merged_shader_compiled_separately)
7907 cmd_buffer->state.emitted_vs_prolog = NULL;
7908 }
7909
7910 static void
radv_bind_tess_eval_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * tes)7911 radv_bind_tess_eval_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *tes)
7912 {
7913 radv_bind_pre_rast_shader(cmd_buffer, tes);
7914
7915 /* Can't put anything else here due to merged shaders */
7916 }
7917
7918 static void
radv_bind_geometry_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * gs)7919 radv_bind_geometry_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *gs)
7920 {
7921 radv_bind_pre_rast_shader(cmd_buffer, gs);
7922
7923 cmd_buffer->esgs_ring_size_needed = MAX2(cmd_buffer->esgs_ring_size_needed, gs->info.gs_ring_info.esgs_ring_size);
7924 cmd_buffer->gsvs_ring_size_needed = MAX2(cmd_buffer->gsvs_ring_size_needed, gs->info.gs_ring_info.gsvs_ring_size);
7925
7926 /* Re-emit the VS prolog when the geometry shader is compiled separately because shader configs
7927 * are combined and need to be updated.
7928 */
7929 if (gs->info.merged_shader_compiled_separately)
7930 cmd_buffer->state.emitted_vs_prolog = NULL;
7931 }
7932
7933 static void
radv_bind_gs_copy_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * gs_copy_shader)7934 radv_bind_gs_copy_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *gs_copy_shader)
7935 {
7936 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7937
7938 cmd_buffer->state.gs_copy_shader = gs_copy_shader;
7939
7940 if (gs_copy_shader) {
7941 cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, gs_copy_shader->upload_seq);
7942
7943 radv_cs_add_buffer(device->ws, cmd_buffer->cs, gs_copy_shader->bo);
7944 }
7945 }
7946
7947 static void
radv_bind_mesh_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ms)7948 radv_bind_mesh_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ms)
7949 {
7950 radv_bind_pre_rast_shader(cmd_buffer, ms);
7951
7952 cmd_buffer->mesh_scratch_ring_needed |= ms->info.ms.needs_ms_scratch_ring;
7953 }
7954
7955 static void
radv_bind_fragment_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ps)7956 radv_bind_fragment_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ps)
7957 {
7958 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7959 const struct radv_physical_device *pdev = radv_device_physical(device);
7960 const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
7961 const struct radv_shader *previous_ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
7962 const float min_sample_shading = 1.0f;
7963
7964 if (ps->info.ps.needs_sample_positions) {
7965 cmd_buffer->sample_positions_needed = true;
7966 }
7967
7968 /* Re-emit the FS state because the SGPR idx can be different. */
7969 if (radv_get_user_sgpr_info(ps, AC_UD_PS_STATE)->sgpr_idx != -1) {
7970 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_LINE_RASTERIZATION_MODE;
7971 }
7972
7973 /* Re-emit the conservative rasterization mode because inner coverage is different. */
7974 if (!previous_ps || previous_ps->info.ps.reads_fully_covered != ps->info.ps.reads_fully_covered)
7975 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_CONSERVATIVE_RAST_MODE;
7976
7977 if (gfx_level >= GFX10_3 && (!previous_ps || previous_ps->info.ps.force_sample_iter_shading_rate !=
7978 ps->info.ps.force_sample_iter_shading_rate))
7979 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
7980
7981 if (cmd_buffer->state.ms.sample_shading_enable != ps->info.ps.uses_sample_shading) {
7982 cmd_buffer->state.ms.sample_shading_enable = ps->info.ps.uses_sample_shading;
7983 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
7984
7985 if (gfx_level >= GFX10_3)
7986 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
7987 }
7988
7989 if (cmd_buffer->state.ms.min_sample_shading != min_sample_shading) {
7990 cmd_buffer->state.ms.min_sample_shading = min_sample_shading;
7991 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
7992 }
7993
7994 if (!previous_ps || previous_ps->info.regs.ps.db_shader_control != ps->info.regs.ps.db_shader_control ||
7995 previous_ps->info.ps.pops_is_per_sample != ps->info.ps.pops_is_per_sample)
7996 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL;
7997
7998 if (!previous_ps || cmd_buffer->state.uses_fbfetch_output != ps->info.ps.uses_fbfetch_output) {
7999 cmd_buffer->state.uses_fbfetch_output = ps->info.ps.uses_fbfetch_output;
8000 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FBFETCH_OUTPUT;
8001 }
8002
8003 /* Re-emit the PS epilog when a new fragment shader is bound. */
8004 if (ps->info.ps.has_epilog)
8005 cmd_buffer->state.emitted_ps_epilog = NULL;
8006 }
8007
8008 static void
radv_bind_task_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ts)8009 radv_bind_task_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ts)
8010 {
8011 if (!radv_gang_init(cmd_buffer))
8012 return;
8013
8014 cmd_buffer->task_rings_needed = true;
8015 }
8016
8017 static void
radv_bind_rt_prolog(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * rt_prolog)8018 radv_bind_rt_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *rt_prolog)
8019 {
8020 cmd_buffer->state.rt_prolog = rt_prolog;
8021
8022 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8023 const unsigned max_scratch_waves = radv_get_max_scratch_waves(device, rt_prolog);
8024 cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted, max_scratch_waves);
8025
8026 cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, rt_prolog->upload_seq);
8027
8028 radv_cs_add_buffer(device->ws, cmd_buffer->cs, rt_prolog->bo);
8029 }
8030
8031 /* This function binds/unbinds a shader to the cmdbuffer state. */
8032 static void
radv_bind_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader,gl_shader_stage stage)8033 radv_bind_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader, gl_shader_stage stage)
8034 {
8035 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8036
8037 if (!shader) {
8038 cmd_buffer->state.shaders[stage] = NULL;
8039 cmd_buffer->state.active_stages &= ~mesa_to_vk_shader_stage(stage);
8040
8041 /* Reset some dynamic states when a shader stage is unbound. */
8042 switch (stage) {
8043 case MESA_SHADER_FRAGMENT:
8044 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL;
8045 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_CONSERVATIVE_RAST_MODE | RADV_DYNAMIC_RASTERIZATION_SAMPLES |
8046 RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
8047 break;
8048 default:
8049 break;
8050 }
8051 return;
8052 }
8053
8054 switch (stage) {
8055 case MESA_SHADER_VERTEX:
8056 radv_bind_vertex_shader(cmd_buffer, shader);
8057 break;
8058 case MESA_SHADER_TESS_CTRL:
8059 radv_bind_tess_ctrl_shader(cmd_buffer, shader);
8060 break;
8061 case MESA_SHADER_TESS_EVAL:
8062 radv_bind_tess_eval_shader(cmd_buffer, shader);
8063 break;
8064 case MESA_SHADER_GEOMETRY:
8065 radv_bind_geometry_shader(cmd_buffer, shader);
8066 break;
8067 case MESA_SHADER_FRAGMENT:
8068 radv_bind_fragment_shader(cmd_buffer, shader);
8069 break;
8070 case MESA_SHADER_MESH:
8071 radv_bind_mesh_shader(cmd_buffer, shader);
8072 break;
8073 case MESA_SHADER_TASK:
8074 radv_bind_task_shader(cmd_buffer, shader);
8075 break;
8076 case MESA_SHADER_COMPUTE: {
8077 cmd_buffer->compute_scratch_size_per_wave_needed =
8078 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, shader->config.scratch_bytes_per_wave);
8079
8080 const unsigned max_stage_waves = radv_get_max_scratch_waves(device, shader);
8081 cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted, max_stage_waves);
8082 break;
8083 }
8084 case MESA_SHADER_INTERSECTION:
8085 /* no-op */
8086 break;
8087 default:
8088 unreachable("invalid shader stage");
8089 }
8090
8091 cmd_buffer->state.shaders[stage] = shader;
8092 cmd_buffer->state.active_stages |= mesa_to_vk_shader_stage(stage);
8093
8094 if (mesa_to_vk_shader_stage(stage) & RADV_GRAPHICS_STAGE_BITS) {
8095 cmd_buffer->scratch_size_per_wave_needed =
8096 MAX2(cmd_buffer->scratch_size_per_wave_needed, shader->config.scratch_bytes_per_wave);
8097
8098 const unsigned max_stage_waves = radv_get_max_scratch_waves(device, shader);
8099 cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, max_stage_waves);
8100 }
8101
8102 cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, shader->upload_seq);
8103
8104 radv_cs_add_buffer(device->ws, cmd_buffer->cs, shader->bo);
8105 }
8106
8107 static void
radv_reset_shader_object_state(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint)8108 radv_reset_shader_object_state(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint)
8109 {
8110 switch (pipelineBindPoint) {
8111 case VK_PIPELINE_BIND_POINT_COMPUTE:
8112 if (cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE]) {
8113 radv_bind_shader(cmd_buffer, NULL, MESA_SHADER_COMPUTE);
8114 cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE] = NULL;
8115 }
8116 break;
8117 case VK_PIPELINE_BIND_POINT_GRAPHICS:
8118 radv_foreach_stage(s, RADV_GRAPHICS_STAGE_BITS)
8119 {
8120 if (cmd_buffer->state.shader_objs[s]) {
8121 radv_bind_shader(cmd_buffer, NULL, s);
8122 cmd_buffer->state.shader_objs[s] = NULL;
8123 }
8124 }
8125 break;
8126 default:
8127 break;
8128 }
8129
8130 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GRAPHICS_SHADERS;
8131 }
8132
8133 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)8134 radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline _pipeline)
8135 {
8136 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8137 VK_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
8138 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8139 const struct radv_physical_device *pdev = radv_device_physical(device);
8140
8141 radv_reset_shader_object_state(cmd_buffer, pipelineBindPoint);
8142
8143 switch (pipelineBindPoint) {
8144 case VK_PIPELINE_BIND_POINT_COMPUTE: {
8145 struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
8146
8147 if (cmd_buffer->state.compute_pipeline == compute_pipeline)
8148 return;
8149 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
8150
8151 radv_bind_shader(cmd_buffer, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], MESA_SHADER_COMPUTE);
8152
8153 cmd_buffer->state.compute_pipeline = compute_pipeline;
8154 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
8155 break;
8156 }
8157 case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
8158 struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
8159
8160 if (cmd_buffer->state.rt_pipeline == rt_pipeline)
8161 return;
8162 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
8163
8164 radv_bind_shader(cmd_buffer, rt_pipeline->base.base.shaders[MESA_SHADER_INTERSECTION], MESA_SHADER_INTERSECTION);
8165 radv_bind_rt_prolog(cmd_buffer, rt_pipeline->prolog);
8166
8167 for (unsigned i = 0; i < rt_pipeline->stage_count; ++i) {
8168 struct radv_shader *shader = rt_pipeline->stages[i].shader;
8169 if (shader)
8170 radv_cs_add_buffer(device->ws, cmd_buffer->cs, shader->bo);
8171 }
8172
8173 cmd_buffer->state.rt_pipeline = rt_pipeline;
8174 cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS;
8175
8176 /* Bind the stack size when it's not dynamic. */
8177 if (rt_pipeline->stack_size != -1u)
8178 cmd_buffer->state.rt_stack_size = rt_pipeline->stack_size;
8179
8180 break;
8181 }
8182 case VK_PIPELINE_BIND_POINT_GRAPHICS: {
8183 struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
8184
8185 /* Bind the non-dynamic graphics state from the pipeline unconditionally because some PSO
8186 * might have been overwritten between two binds of the same pipeline.
8187 */
8188 radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state);
8189
8190 if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
8191 return;
8192 radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
8193
8194 radv_foreach_stage(
8195 stage, (cmd_buffer->state.active_stages | graphics_pipeline->active_stages) & RADV_GRAPHICS_STAGE_BITS)
8196 {
8197 radv_bind_shader(cmd_buffer, graphics_pipeline->base.shaders[stage], stage);
8198 }
8199
8200 radv_bind_gs_copy_shader(cmd_buffer, graphics_pipeline->base.gs_copy_shader);
8201
8202 cmd_buffer->state.last_vgt_shader = graphics_pipeline->base.shaders[graphics_pipeline->last_vgt_api_stage];
8203
8204 cmd_buffer->state.graphics_pipeline = graphics_pipeline;
8205
8206 cmd_buffer->state.has_nggc = graphics_pipeline->has_ngg_culling;
8207 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
8208 cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
8209
8210 /* Prefetch all pipeline shaders at first draw time. */
8211 cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
8212
8213 if (pdev->info.has_vgt_flush_ngg_legacy_bug &&
8214 (!cmd_buffer->state.emitted_graphics_pipeline ||
8215 (cmd_buffer->state.emitted_graphics_pipeline->is_ngg && !cmd_buffer->state.graphics_pipeline->is_ngg))) {
8216 /* Transitioning from NGG to legacy GS requires
8217 * VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH
8218 * is also emitted at the beginning of IBs when legacy
8219 * GS ring pointers are set.
8220 */
8221 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
8222 }
8223
8224 cmd_buffer->state.uses_dynamic_patch_control_points =
8225 !!(graphics_pipeline->dynamic_states & RADV_DYNAMIC_PATCH_CONTROL_POINTS);
8226
8227 if (graphics_pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
8228 if (!cmd_buffer->state.uses_dynamic_patch_control_points) {
8229 /* Bind the tessellation state from the pipeline when it's not dynamic. */
8230 struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
8231
8232 cmd_buffer->state.tess_num_patches = tcs->info.num_tess_patches;
8233 cmd_buffer->state.tess_lds_size = tcs->info.tcs.num_lds_blocks;
8234 }
8235 }
8236
8237 const struct radv_shader *vs = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_VERTEX);
8238 if (vs) {
8239 /* Re-emit the VS prolog when a new vertex shader is bound. */
8240 if (vs->info.vs.has_prolog) {
8241 cmd_buffer->state.emitted_vs_prolog = NULL;
8242 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
8243 }
8244
8245 /* Re-emit the vertex buffer descriptors because they are really tied to the pipeline. */
8246 if (vs->info.vs.vb_desc_usage_mask) {
8247 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
8248 }
8249 }
8250
8251 if (!cmd_buffer->state.emitted_graphics_pipeline ||
8252 cmd_buffer->state.spi_shader_col_format != graphics_pipeline->spi_shader_col_format) {
8253 cmd_buffer->state.spi_shader_col_format = graphics_pipeline->spi_shader_col_format;
8254 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
8255 if (pdev->info.rbplus_allowed)
8256 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
8257 }
8258
8259 if (!cmd_buffer->state.emitted_graphics_pipeline ||
8260 cmd_buffer->state.cb_shader_mask != graphics_pipeline->cb_shader_mask) {
8261 cmd_buffer->state.cb_shader_mask = graphics_pipeline->cb_shader_mask;
8262 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
8263 }
8264
8265 radv_bind_vs_input_state(cmd_buffer, graphics_pipeline);
8266
8267 radv_bind_multisample_state(cmd_buffer, &graphics_pipeline->ms);
8268
8269 radv_bind_custom_blend_mode(cmd_buffer, graphics_pipeline->custom_blend_mode);
8270
8271 cmd_buffer->state.db_render_control = graphics_pipeline->db_render_control;
8272
8273 cmd_buffer->state.rast_prim = graphics_pipeline->rast_prim;
8274
8275 cmd_buffer->state.ia_multi_vgt_param = graphics_pipeline->ia_multi_vgt_param;
8276
8277 cmd_buffer->state.uses_out_of_order_rast = graphics_pipeline->uses_out_of_order_rast;
8278 cmd_buffer->state.uses_vrs = graphics_pipeline->uses_vrs;
8279 cmd_buffer->state.uses_vrs_attachment = graphics_pipeline->uses_vrs_attachment;
8280 cmd_buffer->state.uses_vrs_coarse_shading = graphics_pipeline->uses_vrs_coarse_shading;
8281 break;
8282 }
8283 default:
8284 assert(!"invalid bind point");
8285 break;
8286 }
8287
8288 cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].size = pipeline->push_constant_size;
8289 cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].dynamic_offset_count =
8290 pipeline->dynamic_offset_count;
8291 cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].need_indirect_descriptor_sets =
8292 pipeline->need_indirect_descriptor_sets;
8293 }
8294
8295 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)8296 radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
8297 const VkViewport *pViewports)
8298 {
8299 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8300 struct radv_cmd_state *state = &cmd_buffer->state;
8301 ASSERTED const uint32_t total_count = firstViewport + viewportCount;
8302
8303 assert(firstViewport < MAX_VIEWPORTS);
8304 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
8305
8306 if (state->dynamic.vk.vp.viewport_count < total_count)
8307 state->dynamic.vk.vp.viewport_count = total_count;
8308
8309 memcpy(state->dynamic.vk.vp.viewports + firstViewport, pViewports, viewportCount * sizeof(*pViewports));
8310 for (unsigned i = 0; i < viewportCount; i++) {
8311 radv_get_viewport_xform(&pViewports[i], state->dynamic.hw_vp.xform[i + firstViewport].scale,
8312 state->dynamic.hw_vp.xform[i + firstViewport].translate);
8313 }
8314
8315 state->dirty_dynamic |= RADV_DYNAMIC_VIEWPORT;
8316 state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
8317 }
8318
8319 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)8320 radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
8321 const VkRect2D *pScissors)
8322 {
8323 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8324 struct radv_cmd_state *state = &cmd_buffer->state;
8325 ASSERTED const uint32_t total_count = firstScissor + scissorCount;
8326
8327 assert(firstScissor < MAX_SCISSORS);
8328 assert(total_count >= 1 && total_count <= MAX_SCISSORS);
8329
8330 if (state->dynamic.vk.vp.scissor_count < total_count)
8331 state->dynamic.vk.vp.scissor_count = total_count;
8332
8333 memcpy(state->dynamic.vk.vp.scissors + firstScissor, pScissors, scissorCount * sizeof(*pScissors));
8334
8335 state->dirty_dynamic |= RADV_DYNAMIC_SCISSOR;
8336 }
8337
8338 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)8339 radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
8340 {
8341 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8342 struct radv_cmd_state *state = &cmd_buffer->state;
8343
8344 state->dynamic.vk.rs.line.width = lineWidth;
8345
8346 state->dirty_dynamic |= RADV_DYNAMIC_LINE_WIDTH;
8347 state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
8348 }
8349
8350 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])8351 radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
8352 {
8353 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8354 struct radv_cmd_state *state = &cmd_buffer->state;
8355
8356 memcpy(state->dynamic.vk.cb.blend_constants, blendConstants, sizeof(float) * 4);
8357
8358 state->dirty_dynamic |= RADV_DYNAMIC_BLEND_CONSTANTS;
8359 }
8360
8361 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)8362 radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
8363 {
8364 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8365 struct radv_cmd_state *state = &cmd_buffer->state;
8366
8367 state->dynamic.vk.ds.depth.bounds_test.min = minDepthBounds;
8368 state->dynamic.vk.ds.depth.bounds_test.max = maxDepthBounds;
8369
8370 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_BOUNDS;
8371 }
8372
8373 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)8374 radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t compareMask)
8375 {
8376 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8377 struct radv_cmd_state *state = &cmd_buffer->state;
8378
8379 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
8380 state->dynamic.vk.ds.stencil.front.compare_mask = compareMask;
8381 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
8382 state->dynamic.vk.ds.stencil.back.compare_mask = compareMask;
8383
8384 state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
8385 }
8386
8387 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)8388 radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t writeMask)
8389 {
8390 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8391 struct radv_cmd_state *state = &cmd_buffer->state;
8392
8393 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
8394 state->dynamic.vk.ds.stencil.front.write_mask = writeMask;
8395 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
8396 state->dynamic.vk.ds.stencil.back.write_mask = writeMask;
8397
8398 state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
8399 }
8400
8401 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)8402 radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t reference)
8403 {
8404 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8405 struct radv_cmd_state *state = &cmd_buffer->state;
8406
8407 if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
8408 state->dynamic.vk.ds.stencil.front.reference = reference;
8409 if (faceMask & VK_STENCIL_FACE_BACK_BIT)
8410 state->dynamic.vk.ds.stencil.back.reference = reference;
8411
8412 state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_REFERENCE;
8413 }
8414
8415 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer,uint32_t firstDiscardRectangle,uint32_t discardRectangleCount,const VkRect2D * pDiscardRectangles)8416 radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
8417 uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
8418 {
8419 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8420 struct radv_cmd_state *state = &cmd_buffer->state;
8421 ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
8422
8423 assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
8424 assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
8425
8426 typed_memcpy(&state->dynamic.vk.dr.rectangles[firstDiscardRectangle], pDiscardRectangles, discardRectangleCount);
8427
8428 state->dirty_dynamic |= RADV_DYNAMIC_DISCARD_RECTANGLE;
8429 }
8430
8431 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,const VkSampleLocationsInfoEXT * pSampleLocationsInfo)8432 radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
8433 {
8434 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8435 struct radv_cmd_state *state = &cmd_buffer->state;
8436
8437 assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
8438
8439 state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
8440 state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
8441 state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
8442 typed_memcpy(&state->dynamic.sample_location.locations[0], pSampleLocationsInfo->pSampleLocations,
8443 pSampleLocationsInfo->sampleLocationsCount);
8444
8445 state->dirty_dynamic |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
8446 }
8447
8448 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleKHR(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)8449 radv_CmdSetLineStippleKHR(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor, uint16_t lineStipplePattern)
8450 {
8451 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8452 struct radv_cmd_state *state = &cmd_buffer->state;
8453
8454 state->dynamic.vk.rs.line.stipple.factor = lineStippleFactor;
8455 state->dynamic.vk.rs.line.stipple.pattern = lineStipplePattern;
8456
8457 state->dirty_dynamic |= RADV_DYNAMIC_LINE_STIPPLE;
8458 }
8459
8460 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCullMode(VkCommandBuffer commandBuffer,VkCullModeFlags cullMode)8461 radv_CmdSetCullMode(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
8462 {
8463 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8464 struct radv_cmd_state *state = &cmd_buffer->state;
8465
8466 state->dynamic.vk.rs.cull_mode = cullMode;
8467
8468 state->dirty_dynamic |= RADV_DYNAMIC_CULL_MODE;
8469 }
8470
8471 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFrontFace(VkCommandBuffer commandBuffer,VkFrontFace frontFace)8472 radv_CmdSetFrontFace(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
8473 {
8474 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8475 struct radv_cmd_state *state = &cmd_buffer->state;
8476
8477 state->dynamic.vk.rs.front_face = frontFace;
8478
8479 state->dirty_dynamic |= RADV_DYNAMIC_FRONT_FACE;
8480 }
8481
8482 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer,VkPrimitiveTopology primitiveTopology)8483 radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer, VkPrimitiveTopology primitiveTopology)
8484 {
8485 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8486 struct radv_cmd_state *state = &cmd_buffer->state;
8487 unsigned primitive_topology = radv_translate_prim(primitiveTopology);
8488
8489 if (radv_primitive_topology_is_line_list(state->dynamic.vk.ia.primitive_topology) !=
8490 radv_primitive_topology_is_line_list(primitive_topology))
8491 state->dirty_dynamic |= RADV_DYNAMIC_LINE_STIPPLE;
8492
8493 if (radv_prim_is_points_or_lines(state->dynamic.vk.ia.primitive_topology) !=
8494 radv_prim_is_points_or_lines(primitive_topology))
8495 state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
8496
8497 state->dynamic.vk.ia.primitive_topology = primitive_topology;
8498
8499 state->dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
8500 }
8501
8502 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)8503 radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, uint32_t viewportCount, const VkViewport *pViewports)
8504 {
8505 radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
8506 }
8507
8508 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer,uint32_t scissorCount,const VkRect2D * pScissors)8509 radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer, uint32_t scissorCount, const VkRect2D *pScissors)
8510 {
8511 radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
8512 }
8513
8514 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthTestEnable)8515 radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
8516
8517 {
8518 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8519 struct radv_cmd_state *state = &cmd_buffer->state;
8520
8521 state->dynamic.vk.ds.depth.test_enable = depthTestEnable;
8522
8523 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
8524 }
8525
8526 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer,VkBool32 depthWriteEnable)8527 radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
8528 {
8529 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8530 struct radv_cmd_state *state = &cmd_buffer->state;
8531
8532 state->dynamic.vk.ds.depth.write_enable = depthWriteEnable;
8533
8534 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
8535 }
8536
8537 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer,VkCompareOp depthCompareOp)8538 radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
8539 {
8540 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8541 struct radv_cmd_state *state = &cmd_buffer->state;
8542
8543 state->dynamic.vk.ds.depth.compare_op = depthCompareOp;
8544
8545 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
8546 }
8547
8548 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthBoundsTestEnable)8549 radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
8550 {
8551 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8552 struct radv_cmd_state *state = &cmd_buffer->state;
8553
8554 state->dynamic.vk.ds.depth.bounds_test.enable = depthBoundsTestEnable;
8555
8556 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
8557 }
8558
8559 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer,VkBool32 stencilTestEnable)8560 radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
8561 {
8562 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8563 struct radv_cmd_state *state = &cmd_buffer->state;
8564
8565 state->dynamic.vk.ds.stencil.test_enable = stencilTestEnable;
8566
8567 state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
8568 }
8569
8570 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilOp(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,VkStencilOp failOp,VkStencilOp passOp,VkStencilOp depthFailOp,VkCompareOp compareOp)8571 radv_CmdSetStencilOp(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, VkStencilOp failOp, VkStencilOp passOp,
8572 VkStencilOp depthFailOp, VkCompareOp compareOp)
8573 {
8574 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8575 struct radv_cmd_state *state = &cmd_buffer->state;
8576
8577 if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
8578 state->dynamic.vk.ds.stencil.front.op.fail = failOp;
8579 state->dynamic.vk.ds.stencil.front.op.pass = passOp;
8580 state->dynamic.vk.ds.stencil.front.op.depth_fail = depthFailOp;
8581 state->dynamic.vk.ds.stencil.front.op.compare = compareOp;
8582 }
8583
8584 if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
8585 state->dynamic.vk.ds.stencil.back.op.fail = failOp;
8586 state->dynamic.vk.ds.stencil.back.op.pass = passOp;
8587 state->dynamic.vk.ds.stencil.back.op.depth_fail = depthFailOp;
8588 state->dynamic.vk.ds.stencil.back.op.compare = compareOp;
8589 }
8590
8591 state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_OP;
8592 }
8593
8594 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer,const VkExtent2D * pFragmentSize,const VkFragmentShadingRateCombinerOpKHR combinerOps[2])8595 radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
8596 const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
8597 {
8598 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8599 struct radv_cmd_state *state = &cmd_buffer->state;
8600
8601 state->dynamic.vk.fsr.fragment_size = *pFragmentSize;
8602 for (unsigned i = 0; i < 2; i++)
8603 state->dynamic.vk.fsr.combiner_ops[i] = combinerOps[i];
8604
8605 state->dirty_dynamic |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
8606 }
8607
8608 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer,VkBool32 depthBiasEnable)8609 radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
8610 {
8611 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8612 struct radv_cmd_state *state = &cmd_buffer->state;
8613
8614 state->dynamic.vk.rs.depth_bias.enable = depthBiasEnable;
8615
8616 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
8617 }
8618
8619 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer,VkBool32 primitiveRestartEnable)8620 radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
8621 {
8622 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8623 struct radv_cmd_state *state = &cmd_buffer->state;
8624
8625 state->dynamic.vk.ia.primitive_restart_enable = primitiveRestartEnable;
8626
8627 state->dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
8628 }
8629
8630 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer,VkBool32 rasterizerDiscardEnable)8631 radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer, VkBool32 rasterizerDiscardEnable)
8632 {
8633 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8634 struct radv_cmd_state *state = &cmd_buffer->state;
8635
8636 state->dynamic.vk.rs.rasterizer_discard_enable = rasterizerDiscardEnable;
8637
8638 state->dirty_dynamic |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
8639 }
8640
8641 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,uint32_t patchControlPoints)8642 radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
8643 {
8644 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8645 struct radv_cmd_state *state = &cmd_buffer->state;
8646
8647 state->dynamic.vk.ts.patch_control_points = patchControlPoints;
8648
8649 state->dirty_dynamic |= RADV_DYNAMIC_PATCH_CONTROL_POINTS;
8650 }
8651
8652 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,VkLogicOp logicOp)8653 radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
8654 {
8655 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8656 struct radv_cmd_state *state = &cmd_buffer->state;
8657 unsigned logic_op = radv_translate_blend_logic_op(logicOp);
8658
8659 state->dynamic.vk.cb.logic_op = logic_op;
8660
8661 state->dirty_dynamic |= RADV_DYNAMIC_LOGIC_OP;
8662 }
8663
8664 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)8665 radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
8666 const VkBool32 *pColorWriteEnables)
8667 {
8668 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8669 struct radv_cmd_state *state = &cmd_buffer->state;
8670 uint8_t color_write_enable = 0;
8671
8672 assert(attachmentCount <= MAX_RTS);
8673
8674 for (uint32_t i = 0; i < attachmentCount; i++) {
8675 if (pColorWriteEnables[i]) {
8676 color_write_enable |= BITFIELD_BIT(i);
8677 }
8678 }
8679
8680 state->dynamic.vk.cb.color_write_enables = color_write_enable;
8681
8682 state->dirty_dynamic |= RADV_DYNAMIC_COLOR_WRITE_ENABLE;
8683 }
8684
8685 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer,uint32_t vertexBindingDescriptionCount,const VkVertexInputBindingDescription2EXT * pVertexBindingDescriptions,uint32_t vertexAttributeDescriptionCount,const VkVertexInputAttributeDescription2EXT * pVertexAttributeDescriptions)8686 radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
8687 const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
8688 uint32_t vertexAttributeDescriptionCount,
8689 const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
8690 {
8691 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8692 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8693 const struct radv_physical_device *pdev = radv_device_physical(device);
8694 struct radv_cmd_state *state = &cmd_buffer->state;
8695 struct radv_vertex_input_state *vi_state = &state->vertex_input;
8696
8697 const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
8698 for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
8699 bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
8700
8701 state->vbo_misaligned_mask = 0;
8702 state->vbo_unaligned_mask = 0;
8703 state->vbo_misaligned_mask_invalid = 0;
8704
8705 vi_state->attribute_mask = 0;
8706 vi_state->instance_rate_inputs = 0;
8707 vi_state->nontrivial_divisors = 0;
8708 vi_state->zero_divisors = 0;
8709 vi_state->post_shuffle = 0;
8710 vi_state->alpha_adjust_lo = 0;
8711 vi_state->alpha_adjust_hi = 0;
8712 vi_state->nontrivial_formats = 0;
8713 vi_state->bindings_match_attrib = true;
8714
8715 enum amd_gfx_level chip = pdev->info.gfx_level;
8716 enum radeon_family family = pdev->info.family;
8717 const struct ac_vtx_format_info *vtx_info_table = ac_get_vtx_format_info_table(chip, family);
8718
8719 for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
8720 const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
8721 const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
8722 unsigned loc = attrib->location;
8723
8724 vi_state->attribute_mask |= 1u << loc;
8725 vi_state->bindings[loc] = attrib->binding;
8726 if (attrib->binding != loc)
8727 vi_state->bindings_match_attrib = false;
8728 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
8729 vi_state->instance_rate_inputs |= 1u << loc;
8730 vi_state->divisors[loc] = binding->divisor;
8731 if (binding->divisor == 0) {
8732 vi_state->zero_divisors |= 1u << loc;
8733 } else if (binding->divisor > 1) {
8734 vi_state->nontrivial_divisors |= 1u << loc;
8735 }
8736 }
8737 cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
8738 vi_state->offsets[loc] = attrib->offset;
8739
8740 enum pipe_format format = vk_format_map[attrib->format];
8741 const struct ac_vtx_format_info *vtx_info = &vtx_info_table[format];
8742
8743 vi_state->formats[loc] = format;
8744 uint8_t format_align_req_minus_1 = vtx_info->chan_byte_size >= 4 ? 3 : (vtx_info->element_size - 1);
8745 vi_state->format_align_req_minus_1[loc] = format_align_req_minus_1;
8746 uint8_t component_align_req_minus_1 =
8747 MIN2(vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size, 4) - 1;
8748 vi_state->component_align_req_minus_1[loc] = component_align_req_minus_1;
8749 vi_state->format_sizes[loc] = vtx_info->element_size;
8750 vi_state->alpha_adjust_lo |= (vtx_info->alpha_adjust & 0x1) << loc;
8751 vi_state->alpha_adjust_hi |= (vtx_info->alpha_adjust >> 1) << loc;
8752 if (G_008F0C_DST_SEL_X(vtx_info->dst_sel) == V_008F0C_SQ_SEL_Z)
8753 vi_state->post_shuffle |= BITFIELD_BIT(loc);
8754
8755 if (!(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1)))
8756 vi_state->nontrivial_formats |= BITFIELD_BIT(loc);
8757
8758 if (state->vbo_bound_mask & BITFIELD_BIT(attrib->binding)) {
8759 uint32_t stride = binding->stride;
8760 uint64_t offset = cmd_buffer->vertex_bindings[attrib->binding].offset + vi_state->offsets[loc];
8761 if ((chip == GFX6 || chip >= GFX10) && ((stride | offset) & format_align_req_minus_1))
8762 state->vbo_misaligned_mask |= BITFIELD_BIT(loc);
8763 if ((stride | offset) & component_align_req_minus_1)
8764 state->vbo_unaligned_mask |= BITFIELD_BIT(loc);
8765 }
8766 }
8767
8768 state->dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
8769 state->dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
8770 }
8771
8772 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPolygonModeEXT(VkCommandBuffer commandBuffer,VkPolygonMode polygonMode)8773 radv_CmdSetPolygonModeEXT(VkCommandBuffer commandBuffer, VkPolygonMode polygonMode)
8774 {
8775 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8776 struct radv_cmd_state *state = &cmd_buffer->state;
8777 unsigned polygon_mode = radv_translate_fill(polygonMode);
8778
8779 if (radv_polygon_mode_is_points_or_lines(state->dynamic.vk.rs.polygon_mode) !=
8780 radv_polygon_mode_is_points_or_lines(polygon_mode))
8781 state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
8782
8783 state->dynamic.vk.rs.polygon_mode = polygon_mode;
8784
8785 state->dirty_dynamic |= RADV_DYNAMIC_POLYGON_MODE;
8786 }
8787
8788 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetTessellationDomainOriginEXT(VkCommandBuffer commandBuffer,VkTessellationDomainOrigin domainOrigin)8789 radv_CmdSetTessellationDomainOriginEXT(VkCommandBuffer commandBuffer, VkTessellationDomainOrigin domainOrigin)
8790 {
8791 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8792 struct radv_cmd_state *state = &cmd_buffer->state;
8793
8794 state->dynamic.vk.ts.domain_origin = domainOrigin;
8795
8796 state->dirty_dynamic |= RADV_DYNAMIC_TESS_DOMAIN_ORIGIN;
8797 }
8798
8799 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEnableEXT(VkCommandBuffer commandBuffer,VkBool32 logicOpEnable)8800 radv_CmdSetLogicOpEnableEXT(VkCommandBuffer commandBuffer, VkBool32 logicOpEnable)
8801 {
8802 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8803 struct radv_cmd_state *state = &cmd_buffer->state;
8804
8805 state->dynamic.vk.cb.logic_op_enable = logicOpEnable;
8806
8807 state->dirty_dynamic |= RADV_DYNAMIC_LOGIC_OP_ENABLE;
8808 }
8809
8810 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleEnableEXT(VkCommandBuffer commandBuffer,VkBool32 stippledLineEnable)8811 radv_CmdSetLineStippleEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stippledLineEnable)
8812 {
8813 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8814 struct radv_cmd_state *state = &cmd_buffer->state;
8815
8816 state->dynamic.vk.rs.line.stipple.enable = stippledLineEnable;
8817
8818 state->dirty_dynamic |= RADV_DYNAMIC_LINE_STIPPLE_ENABLE;
8819 }
8820
8821 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAlphaToCoverageEnableEXT(VkCommandBuffer commandBuffer,VkBool32 alphaToCoverageEnable)8822 radv_CmdSetAlphaToCoverageEnableEXT(VkCommandBuffer commandBuffer, VkBool32 alphaToCoverageEnable)
8823 {
8824 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8825 struct radv_cmd_state *state = &cmd_buffer->state;
8826
8827 state->dynamic.vk.ms.alpha_to_coverage_enable = alphaToCoverageEnable;
8828
8829 state->dirty_dynamic |= RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE;
8830 }
8831
8832 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAlphaToOneEnableEXT(VkCommandBuffer commandBuffer,VkBool32 alphaToOneEnable)8833 radv_CmdSetAlphaToOneEnableEXT(VkCommandBuffer commandBuffer, VkBool32 alphaToOneEnable)
8834 {
8835 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8836 struct radv_cmd_state *state = &cmd_buffer->state;
8837
8838 state->dynamic.vk.ms.alpha_to_one_enable = alphaToOneEnable;
8839
8840 state->dirty_dynamic |= RADV_DYNAMIC_ALPHA_TO_ONE_ENABLE;
8841 }
8842
8843 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleMaskEXT(VkCommandBuffer commandBuffer,VkSampleCountFlagBits samples,const VkSampleMask * pSampleMask)8844 radv_CmdSetSampleMaskEXT(VkCommandBuffer commandBuffer, VkSampleCountFlagBits samples, const VkSampleMask *pSampleMask)
8845 {
8846 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8847 struct radv_cmd_state *state = &cmd_buffer->state;
8848
8849 state->dynamic.vk.ms.sample_mask = pSampleMask[0] & 0xffff;
8850
8851 state->dirty_dynamic |= RADV_DYNAMIC_SAMPLE_MASK;
8852 }
8853
8854 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClipEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthClipEnable)8855 radv_CmdSetDepthClipEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthClipEnable)
8856 {
8857 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8858 struct radv_cmd_state *state = &cmd_buffer->state;
8859
8860 state->dynamic.vk.rs.depth_clip_enable = depthClipEnable;
8861
8862 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_CLIP_ENABLE;
8863 }
8864
8865 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetConservativeRasterizationModeEXT(VkCommandBuffer commandBuffer,VkConservativeRasterizationModeEXT conservativeRasterizationMode)8866 radv_CmdSetConservativeRasterizationModeEXT(VkCommandBuffer commandBuffer,
8867 VkConservativeRasterizationModeEXT conservativeRasterizationMode)
8868 {
8869 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8870 struct radv_cmd_state *state = &cmd_buffer->state;
8871
8872 state->dynamic.vk.rs.conservative_mode = conservativeRasterizationMode;
8873
8874 state->dirty_dynamic |= RADV_DYNAMIC_CONSERVATIVE_RAST_MODE;
8875 }
8876
8877 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClipNegativeOneToOneEXT(VkCommandBuffer commandBuffer,VkBool32 negativeOneToOne)8878 radv_CmdSetDepthClipNegativeOneToOneEXT(VkCommandBuffer commandBuffer, VkBool32 negativeOneToOne)
8879 {
8880 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8881 struct radv_cmd_state *state = &cmd_buffer->state;
8882
8883 state->dynamic.vk.vp.depth_clip_negative_one_to_one = negativeOneToOne;
8884
8885 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE;
8886 }
8887
8888 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetProvokingVertexModeEXT(VkCommandBuffer commandBuffer,VkProvokingVertexModeEXT provokingVertexMode)8889 radv_CmdSetProvokingVertexModeEXT(VkCommandBuffer commandBuffer, VkProvokingVertexModeEXT provokingVertexMode)
8890 {
8891 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8892 struct radv_cmd_state *state = &cmd_buffer->state;
8893
8894 state->dynamic.vk.rs.provoking_vertex = provokingVertexMode;
8895
8896 state->dirty_dynamic |= RADV_DYNAMIC_PROVOKING_VERTEX_MODE;
8897 }
8898
8899 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClampEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthClampEnable)8900 radv_CmdSetDepthClampEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthClampEnable)
8901 {
8902 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8903 struct radv_cmd_state *state = &cmd_buffer->state;
8904
8905 state->dynamic.vk.rs.depth_clamp_enable = depthClampEnable;
8906
8907 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_CLAMP_ENABLE;
8908 }
8909
8910 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteMaskEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkColorComponentFlags * pColorWriteMasks)8911 radv_CmdSetColorWriteMaskEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
8912 const VkColorComponentFlags *pColorWriteMasks)
8913 {
8914 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8915 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8916 const struct radv_physical_device *pdev = radv_device_physical(device);
8917 struct radv_cmd_state *state = &cmd_buffer->state;
8918
8919 assert(firstAttachment + attachmentCount <= MAX_RTS);
8920
8921 for (uint32_t i = 0; i < attachmentCount; i++) {
8922 uint32_t idx = firstAttachment + i;
8923
8924 state->dynamic.vk.cb.attachments[idx].write_mask = pColorWriteMasks[i];
8925 }
8926
8927 state->dirty_dynamic |= RADV_DYNAMIC_COLOR_WRITE_MASK;
8928
8929 if (pdev->info.rbplus_allowed)
8930 state->dirty |= RADV_CMD_DIRTY_RBPLUS;
8931 }
8932
8933 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorBlendEnableEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkBool32 * pColorBlendEnables)8934 radv_CmdSetColorBlendEnableEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
8935 const VkBool32 *pColorBlendEnables)
8936 {
8937 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8938 struct radv_cmd_state *state = &cmd_buffer->state;
8939
8940 assert(firstAttachment + attachmentCount <= MAX_RTS);
8941
8942 for (uint32_t i = 0; i < attachmentCount; i++) {
8943 uint32_t idx = firstAttachment + i;
8944
8945 state->dynamic.vk.cb.attachments[idx].blend_enable = pColorBlendEnables[i];
8946 }
8947
8948 state->dirty_dynamic |= RADV_DYNAMIC_COLOR_BLEND_ENABLE;
8949 }
8950
8951 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizationSamplesEXT(VkCommandBuffer commandBuffer,VkSampleCountFlagBits rasterizationSamples)8952 radv_CmdSetRasterizationSamplesEXT(VkCommandBuffer commandBuffer, VkSampleCountFlagBits rasterizationSamples)
8953 {
8954 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8955 struct radv_cmd_state *state = &cmd_buffer->state;
8956
8957 state->dynamic.vk.ms.rasterization_samples = rasterizationSamples;
8958
8959 state->dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
8960 }
8961
8962 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineRasterizationModeEXT(VkCommandBuffer commandBuffer,VkLineRasterizationModeKHR lineRasterizationMode)8963 radv_CmdSetLineRasterizationModeEXT(VkCommandBuffer commandBuffer, VkLineRasterizationModeKHR lineRasterizationMode)
8964 {
8965 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8966 struct radv_cmd_state *state = &cmd_buffer->state;
8967
8968 state->dynamic.vk.rs.line.mode = lineRasterizationMode;
8969
8970 state->dirty_dynamic |= RADV_DYNAMIC_LINE_RASTERIZATION_MODE;
8971 }
8972
8973 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorBlendEquationEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkColorBlendEquationEXT * pColorBlendEquations)8974 radv_CmdSetColorBlendEquationEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
8975 const VkColorBlendEquationEXT *pColorBlendEquations)
8976 {
8977 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8978 struct radv_cmd_state *state = &cmd_buffer->state;
8979
8980 assert(firstAttachment + attachmentCount <= MAX_RTS);
8981 for (uint32_t i = 0; i < attachmentCount; i++) {
8982 unsigned idx = firstAttachment + i;
8983
8984 state->dynamic.vk.cb.attachments[idx].src_color_blend_factor = pColorBlendEquations[i].srcColorBlendFactor;
8985 state->dynamic.vk.cb.attachments[idx].dst_color_blend_factor = pColorBlendEquations[i].dstColorBlendFactor;
8986 state->dynamic.vk.cb.attachments[idx].color_blend_op = pColorBlendEquations[i].colorBlendOp;
8987 state->dynamic.vk.cb.attachments[idx].src_alpha_blend_factor = pColorBlendEquations[i].srcAlphaBlendFactor;
8988 state->dynamic.vk.cb.attachments[idx].dst_alpha_blend_factor = pColorBlendEquations[i].dstAlphaBlendFactor;
8989 state->dynamic.vk.cb.attachments[idx].alpha_blend_op = pColorBlendEquations[i].alphaBlendOp;
8990 }
8991
8992 state->dirty_dynamic |= RADV_DYNAMIC_COLOR_BLEND_EQUATION;
8993 }
8994
8995 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEnableEXT(VkCommandBuffer commandBuffer,VkBool32 sampleLocationsEnable)8996 radv_CmdSetSampleLocationsEnableEXT(VkCommandBuffer commandBuffer, VkBool32 sampleLocationsEnable)
8997 {
8998 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8999 struct radv_cmd_state *state = &cmd_buffer->state;
9000
9001 state->dynamic.vk.ms.sample_locations_enable = sampleLocationsEnable;
9002
9003 state->dirty_dynamic |= RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE;
9004 }
9005
9006 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEnableEXT(VkCommandBuffer commandBuffer,VkBool32 discardRectangleEnable)9007 radv_CmdSetDiscardRectangleEnableEXT(VkCommandBuffer commandBuffer, VkBool32 discardRectangleEnable)
9008 {
9009 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9010 struct radv_cmd_state *state = &cmd_buffer->state;
9011
9012 state->dynamic.vk.dr.enable = discardRectangleEnable;
9013 state->dynamic.vk.dr.rectangle_count = discardRectangleEnable ? MAX_DISCARD_RECTANGLES : 0;
9014
9015 state->dirty_dynamic |= RADV_DYNAMIC_DISCARD_RECTANGLE_ENABLE;
9016 }
9017
9018 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleModeEXT(VkCommandBuffer commandBuffer,VkDiscardRectangleModeEXT discardRectangleMode)9019 radv_CmdSetDiscardRectangleModeEXT(VkCommandBuffer commandBuffer, VkDiscardRectangleModeEXT discardRectangleMode)
9020 {
9021 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9022 struct radv_cmd_state *state = &cmd_buffer->state;
9023
9024 state->dynamic.vk.dr.mode = discardRectangleMode;
9025
9026 state->dirty_dynamic |= RADV_DYNAMIC_DISCARD_RECTANGLE_MODE;
9027 }
9028
9029 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAttachmentFeedbackLoopEnableEXT(VkCommandBuffer commandBuffer,VkImageAspectFlags aspectMask)9030 radv_CmdSetAttachmentFeedbackLoopEnableEXT(VkCommandBuffer commandBuffer, VkImageAspectFlags aspectMask)
9031 {
9032 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9033 struct radv_cmd_state *state = &cmd_buffer->state;
9034
9035 state->dynamic.feedback_loop_aspects = aspectMask;
9036
9037 state->dirty_dynamic |= RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE;
9038 }
9039
9040 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBias2EXT(VkCommandBuffer commandBuffer,const VkDepthBiasInfoEXT * pDepthBiasInfo)9041 radv_CmdSetDepthBias2EXT(VkCommandBuffer commandBuffer, const VkDepthBiasInfoEXT *pDepthBiasInfo)
9042 {
9043 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9044 struct radv_cmd_state *state = &cmd_buffer->state;
9045
9046 const VkDepthBiasRepresentationInfoEXT *dbr_info =
9047 vk_find_struct_const(pDepthBiasInfo->pNext, DEPTH_BIAS_REPRESENTATION_INFO_EXT);
9048
9049 state->dynamic.vk.rs.depth_bias.constant = pDepthBiasInfo->depthBiasConstantFactor;
9050 state->dynamic.vk.rs.depth_bias.clamp = pDepthBiasInfo->depthBiasClamp;
9051 state->dynamic.vk.rs.depth_bias.slope = pDepthBiasInfo->depthBiasSlopeFactor;
9052 state->dynamic.vk.rs.depth_bias.representation =
9053 dbr_info ? dbr_info->depthBiasRepresentation : VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT;
9054
9055 state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_BIAS;
9056 }
9057
9058 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRenderingAttachmentLocationsKHR(VkCommandBuffer commandBuffer,const VkRenderingAttachmentLocationInfoKHR * pLocationInfo)9059 radv_CmdSetRenderingAttachmentLocationsKHR(VkCommandBuffer commandBuffer,
9060 const VkRenderingAttachmentLocationInfoKHR *pLocationInfo)
9061 {
9062 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9063 struct radv_cmd_state *state = &cmd_buffer->state;
9064
9065 assume(pLocationInfo->colorAttachmentCount <= MESA_VK_MAX_COLOR_ATTACHMENTS);
9066 for (uint32_t i = 0; i < pLocationInfo->colorAttachmentCount; i++) {
9067 state->dynamic.vk.cal.color_map[i] = pLocationInfo->pColorAttachmentLocations[i] == VK_ATTACHMENT_UNUSED
9068 ? MESA_VK_ATTACHMENT_UNUSED
9069 : pLocationInfo->pColorAttachmentLocations[i];
9070 }
9071
9072 state->dirty_dynamic |= RADV_DYNAMIC_COLOR_ATTACHMENT_MAP;
9073 state->dirty |= RADV_CMD_DIRTY_FBFETCH_OUTPUT;
9074 }
9075
9076 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRenderingInputAttachmentIndicesKHR(VkCommandBuffer commandBuffer,const VkRenderingInputAttachmentIndexInfoKHR * pLocationInfo)9077 radv_CmdSetRenderingInputAttachmentIndicesKHR(VkCommandBuffer commandBuffer,
9078 const VkRenderingInputAttachmentIndexInfoKHR *pLocationInfo)
9079 {
9080 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9081 struct radv_cmd_state *state = &cmd_buffer->state;
9082
9083 assume(pLocationInfo->colorAttachmentCount <= MESA_VK_MAX_COLOR_ATTACHMENTS);
9084 for (uint32_t i = 0; i < pLocationInfo->colorAttachmentCount; i++) {
9085 uint8_t val;
9086
9087 if (!pLocationInfo->pColorAttachmentInputIndices) {
9088 val = i;
9089 } else if (pLocationInfo->pColorAttachmentInputIndices[i] == VK_ATTACHMENT_UNUSED) {
9090 val = MESA_VK_ATTACHMENT_UNUSED;
9091 } else {
9092 val = pLocationInfo->pColorAttachmentInputIndices[i];
9093 }
9094
9095 state->dynamic.vk.ial.color_map[i] = val;
9096 }
9097
9098 state->dynamic.vk.ial.depth_att = (pLocationInfo->pDepthInputAttachmentIndex == NULL ||
9099 *pLocationInfo->pDepthInputAttachmentIndex == VK_ATTACHMENT_UNUSED)
9100 ? MESA_VK_ATTACHMENT_UNUSED
9101 : *pLocationInfo->pDepthInputAttachmentIndex;
9102 state->dynamic.vk.ial.stencil_att = (pLocationInfo->pStencilInputAttachmentIndex == NULL ||
9103 *pLocationInfo->pStencilInputAttachmentIndex == VK_ATTACHMENT_UNUSED)
9104 ? MESA_VK_ATTACHMENT_UNUSED
9105 : *pLocationInfo->pStencilInputAttachmentIndex;
9106
9107 state->dirty_dynamic |= RADV_DYNAMIC_INPUT_ATTACHMENT_MAP;
9108 state->dirty |= RADV_CMD_DIRTY_FBFETCH_OUTPUT;
9109 }
9110
9111 static void
radv_handle_color_fbfetch_output(struct radv_cmd_buffer * cmd_buffer,uint32_t index)9112 radv_handle_color_fbfetch_output(struct radv_cmd_buffer *cmd_buffer, uint32_t index)
9113 {
9114 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
9115 struct radv_rendering_state *render = &cmd_buffer->state.render;
9116 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
9117 struct radv_attachment *att = &render->color_att[index];
9118
9119 if (!att->iview)
9120 return;
9121
9122 const struct radv_image *image = att->iview->image;
9123 if (!(image->vk.usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
9124 return;
9125
9126 if (!radv_layout_dcc_compressed(device, image, att->iview->vk.base_mip_level, att->layout,
9127 radv_image_queue_family_mask(att->iview->image, cmd_buffer->qf, cmd_buffer->qf)))
9128 return;
9129
9130 const uint32_t color_att_idx = d->vk.cal.color_map[index];
9131 if (color_att_idx == MESA_VK_ATTACHMENT_UNUSED)
9132 return;
9133
9134 if (d->vk.ial.color_map[color_att_idx] != color_att_idx)
9135 return;
9136
9137 const VkImageSubresourceRange range = {
9138 .aspectMask = att->iview->vk.aspects,
9139 .baseMipLevel = att->iview->vk.base_mip_level,
9140 .levelCount = att->iview->vk.level_count,
9141 .baseArrayLayer = att->iview->vk.base_array_layer,
9142 .layerCount = att->iview->vk.layer_count,
9143 };
9144
9145 /* Consider previous rendering work for WAW hazards. */
9146 cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
9147 VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, att->iview->image);
9148
9149 /* Force a transition to FEEDBACK_LOOP_OPTIMAL to decompress DCC. */
9150 radv_handle_image_transition(cmd_buffer, att->iview->image, att->layout,
9151 VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, RADV_QUEUE_GENERAL,
9152 RADV_QUEUE_GENERAL, &range, NULL);
9153
9154 att->layout = VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT;
9155
9156 cmd_buffer->state.flush_bits |= radv_dst_access_flush(
9157 cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
9158 VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT | VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT, att->iview->image);
9159
9160 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
9161 }
9162
9163 static void
radv_handle_depth_fbfetch_output(struct radv_cmd_buffer * cmd_buffer)9164 radv_handle_depth_fbfetch_output(struct radv_cmd_buffer *cmd_buffer)
9165 {
9166 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
9167 struct radv_rendering_state *render = &cmd_buffer->state.render;
9168 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
9169 struct radv_attachment *att = &render->ds_att;
9170
9171 if (!att->iview)
9172 return;
9173
9174 const struct radv_image *image = att->iview->image;
9175 if (!(image->vk.usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
9176 return;
9177
9178 if (!radv_layout_is_htile_compressed(
9179 device, att->iview->image, att->layout,
9180 radv_image_queue_family_mask(att->iview->image, cmd_buffer->qf, cmd_buffer->qf)))
9181 return;
9182
9183 if (d->vk.ial.depth_att == MESA_VK_ATTACHMENT_UNUSED && d->vk.ial.stencil_att == MESA_VK_ATTACHMENT_UNUSED)
9184 return;
9185
9186 const VkImageSubresourceRange range = {
9187 .aspectMask = att->iview->vk.aspects,
9188 .baseMipLevel = att->iview->vk.base_mip_level,
9189 .levelCount = att->iview->vk.level_count,
9190 .baseArrayLayer = att->iview->vk.base_array_layer,
9191 .layerCount = att->iview->vk.layer_count,
9192 };
9193
9194 /* Consider previous rendering work for WAW hazards. */
9195 cmd_buffer->state.flush_bits |=
9196 radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
9197 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, att->iview->image);
9198
9199 /* Force a transition to FEEDBACK_LOOP_OPTIMAL to decompress HTILE. */
9200 radv_handle_image_transition(cmd_buffer, att->iview->image, att->layout,
9201 VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, RADV_QUEUE_GENERAL,
9202 RADV_QUEUE_GENERAL, &range, NULL);
9203
9204 att->layout = VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT;
9205 att->stencil_layout = VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT;
9206
9207 cmd_buffer->state.flush_bits |= radv_dst_access_flush(
9208 cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
9209 VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT, att->iview->image);
9210
9211 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
9212 }
9213
9214 static void
radv_handle_fbfetch_output(struct radv_cmd_buffer * cmd_buffer)9215 radv_handle_fbfetch_output(struct radv_cmd_buffer *cmd_buffer)
9216 {
9217 const struct radv_rendering_state *render = &cmd_buffer->state.render;
9218
9219 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FBFETCH_OUTPUT;
9220
9221 /* Nothing to do when dynamic rendering doesn't use concurrent input attachment writes. */
9222 if (render->has_input_attachment_no_concurrent_writes)
9223 return;
9224
9225 /* Nothing to do when the bound fragment shader doesn't use subpass input attachments. */
9226 if (!cmd_buffer->state.uses_fbfetch_output)
9227 return;
9228
9229 /* Check if any color attachments are compressed and also used as input attachments. */
9230 for (uint32_t i = 0; i < render->color_att_count; i++) {
9231 radv_handle_color_fbfetch_output(cmd_buffer, i);
9232 }
9233
9234 /* Check if the depth/stencil attachment is compressed and also used as input attachment. */
9235 radv_handle_depth_fbfetch_output(cmd_buffer);
9236 }
9237
9238 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)9239 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer *pCmdBuffers)
9240 {
9241 VK_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
9242 struct radv_device *device = radv_cmd_buffer_device(primary);
9243 const struct radv_physical_device *pdev = radv_device_physical(device);
9244
9245 assert(commandBufferCount > 0);
9246
9247 radv_emit_mip_change_flush_default(primary);
9248
9249 /* Emit pending flushes on primary prior to executing secondary */
9250 radv_emit_cache_flush(primary);
9251
9252 /* Make sure CP DMA is idle on primary prior to executing secondary. */
9253 radv_cp_dma_wait_for_idle(primary);
9254
9255 for (uint32_t i = 0; i < commandBufferCount; i++) {
9256 VK_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
9257
9258 /* Do not launch an IB2 for secondary command buffers that contain
9259 * DRAW_{INDEX}_INDIRECT_{MULTI} on GFX6-7 because it's illegal and hangs the GPU.
9260 */
9261 const bool allow_ib2 = !secondary->state.uses_draw_indirect || pdev->info.gfx_level >= GFX8;
9262
9263 primary->scratch_size_per_wave_needed =
9264 MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
9265 primary->scratch_waves_wanted = MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
9266 primary->compute_scratch_size_per_wave_needed =
9267 MAX2(primary->compute_scratch_size_per_wave_needed, secondary->compute_scratch_size_per_wave_needed);
9268 primary->compute_scratch_waves_wanted =
9269 MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
9270
9271 if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
9272 primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
9273 if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
9274 primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
9275 if (secondary->tess_rings_needed)
9276 primary->tess_rings_needed = true;
9277 if (secondary->task_rings_needed)
9278 primary->task_rings_needed = true;
9279 if (secondary->mesh_scratch_ring_needed)
9280 primary->mesh_scratch_ring_needed = true;
9281 if (secondary->sample_positions_needed)
9282 primary->sample_positions_needed = true;
9283 if (secondary->gds_needed)
9284 primary->gds_needed = true;
9285 if (secondary->gds_oa_needed)
9286 primary->gds_oa_needed = true;
9287
9288 primary->shader_upload_seq = MAX2(primary->shader_upload_seq, secondary->shader_upload_seq);
9289
9290 primary->state.uses_fbfetch_output |= secondary->state.uses_fbfetch_output;
9291
9292 if (!secondary->state.render.has_image_views) {
9293 if (primary->state.dirty & RADV_CMD_DIRTY_FBFETCH_OUTPUT)
9294 radv_handle_fbfetch_output(primary);
9295
9296 if (primary->state.render.active && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
9297 /* Emit the framebuffer state from primary if secondary
9298 * has been recorded without a framebuffer, otherwise
9299 * fast color/depth clears can't work.
9300 */
9301 radv_emit_framebuffer_state(primary);
9302 }
9303 }
9304
9305 if (secondary->gang.cs) {
9306 if (!radv_gang_init(primary))
9307 return;
9308
9309 struct radeon_cmdbuf *ace_primary = primary->gang.cs;
9310 struct radeon_cmdbuf *ace_secondary = secondary->gang.cs;
9311
9312 /* Emit pending flushes on primary prior to executing secondary. */
9313 radv_gang_cache_flush(primary);
9314
9315 /* Wait for gang semaphores, if necessary. */
9316 if (radv_flush_gang_leader_semaphore(primary))
9317 radv_wait_gang_leader(primary);
9318 if (radv_flush_gang_follower_semaphore(primary))
9319 radv_wait_gang_follower(primary);
9320
9321 /* Execute the secondary compute cmdbuf.
9322 * Don't use IB2 packets because they are not supported on compute queues.
9323 */
9324 device->ws->cs_execute_secondary(ace_primary, ace_secondary, false);
9325 }
9326
9327 /* Update pending ACE internal flush bits from the secondary cmdbuf */
9328 primary->gang.flush_bits |= secondary->gang.flush_bits;
9329
9330 /* Increment gang semaphores if secondary was dirty.
9331 * This happens when the secondary cmdbuf has a barrier which
9332 * isn't consumed by a draw call.
9333 */
9334 if (radv_gang_leader_sem_dirty(secondary))
9335 primary->gang.sem.leader_value++;
9336 if (radv_gang_follower_sem_dirty(secondary))
9337 primary->gang.sem.follower_value++;
9338
9339 device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
9340
9341 /* When the secondary command buffer is compute only we don't
9342 * need to re-emit the current graphics pipeline.
9343 */
9344 if (secondary->state.emitted_graphics_pipeline) {
9345 primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
9346 }
9347
9348 /* When the secondary command buffer is graphics only we don't
9349 * need to re-emit the current compute pipeline.
9350 */
9351 if (secondary->state.emitted_compute_pipeline) {
9352 primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
9353 }
9354
9355 if (secondary->state.last_ia_multi_vgt_param) {
9356 primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
9357 }
9358
9359 if (secondary->state.last_ge_cntl) {
9360 primary->state.last_ge_cntl = secondary->state.last_ge_cntl;
9361 }
9362
9363 primary->state.last_num_instances = secondary->state.last_num_instances;
9364 primary->state.last_subpass_color_count = secondary->state.last_subpass_color_count;
9365
9366 if (secondary->state.last_index_type != -1) {
9367 primary->state.last_index_type = secondary->state.last_index_type;
9368 }
9369
9370 primary->state.last_vrs_rates = secondary->state.last_vrs_rates;
9371 primary->state.last_force_vrs_rates_offset = secondary->state.last_force_vrs_rates_offset;
9372
9373 primary->state.rb_noncoherent_dirty |= secondary->state.rb_noncoherent_dirty;
9374
9375 primary->state.uses_draw_indirect |= secondary->state.uses_draw_indirect;
9376
9377 for (uint32_t reg = 0; reg < RADV_NUM_ALL_TRACKED_REGS; reg++) {
9378 if (!BITSET_TEST(secondary->tracked_regs.reg_saved_mask, reg))
9379 continue;
9380
9381 BITSET_SET(primary->tracked_regs.reg_saved_mask, reg);
9382 primary->tracked_regs.reg_value[reg] = secondary->tracked_regs.reg_value[reg];
9383 }
9384
9385 memcpy(primary->tracked_regs.spi_ps_input_cntl, secondary->tracked_regs.spi_ps_input_cntl,
9386 sizeof(primary->tracked_regs.spi_ps_input_cntl));
9387 }
9388
9389 /* After executing commands from secondary buffers we have to dirty
9390 * some states.
9391 */
9392 primary->state.dirty_dynamic |= RADV_DYNAMIC_ALL;
9393 primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_GUARDBAND |
9394 RADV_CMD_DIRTY_SHADER_QUERY | RADV_CMD_DIRTY_OCCLUSION_QUERY |
9395 RADV_CMD_DIRTY_DB_SHADER_CONTROL | RADV_CMD_DIRTY_COLOR_OUTPUT;
9396 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
9397 radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
9398
9399 primary->state.last_first_instance = -1;
9400 primary->state.last_drawid = -1;
9401 primary->state.last_vertex_offset_valid = false;
9402 }
9403
9404 static void
radv_mark_noncoherent_rb(struct radv_cmd_buffer * cmd_buffer)9405 radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
9406 {
9407 struct radv_rendering_state *render = &cmd_buffer->state.render;
9408
9409 /* Have to be conservative in cmdbuffers with inherited attachments. */
9410 if (!render->has_image_views) {
9411 cmd_buffer->state.rb_noncoherent_dirty = true;
9412 return;
9413 }
9414
9415 for (uint32_t i = 0; i < render->color_att_count; i++) {
9416 if (render->color_att[i].iview && !render->color_att[i].iview->image->l2_coherent) {
9417 cmd_buffer->state.rb_noncoherent_dirty = true;
9418 return;
9419 }
9420 }
9421 if (render->ds_att.iview && !render->ds_att.iview->image->l2_coherent)
9422 cmd_buffer->state.rb_noncoherent_dirty = true;
9423 }
9424
9425 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)9426 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
9427 {
9428 const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
9429 vk_find_struct_const(att->pNext, RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
9430 if (layout_info != NULL)
9431 return layout_info->initialLayout;
9432
9433 return att->imageLayout;
9434 }
9435
9436 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)9437 radv_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)
9438 {
9439 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9440 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
9441 const struct radv_physical_device *pdev = radv_device_physical(device);
9442
9443 const struct VkSampleLocationsInfoEXT *sample_locs_info =
9444 vk_find_struct_const(pRenderingInfo->pNext, SAMPLE_LOCATIONS_INFO_EXT);
9445
9446 struct radv_sample_locations_state sample_locations = {
9447 .count = 0,
9448 };
9449 if (sample_locs_info) {
9450 sample_locations = (struct radv_sample_locations_state){
9451 .per_pixel = sample_locs_info->sampleLocationsPerPixel,
9452 .grid_size = sample_locs_info->sampleLocationGridSize,
9453 .count = sample_locs_info->sampleLocationsCount,
9454 };
9455 typed_memcpy(sample_locations.locations, sample_locs_info->pSampleLocations,
9456 sample_locs_info->sampleLocationsCount);
9457 }
9458
9459 /* Dynamic rendering does not have implicit transitions, so limit the marker to
9460 * when a render pass is used.
9461 * Additionally, some internal meta operations called inside a barrier may issue
9462 * render calls (with dynamic rendering), so this makes sure those case don't
9463 * create a nested barrier scope.
9464 */
9465 if (cmd_buffer->vk.render_pass)
9466 radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
9467 uint32_t color_samples = 0, ds_samples = 0;
9468 struct radv_attachment color_att[MAX_RTS];
9469 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
9470 const VkRenderingAttachmentInfo *att_info = &pRenderingInfo->pColorAttachments[i];
9471
9472 color_att[i] = (struct radv_attachment){.iview = NULL};
9473 if (att_info->imageView == VK_NULL_HANDLE)
9474 continue;
9475
9476 VK_FROM_HANDLE(radv_image_view, iview, att_info->imageView);
9477 color_att[i].format = iview->vk.format;
9478 color_att[i].iview = iview;
9479 color_att[i].layout = att_info->imageLayout;
9480 radv_initialise_color_surface(device, &color_att[i].cb, iview);
9481
9482 if (att_info->resolveMode != VK_RESOLVE_MODE_NONE && att_info->resolveImageView != VK_NULL_HANDLE) {
9483 color_att[i].resolve_mode = att_info->resolveMode;
9484 color_att[i].resolve_iview = radv_image_view_from_handle(att_info->resolveImageView);
9485 color_att[i].resolve_layout = att_info->resolveImageLayout;
9486 }
9487
9488 color_samples = MAX2(color_samples, color_att[i].iview->vk.image->samples);
9489
9490 VkImageLayout initial_layout = attachment_initial_layout(att_info);
9491 if (initial_layout != color_att[i].layout) {
9492 assert(!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT));
9493 radv_handle_rendering_image_transition(cmd_buffer, color_att[i].iview, pRenderingInfo->layerCount,
9494 pRenderingInfo->viewMask, initial_layout, VK_IMAGE_LAYOUT_UNDEFINED,
9495 color_att[i].layout, VK_IMAGE_LAYOUT_UNDEFINED, &sample_locations);
9496 }
9497 }
9498
9499 struct radv_attachment ds_att = {.iview = NULL};
9500 VkImageAspectFlags ds_att_aspects = 0;
9501 const VkRenderingAttachmentInfo *d_att_info = pRenderingInfo->pDepthAttachment;
9502 const VkRenderingAttachmentInfo *s_att_info = pRenderingInfo->pStencilAttachment;
9503 if ((d_att_info != NULL && d_att_info->imageView != VK_NULL_HANDLE) ||
9504 (s_att_info != NULL && s_att_info->imageView != VK_NULL_HANDLE)) {
9505 struct radv_image_view *d_iview = NULL, *s_iview = NULL;
9506 struct radv_image_view *d_res_iview = NULL, *s_res_iview = NULL;
9507 VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
9508 VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
9509
9510 if (d_att_info != NULL && d_att_info->imageView != VK_NULL_HANDLE) {
9511 d_iview = radv_image_view_from_handle(d_att_info->imageView);
9512 initial_depth_layout = attachment_initial_layout(d_att_info);
9513 ds_att.layout = d_att_info->imageLayout;
9514
9515 if (d_att_info->resolveMode != VK_RESOLVE_MODE_NONE && d_att_info->resolveImageView != VK_NULL_HANDLE) {
9516 d_res_iview = radv_image_view_from_handle(d_att_info->resolveImageView);
9517 ds_att.resolve_mode = d_att_info->resolveMode;
9518 ds_att.resolve_layout = d_att_info->resolveImageLayout;
9519 }
9520 }
9521
9522 if (s_att_info != NULL && s_att_info->imageView != VK_NULL_HANDLE) {
9523 s_iview = radv_image_view_from_handle(s_att_info->imageView);
9524 initial_stencil_layout = attachment_initial_layout(s_att_info);
9525 ds_att.stencil_layout = s_att_info->imageLayout;
9526
9527 if (s_att_info->resolveMode != VK_RESOLVE_MODE_NONE && s_att_info->resolveImageView != VK_NULL_HANDLE) {
9528 s_res_iview = radv_image_view_from_handle(s_att_info->resolveImageView);
9529 ds_att.stencil_resolve_mode = s_att_info->resolveMode;
9530 ds_att.stencil_resolve_layout = s_att_info->resolveImageLayout;
9531 }
9532 }
9533
9534 assert(d_iview == NULL || s_iview == NULL || d_iview == s_iview);
9535 ds_att.iview = d_iview ? d_iview : s_iview, ds_att.format = ds_att.iview->vk.format;
9536
9537 if (d_iview && s_iview) {
9538 ds_att_aspects = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
9539 } else if (d_iview) {
9540 ds_att_aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
9541 } else {
9542 ds_att_aspects = VK_IMAGE_ASPECT_STENCIL_BIT;
9543 }
9544
9545 radv_initialise_ds_surface(device, &ds_att.ds, ds_att.iview, ds_att_aspects);
9546
9547 assert(d_res_iview == NULL || s_res_iview == NULL || d_res_iview == s_res_iview);
9548 ds_att.resolve_iview = d_res_iview ? d_res_iview : s_res_iview;
9549
9550 ds_samples = ds_att.iview->vk.image->samples;
9551
9552 if (initial_depth_layout != ds_att.layout || initial_stencil_layout != ds_att.stencil_layout) {
9553 assert(!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT));
9554 radv_handle_rendering_image_transition(cmd_buffer, ds_att.iview, pRenderingInfo->layerCount,
9555 pRenderingInfo->viewMask, initial_depth_layout, initial_stencil_layout,
9556 ds_att.layout, ds_att.stencil_layout, &sample_locations);
9557 }
9558 }
9559 if (cmd_buffer->vk.render_pass)
9560 radv_describe_barrier_end(cmd_buffer);
9561
9562 const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_info =
9563 vk_find_struct_const(pRenderingInfo->pNext, RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
9564 struct radv_attachment vrs_att = {.iview = NULL};
9565 VkExtent2D vrs_texel_size = {.width = 0};
9566 if (fsr_info && fsr_info->imageView) {
9567 VK_FROM_HANDLE(radv_image_view, iview, fsr_info->imageView);
9568 vrs_att = (struct radv_attachment){
9569 .format = iview->vk.format,
9570 .iview = iview,
9571 .layout = fsr_info->imageLayout,
9572 };
9573 vrs_texel_size = fsr_info->shadingRateAttachmentTexelSize;
9574 }
9575
9576 /* Now that we've done any layout transitions which may invoke meta, we can
9577 * fill out the actual rendering info and set up for the client's render pass.
9578 */
9579 radv_cmd_buffer_reset_rendering(cmd_buffer);
9580
9581 struct radv_rendering_state *render = &cmd_buffer->state.render;
9582 render->active = true;
9583 render->has_image_views = true;
9584 render->has_input_attachment_no_concurrent_writes =
9585 !!(pRenderingInfo->flags & VK_RENDERING_INPUT_ATTACHMENT_NO_CONCURRENT_WRITES_BIT_MESA);
9586 render->area = pRenderingInfo->renderArea;
9587 render->view_mask = pRenderingInfo->viewMask;
9588 render->layer_count = pRenderingInfo->layerCount;
9589 render->color_samples = color_samples;
9590 render->ds_samples = ds_samples;
9591 render->max_samples = MAX2(color_samples, ds_samples);
9592 render->sample_locations = sample_locations;
9593 render->color_att_count = pRenderingInfo->colorAttachmentCount;
9594 typed_memcpy(render->color_att, color_att, render->color_att_count);
9595 render->ds_att = ds_att;
9596 render->ds_att_aspects = ds_att_aspects;
9597 render->vrs_att = vrs_att;
9598 render->vrs_texel_size = vrs_texel_size;
9599 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER | RADV_CMD_DIRTY_FBFETCH_OUTPUT;
9600
9601 if (pdev->info.rbplus_allowed)
9602 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
9603
9604 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_DEPTH_BIAS | RADV_DYNAMIC_STENCIL_TEST_ENABLE;
9605 if (pdev->info.gfx_level >= GFX12)
9606 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
9607
9608 if (render->vrs_att.iview && pdev->info.gfx_level == GFX10_3) {
9609 if (render->ds_att.iview &&
9610 radv_htile_enabled(render->ds_att.iview->image, render->ds_att.iview->vk.base_mip_level)) {
9611 /* When we have a VRS attachment and a depth/stencil attachment, we just need to copy the
9612 * VRS rates to the HTILE buffer of the attachment.
9613 */
9614 struct radv_image_view *ds_iview = render->ds_att.iview;
9615 struct radv_image *ds_image = ds_iview->image;
9616 uint32_t level = ds_iview->vk.base_mip_level;
9617
9618 /* HTILE buffer */
9619 uint64_t htile_offset = ds_image->bindings[0].offset + ds_image->planes[0].surface.meta_offset +
9620 ds_image->planes[0].surface.u.gfx9.meta_levels[level].offset;
9621 uint64_t htile_size = ds_image->planes[0].surface.u.gfx9.meta_levels[level].size;
9622 struct radv_buffer htile_buffer;
9623
9624 radv_buffer_init(&htile_buffer, device, ds_image->bindings[0].bo, htile_size, htile_offset);
9625
9626 assert(render->area.offset.x + render->area.extent.width <= ds_image->vk.extent.width &&
9627 render->area.offset.x + render->area.extent.height <= ds_image->vk.extent.height);
9628
9629 /* Copy the VRS rates to the HTILE buffer. */
9630 radv_copy_vrs_htile(cmd_buffer, render->vrs_att.iview, &render->area, ds_image, &htile_buffer, true);
9631
9632 radv_buffer_finish(&htile_buffer);
9633 } else {
9634 /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, or when
9635 * HTILE isn't enabled, we use a fallback that copies the VRS rates to our internal HTILE buffer.
9636 */
9637 struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
9638
9639 if (ds_image && render->area.offset.x < ds_image->vk.extent.width &&
9640 render->area.offset.y < ds_image->vk.extent.height) {
9641 /* HTILE buffer */
9642 struct radv_buffer *htile_buffer = device->vrs.buffer;
9643
9644 VkRect2D area = render->area;
9645 area.extent.width = MIN2(area.extent.width, ds_image->vk.extent.width - area.offset.x);
9646 area.extent.height = MIN2(area.extent.height, ds_image->vk.extent.height - area.offset.y);
9647
9648 /* Copy the VRS rates to the HTILE buffer. */
9649 radv_copy_vrs_htile(cmd_buffer, render->vrs_att.iview, &area, ds_image, htile_buffer, false);
9650 }
9651 }
9652 }
9653
9654 const uint32_t minx = render->area.offset.x;
9655 const uint32_t miny = render->area.offset.y;
9656 const uint32_t maxx = minx + render->area.extent.width;
9657 const uint32_t maxy = miny + render->area.extent.height;
9658
9659 radeon_check_space(device->ws, cmd_buffer->cs, 6);
9660
9661 if (pdev->info.gfx_level >= GFX12) {
9662 radeon_set_context_reg(cmd_buffer->cs, R_028204_PA_SC_WINDOW_SCISSOR_TL,
9663 S_028204_TL_X(minx) | S_028204_TL_Y_GFX12(miny));
9664 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
9665 S_028208_BR_X(maxx - 1) | S_028208_BR_Y(maxy - 1)); /* inclusive */
9666 } else {
9667 radeon_set_context_reg(cmd_buffer->cs, R_028204_PA_SC_WINDOW_SCISSOR_TL,
9668 S_028204_TL_X(minx) | S_028204_TL_Y_GFX6(miny));
9669 radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
9670 S_028208_BR_X(maxx) | S_028208_BR_Y(maxy));
9671 }
9672
9673 radv_emit_fb_mip_change_flush(cmd_buffer);
9674
9675 if (!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT))
9676 radv_cmd_buffer_clear_rendering(cmd_buffer, pRenderingInfo);
9677 }
9678
9679 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndRendering(VkCommandBuffer commandBuffer)9680 radv_CmdEndRendering(VkCommandBuffer commandBuffer)
9681 {
9682 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9683
9684 radv_mark_noncoherent_rb(cmd_buffer);
9685 radv_cmd_buffer_resolve_rendering(cmd_buffer);
9686 radv_cmd_buffer_reset_rendering(cmd_buffer);
9687 }
9688
9689 static void
radv_emit_view_index_per_stage(struct radeon_cmdbuf * cs,const struct radv_shader * shader,uint32_t base_reg,unsigned index)9690 radv_emit_view_index_per_stage(struct radeon_cmdbuf *cs, const struct radv_shader *shader, uint32_t base_reg,
9691 unsigned index)
9692 {
9693 const uint32_t view_index_offset = radv_get_user_sgpr_loc(shader, AC_UD_VIEW_INDEX);
9694
9695 if (!view_index_offset)
9696 return;
9697
9698 radeon_set_sh_reg(cs, view_index_offset, index);
9699 }
9700
9701 static void
radv_emit_view_index(const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * cs,unsigned index)9702 radv_emit_view_index(const struct radv_cmd_state *cmd_state, struct radeon_cmdbuf *cs, unsigned index)
9703 {
9704 radv_foreach_stage(stage, cmd_state->active_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
9705 {
9706 const struct radv_shader *shader = radv_get_shader(cmd_state->shaders, stage);
9707
9708 radv_emit_view_index_per_stage(cs, shader, shader->info.user_data_0, index);
9709 }
9710
9711 if (cmd_state->gs_copy_shader) {
9712 radv_emit_view_index_per_stage(cs, cmd_state->gs_copy_shader, R_00B130_SPI_SHADER_USER_DATA_VS_0, index);
9713 }
9714 }
9715
9716 /**
9717 * Emulates predication for MEC using COND_EXEC.
9718 * When the current command buffer is predicating, emit a COND_EXEC packet
9719 * so that the MEC skips the next few dwords worth of packets.
9720 *
9721 * To make it work with inverted conditional rendering, we allocate
9722 * space in the upload BO and emit some packets to invert the condition.
9723 */
9724 static void
radv_cs_emit_compute_predication(const struct radv_device * device,struct radv_cmd_state * state,struct radeon_cmdbuf * cs,uint64_t inv_va,bool * inv_emitted,unsigned dwords)9725 radv_cs_emit_compute_predication(const struct radv_device *device, struct radv_cmd_state *state,
9726 struct radeon_cmdbuf *cs, uint64_t inv_va, bool *inv_emitted, unsigned dwords)
9727 {
9728 const struct radv_physical_device *pdev = radv_device_physical(device);
9729
9730 if (!state->predicating)
9731 return;
9732
9733 uint64_t va = state->predication_va;
9734
9735 if (!state->predication_type) {
9736 /* Invert the condition the first time it is needed. */
9737 if (!*inv_emitted) {
9738 const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
9739
9740 *inv_emitted = true;
9741
9742 /* Write 1 to the inverted predication VA. */
9743 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
9744 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
9745 COPY_DATA_WR_CONFIRM | (gfx_level == GFX6 ? COPY_DATA_ENGINE_PFP : 0));
9746 radeon_emit(cs, 1);
9747 radeon_emit(cs, 0);
9748 radeon_emit(cs, inv_va);
9749 radeon_emit(cs, inv_va >> 32);
9750
9751 /* If the API predication VA == 0, skip next command. */
9752 radv_emit_cond_exec(device, cs, va, 6 /* 1x COPY_DATA size */);
9753
9754 /* Write 0 to the new predication VA (when the API condition != 0) */
9755 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
9756 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
9757 COPY_DATA_WR_CONFIRM | (gfx_level == GFX6 ? COPY_DATA_ENGINE_PFP : 0));
9758 radeon_emit(cs, 0);
9759 radeon_emit(cs, 0);
9760 radeon_emit(cs, inv_va);
9761 radeon_emit(cs, inv_va >> 32);
9762 }
9763
9764 va = inv_va;
9765 }
9766
9767 radv_emit_cond_exec(device, cs, va, dwords);
9768 }
9769
9770 static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t use_opaque)9771 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count, uint32_t use_opaque)
9772 {
9773 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
9774 radeon_emit(cmd_buffer->cs, vertex_count);
9775 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
9776 }
9777
9778 /**
9779 * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
9780 *
9781 * The starting address "index_va" may point anywhere within the index buffer. The number of
9782 * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
9783 * Hardware uses this information to return 0 for out-of-bounds reads.
9784 */
9785 static void
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t index_va,uint32_t max_index_count,uint32_t index_count,bool not_eop)9786 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va, uint32_t max_index_count,
9787 uint32_t index_count, bool not_eop)
9788 {
9789 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
9790 radeon_emit(cmd_buffer->cs, max_index_count);
9791 radeon_emit(cmd_buffer->cs, index_va);
9792 radeon_emit(cmd_buffer->cs, index_va >> 32);
9793 radeon_emit(cmd_buffer->cs, index_count);
9794 /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
9795 * can be changed between draws and GS fast launch must be disabled.
9796 * NOT_EOP doesn't work on gfx6-gfx9 and gfx12.
9797 */
9798 radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
9799 }
9800
9801 /* MUST inline this function to avoid massive perf loss in drawoverhead */
9802 ALWAYS_INLINE static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer * cmd_buffer,bool indexed,uint32_t draw_count,uint64_t count_va,uint32_t stride)9803 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed, uint32_t draw_count,
9804 uint64_t count_va, uint32_t stride)
9805 {
9806 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9807 const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
9808 bool draw_id_enable = cmd_buffer->state.uses_drawid;
9809 uint32_t base_reg = cmd_buffer->state.vtx_base_sgpr;
9810 uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
9811 bool predicating = cmd_buffer->state.predicating;
9812 assert(base_reg);
9813
9814 /* just reset draw state for vertex data */
9815 cmd_buffer->state.last_first_instance = -1;
9816 cmd_buffer->state.last_num_instances = -1;
9817 cmd_buffer->state.last_drawid = -1;
9818 cmd_buffer->state.last_vertex_offset_valid = false;
9819
9820 vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
9821 if (cmd_buffer->state.uses_baseinstance)
9822 start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
9823 if (draw_id_enable)
9824 draw_id_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
9825
9826 if (draw_count == 1 && !count_va && !draw_id_enable) {
9827 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
9828 radeon_emit(cs, 0);
9829 radeon_emit(cs, vertex_offset_reg);
9830 radeon_emit(cs, start_instance_reg);
9831 radeon_emit(cs, di_src_sel);
9832 } else {
9833 radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, predicating));
9834 radeon_emit(cs, 0);
9835 radeon_emit(cs, vertex_offset_reg);
9836 radeon_emit(cs, start_instance_reg);
9837 radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
9838 radeon_emit(cs, draw_count); /* count */
9839 radeon_emit(cs, count_va); /* count_addr */
9840 radeon_emit(cs, count_va >> 32);
9841 radeon_emit(cs, stride); /* stride */
9842 radeon_emit(cs, di_src_sel);
9843 }
9844
9845 cmd_buffer->state.uses_draw_indirect = true;
9846 }
9847
9848 ALWAYS_INLINE static void
radv_cs_emit_indirect_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t draw_count,uint64_t count_va,uint32_t stride)9849 radv_cs_emit_indirect_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t draw_count, uint64_t count_va,
9850 uint32_t stride)
9851 {
9852 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
9853 const struct radv_physical_device *pdev = radv_device_physical(device);
9854 const struct radv_shader *mesh_shader = cmd_buffer->state.shaders[MESA_SHADER_MESH];
9855 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9856 uint32_t base_reg = cmd_buffer->state.vtx_base_sgpr;
9857 bool predicating = cmd_buffer->state.predicating;
9858 assert(base_reg || (!cmd_buffer->state.uses_drawid && !mesh_shader->info.cs.uses_grid_size));
9859
9860 /* Reset draw state. */
9861 cmd_buffer->state.last_first_instance = -1;
9862 cmd_buffer->state.last_num_instances = -1;
9863 cmd_buffer->state.last_drawid = -1;
9864 cmd_buffer->state.last_vertex_offset_valid = false;
9865
9866 uint32_t xyz_dim_enable = mesh_shader->info.cs.uses_grid_size;
9867 uint32_t xyz_dim_reg = !xyz_dim_enable ? 0 : (base_reg - SI_SH_REG_OFFSET) >> 2;
9868 uint32_t draw_id_enable = !!cmd_buffer->state.uses_drawid;
9869 uint32_t draw_id_reg = !draw_id_enable ? 0 : (base_reg + (xyz_dim_enable ? 12 : 0) - SI_SH_REG_OFFSET) >> 2;
9870
9871 uint32_t mode1_enable = !pdev->mesh_fast_launch_2;
9872
9873 radeon_emit(cs, PKT3(PKT3_DISPATCH_MESH_INDIRECT_MULTI, 7, predicating) | PKT3_RESET_FILTER_CAM_S(1));
9874 radeon_emit(cs, 0); /* data_offset */
9875 radeon_emit(cs, S_4C1_XYZ_DIM_REG(xyz_dim_reg) | S_4C1_DRAW_INDEX_REG(draw_id_reg));
9876 if (pdev->info.gfx_level >= GFX11)
9877 radeon_emit(cs, S_4C2_DRAW_INDEX_ENABLE(draw_id_enable) | S_4C2_COUNT_INDIRECT_ENABLE(!!count_va) |
9878 S_4C2_XYZ_DIM_ENABLE(xyz_dim_enable) | S_4C2_MODE1_ENABLE(mode1_enable));
9879 else
9880 radeon_emit(cs, S_4C2_DRAW_INDEX_ENABLE(draw_id_enable) | S_4C2_COUNT_INDIRECT_ENABLE(!!count_va));
9881 radeon_emit(cs, draw_count);
9882 radeon_emit(cs, count_va);
9883 radeon_emit(cs, count_va >> 32);
9884 radeon_emit(cs, stride);
9885 radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
9886 }
9887
9888 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_direct_ace_packet(const struct radv_device * device,const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * ace_cs,const uint32_t x,const uint32_t y,const uint32_t z)9889 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(const struct radv_device *device,
9890 const struct radv_cmd_state *cmd_state, struct radeon_cmdbuf *ace_cs,
9891 const uint32_t x, const uint32_t y, const uint32_t z)
9892 {
9893 const struct radv_shader *task_shader = cmd_state->shaders[MESA_SHADER_TASK];
9894 const bool predicating = cmd_state->predicating;
9895 const uint32_t dispatch_initiator =
9896 device->dispatch_initiator_task | S_00B800_CS_W32_EN(task_shader->info.wave_size == 32);
9897 const uint32_t ring_entry_reg = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
9898
9899 radeon_emit(ace_cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1));
9900 radeon_emit(ace_cs, x);
9901 radeon_emit(ace_cs, y);
9902 radeon_emit(ace_cs, z);
9903 radeon_emit(ace_cs, dispatch_initiator);
9904 radeon_emit(ace_cs, ring_entry_reg & 0xFFFF);
9905 }
9906
9907 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(const struct radv_device * device,const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * ace_cs,uint64_t data_va,uint32_t draw_count,uint64_t count_va,uint32_t stride)9908 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(const struct radv_device *device,
9909 const struct radv_cmd_state *cmd_state,
9910 struct radeon_cmdbuf *ace_cs, uint64_t data_va,
9911 uint32_t draw_count, uint64_t count_va, uint32_t stride)
9912 {
9913 assert((data_va & 0x03) == 0);
9914 assert((count_va & 0x03) == 0);
9915
9916 const struct radv_shader *task_shader = cmd_state->shaders[MESA_SHADER_TASK];
9917
9918 const uint32_t dispatch_initiator =
9919 device->dispatch_initiator_task | S_00B800_CS_W32_EN(task_shader->info.wave_size == 32);
9920 const uint32_t ring_entry_reg = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
9921 const uint32_t xyz_dim_reg = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
9922 const uint32_t draw_id_reg = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
9923
9924 radeon_emit(ace_cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
9925 radeon_emit(ace_cs, data_va);
9926 radeon_emit(ace_cs, data_va >> 32);
9927 radeon_emit(ace_cs, S_AD2_RING_ENTRY_REG(ring_entry_reg));
9928 radeon_emit(ace_cs, S_AD3_COUNT_INDIRECT_ENABLE(!!count_va) | S_AD3_DRAW_INDEX_ENABLE(!!draw_id_reg) |
9929 S_AD3_XYZ_DIM_ENABLE(!!xyz_dim_reg) | S_AD3_DRAW_INDEX_REG(draw_id_reg));
9930 radeon_emit(ace_cs, S_AD4_XYZ_DIM_REG(xyz_dim_reg));
9931 radeon_emit(ace_cs, draw_count);
9932 radeon_emit(ace_cs, count_va);
9933 radeon_emit(ace_cs, count_va >> 32);
9934 radeon_emit(ace_cs, stride);
9935 radeon_emit(ace_cs, dispatch_initiator);
9936 }
9937
9938 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_gfx_packet(const struct radv_device * device,const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * cs)9939 radv_cs_emit_dispatch_taskmesh_gfx_packet(const struct radv_device *device, const struct radv_cmd_state *cmd_state,
9940 struct radeon_cmdbuf *cs)
9941 {
9942 const struct radv_physical_device *pdev = radv_device_physical(device);
9943 const struct radv_shader *mesh_shader = cmd_state->shaders[MESA_SHADER_MESH];
9944 const bool predicating = cmd_state->predicating;
9945
9946 const uint32_t ring_entry_reg = radv_get_user_sgpr(mesh_shader, AC_UD_TASK_RING_ENTRY);
9947
9948 uint32_t xyz_dim_en = mesh_shader->info.cs.uses_grid_size;
9949 uint32_t xyz_dim_reg = !xyz_dim_en ? 0 : (cmd_state->vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
9950 uint32_t mode1_en = !pdev->mesh_fast_launch_2;
9951 uint32_t linear_dispatch_en = cmd_state->shaders[MESA_SHADER_TASK]->info.cs.linear_taskmesh_dispatch;
9952 const bool sqtt_en = !!device->sqtt.bo;
9953
9954 radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating) | PKT3_RESET_FILTER_CAM_S(1));
9955 radeon_emit(cs, S_4D0_RING_ENTRY_REG(ring_entry_reg) | S_4D0_XYZ_DIM_REG(xyz_dim_reg));
9956 if (pdev->info.gfx_level >= GFX11)
9957 radeon_emit(cs, S_4D1_XYZ_DIM_ENABLE(xyz_dim_en) | S_4D1_MODE1_ENABLE(mode1_en) |
9958 S_4D1_LINEAR_DISPATCH_ENABLE(linear_dispatch_en) | S_4D1_THREAD_TRACE_MARKER_ENABLE(sqtt_en));
9959 else
9960 radeon_emit(cs, S_4D1_THREAD_TRACE_MARKER_ENABLE(sqtt_en));
9961 radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
9962 }
9963
9964 ALWAYS_INLINE static void
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)9965 radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
9966 const uint32_t vertex_offset)
9967 {
9968 struct radv_cmd_state *state = &cmd_buffer->state;
9969 struct radeon_cmdbuf *cs = cmd_buffer->cs;
9970 const bool uses_baseinstance = state->uses_baseinstance;
9971 const bool uses_drawid = state->uses_drawid;
9972
9973 radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, state->vtx_emit_num);
9974
9975 radeon_emit(cs, vertex_offset);
9976 state->last_vertex_offset_valid = true;
9977 state->last_vertex_offset = vertex_offset;
9978 if (uses_drawid) {
9979 radeon_emit(cs, 0);
9980 state->last_drawid = 0;
9981 }
9982 if (uses_baseinstance) {
9983 radeon_emit(cs, info->first_instance);
9984 state->last_first_instance = info->first_instance;
9985 }
9986 }
9987
9988 ALWAYS_INLINE static void
radv_emit_userdata_vertex(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)9989 radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
9990 const uint32_t vertex_offset)
9991 {
9992 const struct radv_cmd_state *state = &cmd_buffer->state;
9993 const bool uses_baseinstance = state->uses_baseinstance;
9994 const bool uses_drawid = state->uses_drawid;
9995
9996 if (!state->last_vertex_offset_valid || vertex_offset != state->last_vertex_offset ||
9997 (uses_drawid && 0 != state->last_drawid) ||
9998 (uses_baseinstance && info->first_instance != state->last_first_instance))
9999 radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
10000 }
10001
10002 ALWAYS_INLINE static void
radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_offset,uint32_t drawid)10003 radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
10004 {
10005 struct radv_cmd_state *state = &cmd_buffer->state;
10006 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10007 radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, 1 + !!drawid);
10008 radeon_emit(cs, vertex_offset);
10009 state->last_vertex_offset_valid = true;
10010 state->last_vertex_offset = vertex_offset;
10011 if (drawid)
10012 radeon_emit(cs, drawid);
10013 }
10014
10015 ALWAYS_INLINE static void
radv_emit_userdata_mesh(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z)10016 radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer, const uint32_t x, const uint32_t y, const uint32_t z)
10017 {
10018 struct radv_cmd_state *state = &cmd_buffer->state;
10019 const struct radv_shader *mesh_shader = state->shaders[MESA_SHADER_MESH];
10020 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10021 const bool uses_drawid = state->uses_drawid;
10022 const bool uses_grid_size = mesh_shader->info.cs.uses_grid_size;
10023
10024 if (!uses_drawid && !uses_grid_size)
10025 return;
10026
10027 radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, state->vtx_emit_num);
10028 if (uses_grid_size) {
10029 radeon_emit(cs, x);
10030 radeon_emit(cs, y);
10031 radeon_emit(cs, z);
10032 }
10033 if (uses_drawid) {
10034 radeon_emit(cs, 0);
10035 state->last_drawid = 0;
10036 }
10037 }
10038
10039 ALWAYS_INLINE static void
radv_emit_userdata_task(const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * ace_cs,uint32_t x,uint32_t y,uint32_t z)10040 radv_emit_userdata_task(const struct radv_cmd_state *cmd_state, struct radeon_cmdbuf *ace_cs, uint32_t x, uint32_t y,
10041 uint32_t z)
10042 {
10043 const struct radv_shader *task_shader = cmd_state->shaders[MESA_SHADER_TASK];
10044
10045 const uint32_t xyz_offset = radv_get_user_sgpr_loc(task_shader, AC_UD_CS_GRID_SIZE);
10046 const uint32_t draw_id_offset = radv_get_user_sgpr_loc(task_shader, AC_UD_CS_TASK_DRAW_ID);
10047
10048 if (xyz_offset) {
10049 radeon_set_sh_reg_seq(ace_cs, xyz_offset, 3);
10050 radeon_emit(ace_cs, x);
10051 radeon_emit(ace_cs, y);
10052 radeon_emit(ace_cs, z);
10053 }
10054
10055 if (draw_id_offset) {
10056 radeon_set_sh_reg_seq(ace_cs, draw_id_offset, 1);
10057 radeon_emit(ace_cs, 0);
10058 }
10059 }
10060
10061 ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * minfo,uint32_t stride,const int32_t * vertexOffset)10062 radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
10063 uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo, uint32_t stride,
10064 const int32_t *vertexOffset)
10065
10066 {
10067 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10068 const struct radv_physical_device *pdev = radv_device_physical(device);
10069 struct radv_cmd_state *state = &cmd_buffer->state;
10070 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10071 const int index_size = radv_get_vgt_index_size(state->index_type);
10072 unsigned i = 0;
10073 const bool uses_drawid = state->uses_drawid;
10074 const bool can_eop = !uses_drawid && pdev->info.gfx_level >= GFX10 && pdev->info.gfx_level < GFX12;
10075
10076 if (uses_drawid) {
10077 if (vertexOffset) {
10078 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
10079 vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
10080 uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
10081 uint64_t index_va = state->index_va + draw->firstIndex * index_size;
10082
10083 /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
10084 if (!remaining_indexes && pdev->info.has_zero_index_buffer_bug)
10085 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
10086
10087 if (i > 0)
10088 radeon_set_sh_reg(cs, state->vtx_base_sgpr + sizeof(uint32_t), i);
10089
10090 if (!state->render.view_mask) {
10091 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10092 } else {
10093 u_foreach_bit (view, state->render.view_mask) {
10094 radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10095
10096 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10097 }
10098 }
10099 }
10100 } else {
10101 vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
10102 uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
10103 uint64_t index_va = state->index_va + draw->firstIndex * index_size;
10104
10105 /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
10106 if (!remaining_indexes && pdev->info.has_zero_index_buffer_bug)
10107 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
10108
10109 if (i > 0) {
10110 assert(state->last_vertex_offset_valid);
10111 if (state->last_vertex_offset != draw->vertexOffset)
10112 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
10113 else
10114 radeon_set_sh_reg(cs, state->vtx_base_sgpr + sizeof(uint32_t), i);
10115 } else
10116 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
10117
10118 if (!state->render.view_mask) {
10119 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10120 } else {
10121 u_foreach_bit (view, state->render.view_mask) {
10122 radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10123
10124 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10125 }
10126 }
10127 }
10128 }
10129 if (drawCount > 1) {
10130 state->last_drawid = drawCount - 1;
10131 }
10132 } else {
10133 if (vertexOffset) {
10134 if (pdev->info.gfx_level == GFX10) {
10135 /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
10136 * count == 0 for the last draw that doesn't have NOT_EOP.
10137 */
10138 while (drawCount > 1) {
10139 const VkMultiDrawIndexedInfoEXT *last =
10140 (const VkMultiDrawIndexedInfoEXT *)(((const uint8_t *)minfo) + (drawCount - 1) * stride);
10141 if (last->indexCount)
10142 break;
10143 drawCount--;
10144 }
10145 }
10146
10147 radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
10148 vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
10149 uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
10150 uint64_t index_va = state->index_va + draw->firstIndex * index_size;
10151
10152 /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
10153 if (!remaining_indexes && pdev->info.has_zero_index_buffer_bug)
10154 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
10155
10156 if (!state->render.view_mask) {
10157 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount,
10158 can_eop && i < drawCount - 1);
10159 } else {
10160 u_foreach_bit (view, state->render.view_mask) {
10161 radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10162
10163 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10164 }
10165 }
10166 }
10167 } else {
10168 vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
10169 uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
10170 uint64_t index_va = state->index_va + draw->firstIndex * index_size;
10171
10172 /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
10173 if (!remaining_indexes && pdev->info.has_zero_index_buffer_bug)
10174 radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
10175
10176 const VkMultiDrawIndexedInfoEXT *next =
10177 (const VkMultiDrawIndexedInfoEXT *)(i < drawCount - 1 ? ((uint8_t *)draw + stride) : NULL);
10178 const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
10179 radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
10180
10181 if (!state->render.view_mask) {
10182 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount,
10183 can_eop && !offset_changes && i < drawCount - 1);
10184 } else {
10185 u_foreach_bit (view, state->render.view_mask) {
10186 radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10187
10188 radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10189 }
10190 }
10191 }
10192 }
10193 if (drawCount > 1) {
10194 state->last_drawid = drawCount - 1;
10195 }
10196 }
10197 }
10198
10199 ALWAYS_INLINE static void
radv_emit_direct_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawInfoEXT * minfo,uint32_t use_opaque,uint32_t stride)10200 radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount,
10201 const VkMultiDrawInfoEXT *minfo, uint32_t use_opaque, uint32_t stride)
10202 {
10203 unsigned i = 0;
10204 const uint32_t view_mask = cmd_buffer->state.render.view_mask;
10205 const bool uses_drawid = cmd_buffer->state.uses_drawid;
10206 uint32_t last_start = 0;
10207
10208 vk_foreach_multi_draw (draw, i, minfo, drawCount, stride) {
10209 if (!i)
10210 radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
10211 else
10212 radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
10213
10214 if (!view_mask) {
10215 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
10216 } else {
10217 u_foreach_bit (view, view_mask) {
10218 radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10219 radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
10220 }
10221 }
10222 last_start = draw->firstVertex;
10223 }
10224 if (drawCount > 1) {
10225 struct radv_cmd_state *state = &cmd_buffer->state;
10226 assert(state->last_vertex_offset_valid);
10227 state->last_vertex_offset = last_start;
10228 if (uses_drawid)
10229 state->last_drawid = drawCount - 1;
10230 }
10231 }
10232
10233 static void
radv_cs_emit_mesh_dispatch_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)10234 radv_cs_emit_mesh_dispatch_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
10235 {
10236 radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, cmd_buffer->state.predicating));
10237 radeon_emit(cmd_buffer->cs, x);
10238 radeon_emit(cmd_buffer->cs, y);
10239 radeon_emit(cmd_buffer->cs, z);
10240 radeon_emit(cmd_buffer->cs, S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX));
10241 }
10242
10243 ALWAYS_INLINE static void
radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)10244 radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
10245 {
10246 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10247 const struct radv_physical_device *pdev = radv_device_physical(device);
10248 const uint32_t view_mask = cmd_buffer->state.render.view_mask;
10249
10250 radv_emit_userdata_mesh(cmd_buffer, x, y, z);
10251
10252 if (pdev->mesh_fast_launch_2) {
10253 if (!view_mask) {
10254 radv_cs_emit_mesh_dispatch_packet(cmd_buffer, x, y, z);
10255 } else {
10256 u_foreach_bit (view, view_mask) {
10257 radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10258 radv_cs_emit_mesh_dispatch_packet(cmd_buffer, x, y, z);
10259 }
10260 }
10261 } else {
10262 const uint32_t count = x * y * z;
10263 if (!view_mask) {
10264 radv_cs_emit_draw_packet(cmd_buffer, count, 0);
10265 } else {
10266 u_foreach_bit (view, view_mask) {
10267 radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10268 radv_cs_emit_draw_packet(cmd_buffer, count, 0);
10269 }
10270 }
10271 }
10272 }
10273
10274 ALWAYS_INLINE static void
radv_emit_indirect_mesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)10275 radv_emit_indirect_mesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
10276 {
10277 const struct radv_cmd_state *state = &cmd_buffer->state;
10278 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10279 struct radeon_winsys *ws = device->ws;
10280 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10281 const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
10282 const uint64_t count_va = !info->count_buffer ? 0
10283 : radv_buffer_get_va(info->count_buffer->bo) +
10284 info->count_buffer->offset + info->count_buffer_offset;
10285
10286 radv_cs_add_buffer(ws, cs, info->indirect->bo);
10287
10288 if (info->count_buffer) {
10289 radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
10290 }
10291
10292 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
10293 radeon_emit(cs, 1);
10294 radeon_emit(cs, va);
10295 radeon_emit(cs, va >> 32);
10296
10297 if (state->uses_drawid) {
10298 const struct radv_shader *mesh_shader = state->shaders[MESA_SHADER_MESH];
10299 unsigned reg = state->vtx_base_sgpr + (mesh_shader->info.cs.uses_grid_size ? 12 : 0);
10300 radeon_set_sh_reg_seq(cs, reg, 1);
10301 radeon_emit(cs, 0);
10302 }
10303
10304 if (!state->render.view_mask) {
10305 radv_cs_emit_indirect_mesh_draw_packet(cmd_buffer, info->count, count_va, info->stride);
10306 } else {
10307 u_foreach_bit (i, state->render.view_mask) {
10308 radv_emit_view_index(&cmd_buffer->state, cs, i);
10309 radv_cs_emit_indirect_mesh_draw_packet(cmd_buffer, info->count, count_va, info->stride);
10310 }
10311 }
10312 }
10313
10314 ALWAYS_INLINE static void
radv_emit_direct_taskmesh_draw_packets(const struct radv_device * device,struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * cs,struct radeon_cmdbuf * ace_cs,uint32_t x,uint32_t y,uint32_t z)10315 radv_emit_direct_taskmesh_draw_packets(const struct radv_device *device, struct radv_cmd_state *cmd_state,
10316 struct radeon_cmdbuf *cs, struct radeon_cmdbuf *ace_cs, uint32_t x, uint32_t y,
10317 uint32_t z)
10318 {
10319 const uint32_t view_mask = cmd_state->render.view_mask;
10320 const unsigned num_views = MAX2(1, util_bitcount(view_mask));
10321 const unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */
10322
10323 radv_emit_userdata_task(cmd_state, ace_cs, x, y, z);
10324 radv_cs_emit_compute_predication(device, cmd_state, ace_cs, cmd_state->mec_inv_pred_va,
10325 &cmd_state->mec_inv_pred_emitted, ace_predication_size);
10326
10327 if (!view_mask) {
10328 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(device, cmd_state, ace_cs, x, y, z);
10329 radv_cs_emit_dispatch_taskmesh_gfx_packet(device, cmd_state, cs);
10330 } else {
10331 u_foreach_bit (view, view_mask) {
10332 radv_emit_view_index(cmd_state, cs, view);
10333
10334 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(device, cmd_state, ace_cs, x, y, z);
10335 radv_cs_emit_dispatch_taskmesh_gfx_packet(device, cmd_state, cs);
10336 }
10337 }
10338 }
10339
10340 static void
radv_emit_indirect_taskmesh_draw_packets(const struct radv_device * device,struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * cs,struct radeon_cmdbuf * ace_cs,const struct radv_draw_info * info,uint64_t workaround_cond_va)10341 radv_emit_indirect_taskmesh_draw_packets(const struct radv_device *device, struct radv_cmd_state *cmd_state,
10342 struct radeon_cmdbuf *cs, struct radeon_cmdbuf *ace_cs,
10343 const struct radv_draw_info *info, uint64_t workaround_cond_va)
10344 {
10345 const struct radv_physical_device *pdev = radv_device_physical(device);
10346 const uint32_t view_mask = cmd_state->render.view_mask;
10347 struct radeon_winsys *ws = device->ws;
10348 const unsigned num_views = MAX2(1, util_bitcount(view_mask));
10349 unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */
10350
10351 const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
10352 const uint64_t count_va = !info->count_buffer ? 0
10353 : radv_buffer_get_va(info->count_buffer->bo) +
10354 info->count_buffer->offset + info->count_buffer_offset;
10355
10356 if (count_va)
10357 radv_cs_add_buffer(ws, ace_cs, info->count_buffer->bo);
10358
10359 if (pdev->info.has_taskmesh_indirect0_bug && count_va) {
10360 /* MEC firmware bug workaround.
10361 * When the count buffer contains zero, DISPATCH_TASKMESH_INDIRECT_MULTI_ACE hangs.
10362 * - We must ensure that DISPATCH_TASKMESH_INDIRECT_MULTI_ACE
10363 * is only executed when the count buffer contains non-zero.
10364 * - Furthermore, we must also ensure that each DISPATCH_TASKMESH_GFX packet
10365 * has a matching ACE packet.
10366 *
10367 * As a workaround:
10368 * - Reserve a dword in the upload buffer and initialize it to 1 for the workaround
10369 * - When count != 0, write 0 to the workaround BO and execute the indirect dispatch
10370 * - When workaround BO != 0 (count was 0), execute an empty direct dispatch
10371 */
10372 radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
10373 radeon_emit(ace_cs,
10374 COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
10375 radeon_emit(ace_cs, 1);
10376 radeon_emit(ace_cs, 0);
10377 radeon_emit(ace_cs, workaround_cond_va);
10378 radeon_emit(ace_cs, workaround_cond_va >> 32);
10379
10380 /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */
10381 ace_predication_size += 2 * 5 + 6 + 6 * num_views;
10382 }
10383
10384 radv_cs_add_buffer(ws, ace_cs, info->indirect->bo);
10385 radv_cs_emit_compute_predication(device, cmd_state, ace_cs, cmd_state->mec_inv_pred_va,
10386 &cmd_state->mec_inv_pred_emitted, ace_predication_size);
10387
10388 if (workaround_cond_va) {
10389 radv_emit_cond_exec(device, ace_cs, count_va,
10390 6 + 11 * num_views /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */);
10391
10392 radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
10393 radeon_emit(ace_cs,
10394 COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
10395 radeon_emit(ace_cs, 0);
10396 radeon_emit(ace_cs, 0);
10397 radeon_emit(ace_cs, workaround_cond_va);
10398 radeon_emit(ace_cs, workaround_cond_va >> 32);
10399 }
10400
10401 if (!view_mask) {
10402 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(device, cmd_state, ace_cs, va, info->count, count_va,
10403 info->stride);
10404 radv_cs_emit_dispatch_taskmesh_gfx_packet(device, cmd_state, cs);
10405 } else {
10406 u_foreach_bit (view, view_mask) {
10407 radv_emit_view_index(cmd_state, cs, view);
10408
10409 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(device, cmd_state, ace_cs, va, info->count, count_va,
10410 info->stride);
10411 radv_cs_emit_dispatch_taskmesh_gfx_packet(device, cmd_state, cs);
10412 }
10413 }
10414
10415 if (workaround_cond_va) {
10416 radv_emit_cond_exec(device, ace_cs, workaround_cond_va, 6 * num_views /* Nx DISPATCH_TASKMESH_DIRECT_ACE */);
10417
10418 for (unsigned v = 0; v < num_views; ++v) {
10419 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(device, cmd_state, ace_cs, 0, 0, 0);
10420 }
10421 }
10422 }
10423
10424 static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)10425 radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
10426 {
10427 const struct radv_cmd_state *state = &cmd_buffer->state;
10428 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10429 struct radeon_winsys *ws = device->ws;
10430 struct radeon_cmdbuf *cs = cmd_buffer->cs;
10431 const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
10432 const uint64_t count_va = info->count_buffer ? radv_buffer_get_va(info->count_buffer->bo) +
10433 info->count_buffer->offset + info->count_buffer_offset
10434 : 0;
10435
10436 radv_cs_add_buffer(ws, cs, info->indirect->bo);
10437
10438 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
10439 radeon_emit(cs, 1);
10440 radeon_emit(cs, va);
10441 radeon_emit(cs, va >> 32);
10442
10443 if (info->count_buffer) {
10444 radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
10445 }
10446
10447 if (!state->render.view_mask) {
10448 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, info->stride);
10449 } else {
10450 u_foreach_bit (i, state->render.view_mask) {
10451 radv_emit_view_index(&cmd_buffer->state, cs, i);
10452
10453 radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, info->stride);
10454 }
10455 }
10456 }
10457
10458 static uint64_t
radv_get_needed_dynamic_states(struct radv_cmd_buffer * cmd_buffer)10459 radv_get_needed_dynamic_states(struct radv_cmd_buffer *cmd_buffer)
10460 {
10461 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10462 const struct radv_physical_device *pdev = radv_device_physical(device);
10463 uint64_t dynamic_states = RADV_DYNAMIC_ALL;
10464
10465 if (cmd_buffer->state.graphics_pipeline)
10466 return cmd_buffer->state.graphics_pipeline->needed_dynamic_state;
10467
10468 /* Clear unnecessary dynamic states for shader objects. */
10469 if (!cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL])
10470 dynamic_states &= ~(RADV_DYNAMIC_PATCH_CONTROL_POINTS | RADV_DYNAMIC_TESS_DOMAIN_ORIGIN);
10471
10472 if (pdev->info.gfx_level >= GFX10_3) {
10473 if (cmd_buffer->state.shaders[MESA_SHADER_MESH])
10474 dynamic_states &= ~(RADV_DYNAMIC_VERTEX_INPUT | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
10475 RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
10476 } else {
10477 dynamic_states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
10478 }
10479
10480 return dynamic_states;
10481 }
10482
10483 /*
10484 * Vega and raven have a bug which triggers if there are multiple context
10485 * register contexts active at the same time with different scissor values.
10486 *
10487 * There are two possible workarounds:
10488 * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
10489 * there is only ever 1 active set of scissor values at the same time.
10490 *
10491 * 2) Whenever the hardware switches contexts we have to set the scissor
10492 * registers again even if it is a noop. That way the new context gets
10493 * the correct scissor values.
10494 *
10495 * This implements option 2. radv_need_late_scissor_emission needs to
10496 * return true on affected HW if radv_emit_all_graphics_states sets
10497 * any context registers.
10498 */
10499 static bool
radv_need_late_scissor_emission(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)10500 radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
10501 {
10502 if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
10503 return true;
10504
10505 uint64_t used_dynamic_states = radv_get_needed_dynamic_states(cmd_buffer);
10506
10507 used_dynamic_states &= ~RADV_DYNAMIC_VERTEX_INPUT;
10508
10509 if (cmd_buffer->state.dirty_dynamic & used_dynamic_states)
10510 return true;
10511
10512 /* Index, vertex and streamout buffers don't change context regs.
10513 * We assume that any other dirty flag causes context rolls.
10514 */
10515 uint64_t used_states = RADV_CMD_DIRTY_ALL;
10516 used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_STREAMOUT_BUFFER);
10517
10518 return cmd_buffer->state.dirty & used_states;
10519 }
10520
10521 ALWAYS_INLINE static uint32_t
radv_get_ngg_culling_settings(struct radv_cmd_buffer * cmd_buffer,bool vp_y_inverted)10522 radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
10523 {
10524 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
10525
10526 /* Disable shader culling entirely when conservative overestimate is used.
10527 * The face culling algorithm can delete very tiny triangles (even if unintended).
10528 */
10529 if (d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT)
10530 return radv_nggc_none;
10531
10532 /* With graphics pipeline library, NGG culling is unconditionally compiled into shaders
10533 * because we don't know the primitive topology at compile time, so we should
10534 * disable it dynamically for points or lines.
10535 */
10536 const unsigned num_vertices_per_prim = radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, true) + 1;
10537 if (num_vertices_per_prim != 3)
10538 return radv_nggc_none;
10539
10540 /* Cull every triangle when rasterizer discard is enabled. */
10541 if (d->vk.rs.rasterizer_discard_enable)
10542 return radv_nggc_front_face | radv_nggc_back_face;
10543
10544 uint32_t nggc_settings = radv_nggc_none;
10545
10546 /* The culling code needs to know whether face is CW or CCW. */
10547 bool ccw = d->vk.rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
10548
10549 /* Take inverted viewport into account. */
10550 ccw ^= vp_y_inverted;
10551
10552 if (ccw)
10553 nggc_settings |= radv_nggc_face_is_ccw;
10554
10555 /* Face culling settings. */
10556 if (d->vk.rs.cull_mode & VK_CULL_MODE_FRONT_BIT)
10557 nggc_settings |= radv_nggc_front_face;
10558 if (d->vk.rs.cull_mode & VK_CULL_MODE_BACK_BIT)
10559 nggc_settings |= radv_nggc_back_face;
10560
10561 /* Small primitive culling assumes a sample position at (0.5, 0.5)
10562 * so don't enable it with user sample locations.
10563 */
10564 if (!d->vk.ms.sample_locations_enable) {
10565 nggc_settings |= radv_nggc_small_primitives;
10566
10567 /* small_prim_precision = num_samples / 2^subpixel_bits
10568 * num_samples is also always a power of two, so the small prim precision can only be
10569 * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
10570 */
10571 unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
10572 unsigned subpixel_bits = 256;
10573 int32_t small_prim_precision_log2 = util_logbase2(rasterization_samples) - util_logbase2(subpixel_bits);
10574 nggc_settings |= ((uint32_t)small_prim_precision_log2 << 24u);
10575 }
10576
10577 return nggc_settings;
10578 }
10579
10580 static void
radv_emit_ngg_culling_state(struct radv_cmd_buffer * cmd_buffer)10581 radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer)
10582 {
10583 const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
10584
10585 /* Get viewport transform. */
10586 float vp_scale[2], vp_translate[2];
10587 memcpy(vp_scale, cmd_buffer->state.dynamic.hw_vp.xform[0].scale, 2 * sizeof(float));
10588 memcpy(vp_translate, cmd_buffer->state.dynamic.hw_vp.xform[0].translate, 2 * sizeof(float));
10589 bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
10590
10591 /* Get current culling settings. */
10592 uint32_t nggc_settings = radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted);
10593
10594 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
10595 (cmd_buffer->state.dirty_dynamic & (RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_RASTERIZATION_SAMPLES))) {
10596 /* Correction for inverted Y */
10597 if (vp_y_inverted) {
10598 vp_scale[1] = -vp_scale[1];
10599 vp_translate[1] = -vp_translate[1];
10600 }
10601
10602 /* Correction for number of samples per pixel. */
10603 for (unsigned i = 0; i < 2; ++i) {
10604 vp_scale[i] *= (float)cmd_buffer->state.dynamic.vk.ms.rasterization_samples;
10605 vp_translate[i] *= (float)cmd_buffer->state.dynamic.vk.ms.rasterization_samples;
10606 }
10607
10608 uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
10609 const uint32_t ngg_viewport_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_VIEWPORT);
10610 radeon_set_sh_reg_seq(cmd_buffer->cs, ngg_viewport_offset, 4);
10611 radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
10612 }
10613
10614 const uint32_t ngg_culling_settings_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_CULLING_SETTINGS);
10615
10616 radeon_set_sh_reg(cmd_buffer->cs, ngg_culling_settings_offset, nggc_settings);
10617 }
10618
10619 static void
radv_emit_fs_state(struct radv_cmd_buffer * cmd_buffer)10620 radv_emit_fs_state(struct radv_cmd_buffer *cmd_buffer)
10621 {
10622 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
10623
10624 if (!ps)
10625 return;
10626
10627 const uint32_t ps_state_offset = radv_get_user_sgpr_loc(ps, AC_UD_PS_STATE);
10628 if (!ps_state_offset)
10629 return;
10630
10631 const unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
10632 const unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
10633 const uint16_t ps_iter_mask = ac_get_ps_iter_mask(ps_iter_samples);
10634 const unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
10635 const unsigned ps_state = SET_SGPR_FIELD(PS_STATE_NUM_SAMPLES, rasterization_samples) |
10636 SET_SGPR_FIELD(PS_STATE_PS_ITER_MASK, ps_iter_mask) |
10637 SET_SGPR_FIELD(PS_STATE_LINE_RAST_MODE, radv_get_line_mode(cmd_buffer)) |
10638 SET_SGPR_FIELD(PS_STATE_RAST_PRIM, rast_prim);
10639
10640 radeon_set_sh_reg(cmd_buffer->cs, ps_state_offset, ps_state);
10641 }
10642
10643 static void
radv_emit_db_shader_control(struct radv_cmd_buffer * cmd_buffer)10644 radv_emit_db_shader_control(struct radv_cmd_buffer *cmd_buffer)
10645 {
10646 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10647 const struct radv_physical_device *pdev = radv_device_physical(device);
10648 const struct radeon_info *gpu_info = &pdev->info;
10649 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
10650 const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
10651 const bool uses_ds_feedback_loop =
10652 !!(d->feedback_loop_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT));
10653 const unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
10654
10655 uint32_t db_shader_control;
10656
10657 if (ps) {
10658 db_shader_control = ps->info.regs.ps.db_shader_control;
10659 } else {
10660 db_shader_control = S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_ANY_Z) |
10661 S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
10662 S_02880C_DUAL_QUAD_DISABLE(gpu_info->has_rbplus && !gpu_info->rbplus_allowed);
10663 }
10664
10665 /* When a depth/stencil attachment is used inside feedback loops, use LATE_Z to make sure shader invocations read the
10666 * correct value.
10667 * Also apply the bug workaround for smoothing (overrasterization) on GFX6.
10668 */
10669 if (uses_ds_feedback_loop || (gpu_info->gfx_level == GFX6 &&
10670 radv_get_line_mode(cmd_buffer) == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR))
10671 db_shader_control = (db_shader_control & C_02880C_Z_ORDER) | S_02880C_Z_ORDER(V_02880C_LATE_Z);
10672
10673 if (ps && ps->info.ps.pops) {
10674 /* POPS_OVERLAP_NUM_SAMPLES (OVERRIDE_INTRINSIC_RATE on GFX11, must always be enabled for POPS) controls the
10675 * interlock granularity.
10676 * PixelInterlock: 1x.
10677 * SampleInterlock: MSAA_EXPOSED_SAMPLES (much faster at common edges of adjacent primitives with MSAA).
10678 */
10679 if (gpu_info->gfx_level >= GFX11) {
10680 db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE_ENABLE(1);
10681 if (ps->info.ps.pops_is_per_sample)
10682 db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE(util_logbase2(rasterization_samples));
10683 } else {
10684 if (ps->info.ps.pops_is_per_sample)
10685 db_shader_control |= S_02880C_POPS_OVERLAP_NUM_SAMPLES(util_logbase2(rasterization_samples));
10686
10687 if (gpu_info->has_pops_missed_overlap_bug) {
10688 radeon_set_context_reg(cmd_buffer->cs, R_028060_DB_DFSM_CONTROL,
10689 S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
10690 S_028060_POPS_DRAIN_PS_ON_OVERLAP(rasterization_samples >= 8));
10691 }
10692 }
10693 } else if (gpu_info->has_export_conflict_bug && rasterization_samples == 1) {
10694 for (uint32_t i = 0; i < MAX_RTS; i++) {
10695 if (d->vk.cb.attachments[i].write_mask && d->vk.cb.attachments[i].blend_enable) {
10696 db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE_ENABLE(1) | S_02880C_OVERRIDE_INTRINSIC_RATE(2);
10697 break;
10698 }
10699 }
10700 }
10701
10702 if (pdev->info.gfx_level >= GFX12) {
10703 radeon_opt_set_context_reg(cmd_buffer, R_02806C_DB_SHADER_CONTROL, RADV_TRACKED_DB_SHADER_CONTROL,
10704 db_shader_control);
10705 } else {
10706 radeon_opt_set_context_reg(cmd_buffer, R_02880C_DB_SHADER_CONTROL, RADV_TRACKED_DB_SHADER_CONTROL,
10707 db_shader_control);
10708 }
10709
10710 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DB_SHADER_CONTROL;
10711 }
10712
10713 static void
radv_emit_streamout_enable_state(struct radv_cmd_buffer * cmd_buffer)10714 radv_emit_streamout_enable_state(struct radv_cmd_buffer *cmd_buffer)
10715 {
10716 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10717 const struct radv_physical_device *pdev = radv_device_physical(device);
10718 const struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10719 const bool streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
10720 uint32_t enabled_stream_buffers_mask = 0;
10721
10722 assert(!pdev->use_ngg_streamout);
10723
10724 if (streamout_enabled && cmd_buffer->state.last_vgt_shader) {
10725 const struct radv_shader_info *info = &cmd_buffer->state.last_vgt_shader->info;
10726
10727 enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
10728
10729 u_foreach_bit (i, so->enabled_mask) {
10730 radeon_set_context_reg(cmd_buffer->cs, R_028AD4_VGT_STRMOUT_VTX_STRIDE_0 + 16 * i, info->so.strides[i]);
10731 }
10732 }
10733
10734 radeon_set_context_reg_seq(cmd_buffer->cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
10735 radeon_emit(cmd_buffer->cs, S_028B94_STREAMOUT_0_EN(streamout_enabled) | S_028B94_RAST_STREAM(0) |
10736 S_028B94_STREAMOUT_1_EN(streamout_enabled) |
10737 S_028B94_STREAMOUT_2_EN(streamout_enabled) |
10738 S_028B94_STREAMOUT_3_EN(streamout_enabled));
10739 radeon_emit(cmd_buffer->cs, so->hw_enabled_mask & enabled_stream_buffers_mask);
10740
10741 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_ENABLE;
10742 }
10743
10744 static gl_shader_stage
radv_cmdbuf_get_last_vgt_api_stage(const struct radv_cmd_buffer * cmd_buffer)10745 radv_cmdbuf_get_last_vgt_api_stage(const struct radv_cmd_buffer *cmd_buffer)
10746 {
10747 if (cmd_buffer->state.active_stages & VK_SHADER_STAGE_MESH_BIT_EXT)
10748 return MESA_SHADER_MESH;
10749
10750 return util_last_bit(cmd_buffer->state.active_stages & BITFIELD_MASK(MESA_SHADER_FRAGMENT)) - 1;
10751 }
10752
10753 static void
radv_emit_color_output_state(struct radv_cmd_buffer * cmd_buffer)10754 radv_emit_color_output_state(struct radv_cmd_buffer *cmd_buffer)
10755 {
10756 const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10757 const struct radv_physical_device *pdev = radv_device_physical(device);
10758
10759 uint32_t col_format_compacted = radv_compact_spi_shader_col_format(cmd_buffer->state.spi_shader_col_format);
10760
10761 if (pdev->info.gfx_level >= GFX12) {
10762 radeon_set_context_reg(cmd_buffer->cs, R_028854_CB_SHADER_MASK, cmd_buffer->state.cb_shader_mask);
10763 radeon_set_context_reg(cmd_buffer->cs, R_028654_SPI_SHADER_COL_FORMAT, col_format_compacted);
10764 } else {
10765 radeon_set_context_reg(cmd_buffer->cs, R_02823C_CB_SHADER_MASK, cmd_buffer->state.cb_shader_mask);
10766 radeon_set_context_reg(cmd_buffer->cs, R_028714_SPI_SHADER_COL_FORMAT, col_format_compacted);
10767 }
10768
10769 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_COLOR_OUTPUT;
10770 }
10771
10772 static void
radv_emit_all_graphics_states(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)10773 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
10774 {
10775 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10776 const struct radv_physical_device *pdev = radv_device_physical(device);
10777 struct radv_shader_part *ps_epilog = NULL;
10778
10779 if (cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT] &&
10780 cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]->info.ps.has_epilog) {
10781 if ((cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline ||
10782 ((cmd_buffer->state.dirty & (RADV_CMD_DIRTY_GRAPHICS_SHADERS | RADV_CMD_DIRTY_FRAMEBUFFER)) ||
10783 (cmd_buffer->state.dirty_dynamic &
10784 (RADV_DYNAMIC_COLOR_WRITE_MASK | RADV_DYNAMIC_COLOR_BLEND_ENABLE | RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE |
10785 RADV_DYNAMIC_COLOR_BLEND_EQUATION | RADV_DYNAMIC_ALPHA_TO_ONE_ENABLE |
10786 RADV_DYNAMIC_COLOR_ATTACHMENT_MAP))))) {
10787 ps_epilog = lookup_ps_epilog(cmd_buffer);
10788 if (!ps_epilog) {
10789 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
10790 return;
10791 }
10792
10793 uint32_t col_format = ps_epilog->spi_shader_col_format;
10794 uint32_t cb_shader_mask = ps_epilog->cb_shader_mask;
10795
10796 assert(cmd_buffer->state.custom_blend_mode == 0);
10797
10798 if (radv_needs_null_export_workaround(device, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT], 0) &&
10799 !col_format)
10800 col_format = V_028714_SPI_SHADER_32_R;
10801
10802 if (cmd_buffer->state.spi_shader_col_format != col_format) {
10803 cmd_buffer->state.spi_shader_col_format = col_format;
10804 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
10805 if (pdev->info.rbplus_allowed)
10806 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
10807 }
10808
10809 if (cmd_buffer->state.cb_shader_mask != cb_shader_mask) {
10810 cmd_buffer->state.cb_shader_mask = cb_shader_mask;
10811 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
10812 }
10813 }
10814 }
10815
10816 /* Determine whether GFX9 late scissor workaround should be applied based on:
10817 * 1. radv_need_late_scissor_emission
10818 * 2. any dirty dynamic flags that may cause context rolls
10819 */
10820 const bool late_scissor_emission =
10821 pdev->info.has_gfx9_scissor_bug ? radv_need_late_scissor_emission(cmd_buffer, info) : false;
10822
10823 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_RBPLUS)
10824 radv_emit_rbplus_state(cmd_buffer);
10825
10826 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_SHADER_QUERY)
10827 radv_flush_shader_query_state(cmd_buffer);
10828
10829 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_OCCLUSION_QUERY) ||
10830 (cmd_buffer->state.dirty_dynamic & (RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY)))
10831 radv_flush_occlusion_query_state(cmd_buffer);
10832
10833 if (((cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
10834 (cmd_buffer->state.dirty_dynamic &
10835 (RADV_DYNAMIC_CULL_MODE | RADV_DYNAMIC_FRONT_FACE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE |
10836 RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_CONSERVATIVE_RAST_MODE | RADV_DYNAMIC_RASTERIZATION_SAMPLES |
10837 RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE))) &&
10838 cmd_buffer->state.has_nggc)
10839 radv_emit_ngg_culling_state(cmd_buffer);
10840
10841 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
10842 (cmd_buffer->state.dirty_dynamic &
10843 (RADV_DYNAMIC_COLOR_WRITE_MASK | RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_LINE_RASTERIZATION_MODE |
10844 RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE)))
10845 radv_emit_binning_state(cmd_buffer);
10846
10847 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) {
10848 radv_emit_graphics_pipeline(cmd_buffer);
10849 } else if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
10850 radv_emit_graphics_shaders(cmd_buffer);
10851 }
10852
10853 if (ps_epilog)
10854 radv_emit_ps_epilog_state(cmd_buffer, ps_epilog);
10855
10856 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_COLOR_OUTPUT)
10857 radv_emit_color_output_state(cmd_buffer);
10858
10859 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
10860 radv_emit_framebuffer_state(cmd_buffer);
10861
10862 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GUARDBAND)
10863 radv_emit_guardband_state(cmd_buffer);
10864
10865 if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_DB_SHADER_CONTROL) ||
10866 (cmd_buffer->state.dirty_dynamic &
10867 (RADV_DYNAMIC_COLOR_WRITE_MASK | RADV_DYNAMIC_COLOR_BLEND_ENABLE | RADV_DYNAMIC_RASTERIZATION_SAMPLES |
10868 RADV_DYNAMIC_LINE_RASTERIZATION_MODE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE |
10869 RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)))
10870 radv_emit_db_shader_control(cmd_buffer);
10871
10872 if (info->indexed && info->indirect && cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
10873 radv_emit_index_buffer(cmd_buffer);
10874
10875 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_ENABLE)
10876 radv_emit_streamout_enable_state(cmd_buffer);
10877
10878 const uint64_t dynamic_states = cmd_buffer->state.dirty_dynamic & radv_get_needed_dynamic_states(cmd_buffer);
10879
10880 if (dynamic_states) {
10881 radv_cmd_buffer_flush_dynamic_state(cmd_buffer, dynamic_states);
10882
10883 if (dynamic_states & (RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_LINE_RASTERIZATION_MODE |
10884 RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE))
10885 radv_emit_fs_state(cmd_buffer);
10886 }
10887
10888 radv_emit_draw_registers(cmd_buffer, info);
10889
10890 if (late_scissor_emission) {
10891 radv_emit_scissor(cmd_buffer);
10892 cmd_buffer->state.context_roll_without_scissor_emitted = false;
10893 }
10894 }
10895
10896 static void
radv_bind_graphics_shaders(struct radv_cmd_buffer * cmd_buffer)10897 radv_bind_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
10898 {
10899 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10900 const struct radv_physical_device *pdev = radv_device_physical(device);
10901 uint32_t push_constant_size = 0, dynamic_offset_count = 0;
10902 bool need_indirect_descriptor_sets = false;
10903
10904 for (unsigned s = 0; s <= MESA_SHADER_MESH; s++) {
10905 const struct radv_shader_object *shader_obj = cmd_buffer->state.shader_objs[s];
10906 struct radv_shader *shader = NULL;
10907
10908 if (s == MESA_SHADER_COMPUTE)
10909 continue;
10910
10911 if (!shader_obj) {
10912 radv_bind_shader(cmd_buffer, NULL, s);
10913 continue;
10914 }
10915
10916 /* Select shader variants. */
10917 if (s == MESA_SHADER_VERTEX && (cmd_buffer->state.shader_objs[MESA_SHADER_TESS_CTRL] ||
10918 cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY])) {
10919 if (cmd_buffer->state.shader_objs[MESA_SHADER_TESS_CTRL]) {
10920 shader = shader_obj->as_ls.shader;
10921 } else {
10922 shader = shader_obj->as_es.shader;
10923 }
10924 } else if (s == MESA_SHADER_TESS_EVAL && cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]) {
10925 shader = shader_obj->as_es.shader;
10926 } else {
10927 shader = shader_obj->shader;
10928 }
10929
10930 radv_bind_shader(cmd_buffer, shader, s);
10931 if (!shader)
10932 continue;
10933
10934 /* Compute push constants/indirect descriptors state. */
10935 need_indirect_descriptor_sets |= radv_get_user_sgpr_info(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS)->sgpr_idx != -1;
10936 push_constant_size += shader_obj->push_constant_size;
10937 dynamic_offset_count += shader_obj->dynamic_offset_count;
10938 }
10939
10940 /* Determine the last VGT shader. */
10941 const gl_shader_stage last_vgt_api_stage = radv_cmdbuf_get_last_vgt_api_stage(cmd_buffer);
10942
10943 assume(last_vgt_api_stage != MESA_SHADER_NONE);
10944 if (pdev->info.has_vgt_flush_ngg_legacy_bug &&
10945 (!cmd_buffer->state.last_vgt_shader || (cmd_buffer->state.last_vgt_shader->info.is_ngg &&
10946 !cmd_buffer->state.shaders[last_vgt_api_stage]->info.is_ngg))) {
10947 /* Transitioning from NGG to legacy GS requires VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH is
10948 * also emitted at the beginning of IBs when legacy GS ring pointers are set.
10949 */
10950 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
10951 }
10952
10953 cmd_buffer->state.last_vgt_shader = cmd_buffer->state.shaders[last_vgt_api_stage];
10954
10955 struct radv_shader *gs_copy_shader = cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]
10956 ? cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]->gs.copy_shader
10957 : NULL;
10958
10959 radv_bind_gs_copy_shader(cmd_buffer, gs_copy_shader);
10960
10961 /* Determine NGG GS info. */
10962 if (cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY] &&
10963 cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.is_ngg &&
10964 cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.merged_shader_compiled_separately) {
10965 struct radv_shader *es = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
10966 ? cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
10967 : cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
10968 struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
10969
10970 gfx10_get_ngg_info(device, &es->info, &gs->info, &gs->info.ngg_info);
10971 radv_precompute_registers_hw_ngg(device, &gs->config, &gs->info);
10972 }
10973
10974 /* Determine the rasterized primitive. */
10975 if (cmd_buffer->state.active_stages &
10976 (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
10977 VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_MESH_BIT_EXT)) {
10978 cmd_buffer->state.rast_prim = radv_get_vgt_gs_out(cmd_buffer->state.shaders, 0);
10979 }
10980
10981 const struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
10982 if (vs) {
10983 /* Re-emit the VS prolog when a new vertex shader is bound. */
10984 if (vs->info.vs.has_prolog) {
10985 cmd_buffer->state.emitted_vs_prolog = NULL;
10986 cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
10987 }
10988
10989 /* Re-emit the vertex buffer descriptors because they are really tied to the pipeline. */
10990 if (vs->info.vs.vb_desc_usage_mask) {
10991 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
10992 }
10993 }
10994
10995 const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
10996 if (ps && !ps->info.ps.has_epilog) {
10997 uint32_t col_format = 0, cb_shader_mask = 0;
10998 if (radv_needs_null_export_workaround(device, ps, 0))
10999 col_format = V_028714_SPI_SHADER_32_R;
11000
11001 if (cmd_buffer->state.spi_shader_col_format != col_format) {
11002 cmd_buffer->state.spi_shader_col_format = col_format;
11003 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
11004 if (pdev->info.rbplus_allowed)
11005 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
11006 }
11007
11008 if (cmd_buffer->state.cb_shader_mask != cb_shader_mask) {
11009 cmd_buffer->state.cb_shader_mask = cb_shader_mask;
11010 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
11011 }
11012 }
11013
11014 /* Update push constants/indirect descriptors state. */
11015 struct radv_descriptor_state *descriptors_state =
11016 radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
11017 struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_GRAPHICS];
11018
11019 descriptors_state->need_indirect_descriptor_sets = need_indirect_descriptor_sets;
11020 pc_state->size = push_constant_size;
11021 pc_state->dynamic_offset_count = dynamic_offset_count;
11022
11023 if (pdev->info.gfx_level <= GFX9) {
11024 cmd_buffer->state.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param(device, cmd_buffer->state.shaders);
11025 }
11026
11027 if (cmd_buffer->state.active_stages &
11028 (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) {
11029 cmd_buffer->state.uses_dynamic_patch_control_points = true;
11030 }
11031 }
11032
11033 /* MUST inline this function to avoid massive perf loss in drawoverhead */
11034 ALWAYS_INLINE static bool
radv_before_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,bool dgc)11035 radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount, bool dgc)
11036 {
11037 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11038 const struct radv_physical_device *pdev = radv_device_physical(device);
11039 const bool has_prefetch = pdev->info.gfx_level >= GFX7;
11040
11041 ASSERTED const unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
11042
11043 if (likely(!info->indirect)) {
11044 /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
11045 * no workaround for indirect draws, but we can at least skip
11046 * direct draws.
11047 */
11048 if (unlikely(!info->instance_count))
11049 return false;
11050
11051 /* Handle count == 0. */
11052 if (unlikely(!info->count && !info->strmout_buffer))
11053 return false;
11054 }
11055
11056 if (!info->indexed && pdev->info.gfx_level >= GFX7) {
11057 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
11058 * so the state must be re-emitted before the next indexed
11059 * draw.
11060 */
11061 cmd_buffer->state.last_index_type = -1;
11062 }
11063
11064 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FBFETCH_OUTPUT)
11065 radv_handle_fbfetch_output(cmd_buffer);
11066
11067 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
11068 radv_bind_graphics_shaders(cmd_buffer);
11069 }
11070
11071 /* Use optimal packet order based on whether we need to sync the
11072 * pipeline.
11073 */
11074 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
11075 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
11076 /* If we have to wait for idle, set all states first, so that
11077 * all SET packets are processed in parallel with previous draw
11078 * calls. Then upload descriptors, set shader pointers, and
11079 * draw, and prefetch at the end. This ensures that the time
11080 * the CUs are idle is very short. (there are only SET_SH
11081 * packets between the wait and the draw)
11082 */
11083 radv_emit_all_graphics_states(cmd_buffer, info);
11084 radv_emit_cache_flush(cmd_buffer);
11085 /* <-- CUs are idle here --> */
11086
11087 radv_upload_graphics_shader_descriptors(cmd_buffer);
11088 } else {
11089 const bool need_prefetch = has_prefetch && cmd_buffer->state.prefetch_L2_mask;
11090
11091 /* If we don't wait for idle, start prefetches first, then set
11092 * states, and draw at the end.
11093 */
11094 radv_emit_cache_flush(cmd_buffer);
11095
11096 if (need_prefetch) {
11097 /* Only prefetch the vertex shader and VBO descriptors
11098 * in order to start the draw as soon as possible.
11099 */
11100 radv_emit_prefetch_L2(cmd_buffer, true);
11101 }
11102
11103 radv_upload_graphics_shader_descriptors(cmd_buffer);
11104
11105 radv_emit_all_graphics_states(cmd_buffer, info);
11106 }
11107
11108 if (!dgc)
11109 radv_describe_draw(cmd_buffer);
11110 if (likely(!info->indirect)) {
11111 struct radv_cmd_state *state = &cmd_buffer->state;
11112 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11113 assert(state->vtx_base_sgpr);
11114 if (state->last_num_instances != info->instance_count) {
11115 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
11116 radeon_emit(cs, info->instance_count);
11117 state->last_num_instances = info->instance_count;
11118 }
11119 }
11120 assert(cmd_buffer->cs->cdw <= cdw_max);
11121
11122 return true;
11123 }
11124
11125 ALWAYS_INLINE static bool
radv_before_taskmesh_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,bool dgc)11126 radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount,
11127 bool dgc)
11128 {
11129 /* For direct draws, this makes sure we don't draw anything.
11130 * For indirect draws, this is necessary to prevent a GPU hang (on MEC version < 100).
11131 */
11132 if (unlikely(!info->count))
11133 return false;
11134
11135 if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
11136 radv_bind_graphics_shaders(cmd_buffer);
11137 }
11138
11139 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11140 struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
11141 struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
11142
11143 assert(!task_shader || ace_cs);
11144
11145 const VkShaderStageFlags stages =
11146 VK_SHADER_STAGE_MESH_BIT_EXT | VK_SHADER_STAGE_FRAGMENT_BIT | (task_shader ? VK_SHADER_STAGE_TASK_BIT_EXT : 0);
11147 const bool need_task_semaphore = task_shader && radv_flush_gang_leader_semaphore(cmd_buffer);
11148
11149 ASSERTED const unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
11150 ASSERTED const unsigned ace_cdw_max =
11151 !ace_cs ? 0 : radeon_check_space(device->ws, ace_cs, 4096 + 128 * (drawCount - 1));
11152
11153 radv_emit_all_graphics_states(cmd_buffer, info);
11154
11155 radv_emit_cache_flush(cmd_buffer);
11156
11157 if (task_shader) {
11158 radv_gang_cache_flush(cmd_buffer);
11159
11160 if (need_task_semaphore) {
11161 radv_wait_gang_leader(cmd_buffer);
11162 }
11163 }
11164
11165 radv_flush_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
11166
11167 const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
11168 if (pc_stages)
11169 radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
11170
11171 if (!dgc)
11172 radv_describe_draw(cmd_buffer);
11173 if (likely(!info->indirect)) {
11174 struct radv_cmd_state *state = &cmd_buffer->state;
11175 if (unlikely(state->last_num_instances != 1)) {
11176 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11177 radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
11178 radeon_emit(cs, 1);
11179 state->last_num_instances = 1;
11180 }
11181 }
11182
11183 assert(cmd_buffer->cs->cdw <= cdw_max);
11184 assert(!ace_cs || ace_cs->cdw <= ace_cdw_max);
11185
11186 cmd_buffer->state.last_index_type = -1;
11187
11188 return true;
11189 }
11190
11191 ALWAYS_INLINE static void
radv_after_draw(struct radv_cmd_buffer * cmd_buffer,bool dgc)11192 radv_after_draw(struct radv_cmd_buffer *cmd_buffer, bool dgc)
11193 {
11194 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11195 const struct radv_physical_device *pdev = radv_device_physical(device);
11196 const struct radeon_info *gpu_info = &pdev->info;
11197 bool has_prefetch = pdev->info.gfx_level >= GFX7;
11198 /* Start prefetches after the draw has been started. Both will
11199 * run in parallel, but starting the draw first is more
11200 * important.
11201 */
11202 if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
11203 radv_emit_prefetch_L2(cmd_buffer, false);
11204 }
11205
11206 /* Workaround for a VGT hang when streamout is enabled.
11207 * It must be done after drawing.
11208 */
11209 if (radv_is_streamout_enabled(cmd_buffer) &&
11210 (gpu_info->family == CHIP_HAWAII || gpu_info->family == CHIP_TONGA || gpu_info->family == CHIP_FIJI)) {
11211 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
11212 }
11213
11214 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH, dgc);
11215 }
11216
11217 VKAPI_ATTR void VKAPI_CALL
radv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)11218 radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex,
11219 uint32_t firstInstance)
11220 {
11221 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11222 struct radv_draw_info info;
11223
11224 info.count = vertexCount;
11225 info.instance_count = instanceCount;
11226 info.first_instance = firstInstance;
11227 info.strmout_buffer = NULL;
11228 info.indirect = NULL;
11229 info.indexed = false;
11230
11231 if (!radv_before_draw(cmd_buffer, &info, 1, false))
11232 return;
11233 const VkMultiDrawInfoEXT minfo = {firstVertex, vertexCount};
11234 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
11235 radv_after_draw(cmd_buffer, false);
11236 }
11237
11238 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)11239 radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
11240 uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
11241 {
11242 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11243 struct radv_draw_info info;
11244
11245 if (!drawCount)
11246 return;
11247
11248 info.count = pVertexInfo->vertexCount;
11249 info.instance_count = instanceCount;
11250 info.first_instance = firstInstance;
11251 info.strmout_buffer = NULL;
11252 info.indirect = NULL;
11253 info.indexed = false;
11254
11255 if (!radv_before_draw(cmd_buffer, &info, drawCount, false))
11256 return;
11257 radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
11258 radv_after_draw(cmd_buffer, false);
11259 }
11260
11261 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)11262 radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex,
11263 int32_t vertexOffset, uint32_t firstInstance)
11264 {
11265 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11266 struct radv_draw_info info;
11267
11268 info.indexed = true;
11269 info.count = indexCount;
11270 info.instance_count = instanceCount;
11271 info.first_instance = firstInstance;
11272 info.strmout_buffer = NULL;
11273 info.indirect = NULL;
11274
11275 if (!radv_before_draw(cmd_buffer, &info, 1, false))
11276 return;
11277 const VkMultiDrawIndexedInfoEXT minfo = {firstIndex, indexCount, vertexOffset};
11278 radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
11279 radv_after_draw(cmd_buffer, false);
11280 }
11281
11282 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)11283 radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
11284 const VkMultiDrawIndexedInfoEXT *pIndexInfo, uint32_t instanceCount, uint32_t firstInstance,
11285 uint32_t stride, const int32_t *pVertexOffset)
11286 {
11287 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11288 struct radv_draw_info info;
11289
11290 if (!drawCount)
11291 return;
11292
11293 const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
11294 info.indexed = true;
11295 info.count = minfo->indexCount;
11296 info.instance_count = instanceCount;
11297 info.first_instance = firstInstance;
11298 info.strmout_buffer = NULL;
11299 info.indirect = NULL;
11300
11301 if (!radv_before_draw(cmd_buffer, &info, drawCount, false))
11302 return;
11303 radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
11304 radv_after_draw(cmd_buffer, false);
11305 }
11306
11307 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)11308 radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount,
11309 uint32_t stride)
11310 {
11311 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11312 VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11313 struct radv_draw_info info;
11314
11315 info.count = drawCount;
11316 info.indirect = buffer;
11317 info.indirect_offset = offset;
11318 info.stride = stride;
11319 info.strmout_buffer = NULL;
11320 info.count_buffer = NULL;
11321 info.indexed = false;
11322 info.instance_count = 0;
11323
11324 if (!radv_before_draw(cmd_buffer, &info, 1, false))
11325 return;
11326 radv_emit_indirect_draw_packets(cmd_buffer, &info);
11327 radv_after_draw(cmd_buffer, false);
11328 }
11329
11330 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)11331 radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount,
11332 uint32_t stride)
11333 {
11334 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11335 VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11336 struct radv_draw_info info;
11337
11338 info.indexed = true;
11339 info.count = drawCount;
11340 info.indirect = buffer;
11341 info.indirect_offset = offset;
11342 info.stride = stride;
11343 info.count_buffer = NULL;
11344 info.strmout_buffer = NULL;
11345 info.instance_count = 0;
11346
11347 if (!radv_before_draw(cmd_buffer, &info, 1, false))
11348 return;
11349 radv_emit_indirect_draw_packets(cmd_buffer, &info);
11350 radv_after_draw(cmd_buffer, false);
11351 }
11352
11353 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)11354 radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer,
11355 VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)
11356 {
11357 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11358 VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11359 VK_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
11360 struct radv_draw_info info;
11361
11362 info.count = maxDrawCount;
11363 info.indirect = buffer;
11364 info.indirect_offset = offset;
11365 info.count_buffer = count_buffer;
11366 info.count_buffer_offset = countBufferOffset;
11367 info.stride = stride;
11368 info.strmout_buffer = NULL;
11369 info.indexed = false;
11370 info.instance_count = 0;
11371
11372 if (!radv_before_draw(cmd_buffer, &info, 1, false))
11373 return;
11374 radv_emit_indirect_draw_packets(cmd_buffer, &info);
11375 radv_after_draw(cmd_buffer, false);
11376 }
11377
11378 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)11379 radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
11380 VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
11381 uint32_t stride)
11382 {
11383 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11384 VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11385 VK_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
11386 struct radv_draw_info info;
11387
11388 info.indexed = true;
11389 info.count = maxDrawCount;
11390 info.indirect = buffer;
11391 info.indirect_offset = offset;
11392 info.count_buffer = count_buffer;
11393 info.count_buffer_offset = countBufferOffset;
11394 info.stride = stride;
11395 info.strmout_buffer = NULL;
11396 info.instance_count = 0;
11397
11398 if (!radv_before_draw(cmd_buffer, &info, 1, false))
11399 return;
11400 radv_emit_indirect_draw_packets(cmd_buffer, &info);
11401 radv_after_draw(cmd_buffer, false);
11402 }
11403
11404 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksEXT(VkCommandBuffer commandBuffer,uint32_t x,uint32_t y,uint32_t z)11405 radv_CmdDrawMeshTasksEXT(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
11406 {
11407 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11408 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11409 struct radv_draw_info info;
11410
11411 info.count = x * y * z;
11412 info.instance_count = 1;
11413 info.first_instance = 0;
11414 info.stride = 0;
11415 info.indexed = false;
11416 info.strmout_buffer = NULL;
11417 info.count_buffer = NULL;
11418 info.indirect = NULL;
11419
11420 if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, false))
11421 return;
11422
11423 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
11424 radv_emit_direct_taskmesh_draw_packets(device, &cmd_buffer->state, cmd_buffer->cs, cmd_buffer->gang.cs, x, y, z);
11425 } else {
11426 radv_emit_direct_mesh_draw_packet(cmd_buffer, x, y, z);
11427 }
11428
11429 radv_after_draw(cmd_buffer, false);
11430 }
11431
11432 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectEXT(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)11433 radv_CmdDrawMeshTasksIndirectEXT(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
11434 uint32_t drawCount, uint32_t stride)
11435 {
11436 if (!drawCount)
11437 return;
11438
11439 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11440 VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11441 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11442 struct radv_draw_info info;
11443
11444 info.indirect = buffer;
11445 info.indirect_offset = offset;
11446 info.stride = stride;
11447 info.count = drawCount;
11448 info.strmout_buffer = NULL;
11449 info.count_buffer = NULL;
11450 info.indexed = false;
11451 info.instance_count = 0;
11452
11453 if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount, false))
11454 return;
11455
11456 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
11457 radv_emit_indirect_taskmesh_draw_packets(device, &cmd_buffer->state, cmd_buffer->cs, cmd_buffer->gang.cs, &info,
11458 0);
11459 } else {
11460 radv_emit_indirect_mesh_draw_packets(cmd_buffer, &info);
11461 }
11462
11463 radv_after_draw(cmd_buffer, false);
11464 }
11465
11466 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)11467 radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
11468 VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
11469 uint32_t stride)
11470 {
11471
11472 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11473 VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11474 VK_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
11475 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11476 const struct radv_physical_device *pdev = radv_device_physical(device);
11477 struct radv_draw_info info;
11478
11479 info.indirect = buffer;
11480 info.indirect_offset = offset;
11481 info.stride = stride;
11482 info.count = maxDrawCount;
11483 info.strmout_buffer = NULL;
11484 info.count_buffer = count_buffer;
11485 info.count_buffer_offset = countBufferOffset;
11486 info.indexed = false;
11487 info.instance_count = 0;
11488
11489 if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount, false))
11490 return;
11491
11492 if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
11493 uint64_t workaround_cond_va = 0;
11494
11495 if (pdev->info.has_taskmesh_indirect0_bug && info.count_buffer) {
11496 /* Allocate a 32-bit value for the MEC firmware bug workaround. */
11497 uint32_t workaround_cond_init = 0;
11498 uint32_t workaround_cond_off;
11499
11500 if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &workaround_cond_init, &workaround_cond_off))
11501 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
11502
11503 workaround_cond_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + workaround_cond_off;
11504 }
11505
11506 radv_emit_indirect_taskmesh_draw_packets(device, &cmd_buffer->state, cmd_buffer->cs, cmd_buffer->gang.cs, &info,
11507 workaround_cond_va);
11508 } else {
11509 radv_emit_indirect_mesh_draw_packets(cmd_buffer, &info);
11510 }
11511
11512 radv_after_draw(cmd_buffer, false);
11513 }
11514
11515 /* TODO: Use these functions with the normal dispatch path. */
11516 static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer);
11517 static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer);
11518
11519 VKAPI_ATTR void VKAPI_CALL
radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)11520 radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
11521 const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
11522 {
11523 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11524 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
11525 VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
11526
11527 if (!radv_dgc_can_preprocess(layout, pipeline))
11528 return;
11529
11530 /* VK_EXT_conditional_rendering says that copy commands should not be
11531 * affected by conditional rendering.
11532 */
11533 const bool old_predicating = cmd_buffer->state.predicating;
11534 cmd_buffer->state.predicating = false;
11535
11536 radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, false);
11537
11538 /* Restore conditional rendering. */
11539 cmd_buffer->state.predicating = old_predicating;
11540 }
11541
11542 static void
radv_dgc_execute_ib(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)11543 radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
11544 {
11545 VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
11546 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11547 const bool has_task_shader = radv_dgc_with_task_shader(pGeneratedCommandsInfo);
11548
11549 const uint32_t cmdbuf_size = radv_get_indirect_gfx_cmdbuf_size(pGeneratedCommandsInfo);
11550 const uint64_t ib_va =
11551 radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
11552
11553 device->ws->cs_execute_ib(cmd_buffer->cs, NULL, ib_va, cmdbuf_size >> 2, cmd_buffer->state.predicating);
11554
11555 if (has_task_shader) {
11556 const uint32_t ace_cmdbuf_size = radv_get_indirect_ace_cmdbuf_size(pGeneratedCommandsInfo);
11557 const uint64_t ace_ib_va = ib_va + radv_get_indirect_ace_cmdbuf_offset(pGeneratedCommandsInfo);
11558
11559 assert(cmd_buffer->gang.cs);
11560 device->ws->cs_execute_ib(cmd_buffer->gang.cs, NULL, ace_ib_va, ace_cmdbuf_size >> 2,
11561 cmd_buffer->state.predicating);
11562 }
11563 }
11564
11565 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)11566 radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
11567 const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
11568 {
11569 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11570 VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
11571 VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
11572 VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
11573 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11574 const bool compute = layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE;
11575 const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo);
11576 const struct radv_physical_device *pdev = radv_device_physical(device);
11577
11578 /* Secondary command buffers are needed for the full extension but can't use
11579 * PKT3_INDIRECT_BUFFER.
11580 */
11581 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
11582
11583 if (use_predication) {
11584 VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
11585 const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset +
11586 pGeneratedCommandsInfo->sequencesCountOffset;
11587
11588 radv_begin_conditional_rendering(cmd_buffer, va, true);
11589 }
11590
11591 if (!radv_dgc_can_preprocess(layout, pipeline)) {
11592 /* Suspend conditional rendering when the DGC execute is called on the compute queue to
11593 * generate a cmdbuf which will skips dispatches when necessary. This is because the
11594 * compute queue is missing IB2 which means it's not possible to skip the cmdbuf entirely.
11595 * It should also be suspended when task shaders are used because the DGC ACE IB would be
11596 * uninitialized otherwise.
11597 */
11598 const bool suspend_cond_render =
11599 (cmd_buffer->qf == RADV_QUEUE_COMPUTE || radv_dgc_with_task_shader(pGeneratedCommandsInfo));
11600 const bool old_predicating = cmd_buffer->state.predicating;
11601
11602 if (suspend_cond_render && cmd_buffer->state.predicating) {
11603 cmd_buffer->state.predicating = false;
11604 }
11605
11606 radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, old_predicating);
11607
11608 if (suspend_cond_render) {
11609 cmd_buffer->state.predicating = old_predicating;
11610 }
11611
11612 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
11613
11614 if (radv_dgc_with_task_shader(pGeneratedCommandsInfo)) {
11615 /* Make sure the DGC ACE IB will wait for the DGC prepare shader before the execution
11616 * starts.
11617 */
11618 radv_gang_barrier(cmd_buffer, VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV,
11619 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT);
11620 }
11621 }
11622
11623 if (compute) {
11624 radv_dgc_before_dispatch(cmd_buffer);
11625
11626 if (!pGeneratedCommandsInfo->pipeline)
11627 cmd_buffer->has_indirect_pipeline_binds = true;
11628 } else {
11629 struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
11630 struct radv_draw_info info;
11631
11632 info.count = pGeneratedCommandsInfo->sequencesCount;
11633 info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal
11634 that this is not direct. */
11635 info.indirect_offset = 0;
11636 info.stride = 0;
11637 info.strmout_buffer = NULL;
11638 info.count_buffer = NULL;
11639 info.indexed = layout->indexed;
11640 info.instance_count = 0;
11641
11642 if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH)) {
11643 if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, true))
11644 return;
11645 } else {
11646 if (!radv_before_draw(cmd_buffer, &info, 1, true))
11647 return;
11648 }
11649 }
11650
11651 const uint32_t view_mask = cmd_buffer->state.render.view_mask;
11652
11653 if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
11654 radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
11655 radeon_emit(cmd_buffer->cs, 0);
11656 }
11657
11658 radv_cs_add_buffer(device->ws, cmd_buffer->cs, prep_buffer->bo);
11659
11660 if (compute || !view_mask) {
11661 radv_dgc_execute_ib(cmd_buffer, pGeneratedCommandsInfo);
11662 } else {
11663 u_foreach_bit (view, view_mask) {
11664 radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
11665
11666 radv_dgc_execute_ib(cmd_buffer, pGeneratedCommandsInfo);
11667 }
11668 }
11669
11670 if (compute) {
11671 cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
11672
11673 if (!pGeneratedCommandsInfo->pipeline)
11674 radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
11675
11676 radv_dgc_after_dispatch(cmd_buffer);
11677 } else {
11678 struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
11679
11680 if (layout->binds_index_buffer) {
11681 cmd_buffer->state.last_index_type = -1;
11682 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
11683 }
11684
11685 if (layout->bind_vbo_mask)
11686 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
11687
11688 cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
11689
11690 if (!layout->indexed && pdev->info.gfx_level >= GFX7) {
11691 /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, so the state must be
11692 * re-emitted before the next indexed draw.
11693 */
11694 cmd_buffer->state.last_index_type = -1;
11695 }
11696
11697 cmd_buffer->state.last_num_instances = -1;
11698 cmd_buffer->state.last_vertex_offset_valid = false;
11699 cmd_buffer->state.last_first_instance = -1;
11700 cmd_buffer->state.last_drawid = -1;
11701
11702 radv_after_draw(cmd_buffer, true);
11703 }
11704
11705 if (use_predication) {
11706 radv_end_conditional_rendering(cmd_buffer);
11707 }
11708 }
11709
11710 static void
radv_save_dispatch_size(struct radv_cmd_buffer * cmd_buffer,uint64_t indirect_va)11711 radv_save_dispatch_size(struct radv_cmd_buffer *cmd_buffer, uint64_t indirect_va)
11712 {
11713 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11714
11715 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11716 radeon_check_space(device->ws, cs, 18);
11717
11718 uint64_t va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, indirect_dispatch);
11719
11720 for (uint32_t i = 0; i < 3; i++) {
11721 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11722 radeon_emit(cs,
11723 COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
11724 radeon_emit(cs, indirect_va);
11725 radeon_emit(cs, indirect_va >> 32);
11726 radeon_emit(cs, va);
11727 radeon_emit(cs, va >> 32);
11728
11729 indirect_va += 4;
11730 va += 4;
11731 }
11732 }
11733
11734 static void
radv_emit_dispatch_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * compute_shader,const struct radv_dispatch_info * info)11735 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *compute_shader,
11736 const struct radv_dispatch_info *info)
11737 {
11738 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11739 const struct radv_physical_device *pdev = radv_device_physical(device);
11740 unsigned dispatch_initiator = device->dispatch_initiator;
11741 struct radeon_winsys *ws = device->ws;
11742 bool predicating = cmd_buffer->state.predicating;
11743 struct radeon_cmdbuf *cs = cmd_buffer->cs;
11744 const uint32_t grid_size_offset = radv_get_user_sgpr_loc(compute_shader, AC_UD_CS_GRID_SIZE);
11745
11746 radv_describe_dispatch(cmd_buffer, info);
11747
11748 ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 30);
11749
11750 if (compute_shader->info.wave_size == 32) {
11751 assert(pdev->info.gfx_level >= GFX10);
11752 dispatch_initiator |= S_00B800_CS_W32_EN(1);
11753 }
11754
11755 if (info->ordered)
11756 dispatch_initiator &= ~S_00B800_ORDER_MODE(1);
11757
11758 if (info->va) {
11759 if (radv_device_fault_detection_enabled(device))
11760 radv_save_dispatch_size(cmd_buffer, info->va);
11761
11762 if (info->indirect)
11763 radv_cs_add_buffer(ws, cs, info->indirect);
11764
11765 if (info->unaligned) {
11766 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
11767 if (pdev->info.gfx_level >= GFX12) {
11768 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL_GFX12(compute_shader->info.cs.block_size[0]));
11769 radeon_emit(cs, S_00B820_NUM_THREAD_FULL_GFX12(compute_shader->info.cs.block_size[1]));
11770 } else {
11771 radeon_emit(cs, S_00B81C_NUM_THREAD_FULL_GFX6(compute_shader->info.cs.block_size[0]));
11772 radeon_emit(cs, S_00B820_NUM_THREAD_FULL_GFX6(compute_shader->info.cs.block_size[1]));
11773 }
11774 radeon_emit(cs, S_00B824_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
11775
11776 dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
11777 }
11778
11779 if (grid_size_offset) {
11780 if (device->load_grid_size_from_user_sgpr) {
11781 assert(pdev->info.gfx_level >= GFX10_3);
11782 radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
11783 radeon_emit(cs, info->va);
11784 radeon_emit(cs, info->va >> 32);
11785 radeon_emit(cs, (grid_size_offset - SI_SH_REG_OFFSET) >> 2);
11786 radeon_emit(cs, 3);
11787 } else {
11788 radv_emit_shader_pointer(device, cmd_buffer->cs, grid_size_offset, info->va, true);
11789 }
11790 }
11791
11792 if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
11793 uint64_t indirect_va = info->va;
11794 const bool needs_align32_workaround = pdev->info.has_async_compute_align32_bug &&
11795 cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
11796 !util_is_aligned(indirect_va, 32);
11797 const unsigned ace_predication_size =
11798 4 /* DISPATCH_INDIRECT */ + (needs_align32_workaround ? 6 * 3 /* 3x COPY_DATA */ : 0);
11799
11800 radv_cs_emit_compute_predication(device, &cmd_buffer->state, cs, cmd_buffer->state.mec_inv_pred_va,
11801 &cmd_buffer->state.mec_inv_pred_emitted, ace_predication_size);
11802
11803 if (needs_align32_workaround) {
11804 const uint64_t unaligned_va = indirect_va;
11805 UNUSED void *ptr;
11806 uint32_t offset;
11807
11808 if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, sizeof(VkDispatchIndirectCommand), 32, &offset, &ptr))
11809 return;
11810
11811 indirect_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
11812
11813 for (uint32_t i = 0; i < 3; i++) {
11814 const uint64_t src_va = unaligned_va + i * 4;
11815 const uint64_t dst_va = indirect_va + i * 4;
11816
11817 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11818 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
11819 COPY_DATA_WR_CONFIRM);
11820 radeon_emit(cs, src_va);
11821 radeon_emit(cs, src_va >> 32);
11822 radeon_emit(cs, dst_va);
11823 radeon_emit(cs, dst_va >> 32);
11824 }
11825 }
11826
11827 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
11828 radeon_emit(cs, indirect_va);
11829 radeon_emit(cs, indirect_va >> 32);
11830 radeon_emit(cs, dispatch_initiator);
11831 } else {
11832 radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
11833 radeon_emit(cs, 1);
11834 radeon_emit(cs, info->va);
11835 radeon_emit(cs, info->va >> 32);
11836
11837 if (cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
11838 radv_cs_emit_compute_predication(device, &cmd_buffer->state, cs, cmd_buffer->state.mec_inv_pred_va,
11839 &cmd_buffer->state.mec_inv_pred_emitted, 3 /* PKT3_DISPATCH_INDIRECT */);
11840 predicating = false;
11841 }
11842
11843 radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
11844 radeon_emit(cs, 0);
11845 radeon_emit(cs, dispatch_initiator);
11846 }
11847 } else {
11848 const unsigned *cs_block_size = compute_shader->info.cs.block_size;
11849 unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
11850 unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
11851
11852 if (info->unaligned) {
11853 unsigned remainder[3];
11854
11855 /* If aligned, these should be an entire block size,
11856 * not 0.
11857 */
11858 remainder[0] = blocks[0] + cs_block_size[0] - ALIGN_NPOT(blocks[0], cs_block_size[0]);
11859 remainder[1] = blocks[1] + cs_block_size[1] - ALIGN_NPOT(blocks[1], cs_block_size[1]);
11860 remainder[2] = blocks[2] + cs_block_size[2] - ALIGN_NPOT(blocks[2], cs_block_size[2]);
11861
11862 blocks[0] = DIV_ROUND_UP(blocks[0], cs_block_size[0]);
11863 blocks[1] = DIV_ROUND_UP(blocks[1], cs_block_size[1]);
11864 blocks[2] = DIV_ROUND_UP(blocks[2], cs_block_size[2]);
11865
11866 for (unsigned i = 0; i < 3; ++i) {
11867 assert(offsets[i] % cs_block_size[i] == 0);
11868 offsets[i] /= cs_block_size[i];
11869 }
11870
11871 radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
11872 if (pdev->info.gfx_level >= GFX12) {
11873 radeon_emit(cs,
11874 S_00B81C_NUM_THREAD_FULL_GFX12(cs_block_size[0]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
11875 radeon_emit(cs,
11876 S_00B820_NUM_THREAD_FULL_GFX12(cs_block_size[1]) | S_00B820_NUM_THREAD_PARTIAL(remainder[1]));
11877 } else {
11878 radeon_emit(cs,
11879 S_00B81C_NUM_THREAD_FULL_GFX6(cs_block_size[0]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
11880 radeon_emit(cs,
11881 S_00B820_NUM_THREAD_FULL_GFX6(cs_block_size[1]) | S_00B820_NUM_THREAD_PARTIAL(remainder[1]));
11882 }
11883 radeon_emit(cs, S_00B824_NUM_THREAD_FULL(cs_block_size[2]) | S_00B824_NUM_THREAD_PARTIAL(remainder[2]));
11884
11885 dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
11886 }
11887
11888 if (grid_size_offset) {
11889 if (device->load_grid_size_from_user_sgpr) {
11890 radeon_set_sh_reg_seq(cs, grid_size_offset, 3);
11891 radeon_emit(cs, blocks[0]);
11892 radeon_emit(cs, blocks[1]);
11893 radeon_emit(cs, blocks[2]);
11894 } else {
11895 uint32_t offset;
11896 if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
11897 return;
11898
11899 uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
11900 radv_emit_shader_pointer(device, cmd_buffer->cs, grid_size_offset, va, true);
11901 }
11902 }
11903
11904 if (offsets[0] || offsets[1] || offsets[2]) {
11905 radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
11906 radeon_emit(cs, offsets[0]);
11907 radeon_emit(cs, offsets[1]);
11908 radeon_emit(cs, offsets[2]);
11909
11910 /* The blocks in the packet are not counts but end values. */
11911 for (unsigned i = 0; i < 3; ++i)
11912 blocks[i] += offsets[i];
11913 } else {
11914 dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
11915 }
11916
11917 if (cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
11918 radv_cs_emit_compute_predication(device, &cmd_buffer->state, cs, cmd_buffer->state.mec_inv_pred_va,
11919 &cmd_buffer->state.mec_inv_pred_emitted, 5 /* DISPATCH_DIRECT size */);
11920 predicating = false;
11921 }
11922
11923 if (pdev->info.has_async_compute_threadgroup_bug && cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
11924 for (unsigned i = 0; i < 3; i++) {
11925 if (info->unaligned) {
11926 /* info->blocks is already in thread dimensions for unaligned dispatches. */
11927 blocks[i] = info->blocks[i];
11928 } else {
11929 /* Force the async compute dispatch to be in "thread" dim mode to workaround a hw bug. */
11930 blocks[i] *= cs_block_size[i];
11931 }
11932
11933 dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
11934 }
11935 }
11936
11937 radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
11938 radeon_emit(cs, blocks[0]);
11939 radeon_emit(cs, blocks[1]);
11940 radeon_emit(cs, blocks[2]);
11941 radeon_emit(cs, dispatch_initiator);
11942 }
11943
11944 assert(cmd_buffer->cs->cdw <= cdw_max);
11945 }
11946
11947 static void
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)11948 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
11949 {
11950 radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, bind_point);
11951 const VkShaderStageFlags stages =
11952 bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR ? RADV_RT_STAGE_BITS : VK_SHADER_STAGE_COMPUTE_BIT;
11953 const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, bind_point);
11954 if (pc_stages)
11955 radv_flush_constants(cmd_buffer, pc_stages, bind_point);
11956 }
11957
11958 static void
radv_emit_rt_stack_size(struct radv_cmd_buffer * cmd_buffer)11959 radv_emit_rt_stack_size(struct radv_cmd_buffer *cmd_buffer)
11960 {
11961 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11962 const struct radv_shader *rt_prolog = cmd_buffer->state.rt_prolog;
11963 unsigned rsrc2 = rt_prolog->config.rsrc2;
11964 if (cmd_buffer->state.rt_stack_size)
11965 rsrc2 |= S_00B12C_SCRATCH_EN(1);
11966
11967 radeon_check_space(device->ws, cmd_buffer->cs, 3);
11968 radeon_set_sh_reg(cmd_buffer->cs, rt_prolog->info.regs.pgm_rsrc2, rsrc2);
11969 }
11970
11971 static void
radv_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info,struct radv_compute_pipeline * pipeline,struct radv_shader * compute_shader,VkPipelineBindPoint bind_point)11972 radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
11973 struct radv_compute_pipeline *pipeline, struct radv_shader *compute_shader,
11974 VkPipelineBindPoint bind_point)
11975 {
11976 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11977 const struct radv_physical_device *pdev = radv_device_physical(device);
11978 bool has_prefetch = pdev->info.gfx_level >= GFX7;
11979 bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
11980
11981 if (compute_shader->info.cs.regalloc_hang_bug)
11982 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
11983
11984 if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
11985 RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
11986 /* If we have to wait for idle, set all states first, so that
11987 * all SET packets are processed in parallel with previous draw
11988 * calls. Then upload descriptors, set shader pointers, and
11989 * dispatch, and prefetch at the end. This ensures that the
11990 * time the CUs are idle is very short. (there are only SET_SH
11991 * packets between the wait and the draw)
11992 */
11993 radv_emit_compute_pipeline(cmd_buffer, pipeline);
11994 if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR)
11995 radv_emit_rt_stack_size(cmd_buffer);
11996 radv_emit_cache_flush(cmd_buffer);
11997 /* <-- CUs are idle here --> */
11998
11999 radv_upload_compute_shader_descriptors(cmd_buffer, bind_point);
12000
12001 radv_emit_dispatch_packets(cmd_buffer, compute_shader, info);
12002 /* <-- CUs are busy here --> */
12003
12004 /* Start prefetches after the dispatch has been started. Both
12005 * will run in parallel, but starting the dispatch first is
12006 * more important.
12007 */
12008 if (has_prefetch && pipeline_is_dirty) {
12009 radv_emit_shader_prefetch(cmd_buffer, compute_shader);
12010 }
12011 } else {
12012 /* If we don't wait for idle, start prefetches first, then set
12013 * states, and dispatch at the end.
12014 */
12015 radv_emit_cache_flush(cmd_buffer);
12016
12017 if (has_prefetch && pipeline_is_dirty) {
12018 radv_emit_shader_prefetch(cmd_buffer, compute_shader);
12019 }
12020
12021 radv_upload_compute_shader_descriptors(cmd_buffer, bind_point);
12022
12023 radv_emit_compute_pipeline(cmd_buffer, pipeline);
12024 if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR)
12025 radv_emit_rt_stack_size(cmd_buffer);
12026 radv_emit_dispatch_packets(cmd_buffer, compute_shader, info);
12027 }
12028
12029 if (pipeline_is_dirty) {
12030 /* Raytracing uses compute shaders but has separate bind points and pipelines.
12031 * So if we set compute userdata & shader registers we should dirty the raytracing
12032 * ones and the other way around.
12033 *
12034 * We only need to do this when the pipeline is dirty because when we switch between
12035 * the two we always need to switch pipelines.
12036 */
12037 radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
12038 ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
12039 : VK_PIPELINE_BIND_POINT_COMPUTE);
12040 }
12041
12042 if (compute_shader->info.cs.regalloc_hang_bug)
12043 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
12044
12045 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, false);
12046 }
12047
12048 static void
radv_dgc_before_dispatch(struct radv_cmd_buffer * cmd_buffer)12049 radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer)
12050 {
12051 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12052 const struct radv_physical_device *pdev = radv_device_physical(device);
12053 struct radv_compute_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
12054 struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
12055 bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
12056
12057 /* We will have run the DGC patch shaders before, so we can assume that there is something to
12058 * flush. Otherwise, we just split radv_dispatch in two. One pre-dispatch and another one
12059 * post-dispatch. */
12060
12061 if (compute_shader->info.cs.regalloc_hang_bug)
12062 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
12063
12064 if (pipeline)
12065 radv_emit_compute_pipeline(cmd_buffer, pipeline);
12066 radv_emit_cache_flush(cmd_buffer);
12067
12068 radv_upload_compute_shader_descriptors(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
12069
12070 if (pipeline_is_dirty) {
12071 const bool has_prefetch = pdev->info.gfx_level >= GFX7;
12072
12073 if (has_prefetch)
12074 radv_emit_shader_prefetch(cmd_buffer, compute_shader);
12075
12076 /* Raytracing uses compute shaders but has separate bind points and pipelines.
12077 * So if we set compute userdata & shader registers we should dirty the raytracing
12078 * ones and the other way around.
12079 *
12080 * We only need to do this when the pipeline is dirty because when we switch between
12081 * the two we always need to switch pipelines.
12082 */
12083 radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
12084 }
12085 }
12086
12087 static void
radv_dgc_after_dispatch(struct radv_cmd_buffer * cmd_buffer)12088 radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer)
12089 {
12090 struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
12091
12092 if (compute_shader->info.cs.regalloc_hang_bug)
12093 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
12094
12095 radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, true);
12096 }
12097
12098 void
radv_compute_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)12099 radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
12100 {
12101 radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline, cmd_buffer->state.shaders[MESA_SHADER_COMPUTE],
12102 VK_PIPELINE_BIND_POINT_COMPUTE);
12103 }
12104
12105 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)12106 radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y, uint32_t base_z, uint32_t x,
12107 uint32_t y, uint32_t z)
12108 {
12109 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12110 struct radv_dispatch_info info = {0};
12111
12112 info.blocks[0] = x;
12113 info.blocks[1] = y;
12114 info.blocks[2] = z;
12115
12116 info.offsets[0] = base_x;
12117 info.offsets[1] = base_y;
12118 info.offsets[2] = base_z;
12119 radv_compute_dispatch(cmd_buffer, &info);
12120 }
12121
12122 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)12123 radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
12124 {
12125 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12126 VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
12127 struct radv_dispatch_info info = {0};
12128
12129 info.indirect = buffer->bo;
12130 info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
12131
12132 radv_compute_dispatch(cmd_buffer, &info);
12133 }
12134
12135 void
radv_unaligned_dispatch(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)12136 radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
12137 {
12138 struct radv_dispatch_info info = {0};
12139
12140 info.blocks[0] = x;
12141 info.blocks[1] = y;
12142 info.blocks[2] = z;
12143 info.unaligned = 1;
12144
12145 radv_compute_dispatch(cmd_buffer, &info);
12146 }
12147
12148 void
radv_indirect_dispatch(struct radv_cmd_buffer * cmd_buffer,struct radeon_winsys_bo * bo,uint64_t va)12149 radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
12150 {
12151 struct radv_dispatch_info info = {0};
12152
12153 info.indirect = bo;
12154 info.va = va;
12155
12156 radv_compute_dispatch(cmd_buffer, &info);
12157 }
12158
12159 static void
radv_trace_trace_rays(struct radv_cmd_buffer * cmd_buffer,const VkTraceRaysIndirectCommand2KHR * cmd,uint64_t indirect_va)12160 radv_trace_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *cmd,
12161 uint64_t indirect_va)
12162 {
12163 if (!cmd || indirect_va)
12164 return;
12165
12166 struct radv_rra_ray_history_data *data = malloc(sizeof(struct radv_rra_ray_history_data));
12167 if (!data)
12168 return;
12169
12170 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12171 uint32_t width = DIV_ROUND_UP(cmd->width, device->rra_trace.ray_history_resolution_scale);
12172 uint32_t height = DIV_ROUND_UP(cmd->height, device->rra_trace.ray_history_resolution_scale);
12173 uint32_t depth = DIV_ROUND_UP(cmd->depth, device->rra_trace.ray_history_resolution_scale);
12174
12175 struct radv_rra_ray_history_counter counter = {
12176 .dispatch_size = {width, height, depth},
12177 .hit_shader_count = cmd->hitShaderBindingTableSize / cmd->hitShaderBindingTableStride,
12178 .miss_shader_count = cmd->missShaderBindingTableSize / cmd->missShaderBindingTableStride,
12179 .shader_count = cmd_buffer->state.rt_pipeline->stage_count,
12180 .pipeline_api_hash = cmd_buffer->state.rt_pipeline->base.base.pipeline_hash,
12181 .mode = 1,
12182 .stride = sizeof(uint32_t),
12183 .data_size = 0,
12184 .ray_id_begin = 0,
12185 .ray_id_end = 0xFFFFFFFF,
12186 .pipeline_type = RADV_RRA_PIPELINE_RAY_TRACING,
12187 };
12188
12189 struct radv_rra_ray_history_dispatch_size dispatch_size = {
12190 .size = {width, height, depth},
12191 };
12192
12193 struct radv_rra_ray_history_traversal_flags traversal_flags = {0};
12194
12195 data->metadata = (struct radv_rra_ray_history_metadata){
12196 .counter_info.type = RADV_RRA_COUNTER_INFO,
12197 .counter_info.size = sizeof(struct radv_rra_ray_history_counter),
12198 .counter = counter,
12199
12200 .dispatch_size_info.type = RADV_RRA_DISPATCH_SIZE,
12201 .dispatch_size_info.size = sizeof(struct radv_rra_ray_history_dispatch_size),
12202 .dispatch_size = dispatch_size,
12203
12204 .traversal_flags_info.type = RADV_RRA_TRAVERSAL_FLAGS,
12205 .traversal_flags_info.size = sizeof(struct radv_rra_ray_history_traversal_flags),
12206 .traversal_flags = traversal_flags,
12207 };
12208
12209 uint32_t dispatch_index = util_dynarray_num_elements(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *)
12210 << 16;
12211
12212 util_dynarray_append(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *, data);
12213
12214 cmd_buffer->state.flush_bits |=
12215 RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
12216 radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_WRITE_BIT, NULL) |
12217 radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_READ_BIT, NULL);
12218
12219 radv_update_buffer_cp(cmd_buffer,
12220 device->rra_trace.ray_history_addr + offsetof(struct radv_ray_history_header, dispatch_index),
12221 &dispatch_index, sizeof(dispatch_index));
12222 }
12223
12224 enum radv_rt_mode {
12225 radv_rt_mode_direct,
12226 radv_rt_mode_indirect,
12227 radv_rt_mode_indirect2,
12228 };
12229
12230 static void
radv_upload_trace_rays_params(struct radv_cmd_buffer * cmd_buffer,VkTraceRaysIndirectCommand2KHR * tables,enum radv_rt_mode mode,uint64_t * launch_size_va,uint64_t * sbt_va)12231 radv_upload_trace_rays_params(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables,
12232 enum radv_rt_mode mode, uint64_t *launch_size_va, uint64_t *sbt_va)
12233 {
12234 uint32_t upload_size = mode == radv_rt_mode_direct ? sizeof(VkTraceRaysIndirectCommand2KHR)
12235 : offsetof(VkTraceRaysIndirectCommand2KHR, width);
12236
12237 uint32_t offset;
12238 if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
12239 return;
12240
12241 uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
12242
12243 if (mode == radv_rt_mode_direct)
12244 *launch_size_va = upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
12245 if (sbt_va)
12246 *sbt_va = upload_va;
12247 }
12248
12249 static void
radv_trace_rays(struct radv_cmd_buffer * cmd_buffer,VkTraceRaysIndirectCommand2KHR * tables,uint64_t indirect_va,enum radv_rt_mode mode)12250 radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables, uint64_t indirect_va,
12251 enum radv_rt_mode mode)
12252 {
12253 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12254 const struct radv_physical_device *pdev = radv_device_physical(device);
12255 const struct radv_instance *instance = radv_physical_device_instance(pdev);
12256
12257 if (instance->debug_flags & RADV_DEBUG_NO_RT)
12258 return;
12259
12260 if (unlikely(device->rra_trace.ray_history_buffer))
12261 radv_trace_trace_rays(cmd_buffer, tables, indirect_va);
12262
12263 struct radv_compute_pipeline *pipeline = &cmd_buffer->state.rt_pipeline->base;
12264 struct radv_shader *rt_prolog = cmd_buffer->state.rt_prolog;
12265
12266 /* Reserve scratch for stacks manually since it is not handled by the compute path. */
12267 uint32_t scratch_bytes_per_wave = rt_prolog->config.scratch_bytes_per_wave;
12268 uint32_t wave_size = rt_prolog->info.wave_size;
12269
12270 /* The hardware register is specified as a multiple of 64 or 256 DWORDS. */
12271 unsigned scratch_alloc_granule = pdev->info.gfx_level >= GFX11 ? 256 : 1024;
12272 scratch_bytes_per_wave += align(cmd_buffer->state.rt_stack_size * wave_size, scratch_alloc_granule);
12273
12274 cmd_buffer->compute_scratch_size_per_wave_needed =
12275 MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
12276
12277 /* Since the workgroup size is 8x4 (or 8x8), 1D dispatches can only fill 8 threads per wave at most. To increase
12278 * occupancy, it's beneficial to convert to a 2D dispatch in these cases. */
12279 if (tables && tables->height == 1 && tables->width >= cmd_buffer->state.rt_prolog->info.cs.block_size[0])
12280 tables->height = ACO_RT_CONVERTED_2D_LAUNCH_SIZE;
12281
12282 struct radv_dispatch_info info = {0};
12283 info.unaligned = true;
12284
12285 uint64_t launch_size_va = 0;
12286 uint64_t sbt_va = 0;
12287
12288 if (mode != radv_rt_mode_indirect2) {
12289 launch_size_va = indirect_va;
12290 radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, &sbt_va);
12291 } else {
12292 launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
12293 sbt_va = indirect_va;
12294 }
12295
12296 uint32_t remaining_ray_count = 0;
12297
12298 if (mode == radv_rt_mode_direct) {
12299 info.blocks[0] = tables->width;
12300 info.blocks[1] = tables->height;
12301 info.blocks[2] = tables->depth;
12302
12303 if (tables->height == ACO_RT_CONVERTED_2D_LAUNCH_SIZE) {
12304 /* We need the ray count for the 2D dispatch to be a multiple of the y block size for the division to work, and
12305 * a multiple of the x block size because the invocation offset must be a multiple of the block size when
12306 * dispatching the remaining rays. Fortunately, the x block size is itself a multiple of the y block size, so
12307 * we only need to ensure that the ray count is a multiple of the x block size. */
12308 remaining_ray_count = tables->width % rt_prolog->info.cs.block_size[0];
12309
12310 uint32_t ray_count = tables->width - remaining_ray_count;
12311 info.blocks[0] = ray_count / rt_prolog->info.cs.block_size[1];
12312 info.blocks[1] = rt_prolog->info.cs.block_size[1];
12313 }
12314 } else
12315 info.va = launch_size_va;
12316
12317 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 15);
12318
12319 const uint32_t sbt_descriptors_offset = radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
12320 if (sbt_descriptors_offset) {
12321 radv_emit_shader_pointer(device, cmd_buffer->cs, sbt_descriptors_offset, sbt_va, true);
12322 }
12323
12324 const uint32_t ray_launch_size_addr_offset = radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
12325 if (ray_launch_size_addr_offset) {
12326 radv_emit_shader_pointer(device, cmd_buffer->cs, ray_launch_size_addr_offset, launch_size_va, true);
12327 }
12328
12329 const uint32_t ray_dynamic_callback_stack_base_offset =
12330 radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE);
12331 if (ray_dynamic_callback_stack_base_offset) {
12332 const struct radv_shader_info *cs_info = &rt_prolog->info;
12333 radeon_set_sh_reg(cmd_buffer->cs, ray_dynamic_callback_stack_base_offset,
12334 rt_prolog->config.scratch_bytes_per_wave / cs_info->wave_size);
12335 }
12336
12337 const uint32_t traversal_shader_addr_offset = radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_TRAVERSAL_SHADER_ADDR);
12338 struct radv_shader *traversal_shader = cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION];
12339 if (traversal_shader_addr_offset && traversal_shader) {
12340 uint64_t traversal_va = traversal_shader->va | radv_rt_priority_traversal;
12341 radv_emit_shader_pointer(device, cmd_buffer->cs, traversal_shader_addr_offset, traversal_va, true);
12342 }
12343
12344 assert(cmd_buffer->cs->cdw <= cdw_max);
12345
12346 radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
12347
12348 if (remaining_ray_count) {
12349 info.blocks[0] = remaining_ray_count;
12350 info.blocks[1] = 1;
12351 info.offsets[0] = tables->width - remaining_ray_count;
12352
12353 /* Reset the ray launch size so the prolog doesn't think this is a converted dispatch */
12354 tables->height = 1;
12355 radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, NULL);
12356 if (ray_launch_size_addr_offset) {
12357 radv_emit_shader_pointer(device, cmd_buffer->cs, ray_launch_size_addr_offset, launch_size_va, true);
12358 }
12359
12360 radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
12361 }
12362 }
12363
12364 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,uint32_t width,uint32_t height,uint32_t depth)12365 radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer, const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
12366 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
12367 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
12368 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
12369 uint32_t height, uint32_t depth)
12370 {
12371 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12372
12373 VkTraceRaysIndirectCommand2KHR tables = {
12374 .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
12375 .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
12376 .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
12377 .missShaderBindingTableSize = pMissShaderBindingTable->size,
12378 .missShaderBindingTableStride = pMissShaderBindingTable->stride,
12379 .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
12380 .hitShaderBindingTableSize = pHitShaderBindingTable->size,
12381 .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
12382 .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
12383 .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
12384 .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
12385 .width = width,
12386 .height = height,
12387 .depth = depth,
12388 };
12389
12390 radv_trace_rays(cmd_buffer, &tables, 0, radv_rt_mode_direct);
12391 }
12392
12393 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,VkDeviceAddress indirectDeviceAddress)12394 radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
12395 const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
12396 const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
12397 const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
12398 const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
12399 VkDeviceAddress indirectDeviceAddress)
12400 {
12401 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12402 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12403
12404 assert(device->use_global_bo_list);
12405
12406 VkTraceRaysIndirectCommand2KHR tables = {
12407 .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
12408 .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
12409 .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
12410 .missShaderBindingTableSize = pMissShaderBindingTable->size,
12411 .missShaderBindingTableStride = pMissShaderBindingTable->stride,
12412 .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
12413 .hitShaderBindingTableSize = pHitShaderBindingTable->size,
12414 .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
12415 .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
12416 .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
12417 .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
12418 };
12419
12420 radv_trace_rays(cmd_buffer, &tables, indirectDeviceAddress, radv_rt_mode_indirect);
12421 }
12422
12423 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer,VkDeviceAddress indirectDeviceAddress)12424 radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer, VkDeviceAddress indirectDeviceAddress)
12425 {
12426 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12427 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12428
12429 assert(device->use_global_bo_list);
12430
12431 radv_trace_rays(cmd_buffer, NULL, indirectDeviceAddress, radv_rt_mode_indirect2);
12432 }
12433
12434 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer,uint32_t size)12435 radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
12436 {
12437 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12438 cmd_buffer->state.rt_stack_size = size;
12439 }
12440
12441 /*
12442 * For HTILE we have the following interesting clear words:
12443 * 0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
12444 * 0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
12445 * 0xfffffff0: Clear depth to 1.0
12446 * 0x00000000: Clear depth to 0.0
12447 */
12448 static void
radv_initialize_htile(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)12449 radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
12450 const VkImageSubresourceRange *range)
12451 {
12452 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12453 struct radv_cmd_state *state = &cmd_buffer->state;
12454 uint32_t htile_value = radv_get_htile_initial_value(device, image);
12455 VkClearDepthStencilValue value = {0};
12456 struct radv_barrier_data barrier = {0};
12457
12458 barrier.layout_transitions.init_mask_ram = 1;
12459 radv_describe_layout_transition(cmd_buffer, &barrier);
12460
12461 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
12462 * in considering previous rendering work for WAW hazards. */
12463 state->flush_bits |= radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
12464 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
12465
12466 if (image->planes[0].surface.has_stencil &&
12467 !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
12468 /* Flush caches before performing a separate aspect initialization because it's a
12469 * read-modify-write operation.
12470 */
12471 state->flush_bits |=
12472 radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_READ_BIT, image);
12473 }
12474
12475 state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
12476
12477 radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
12478
12479 if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
12480 /* Initialize the TC-compat metada value to 0 because by
12481 * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
12482 * need have to conditionally update its value when performing
12483 * a fast depth clear.
12484 */
12485 radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
12486 }
12487 }
12488
12489 static void
radv_handle_depth_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)12490 radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
12491 VkImageLayout src_layout, VkImageLayout dst_layout, unsigned src_queue_mask,
12492 unsigned dst_queue_mask, const VkImageSubresourceRange *range,
12493 struct radv_sample_locations_state *sample_locs)
12494 {
12495 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12496
12497 if (!radv_htile_enabled(image, range->baseMipLevel))
12498 return;
12499
12500 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
12501 radv_initialize_htile(cmd_buffer, image, range);
12502 } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_queue_mask) &&
12503 !radv_layout_is_htile_compressed(device, image, dst_layout, dst_queue_mask)) {
12504 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
12505
12506 radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
12507
12508 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
12509 }
12510 }
12511
12512 static uint32_t
radv_init_cmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)12513 radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range,
12514 uint32_t value)
12515 {
12516 struct radv_barrier_data barrier = {0};
12517
12518 barrier.layout_transitions.init_mask_ram = 1;
12519 radv_describe_layout_transition(cmd_buffer, &barrier);
12520
12521 return radv_clear_cmask(cmd_buffer, image, range, value);
12522 }
12523
12524 uint32_t
radv_init_fmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)12525 radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range)
12526 {
12527 static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
12528 uint32_t log2_samples = util_logbase2(image->vk.samples);
12529 uint32_t value = fmask_clear_values[log2_samples];
12530 struct radv_barrier_data barrier = {0};
12531
12532 barrier.layout_transitions.init_mask_ram = 1;
12533 radv_describe_layout_transition(cmd_buffer, &barrier);
12534
12535 return radv_clear_fmask(cmd_buffer, image, range, value);
12536 }
12537
12538 uint32_t
radv_init_dcc(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)12539 radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range,
12540 uint32_t value)
12541 {
12542 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12543 const struct radv_physical_device *pdev = radv_device_physical(device);
12544 struct radv_barrier_data barrier = {0};
12545 uint32_t flush_bits = 0;
12546 unsigned size = 0;
12547
12548 barrier.layout_transitions.init_mask_ram = 1;
12549 radv_describe_layout_transition(cmd_buffer, &barrier);
12550
12551 flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
12552
12553 if (pdev->info.gfx_level == GFX8) {
12554 /* When DCC is enabled with mipmaps, some levels might not
12555 * support fast clears and we have to initialize them as "fully
12556 * expanded".
12557 */
12558 /* Compute the size of all fast clearable DCC levels. */
12559 for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
12560 struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
12561 unsigned dcc_fast_clear_size = dcc_level->dcc_slice_fast_clear_size * image->vk.array_layers;
12562
12563 if (!dcc_fast_clear_size)
12564 break;
12565
12566 size = dcc_level->dcc_offset + dcc_fast_clear_size;
12567 }
12568
12569 /* Initialize the mipmap levels without DCC. */
12570 if (size != image->planes[0].surface.meta_size) {
12571 flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bindings[0].bo,
12572 radv_image_get_va(image, 0) + image->planes[0].surface.meta_offset + size,
12573 image->planes[0].surface.meta_size - size, 0xffffffff);
12574 }
12575 }
12576
12577 return flush_bits;
12578 }
12579
12580 /**
12581 * Initialize DCC/FMASK/CMASK metadata for a color image.
12582 */
12583 static void
radv_init_color_image_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)12584 radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
12585 VkImageLayout dst_layout, unsigned src_queue_mask, unsigned dst_queue_mask,
12586 const VkImageSubresourceRange *range)
12587 {
12588 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12589 uint32_t flush_bits = 0;
12590
12591 /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
12592 * consistent in considering previous rendering work for WAW hazards.
12593 */
12594 cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
12595 VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, image);
12596
12597 if (radv_image_has_cmask(image)) {
12598 static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
12599 uint32_t log2_samples = util_logbase2(image->vk.samples);
12600
12601 flush_bits |= radv_init_cmask(cmd_buffer, image, range, cmask_clear_values[log2_samples]);
12602 }
12603
12604 if (radv_image_has_fmask(image)) {
12605 flush_bits |= radv_init_fmask(cmd_buffer, image, range);
12606 }
12607
12608 if (radv_dcc_enabled(image, range->baseMipLevel)) {
12609 uint32_t value = 0xffffffffu; /* Fully expanded mode. */
12610
12611 if (radv_layout_dcc_compressed(device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
12612 value = 0u;
12613 }
12614
12615 flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
12616 }
12617
12618 if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
12619 radv_update_fce_metadata(cmd_buffer, image, range, false);
12620
12621 uint32_t color_values[2] = {0};
12622 radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
12623 }
12624
12625 cmd_buffer->state.flush_bits |= flush_bits;
12626 }
12627
12628 static void
radv_retile_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned dst_queue_mask)12629 radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
12630 VkImageLayout dst_layout, unsigned dst_queue_mask)
12631 {
12632 /* If the image is read-only, we don't have to retile DCC because it can't change. */
12633 if (!(image->vk.usage & RADV_IMAGE_USAGE_WRITE_BITS))
12634 return;
12635
12636 if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
12637 (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR || (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
12638 radv_retile_dcc(cmd_buffer, image);
12639 }
12640
12641 static bool
radv_image_need_retile(const struct radv_cmd_buffer * cmd_buffer,const struct radv_image * image)12642 radv_image_need_retile(const struct radv_cmd_buffer *cmd_buffer, const struct radv_image *image)
12643 {
12644 return cmd_buffer->qf != RADV_QUEUE_TRANSFER && image->planes[0].surface.display_dcc_offset &&
12645 image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
12646 }
12647
12648 /**
12649 * Handle color image transitions for DCC/FMASK/CMASK.
12650 */
12651 static void
radv_handle_color_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)12652 radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
12653 VkImageLayout src_layout, VkImageLayout dst_layout, unsigned src_queue_mask,
12654 unsigned dst_queue_mask, const VkImageSubresourceRange *range)
12655 {
12656 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12657 bool dcc_decompressed = false, fast_clear_flushed = false;
12658
12659 if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) && !radv_dcc_enabled(image, range->baseMipLevel))
12660 return;
12661
12662 if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
12663 radv_init_color_image_metadata(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask, range);
12664
12665 if (radv_image_need_retile(cmd_buffer, image))
12666 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
12667 return;
12668 }
12669
12670 if (radv_dcc_enabled(image, range->baseMipLevel)) {
12671 if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
12672 cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
12673 } else if (radv_layout_dcc_compressed(device, image, range->baseMipLevel, src_layout, src_queue_mask) &&
12674 !radv_layout_dcc_compressed(device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
12675 radv_decompress_dcc(cmd_buffer, image, range);
12676 dcc_decompressed = true;
12677 } else if (radv_layout_can_fast_clear(device, image, range->baseMipLevel, src_layout, src_queue_mask) &&
12678 !radv_layout_can_fast_clear(device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
12679 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
12680 fast_clear_flushed = true;
12681 }
12682
12683 if (radv_image_need_retile(cmd_buffer, image))
12684 radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
12685 } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
12686 if (radv_layout_can_fast_clear(device, image, range->baseMipLevel, src_layout, src_queue_mask) &&
12687 !radv_layout_can_fast_clear(device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
12688 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
12689 fast_clear_flushed = true;
12690 }
12691 }
12692
12693 /* MSAA color decompress. */
12694 const enum radv_fmask_compression src_fmask_comp =
12695 radv_layout_fmask_compression(device, image, src_layout, src_queue_mask);
12696 const enum radv_fmask_compression dst_fmask_comp =
12697 radv_layout_fmask_compression(device, image, dst_layout, dst_queue_mask);
12698 if (src_fmask_comp <= dst_fmask_comp)
12699 return;
12700
12701 if (src_fmask_comp == RADV_FMASK_COMPRESSION_FULL) {
12702 if (radv_dcc_enabled(image, range->baseMipLevel) && !radv_image_use_dcc_image_stores(device, image) &&
12703 !dcc_decompressed) {
12704 /* A DCC decompress is required before expanding FMASK
12705 * when DCC stores aren't supported to avoid being in
12706 * a state where DCC is compressed and the main
12707 * surface is uncompressed.
12708 */
12709 radv_decompress_dcc(cmd_buffer, image, range);
12710 } else if (!fast_clear_flushed) {
12711 /* A FMASK decompress is required before expanding
12712 * FMASK.
12713 */
12714 radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
12715 }
12716 }
12717
12718 if (dst_fmask_comp == RADV_FMASK_COMPRESSION_NONE) {
12719 struct radv_barrier_data barrier = {0};
12720 barrier.layout_transitions.fmask_color_expand = 1;
12721 radv_describe_layout_transition(cmd_buffer, &barrier);
12722
12723 radv_expand_fmask_image_inplace(cmd_buffer, image, range);
12724 }
12725 }
12726
12727 static void
radv_handle_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)12728 radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
12729 VkImageLayout dst_layout, uint32_t src_family_index, uint32_t dst_family_index,
12730 const VkImageSubresourceRange *range, struct radv_sample_locations_state *sample_locs)
12731 {
12732 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12733 const struct radv_physical_device *pdev = radv_device_physical(device);
12734 enum radv_queue_family src_qf = vk_queue_to_radv(pdev, src_family_index);
12735 enum radv_queue_family dst_qf = vk_queue_to_radv(pdev, dst_family_index);
12736 if (image->exclusive && src_family_index != dst_family_index) {
12737 /* This is an acquire or a release operation and there will be
12738 * a corresponding release/acquire. Do the transition in the
12739 * most flexible queue. */
12740
12741 assert(src_qf == cmd_buffer->qf || dst_qf == cmd_buffer->qf);
12742
12743 if (src_family_index == VK_QUEUE_FAMILY_EXTERNAL || src_family_index == VK_QUEUE_FAMILY_FOREIGN_EXT)
12744 return;
12745
12746 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
12747 return;
12748
12749 if (cmd_buffer->qf == RADV_QUEUE_COMPUTE && (src_qf == RADV_QUEUE_GENERAL || dst_qf == RADV_QUEUE_GENERAL))
12750 return;
12751 }
12752
12753 unsigned src_queue_mask = radv_image_queue_family_mask(image, src_qf, cmd_buffer->qf);
12754 unsigned dst_queue_mask = radv_image_queue_family_mask(image, dst_qf, cmd_buffer->qf);
12755
12756 if (src_layout == dst_layout && src_queue_mask == dst_queue_mask)
12757 return;
12758
12759 if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
12760 radv_handle_depth_image_transition(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask,
12761 range, sample_locs);
12762 } else {
12763 radv_handle_color_image_transition(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask,
12764 range);
12765 }
12766 }
12767
12768 static void
radv_cp_dma_wait_for_stages(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 stage_mask)12769 radv_cp_dma_wait_for_stages(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 stage_mask)
12770 {
12771 /* Make sure CP DMA is idle because the driver might have performed a DMA operation for copying a
12772 * buffer (or a MSAA image using FMASK). Note that updating a buffer is considered a clear
12773 * operation but it might also use a CP DMA copy in some rare situations. Other operations using
12774 * a CP DMA clear are implicitly synchronized (see CP_DMA_SYNC).
12775 */
12776 if (stage_mask &
12777 (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
12778 VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
12779 radv_cp_dma_wait_for_idle(cmd_buffer);
12780 }
12781
12782 void
radv_emit_cache_flush(struct radv_cmd_buffer * cmd_buffer)12783 radv_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
12784 {
12785 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12786 const struct radv_physical_device *pdev = radv_device_physical(device);
12787 bool is_compute = cmd_buffer->qf == RADV_QUEUE_COMPUTE;
12788
12789 if (is_compute)
12790 cmd_buffer->state.flush_bits &=
12791 ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
12792 RADV_CMD_FLAG_FLUSH_AND_INV_DB_META | RADV_CMD_FLAG_INV_L2_METADATA | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
12793 RADV_CMD_FLAG_VS_PARTIAL_FLUSH | RADV_CMD_FLAG_VGT_FLUSH | RADV_CMD_FLAG_START_PIPELINE_STATS |
12794 RADV_CMD_FLAG_STOP_PIPELINE_STATS);
12795
12796 if (!cmd_buffer->state.flush_bits) {
12797 radv_describe_barrier_end_delayed(cmd_buffer);
12798 return;
12799 }
12800
12801 radv_cs_emit_cache_flush(device->ws, cmd_buffer->cs, pdev->info.gfx_level, &cmd_buffer->gfx9_fence_idx,
12802 cmd_buffer->gfx9_fence_va, radv_cmd_buffer_uses_mec(cmd_buffer),
12803 cmd_buffer->state.flush_bits, &cmd_buffer->state.sqtt_flush_bits,
12804 cmd_buffer->gfx9_eop_bug_va);
12805
12806 if (radv_device_fault_detection_enabled(device))
12807 radv_cmd_buffer_trace_emit(cmd_buffer);
12808
12809 if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_L2)
12810 cmd_buffer->state.rb_noncoherent_dirty = false;
12811
12812 /* Clear the caches that have been flushed to avoid syncing too much
12813 * when there is some pending active queries.
12814 */
12815 cmd_buffer->active_query_flush_bits &= ~cmd_buffer->state.flush_bits;
12816
12817 cmd_buffer->state.flush_bits = 0;
12818
12819 /* If the driver used a compute shader for resetting a query pool, it
12820 * should be finished at this point.
12821 */
12822 cmd_buffer->pending_reset_query = false;
12823
12824 radv_describe_barrier_end_delayed(cmd_buffer);
12825 }
12826
12827 static void
radv_barrier(struct radv_cmd_buffer * cmd_buffer,uint32_t dep_count,const VkDependencyInfo * dep_infos,enum rgp_barrier_reason reason)12828 radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t dep_count, const VkDependencyInfo *dep_infos,
12829 enum rgp_barrier_reason reason)
12830 {
12831 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12832 enum radv_cmd_flush_bits src_flush_bits = 0;
12833 enum radv_cmd_flush_bits dst_flush_bits = 0;
12834 VkPipelineStageFlags2 src_stage_mask = 0;
12835 VkPipelineStageFlags2 dst_stage_mask = 0;
12836
12837 if (cmd_buffer->state.render.active)
12838 radv_mark_noncoherent_rb(cmd_buffer);
12839
12840 radv_describe_barrier_start(cmd_buffer, reason);
12841
12842 for (uint32_t dep_idx = 0; dep_idx < dep_count; dep_idx++) {
12843 const VkDependencyInfo *dep_info = &dep_infos[dep_idx];
12844
12845 for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
12846 const VkMemoryBarrier2 *barrier = &dep_info->pMemoryBarriers[i];
12847 src_stage_mask |= barrier->srcStageMask;
12848 src_flush_bits |= radv_src_access_flush(cmd_buffer, barrier->srcStageMask, barrier->srcAccessMask, NULL);
12849 dst_stage_mask |= barrier->dstStageMask;
12850 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dstStageMask, barrier->dstAccessMask, NULL);
12851 }
12852
12853 for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
12854 const VkBufferMemoryBarrier2 *barrier = &dep_info->pBufferMemoryBarriers[i];
12855 src_stage_mask |= barrier->srcStageMask;
12856 src_flush_bits |= radv_src_access_flush(cmd_buffer, barrier->srcStageMask, barrier->srcAccessMask, NULL);
12857 dst_stage_mask |= barrier->dstStageMask;
12858 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dstStageMask, barrier->dstAccessMask, NULL);
12859 }
12860
12861 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
12862 const VkImageMemoryBarrier2 *barrier = &dep_info->pImageMemoryBarriers[i];
12863 VK_FROM_HANDLE(radv_image, image, barrier->image);
12864
12865 src_stage_mask |= barrier->srcStageMask;
12866 src_flush_bits |= radv_src_access_flush(cmd_buffer, barrier->srcStageMask, barrier->srcAccessMask, image);
12867 dst_stage_mask |= barrier->dstStageMask;
12868 dst_flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dstStageMask, barrier->dstAccessMask, image);
12869 }
12870 }
12871
12872 /* The Vulkan spec 1.1.98 says:
12873 *
12874 * "An execution dependency with only
12875 * VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT in the destination stage mask
12876 * will only prevent that stage from executing in subsequently
12877 * submitted commands. As this stage does not perform any actual
12878 * execution, this is not observable - in effect, it does not delay
12879 * processing of subsequent commands. Similarly an execution dependency
12880 * with only VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT in the source stage mask
12881 * will effectively not wait for any prior commands to complete."
12882 */
12883 if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
12884 radv_stage_flush(cmd_buffer, src_stage_mask);
12885 cmd_buffer->state.flush_bits |= src_flush_bits;
12886
12887 radv_gang_barrier(cmd_buffer, src_stage_mask, 0);
12888
12889 for (uint32_t dep_idx = 0; dep_idx < dep_count; dep_idx++) {
12890 const VkDependencyInfo *dep_info = &dep_infos[dep_idx];
12891
12892 for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
12893 VK_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
12894
12895 const struct VkSampleLocationsInfoEXT *sample_locs_info =
12896 vk_find_struct_const(dep_info->pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
12897 struct radv_sample_locations_state sample_locations;
12898
12899 if (sample_locs_info) {
12900 assert(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
12901 sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
12902 sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
12903 sample_locations.count = sample_locs_info->sampleLocationsCount;
12904 typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
12905 sample_locs_info->sampleLocationsCount);
12906 }
12907
12908 radv_handle_image_transition(
12909 cmd_buffer, image, dep_info->pImageMemoryBarriers[i].oldLayout, dep_info->pImageMemoryBarriers[i].newLayout,
12910 dep_info->pImageMemoryBarriers[i].srcQueueFamilyIndex,
12911 dep_info->pImageMemoryBarriers[i].dstQueueFamilyIndex, &dep_info->pImageMemoryBarriers[i].subresourceRange,
12912 sample_locs_info ? &sample_locations : NULL);
12913 }
12914 }
12915
12916 radv_gang_barrier(cmd_buffer, 0, dst_stage_mask);
12917
12918 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
12919 /* SDMA NOP packet waits for all pending SDMA operations to complete.
12920 * Note that GFX9+ is supposed to have RAW dependency tracking, but it's buggy
12921 * so we can't rely on it fow now.
12922 */
12923 radeon_check_space(device->ws, cmd_buffer->cs, 1);
12924 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
12925 } else {
12926 const bool is_gfx_or_ace = cmd_buffer->qf == RADV_QUEUE_GENERAL || cmd_buffer->qf == RADV_QUEUE_COMPUTE;
12927 if (is_gfx_or_ace)
12928 radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
12929 }
12930
12931 cmd_buffer->state.flush_bits |= dst_flush_bits;
12932
12933 radv_describe_barrier_end(cmd_buffer);
12934 }
12935
12936 VKAPI_ATTR void VKAPI_CALL
radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)12937 radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo)
12938 {
12939 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12940 enum rgp_barrier_reason barrier_reason;
12941
12942 if (cmd_buffer->vk.runtime_rp_barrier) {
12943 barrier_reason = RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC;
12944 } else {
12945 barrier_reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
12946 }
12947
12948 radv_barrier(cmd_buffer, 1, pDependencyInfo, barrier_reason);
12949 }
12950
12951 static void
write_event(struct radv_cmd_buffer * cmd_buffer,struct radv_event * event,VkPipelineStageFlags2 stageMask,unsigned value)12952 write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event, VkPipelineStageFlags2 stageMask,
12953 unsigned value)
12954 {
12955 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12956 const struct radv_physical_device *pdev = radv_device_physical(device);
12957 struct radeon_cmdbuf *cs = cmd_buffer->cs;
12958 uint64_t va = radv_buffer_get_va(event->bo);
12959
12960 if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC || cmd_buffer->qf == RADV_QUEUE_VIDEO_ENC) {
12961 radv_vcn_write_event(cmd_buffer, event, value);
12962 return;
12963 }
12964
12965 radv_emit_cache_flush(cmd_buffer);
12966
12967 radv_cs_add_buffer(device->ws, cs, event->bo);
12968
12969 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 28);
12970
12971 if (stageMask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
12972 VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
12973 /* Be conservative for now. */
12974 stageMask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
12975 }
12976
12977 /* Flags that only require a top-of-pipe event. */
12978 VkPipelineStageFlags2 top_of_pipe_flags = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
12979
12980 /* Flags that only require a post-index-fetch event. */
12981 VkPipelineStageFlags2 post_index_fetch_flags =
12982 top_of_pipe_flags | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT;
12983
12984 /* Flags that only require signaling post PS. */
12985 VkPipelineStageFlags2 post_ps_flags =
12986 post_index_fetch_flags | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
12987 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
12988 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT |
12989 VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT | VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
12990 VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
12991 VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT;
12992
12993 /* Flags that only require signaling post CS. */
12994 VkPipelineStageFlags2 post_cs_flags = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
12995
12996 radv_cp_dma_wait_for_stages(cmd_buffer, stageMask);
12997
12998 if (!(stageMask & ~top_of_pipe_flags)) {
12999 /* Just need to sync the PFP engine. */
13000 radv_write_data(cmd_buffer, V_370_PFP, va, 1, &value, false);
13001 } else if (!(stageMask & ~post_index_fetch_flags)) {
13002 /* Sync ME because PFP reads index and indirect buffers. */
13003 radv_write_data(cmd_buffer, V_370_ME, va, 1, &value, false);
13004 } else {
13005 unsigned event_type;
13006
13007 if (!(stageMask & ~post_ps_flags)) {
13008 /* Sync previous fragment shaders. */
13009 event_type = V_028A90_PS_DONE;
13010 } else if (!(stageMask & ~post_cs_flags)) {
13011 /* Sync previous compute shaders. */
13012 event_type = V_028A90_CS_DONE;
13013 } else {
13014 /* Otherwise, sync all prior GPU work. */
13015 event_type = V_028A90_BOTTOM_OF_PIPE_TS;
13016 }
13017
13018 radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, event_type, 0, EOP_DST_SEL_MEM,
13019 EOP_DATA_SEL_VALUE_32BIT, va, value, cmd_buffer->gfx9_eop_bug_va);
13020 }
13021
13022 assert(cmd_buffer->cs->cdw <= cdw_max);
13023 }
13024
13025 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)13026 radv_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, const VkDependencyInfo *pDependencyInfo)
13027 {
13028 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13029 VK_FROM_HANDLE(radv_event, event, _event);
13030 VkPipelineStageFlags2 src_stage_mask = 0;
13031
13032 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
13033 src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
13034 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
13035 src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
13036 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
13037 src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
13038
13039 write_event(cmd_buffer, event, src_stage_mask, 1);
13040 }
13041
13042 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)13043 radv_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags2 stageMask)
13044 {
13045 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13046 VK_FROM_HANDLE(radv_event, event, _event);
13047
13048 write_event(cmd_buffer, event, stageMask, 0);
13049 }
13050
13051 VKAPI_ATTR void VKAPI_CALL
radv_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)13052 radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
13053 const VkDependencyInfo *pDependencyInfos)
13054 {
13055 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13056 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13057 struct radeon_cmdbuf *cs = cmd_buffer->cs;
13058
13059 if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC || cmd_buffer->qf == RADV_QUEUE_VIDEO_ENC)
13060 return;
13061
13062 for (unsigned i = 0; i < eventCount; ++i) {
13063 VK_FROM_HANDLE(radv_event, event, pEvents[i]);
13064 uint64_t va = radv_buffer_get_va(event->bo);
13065
13066 radv_cs_add_buffer(device->ws, cs, event->bo);
13067
13068 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 7);
13069
13070 radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
13071 assert(cmd_buffer->cs->cdw <= cdw_max);
13072 }
13073
13074 radv_barrier(cmd_buffer, eventCount, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS);
13075 }
13076
13077 void
radv_emit_set_predication_state(struct radv_cmd_buffer * cmd_buffer,bool draw_visible,unsigned pred_op,uint64_t va)13078 radv_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, bool draw_visible, unsigned pred_op, uint64_t va)
13079 {
13080 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13081 const struct radv_physical_device *pdev = radv_device_physical(device);
13082 uint32_t op = 0;
13083
13084 radeon_check_space(device->ws, cmd_buffer->cs, 4);
13085
13086 if (va) {
13087 assert(pred_op == PREDICATION_OP_BOOL32 || pred_op == PREDICATION_OP_BOOL64);
13088
13089 op = PRED_OP(pred_op);
13090
13091 /* PREDICATION_DRAW_VISIBLE means that if the 32-bit value is
13092 * zero, all rendering commands are discarded. Otherwise, they
13093 * are discarded if the value is non zero.
13094 */
13095 op |= draw_visible ? PREDICATION_DRAW_VISIBLE : PREDICATION_DRAW_NOT_VISIBLE;
13096 }
13097 if (pdev->info.gfx_level >= GFX9) {
13098 radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
13099 radeon_emit(cmd_buffer->cs, op);
13100 radeon_emit(cmd_buffer->cs, va);
13101 radeon_emit(cmd_buffer->cs, va >> 32);
13102 } else {
13103 radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
13104 radeon_emit(cmd_buffer->cs, va);
13105 radeon_emit(cmd_buffer->cs, op | ((va >> 32) & 0xFF));
13106 }
13107 }
13108
13109 void
radv_begin_conditional_rendering(struct radv_cmd_buffer * cmd_buffer,uint64_t va,bool draw_visible)13110 radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va, bool draw_visible)
13111 {
13112 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13113 const struct radv_physical_device *pdev = radv_device_physical(device);
13114 struct radeon_cmdbuf *cs = cmd_buffer->cs;
13115 unsigned pred_op = PREDICATION_OP_BOOL32;
13116
13117 radv_emit_cache_flush(cmd_buffer);
13118
13119 if (cmd_buffer->qf == RADV_QUEUE_GENERAL) {
13120 if (!pdev->info.has_32bit_predication) {
13121 uint64_t pred_value = 0, pred_va;
13122 unsigned pred_offset;
13123
13124 /* From the Vulkan spec 1.1.107:
13125 *
13126 * "If the 32-bit value at offset in buffer memory is zero,
13127 * then the rendering commands are discarded, otherwise they
13128 * are executed as normal. If the value of the predicate in
13129 * buffer memory changes while conditional rendering is
13130 * active, the rendering commands may be discarded in an
13131 * implementation-dependent way. Some implementations may
13132 * latch the value of the predicate upon beginning conditional
13133 * rendering while others may read it before every rendering
13134 * command."
13135 *
13136 * But, the AMD hardware treats the predicate as a 64-bit
13137 * value which means we need a workaround in the driver.
13138 * Luckily, it's not required to support if the value changes
13139 * when predication is active.
13140 *
13141 * The workaround is as follows:
13142 * 1) allocate a 64-value in the upload BO and initialize it
13143 * to 0
13144 * 2) copy the 32-bit predicate value to the upload BO
13145 * 3) use the new allocated VA address for predication
13146 *
13147 * Based on the conditionalrender demo, it's faster to do the
13148 * COPY_DATA in ME (+ sync PFP) instead of PFP.
13149 */
13150 radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
13151
13152 pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
13153
13154 radeon_check_space(device->ws, cmd_buffer->cs, 8);
13155
13156 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13157 radeon_emit(
13158 cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
13159 radeon_emit(cs, va);
13160 radeon_emit(cs, va >> 32);
13161 radeon_emit(cs, pred_va);
13162 radeon_emit(cs, pred_va >> 32);
13163
13164 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
13165 radeon_emit(cs, 0);
13166
13167 va = pred_va;
13168 pred_op = PREDICATION_OP_BOOL64;
13169 }
13170
13171 radv_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
13172 } else {
13173 /* Compute queue doesn't support predication and it's emulated elsewhere. */
13174 }
13175
13176 /* Store conditional rendering user info. */
13177 cmd_buffer->state.predicating = true;
13178 cmd_buffer->state.predication_type = draw_visible;
13179 cmd_buffer->state.predication_op = pred_op;
13180 cmd_buffer->state.predication_va = va;
13181 cmd_buffer->state.mec_inv_pred_emitted = false;
13182 }
13183
13184 void
radv_end_conditional_rendering(struct radv_cmd_buffer * cmd_buffer)13185 radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer)
13186 {
13187 if (cmd_buffer->qf == RADV_QUEUE_GENERAL) {
13188 radv_emit_set_predication_state(cmd_buffer, false, 0, 0);
13189 } else {
13190 /* Compute queue doesn't support predication, no need to emit anything here. */
13191 }
13192
13193 /* Reset conditional rendering user info. */
13194 cmd_buffer->state.predicating = false;
13195 cmd_buffer->state.predication_type = -1;
13196 cmd_buffer->state.predication_op = 0;
13197 cmd_buffer->state.predication_va = 0;
13198 cmd_buffer->state.mec_inv_pred_emitted = false;
13199 }
13200
13201 /* VK_EXT_conditional_rendering */
13202 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)13203 radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
13204 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
13205 {
13206 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13207 VK_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
13208 bool draw_visible = true;
13209 uint64_t va;
13210
13211 va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
13212
13213 /* By default, if the 32-bit value at offset in buffer memory is zero,
13214 * then the rendering commands are discarded, otherwise they are
13215 * executed as normal. If the inverted flag is set, all commands are
13216 * discarded if the value is non zero.
13217 */
13218 if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
13219 draw_visible = false;
13220 }
13221
13222 radv_begin_conditional_rendering(cmd_buffer, va, draw_visible);
13223 }
13224
13225 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)13226 radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
13227 {
13228 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13229
13230 radv_end_conditional_rendering(cmd_buffer);
13231 }
13232
13233 /* VK_EXT_transform_feedback */
13234 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)13235 radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount,
13236 const VkBuffer *pBuffers, const VkDeviceSize *pOffsets,
13237 const VkDeviceSize *pSizes)
13238 {
13239 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13240 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13241 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
13242 uint8_t enabled_mask = 0;
13243
13244 assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
13245 for (uint32_t i = 0; i < bindingCount; i++) {
13246 uint32_t idx = firstBinding + i;
13247
13248 sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
13249 sb[idx].offset = pOffsets[i];
13250
13251 if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
13252 sb[idx].size = sb[idx].buffer->vk.size - sb[idx].offset;
13253 } else {
13254 sb[idx].size = pSizes[i];
13255 }
13256
13257 radv_cs_add_buffer(device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
13258
13259 enabled_mask |= 1 << idx;
13260 }
13261
13262 cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
13263
13264 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
13265 }
13266
13267 static void
radv_set_streamout_enable(struct radv_cmd_buffer * cmd_buffer,bool enable)13268 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
13269 {
13270 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13271 const struct radv_physical_device *pdev = radv_device_physical(device);
13272 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
13273 bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
13274 uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
13275
13276 so->streamout_enabled = enable;
13277
13278 so->hw_enabled_mask =
13279 so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) | (so->enabled_mask << 12);
13280
13281 if (!pdev->use_ngg_streamout && ((old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) ||
13282 (old_hw_enabled_mask != so->hw_enabled_mask)))
13283 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
13284
13285 if (pdev->use_ngg_streamout) {
13286 /* Re-emit streamout desciptors because with NGG streamout, a buffer size of 0 acts like a
13287 * disable bit and this is needed when streamout needs to be ignored in shaders.
13288 */
13289 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY | RADV_CMD_DIRTY_STREAMOUT_BUFFER;
13290 }
13291 }
13292
13293 static void
radv_flush_vgt_streamout(struct radv_cmd_buffer * cmd_buffer)13294 radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
13295 {
13296 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13297 const struct radv_physical_device *pdev = radv_device_physical(device);
13298 struct radeon_cmdbuf *cs = cmd_buffer->cs;
13299 unsigned reg_strmout_cntl;
13300
13301 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 14);
13302
13303 /* The register is at different places on different ASICs. */
13304 if (pdev->info.gfx_level >= GFX9) {
13305 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
13306 radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
13307 radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
13308 radeon_emit(cs, R_0300FC_CP_STRMOUT_CNTL >> 2);
13309 radeon_emit(cs, 0);
13310 radeon_emit(cs, 0);
13311 } else if (pdev->info.gfx_level >= GFX7) {
13312 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
13313 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
13314 } else {
13315 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
13316 radeon_set_config_reg(cs, reg_strmout_cntl, 0);
13317 }
13318
13319 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
13320 radeon_emit(cs, EVENT_TYPE(V_028A90_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
13321
13322 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
13323 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
13324 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
13325 radeon_emit(cs, 0);
13326 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
13327 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
13328 radeon_emit(cs, 4); /* poll interval */
13329
13330 assert(cs->cdw <= cdw_max);
13331 }
13332
13333 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)13334 radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
13335 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
13336 const VkDeviceSize *pCounterBufferOffsets)
13337 {
13338 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13339 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13340 const struct radv_physical_device *pdev = radv_device_physical(device);
13341 struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
13342 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
13343 struct radeon_cmdbuf *cs = cmd_buffer->cs;
13344 bool first_target = true;
13345
13346 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
13347 if (!pdev->use_ngg_streamout)
13348 radv_flush_vgt_streamout(cmd_buffer);
13349
13350 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, MAX_SO_BUFFERS * 10);
13351
13352 u_foreach_bit (i, so->enabled_mask) {
13353 int32_t counter_buffer_idx = i - firstCounterBuffer;
13354 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
13355 counter_buffer_idx = -1;
13356
13357 bool append = counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
13358 uint64_t va = 0;
13359
13360 if (append) {
13361 VK_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
13362 uint64_t counter_buffer_offset = 0;
13363
13364 if (pCounterBufferOffsets)
13365 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
13366
13367 va += radv_buffer_get_va(buffer->bo);
13368 va += buffer->offset + counter_buffer_offset;
13369
13370 radv_cs_add_buffer(device->ws, cs, buffer->bo);
13371 }
13372
13373 if (pdev->info.gfx_level >= GFX12) {
13374 /* Only the first streamout target holds information. */
13375 if (first_target) {
13376 if (append) {
13377 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13378 radeon_emit(
13379 cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
13380 radeon_emit(cs, va);
13381 radeon_emit(cs, va >> 32);
13382 radeon_emit(cs, (R_0309B0_GE_GS_ORDERED_ID_BASE >> 2));
13383 radeon_emit(cs, 0);
13384 } else {
13385 radeon_set_uconfig_reg(cs, R_0309B0_GE_GS_ORDERED_ID_BASE, 0);
13386 }
13387
13388 first_target = false;
13389 }
13390 } else if (pdev->use_ngg_streamout) {
13391 if (append) {
13392 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13393 radeon_emit(cs,
13394 COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
13395 radeon_emit(cs, va);
13396 radeon_emit(cs, va >> 32);
13397 radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
13398 radeon_emit(cs, 0);
13399 } else {
13400 /* The PKT3 CAM bit workaround seems needed for initializing this GDS register to zero. */
13401 radeon_set_uconfig_perfctr_reg(pdev->info.gfx_level, cmd_buffer->qf, cs,
13402 R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + i * 4, 0);
13403 }
13404 } else {
13405 /* AMD GCN binds streamout buffers as shader resources.
13406 * VGT only counts primitives and tells the shader through
13407 * SGPRs what to do.
13408 */
13409 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, sb[i].size >> 2);
13410
13411 cmd_buffer->state.context_roll_without_scissor_emitted = true;
13412
13413 if (append) {
13414 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
13415 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
13416 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
13417 radeon_emit(cs, 0); /* unused */
13418 radeon_emit(cs, 0); /* unused */
13419 radeon_emit(cs, va); /* src address lo */
13420 radeon_emit(cs, va >> 32); /* src address hi */
13421 } else {
13422 /* Start from the beginning. */
13423 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
13424 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
13425 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
13426 radeon_emit(cs, 0); /* unused */
13427 radeon_emit(cs, 0); /* unused */
13428 radeon_emit(cs, 0); /* unused */
13429 radeon_emit(cs, 0); /* unused */
13430 }
13431 }
13432 }
13433
13434 assert(cs->cdw <= cdw_max);
13435
13436 radv_set_streamout_enable(cmd_buffer, true);
13437
13438 if (!pdev->use_ngg_streamout)
13439 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
13440 }
13441
13442 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)13443 radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, uint32_t counterBufferCount,
13444 const VkBuffer *pCounterBuffers, const VkDeviceSize *pCounterBufferOffsets)
13445 {
13446 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13447 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13448 const struct radv_physical_device *pdev = radv_device_physical(device);
13449 struct radv_streamout_state *so = &cmd_buffer->state.streamout;
13450 struct radeon_cmdbuf *cs = cmd_buffer->cs;
13451
13452 assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
13453
13454 if (pdev->info.gfx_level >= GFX12) {
13455 /* Nothing to do. The streamout state buffer already contains the next ordered ID, which
13456 * is the only thing we need to restore.
13457 */
13458 radv_set_streamout_enable(cmd_buffer, false);
13459 return;
13460 }
13461
13462 if (pdev->use_ngg_streamout) {
13463 /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
13464 cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
13465 radv_emit_cache_flush(cmd_buffer);
13466 } else {
13467 radv_flush_vgt_streamout(cmd_buffer);
13468 }
13469
13470 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, MAX_SO_BUFFERS * 12);
13471
13472 u_foreach_bit (i, so->enabled_mask) {
13473 int32_t counter_buffer_idx = i - firstCounterBuffer;
13474 if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
13475 counter_buffer_idx = -1;
13476
13477 bool append = counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
13478 uint64_t va = 0;
13479
13480 if (append) {
13481 VK_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
13482 uint64_t counter_buffer_offset = 0;
13483
13484 if (pCounterBufferOffsets)
13485 counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
13486
13487 va += radv_buffer_get_va(buffer->bo);
13488 va += buffer->offset + counter_buffer_offset;
13489
13490 radv_cs_add_buffer(device->ws, cs, buffer->bo);
13491 }
13492
13493 if (pdev->use_ngg_streamout) {
13494 if (append) {
13495 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13496 radeon_emit(cs,
13497 COPY_DATA_SRC_SEL(COPY_DATA_REG) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
13498 radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
13499 radeon_emit(cs, 0);
13500 radeon_emit(cs, va);
13501 radeon_emit(cs, va >> 32);
13502 }
13503 } else {
13504 if (append) {
13505 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
13506 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
13507 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
13508 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
13509 radeon_emit(cs, va); /* dst address lo */
13510 radeon_emit(cs, va >> 32); /* dst address hi */
13511 radeon_emit(cs, 0); /* unused */
13512 radeon_emit(cs, 0); /* unused */
13513 }
13514
13515 /* Deactivate transform feedback by zeroing the buffer size.
13516 * The counters (primitives generated, primitives emitted) may
13517 * be enabled even if there is not buffer bound. This ensures
13518 * that the primitives-emitted query won't increment.
13519 */
13520 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
13521
13522 cmd_buffer->state.context_roll_without_scissor_emitted = true;
13523 }
13524 }
13525
13526 assert(cmd_buffer->cs->cdw <= cdw_max);
13527
13528 radv_set_streamout_enable(cmd_buffer, false);
13529 }
13530
13531 static void
radv_emit_strmout_buffer(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)13532 radv_emit_strmout_buffer(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
13533 {
13534 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13535 const struct radv_physical_device *pdev = radv_device_physical(device);
13536 const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
13537 uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
13538 struct radeon_cmdbuf *cs = cmd_buffer->cs;
13539
13540 va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
13541
13542 radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
13543
13544 if (gfx_level >= GFX10) {
13545 /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
13546 * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
13547 */
13548 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
13549 radeon_emit(cs, 0);
13550
13551 radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
13552 radeon_emit(cs, va);
13553 radeon_emit(cs, va >> 32);
13554 radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
13555 radeon_emit(cs, 1); /* 1 DWORD */
13556 } else {
13557 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13558 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
13559 radeon_emit(cs, va);
13560 radeon_emit(cs, va >> 32);
13561 radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
13562 radeon_emit(cs, 0); /* unused */
13563 }
13564
13565 radv_cs_add_buffer(device->ws, cs, draw_info->strmout_buffer->bo);
13566 }
13567
13568 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)13569 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount, uint32_t firstInstance,
13570 VkBuffer _counterBuffer, VkDeviceSize counterBufferOffset, uint32_t counterOffset,
13571 uint32_t vertexStride)
13572 {
13573 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13574 VK_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
13575 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13576 const struct radv_physical_device *pdev = radv_device_physical(device);
13577 struct radv_draw_info info;
13578
13579 info.count = 0;
13580 info.instance_count = instanceCount;
13581 info.first_instance = firstInstance;
13582 info.strmout_buffer = counterBuffer;
13583 info.strmout_buffer_offset = counterBufferOffset;
13584 info.stride = vertexStride;
13585 info.indexed = false;
13586 info.indirect = NULL;
13587
13588 if (!radv_before_draw(cmd_buffer, &info, 1, false))
13589 return;
13590 struct VkMultiDrawInfoEXT minfo = {0, 0};
13591 radv_emit_strmout_buffer(cmd_buffer, &info);
13592 radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
13593
13594 if (pdev->info.gfx_level == GFX12) {
13595 /* DrawTransformFeedback requires 3 SQ_NON_EVENTs after the packet. */
13596 for (unsigned i = 0; i < 3; i++) {
13597 radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
13598 radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
13599 }
13600 }
13601
13602 radv_after_draw(cmd_buffer, false);
13603 }
13604
13605 /* VK_AMD_buffer_marker */
13606 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)13607 radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkBuffer dstBuffer,
13608 VkDeviceSize dstOffset, uint32_t marker)
13609 {
13610 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13611 VK_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
13612 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13613 const struct radv_physical_device *pdev = radv_device_physical(device);
13614 struct radeon_cmdbuf *cs = cmd_buffer->cs;
13615 const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + dstOffset;
13616
13617 if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
13618 radeon_check_space(device->ws, cmd_buffer->cs, 4);
13619 radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_FENCE, 0, SDMA_FENCE_MTYPE_UC));
13620 radeon_emit(cs, va);
13621 radeon_emit(cs, va >> 32);
13622 radeon_emit(cs, marker);
13623 return;
13624 }
13625
13626 radv_emit_cache_flush(cmd_buffer);
13627
13628 ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 12);
13629
13630 if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) {
13631 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13632 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
13633 radeon_emit(cs, marker);
13634 radeon_emit(cs, 0);
13635 radeon_emit(cs, va);
13636 radeon_emit(cs, va >> 32);
13637 } else {
13638 radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
13639 EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker, cmd_buffer->gfx9_eop_bug_va);
13640 }
13641
13642 assert(cmd_buffer->cs->cdw <= cdw_max);
13643 }
13644
13645 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline pipeline,uint32_t groupIndex)13646 radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
13647 VkPipeline pipeline, uint32_t groupIndex)
13648 {
13649 fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n");
13650 abort();
13651 }
13652
13653 /* VK_NV_device_generated_commands_compute */
13654 VKAPI_ATTR void VKAPI_CALL
radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)13655 radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
13656 VkPipeline _pipeline)
13657 {
13658 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13659 VK_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
13660 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13661 const struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
13662 const struct radeon_cmdbuf *cs = &compute_pipeline->indirect.cs;
13663 const uint64_t va = compute_pipeline->indirect.va;
13664 struct radv_compute_pipeline_metadata metadata;
13665 uint32_t offset = 0;
13666
13667 radv_get_compute_shader_metadata(device, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], &metadata);
13668
13669 radv_write_data(cmd_buffer, V_370_ME, va + offset, sizeof(metadata) / 4, (const uint32_t *)&metadata, false);
13670 offset += sizeof(metadata);
13671
13672 radv_write_data(cmd_buffer, V_370_ME, va + offset, 1, (const uint32_t *)&cs->cdw, false);
13673 offset += sizeof(uint32_t);
13674
13675 radv_write_data(cmd_buffer, V_370_ME, va + offset, cs->cdw, (const uint32_t *)cs->buf, false);
13676 offset += cs->cdw * sizeof(uint32_t);
13677
13678 assert(offset < compute_pipeline->indirect.size);
13679 }
13680
13681 /* VK_EXT_descriptor_buffer */
13682 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer,uint32_t bufferCount,const VkDescriptorBufferBindingInfoEXT * pBindingInfos)13683 radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer, uint32_t bufferCount,
13684 const VkDescriptorBufferBindingInfoEXT *pBindingInfos)
13685 {
13686 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13687
13688 for (uint32_t i = 0; i < bufferCount; i++) {
13689 cmd_buffer->descriptor_buffers[i] = pBindingInfos[i].address;
13690 }
13691 }
13692
13693 static void
radv_set_descriptor_buffer_offsets(struct radv_cmd_buffer * cmd_buffer,const VkSetDescriptorBufferOffsetsInfoEXT * pSetDescriptorBufferOffsetsInfo,VkPipelineBindPoint bind_point)13694 radv_set_descriptor_buffer_offsets(struct radv_cmd_buffer *cmd_buffer,
13695 const VkSetDescriptorBufferOffsetsInfoEXT *pSetDescriptorBufferOffsetsInfo,
13696 VkPipelineBindPoint bind_point)
13697 {
13698 struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
13699
13700 for (unsigned i = 0; i < pSetDescriptorBufferOffsetsInfo->setCount; i++) {
13701 const uint32_t buffer_idx = pSetDescriptorBufferOffsetsInfo->pBufferIndices[i];
13702 const uint64_t offset = pSetDescriptorBufferOffsetsInfo->pOffsets[i];
13703 unsigned idx = i + pSetDescriptorBufferOffsetsInfo->firstSet;
13704
13705 descriptors_state->descriptor_buffers[idx] = cmd_buffer->descriptor_buffers[buffer_idx] + offset;
13706
13707 radv_set_descriptor_set(cmd_buffer, bind_point, NULL, idx);
13708 }
13709 }
13710
13711 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDescriptorBufferOffsets2EXT(VkCommandBuffer commandBuffer,const VkSetDescriptorBufferOffsetsInfoEXT * pSetDescriptorBufferOffsetsInfo)13712 radv_CmdSetDescriptorBufferOffsets2EXT(VkCommandBuffer commandBuffer,
13713 const VkSetDescriptorBufferOffsetsInfoEXT *pSetDescriptorBufferOffsetsInfo)
13714 {
13715 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13716
13717 if (pSetDescriptorBufferOffsetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
13718 radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
13719 }
13720
13721 if (pSetDescriptorBufferOffsetsInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
13722 radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
13723 }
13724
13725 if (pSetDescriptorBufferOffsetsInfo->stageFlags & RADV_RT_STAGE_BITS) {
13726 radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo,
13727 VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
13728 }
13729 }
13730
13731 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(VkCommandBuffer commandBuffer,const VkBindDescriptorBufferEmbeddedSamplersInfoEXT * pBindDescriptorBufferEmbeddedSamplersInfo)13732 radv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(
13733 VkCommandBuffer commandBuffer,
13734 const VkBindDescriptorBufferEmbeddedSamplersInfoEXT *pBindDescriptorBufferEmbeddedSamplersInfo)
13735 {
13736 /* This is a no-op because embedded samplers are inlined at compile time. */
13737 }
13738
13739 /* VK_EXT_shader_object */
13740 static void
radv_reset_pipeline_state(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint)13741 radv_reset_pipeline_state(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint)
13742 {
13743 switch (pipelineBindPoint) {
13744 case VK_PIPELINE_BIND_POINT_COMPUTE:
13745 if (cmd_buffer->state.compute_pipeline) {
13746 radv_bind_shader(cmd_buffer, NULL, MESA_SHADER_COMPUTE);
13747 cmd_buffer->state.compute_pipeline = NULL;
13748 }
13749 if (cmd_buffer->state.emitted_compute_pipeline) {
13750 cmd_buffer->state.emitted_compute_pipeline = NULL;
13751 }
13752 break;
13753 case VK_PIPELINE_BIND_POINT_GRAPHICS:
13754 if (cmd_buffer->state.graphics_pipeline) {
13755 radv_foreach_stage(s, cmd_buffer->state.graphics_pipeline->active_stages)
13756 {
13757 radv_bind_shader(cmd_buffer, NULL, s);
13758 }
13759 cmd_buffer->state.graphics_pipeline = NULL;
13760
13761 cmd_buffer->state.gs_copy_shader = NULL;
13762 cmd_buffer->state.last_vgt_shader = NULL;
13763 cmd_buffer->state.has_nggc = false;
13764 cmd_buffer->state.emitted_vs_prolog = NULL;
13765 cmd_buffer->state.spi_shader_col_format = 0;
13766 cmd_buffer->state.cb_shader_mask = 0;
13767 cmd_buffer->state.ms.sample_shading_enable = false;
13768 cmd_buffer->state.ms.min_sample_shading = 1.0f;
13769 cmd_buffer->state.rast_prim = 0;
13770 cmd_buffer->state.uses_out_of_order_rast = false;
13771 cmd_buffer->state.uses_vrs_attachment = false;
13772 }
13773 if (cmd_buffer->state.emitted_graphics_pipeline) {
13774 radv_bind_custom_blend_mode(cmd_buffer, 0);
13775
13776 if (cmd_buffer->state.db_render_control) {
13777 cmd_buffer->state.db_render_control = 0;
13778 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
13779 }
13780
13781 cmd_buffer->state.uses_vrs = false;
13782 cmd_buffer->state.uses_vrs_coarse_shading = false;
13783
13784 cmd_buffer->state.emitted_graphics_pipeline = NULL;
13785 }
13786 break;
13787 default:
13788 break;
13789 }
13790
13791 cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
13792 }
13793
13794 static void
radv_bind_compute_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_object * shader_obj)13795 radv_bind_compute_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_object *shader_obj)
13796 {
13797 struct radv_shader *shader = shader_obj ? shader_obj->shader : NULL;
13798 struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13799 const struct radv_physical_device *pdev = radv_device_physical(device);
13800 struct radeon_cmdbuf *cs = cmd_buffer->cs;
13801
13802 radv_bind_shader(cmd_buffer, shader, MESA_SHADER_COMPUTE);
13803
13804 if (!shader_obj)
13805 return;
13806
13807 ASSERTED const unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 128);
13808
13809 radv_emit_compute_shader(pdev, cs, shader);
13810
13811 /* Update push constants/indirect descriptors state. */
13812 struct radv_descriptor_state *descriptors_state =
13813 radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
13814 struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_COMPUTE];
13815
13816 descriptors_state->need_indirect_descriptor_sets =
13817 radv_get_user_sgpr_info(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS)->sgpr_idx != -1;
13818 pc_state->size = shader_obj->push_constant_size;
13819 pc_state->dynamic_offset_count = shader_obj->dynamic_offset_count;
13820
13821 assert(cmd_buffer->cs->cdw <= cdw_max);
13822 }
13823
13824 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindShadersEXT(VkCommandBuffer commandBuffer,uint32_t stageCount,const VkShaderStageFlagBits * pStages,const VkShaderEXT * pShaders)13825 radv_CmdBindShadersEXT(VkCommandBuffer commandBuffer, uint32_t stageCount, const VkShaderStageFlagBits *pStages,
13826 const VkShaderEXT *pShaders)
13827 {
13828 VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13829 VkShaderStageFlagBits bound_stages = 0;
13830
13831 for (uint32_t i = 0; i < stageCount; i++) {
13832 const gl_shader_stage stage = vk_to_mesa_shader_stage(pStages[i]);
13833
13834 if (!pShaders) {
13835 cmd_buffer->state.shader_objs[stage] = NULL;
13836 continue;
13837 }
13838
13839 VK_FROM_HANDLE(radv_shader_object, shader_obj, pShaders[i]);
13840
13841 cmd_buffer->state.shader_objs[stage] = shader_obj;
13842
13843 bound_stages |= pStages[i];
13844 }
13845
13846 if (bound_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
13847 radv_reset_pipeline_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
13848 radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
13849
13850 radv_bind_compute_shader(cmd_buffer, cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE]);
13851 }
13852
13853 if (bound_stages & RADV_GRAPHICS_STAGE_BITS) {
13854 radv_reset_pipeline_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
13855 radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
13856
13857 /* Graphics shaders are handled at draw time because of shader variants. */
13858 }
13859
13860 cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GRAPHICS_SHADERS;
13861 }
13862
13863 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationModeNV(VkCommandBuffer commandBuffer,VkCoverageModulationModeNV coverageModulationMode)13864 radv_CmdSetCoverageModulationModeNV(VkCommandBuffer commandBuffer, VkCoverageModulationModeNV coverageModulationMode)
13865 {
13866 unreachable("Not supported by RADV.");
13867 }
13868
13869 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationTableEnableNV(VkCommandBuffer commandBuffer,VkBool32 coverageModulationTableEnable)13870 radv_CmdSetCoverageModulationTableEnableNV(VkCommandBuffer commandBuffer, VkBool32 coverageModulationTableEnable)
13871 {
13872 unreachable("Not supported by RADV.");
13873 }
13874
13875 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationTableNV(VkCommandBuffer commandBuffer,uint32_t coverageModulationTableCount,const float * pCoverageModulationTable)13876 radv_CmdSetCoverageModulationTableNV(VkCommandBuffer commandBuffer, uint32_t coverageModulationTableCount,
13877 const float *pCoverageModulationTable)
13878 {
13879 unreachable("Not supported by RADV.");
13880 }
13881
13882 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageReductionModeNV(VkCommandBuffer commandBuffer,VkCoverageReductionModeNV coverageReductionMode)13883 radv_CmdSetCoverageReductionModeNV(VkCommandBuffer commandBuffer, VkCoverageReductionModeNV coverageReductionMode)
13884 {
13885 unreachable("Not supported by RADV.");
13886 }
13887
13888 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageToColorEnableNV(VkCommandBuffer commandBuffer,VkBool32 coverageToColorEnable)13889 radv_CmdSetCoverageToColorEnableNV(VkCommandBuffer commandBuffer, VkBool32 coverageToColorEnable)
13890 {
13891 unreachable("Not supported by RADV.");
13892 }
13893
13894 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageToColorLocationNV(VkCommandBuffer commandBuffer,uint32_t coverageToColorLocation)13895 radv_CmdSetCoverageToColorLocationNV(VkCommandBuffer commandBuffer, uint32_t coverageToColorLocation)
13896 {
13897 unreachable("Not supported by RADV.");
13898 }
13899
13900 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRepresentativeFragmentTestEnableNV(VkCommandBuffer commandBuffer,VkBool32 representativeFragmentTestEnable)13901 radv_CmdSetRepresentativeFragmentTestEnableNV(VkCommandBuffer commandBuffer, VkBool32 representativeFragmentTestEnable)
13902 {
13903 unreachable("Not supported by RADV.");
13904 }
13905
13906 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetShadingRateImageEnableNV(VkCommandBuffer commandBuffer,VkBool32 shadingRateImageEnable)13907 radv_CmdSetShadingRateImageEnableNV(VkCommandBuffer commandBuffer, VkBool32 shadingRateImageEnable)
13908 {
13909 unreachable("Not supported by RADV.");
13910 }
13911
13912 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportSwizzleNV(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewportSwizzleNV * pViewportSwizzles)13913 radv_CmdSetViewportSwizzleNV(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
13914 const VkViewportSwizzleNV *pViewportSwizzles)
13915 {
13916 unreachable("Not supported by RADV.");
13917 }
13918
13919 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWScalingEnableNV(VkCommandBuffer commandBuffer,VkBool32 viewportWScalingEnable)13920 radv_CmdSetViewportWScalingEnableNV(VkCommandBuffer commandBuffer, VkBool32 viewportWScalingEnable)
13921 {
13922 unreachable("Not supported by RADV.");
13923 }
13924