xref: /aosp_15_r20/external/mesa3d/src/amd/vulkan/radv_cmd_buffer.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * SPDX-License-Identifier: MIT
9  */
10 
11 #include "radv_cmd_buffer.h"
12 #include "meta/radv_meta.h"
13 #include "radv_cp_dma.h"
14 #include "radv_cs.h"
15 #include "radv_debug.h"
16 #include "radv_device_generated_commands.h"
17 #include "radv_event.h"
18 #include "radv_pipeline_rt.h"
19 #include "radv_radeon_winsys.h"
20 #include "radv_rmv.h"
21 #include "radv_rra.h"
22 #include "radv_shader.h"
23 #include "radv_shader_object.h"
24 #include "radv_sqtt.h"
25 #include "sid.h"
26 #include "vk_command_pool.h"
27 #include "vk_common_entrypoints.h"
28 #include "vk_enum_defines.h"
29 #include "vk_format.h"
30 #include "vk_framebuffer.h"
31 #include "vk_render_pass.h"
32 #include "vk_synchronization.h"
33 #include "vk_util.h"
34 
35 #include "ac_debug.h"
36 #include "ac_descriptors.h"
37 #include "ac_nir.h"
38 #include "ac_shader_args.h"
39 
40 #include "aco_interface.h"
41 
42 #include "util/fast_idiv_by_const.h"
43 
44 enum {
45    RADV_PREFETCH_VBO_DESCRIPTORS = (1 << 0),
46    RADV_PREFETCH_VS = (1 << 1),
47    RADV_PREFETCH_TCS = (1 << 2),
48    RADV_PREFETCH_TES = (1 << 3),
49    RADV_PREFETCH_GS = (1 << 4),
50    RADV_PREFETCH_PS = (1 << 5),
51    RADV_PREFETCH_MS = (1 << 6),
52    RADV_PREFETCH_SHADERS = (RADV_PREFETCH_VS | RADV_PREFETCH_TCS | RADV_PREFETCH_TES | RADV_PREFETCH_GS |
53                             RADV_PREFETCH_PS | RADV_PREFETCH_MS)
54 };
55 
56 static void radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
57                                          VkImageLayout src_layout, VkImageLayout dst_layout, uint32_t src_family_index,
58                                          uint32_t dst_family_index, const VkImageSubresourceRange *range,
59                                          struct radv_sample_locations_state *sample_locs);
60 
61 static void
radv_bind_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_dynamic_state * src)62 radv_bind_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_dynamic_state *src)
63 {
64    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
65    const struct radv_physical_device *pdev = radv_device_physical(device);
66    struct radv_dynamic_state *dest = &cmd_buffer->state.dynamic;
67    uint64_t copy_mask = src->mask;
68    uint64_t dest_mask = 0;
69 
70    dest->vk.dr.rectangle_count = src->vk.dr.rectangle_count;
71    dest->sample_location.count = src->sample_location.count;
72 
73    if (copy_mask & RADV_DYNAMIC_VIEWPORT) {
74       if (dest->vk.vp.viewport_count != src->vk.vp.viewport_count) {
75          dest->vk.vp.viewport_count = src->vk.vp.viewport_count;
76          dest_mask |= RADV_DYNAMIC_VIEWPORT;
77       }
78 
79       if (memcmp(&dest->vk.vp.viewports, &src->vk.vp.viewports, src->vk.vp.viewport_count * sizeof(VkViewport))) {
80          typed_memcpy(dest->vk.vp.viewports, src->vk.vp.viewports, src->vk.vp.viewport_count);
81          typed_memcpy(dest->hw_vp.xform, src->hw_vp.xform, src->vk.vp.viewport_count);
82          dest_mask |= RADV_DYNAMIC_VIEWPORT;
83       }
84    }
85 
86    if (copy_mask & RADV_DYNAMIC_SCISSOR) {
87       if (dest->vk.vp.scissor_count != src->vk.vp.scissor_count) {
88          dest->vk.vp.scissor_count = src->vk.vp.scissor_count;
89          dest_mask |= RADV_DYNAMIC_SCISSOR;
90       }
91 
92       if (memcmp(&dest->vk.vp.scissors, &src->vk.vp.scissors, src->vk.vp.scissor_count * sizeof(VkRect2D))) {
93          typed_memcpy(dest->vk.vp.scissors, src->vk.vp.scissors, src->vk.vp.scissor_count);
94          dest_mask |= RADV_DYNAMIC_SCISSOR;
95       }
96    }
97 
98    if (copy_mask & RADV_DYNAMIC_BLEND_CONSTANTS) {
99       if (memcmp(&dest->vk.cb.blend_constants, &src->vk.cb.blend_constants, sizeof(src->vk.cb.blend_constants))) {
100          typed_memcpy(dest->vk.cb.blend_constants, src->vk.cb.blend_constants, 4);
101          dest_mask |= RADV_DYNAMIC_BLEND_CONSTANTS;
102       }
103    }
104 
105    if (copy_mask & RADV_DYNAMIC_DISCARD_RECTANGLE) {
106       if (memcmp(&dest->vk.dr.rectangles, &src->vk.dr.rectangles, src->vk.dr.rectangle_count * sizeof(VkRect2D))) {
107          typed_memcpy(dest->vk.dr.rectangles, src->vk.dr.rectangles, src->vk.dr.rectangle_count);
108          dest_mask |= RADV_DYNAMIC_DISCARD_RECTANGLE;
109       }
110    }
111 
112    if (copy_mask & RADV_DYNAMIC_SAMPLE_LOCATIONS) {
113       if (dest->sample_location.per_pixel != src->sample_location.per_pixel ||
114           dest->sample_location.grid_size.width != src->sample_location.grid_size.width ||
115           dest->sample_location.grid_size.height != src->sample_location.grid_size.height ||
116           memcmp(&dest->sample_location.locations, &src->sample_location.locations,
117                  src->sample_location.count * sizeof(VkSampleLocationEXT))) {
118          dest->sample_location.per_pixel = src->sample_location.per_pixel;
119          dest->sample_location.grid_size = src->sample_location.grid_size;
120          typed_memcpy(dest->sample_location.locations, src->sample_location.locations, src->sample_location.count);
121          dest_mask |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
122       }
123    }
124 
125    if (copy_mask & RADV_DYNAMIC_COLOR_WRITE_MASK) {
126       for (uint32_t i = 0; i < MAX_RTS; i++) {
127          if (dest->vk.cb.attachments[i].write_mask != src->vk.cb.attachments[i].write_mask) {
128             dest->vk.cb.attachments[i].write_mask = src->vk.cb.attachments[i].write_mask;
129             dest_mask |= RADV_DYNAMIC_COLOR_WRITE_MASK;
130          }
131       }
132    }
133 
134    if (copy_mask & RADV_DYNAMIC_COLOR_BLEND_ENABLE) {
135       for (uint32_t i = 0; i < MAX_RTS; i++) {
136          if (dest->vk.cb.attachments[i].blend_enable != src->vk.cb.attachments[i].blend_enable) {
137             dest->vk.cb.attachments[i].blend_enable = src->vk.cb.attachments[i].blend_enable;
138             dest_mask |= RADV_DYNAMIC_COLOR_BLEND_ENABLE;
139          }
140       }
141    }
142 
143    if (copy_mask & RADV_DYNAMIC_COLOR_BLEND_EQUATION) {
144       for (uint32_t i = 0; i < MAX_RTS; i++) {
145          if (dest->vk.cb.attachments[i].src_color_blend_factor != src->vk.cb.attachments[i].src_color_blend_factor ||
146              dest->vk.cb.attachments[i].dst_color_blend_factor != src->vk.cb.attachments[i].dst_color_blend_factor ||
147              dest->vk.cb.attachments[i].color_blend_op != src->vk.cb.attachments[i].color_blend_op ||
148              dest->vk.cb.attachments[i].src_alpha_blend_factor != src->vk.cb.attachments[i].src_alpha_blend_factor ||
149              dest->vk.cb.attachments[i].dst_alpha_blend_factor != src->vk.cb.attachments[i].dst_alpha_blend_factor ||
150              dest->vk.cb.attachments[i].alpha_blend_op != src->vk.cb.attachments[i].alpha_blend_op) {
151             dest->vk.cb.attachments[i].src_color_blend_factor = src->vk.cb.attachments[i].src_color_blend_factor;
152             dest->vk.cb.attachments[i].dst_color_blend_factor = src->vk.cb.attachments[i].dst_color_blend_factor;
153             dest->vk.cb.attachments[i].color_blend_op = src->vk.cb.attachments[i].color_blend_op;
154             dest->vk.cb.attachments[i].src_alpha_blend_factor = src->vk.cb.attachments[i].src_alpha_blend_factor;
155             dest->vk.cb.attachments[i].dst_alpha_blend_factor = src->vk.cb.attachments[i].dst_alpha_blend_factor;
156             dest->vk.cb.attachments[i].alpha_blend_op = src->vk.cb.attachments[i].alpha_blend_op;
157             dest_mask |= RADV_DYNAMIC_COLOR_BLEND_EQUATION;
158          }
159       }
160    }
161 
162    if (memcmp(&dest->vk.cal.color_map, &src->vk.cal.color_map, sizeof(src->vk.cal.color_map))) {
163       typed_memcpy(dest->vk.cal.color_map, src->vk.cal.color_map, MAX_RTS);
164       dest_mask |= RADV_DYNAMIC_COLOR_ATTACHMENT_MAP;
165    }
166 
167    if (memcmp(&dest->vk.ial, &src->vk.ial, sizeof(src->vk.ial))) {
168       typed_memcpy(dest->vk.ial.color_map, src->vk.ial.color_map, MAX_RTS);
169       dest->vk.ial.depth_att = src->vk.ial.depth_att;
170       dest->vk.ial.stencil_att = src->vk.ial.stencil_att;
171       dest_mask |= RADV_DYNAMIC_INPUT_ATTACHMENT_MAP;
172    }
173 
174 #define RADV_CMP_COPY(field, flag)                                                                                     \
175    if (copy_mask & flag) {                                                                                             \
176       if (dest->field != src->field) {                                                                                 \
177          dest->field = src->field;                                                                                     \
178          dest_mask |= flag;                                                                                            \
179       }                                                                                                                \
180    }
181 
182    RADV_CMP_COPY(vk.ia.primitive_topology, RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
183    RADV_CMP_COPY(vk.ia.primitive_restart_enable, RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE);
184 
185    RADV_CMP_COPY(vk.vp.depth_clip_negative_one_to_one, RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE);
186 
187    RADV_CMP_COPY(vk.ts.patch_control_points, RADV_DYNAMIC_PATCH_CONTROL_POINTS);
188    RADV_CMP_COPY(vk.ts.domain_origin, RADV_DYNAMIC_TESS_DOMAIN_ORIGIN);
189 
190    RADV_CMP_COPY(vk.rs.line.width, RADV_DYNAMIC_LINE_WIDTH);
191    RADV_CMP_COPY(vk.rs.depth_bias.constant, RADV_DYNAMIC_DEPTH_BIAS);
192    RADV_CMP_COPY(vk.rs.depth_bias.clamp, RADV_DYNAMIC_DEPTH_BIAS);
193    RADV_CMP_COPY(vk.rs.depth_bias.slope, RADV_DYNAMIC_DEPTH_BIAS);
194    RADV_CMP_COPY(vk.rs.depth_bias.representation, RADV_DYNAMIC_DEPTH_BIAS);
195    RADV_CMP_COPY(vk.rs.line.stipple.factor, RADV_DYNAMIC_LINE_STIPPLE);
196    RADV_CMP_COPY(vk.rs.line.stipple.pattern, RADV_DYNAMIC_LINE_STIPPLE);
197    RADV_CMP_COPY(vk.rs.cull_mode, RADV_DYNAMIC_CULL_MODE);
198    RADV_CMP_COPY(vk.rs.front_face, RADV_DYNAMIC_FRONT_FACE);
199    RADV_CMP_COPY(vk.rs.depth_bias.enable, RADV_DYNAMIC_DEPTH_BIAS_ENABLE);
200    RADV_CMP_COPY(vk.rs.rasterizer_discard_enable, RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
201    RADV_CMP_COPY(vk.rs.polygon_mode, RADV_DYNAMIC_POLYGON_MODE);
202    RADV_CMP_COPY(vk.rs.line.stipple.enable, RADV_DYNAMIC_LINE_STIPPLE_ENABLE);
203    RADV_CMP_COPY(vk.rs.depth_clip_enable, RADV_DYNAMIC_DEPTH_CLIP_ENABLE);
204    RADV_CMP_COPY(vk.rs.conservative_mode, RADV_DYNAMIC_CONSERVATIVE_RAST_MODE);
205    RADV_CMP_COPY(vk.rs.provoking_vertex, RADV_DYNAMIC_PROVOKING_VERTEX_MODE);
206    RADV_CMP_COPY(vk.rs.depth_clamp_enable, RADV_DYNAMIC_DEPTH_CLAMP_ENABLE);
207    RADV_CMP_COPY(vk.rs.line.mode, RADV_DYNAMIC_LINE_RASTERIZATION_MODE);
208 
209    RADV_CMP_COPY(vk.ms.alpha_to_coverage_enable, RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE);
210    RADV_CMP_COPY(vk.ms.alpha_to_one_enable, RADV_DYNAMIC_ALPHA_TO_ONE_ENABLE);
211    RADV_CMP_COPY(vk.ms.sample_mask, RADV_DYNAMIC_SAMPLE_MASK);
212    RADV_CMP_COPY(vk.ms.rasterization_samples, RADV_DYNAMIC_RASTERIZATION_SAMPLES);
213    RADV_CMP_COPY(vk.ms.sample_locations_enable, RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE);
214 
215    RADV_CMP_COPY(vk.ds.depth.bounds_test.min, RADV_DYNAMIC_DEPTH_BOUNDS);
216    RADV_CMP_COPY(vk.ds.depth.bounds_test.max, RADV_DYNAMIC_DEPTH_BOUNDS);
217    RADV_CMP_COPY(vk.ds.stencil.front.compare_mask, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
218    RADV_CMP_COPY(vk.ds.stencil.back.compare_mask, RADV_DYNAMIC_STENCIL_COMPARE_MASK);
219    RADV_CMP_COPY(vk.ds.stencil.front.write_mask, RADV_DYNAMIC_STENCIL_WRITE_MASK);
220    RADV_CMP_COPY(vk.ds.stencil.back.write_mask, RADV_DYNAMIC_STENCIL_WRITE_MASK);
221    RADV_CMP_COPY(vk.ds.stencil.front.reference, RADV_DYNAMIC_STENCIL_REFERENCE);
222    RADV_CMP_COPY(vk.ds.stencil.back.reference, RADV_DYNAMIC_STENCIL_REFERENCE);
223    RADV_CMP_COPY(vk.ds.depth.test_enable, RADV_DYNAMIC_DEPTH_TEST_ENABLE);
224    RADV_CMP_COPY(vk.ds.depth.write_enable, RADV_DYNAMIC_DEPTH_WRITE_ENABLE);
225    RADV_CMP_COPY(vk.ds.depth.compare_op, RADV_DYNAMIC_DEPTH_COMPARE_OP);
226    RADV_CMP_COPY(vk.ds.depth.bounds_test.enable, RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE);
227    RADV_CMP_COPY(vk.ds.stencil.test_enable, RADV_DYNAMIC_STENCIL_TEST_ENABLE);
228    RADV_CMP_COPY(vk.ds.stencil.front.op.fail, RADV_DYNAMIC_STENCIL_OP);
229    RADV_CMP_COPY(vk.ds.stencil.front.op.pass, RADV_DYNAMIC_STENCIL_OP);
230    RADV_CMP_COPY(vk.ds.stencil.front.op.depth_fail, RADV_DYNAMIC_STENCIL_OP);
231    RADV_CMP_COPY(vk.ds.stencil.front.op.compare, RADV_DYNAMIC_STENCIL_OP);
232    RADV_CMP_COPY(vk.ds.stencil.back.op.fail, RADV_DYNAMIC_STENCIL_OP);
233    RADV_CMP_COPY(vk.ds.stencil.back.op.pass, RADV_DYNAMIC_STENCIL_OP);
234    RADV_CMP_COPY(vk.ds.stencil.back.op.depth_fail, RADV_DYNAMIC_STENCIL_OP);
235    RADV_CMP_COPY(vk.ds.stencil.back.op.compare, RADV_DYNAMIC_STENCIL_OP);
236 
237    RADV_CMP_COPY(vk.cb.logic_op, RADV_DYNAMIC_LOGIC_OP);
238    RADV_CMP_COPY(vk.cb.color_write_enables, RADV_DYNAMIC_COLOR_WRITE_ENABLE);
239    RADV_CMP_COPY(vk.cb.logic_op_enable, RADV_DYNAMIC_LOGIC_OP_ENABLE);
240 
241    RADV_CMP_COPY(vk.fsr.fragment_size.width, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
242    RADV_CMP_COPY(vk.fsr.fragment_size.height, RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
243    RADV_CMP_COPY(vk.fsr.combiner_ops[0], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
244    RADV_CMP_COPY(vk.fsr.combiner_ops[1], RADV_DYNAMIC_FRAGMENT_SHADING_RATE);
245 
246    RADV_CMP_COPY(vk.dr.enable, RADV_DYNAMIC_DISCARD_RECTANGLE_ENABLE);
247    RADV_CMP_COPY(vk.dr.mode, RADV_DYNAMIC_DISCARD_RECTANGLE_MODE);
248 
249    RADV_CMP_COPY(feedback_loop_aspects, RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE);
250 
251 #undef RADV_CMP_COPY
252 
253    cmd_buffer->state.dirty_dynamic |= dest_mask;
254 
255    /* Handle driver specific states that need to be re-emitted when PSO are bound. */
256    if (dest_mask & (RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_POLYGON_MODE | RADV_DYNAMIC_LINE_WIDTH |
257                     RADV_DYNAMIC_PRIMITIVE_TOPOLOGY)) {
258       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GUARDBAND;
259    }
260 
261    if (pdev->info.rbplus_allowed && (dest_mask & RADV_DYNAMIC_COLOR_WRITE_MASK)) {
262       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
263    }
264 
265    if (dest_mask & (RADV_DYNAMIC_COLOR_ATTACHMENT_MAP | RADV_DYNAMIC_INPUT_ATTACHMENT_MAP)) {
266       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FBFETCH_OUTPUT;
267    }
268 }
269 
270 bool
radv_cmd_buffer_uses_mec(struct radv_cmd_buffer * cmd_buffer)271 radv_cmd_buffer_uses_mec(struct radv_cmd_buffer *cmd_buffer)
272 {
273    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
274    const struct radv_physical_device *pdev = radv_device_physical(device);
275    return cmd_buffer->qf == RADV_QUEUE_COMPUTE && pdev->info.gfx_level >= GFX7;
276 }
277 
278 static void
radv_write_data(struct radv_cmd_buffer * cmd_buffer,const unsigned engine_sel,const uint64_t va,const unsigned count,const uint32_t * data,const bool predicating)279 radv_write_data(struct radv_cmd_buffer *cmd_buffer, const unsigned engine_sel, const uint64_t va, const unsigned count,
280                 const uint32_t *data, const bool predicating)
281 {
282    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
283 
284    radv_cs_write_data(device, cmd_buffer->cs, cmd_buffer->qf, engine_sel, va, count, data, predicating);
285 }
286 
287 static void
radv_emit_clear_data(struct radv_cmd_buffer * cmd_buffer,unsigned engine_sel,uint64_t va,unsigned size)288 radv_emit_clear_data(struct radv_cmd_buffer *cmd_buffer, unsigned engine_sel, uint64_t va, unsigned size)
289 {
290    uint32_t *zeroes = alloca(size);
291    memset(zeroes, 0, size);
292    radv_write_data(cmd_buffer, engine_sel, va, size / 4, zeroes, false);
293 }
294 
295 static void
radv_cmd_buffer_finish_shader_part_cache(struct radv_cmd_buffer * cmd_buffer)296 radv_cmd_buffer_finish_shader_part_cache(struct radv_cmd_buffer *cmd_buffer)
297 {
298    ralloc_free(cmd_buffer->vs_prologs.table);
299    ralloc_free(cmd_buffer->ps_epilogs.table);
300 }
301 
302 static bool
radv_cmd_buffer_init_shader_part_cache(struct radv_device * device,struct radv_cmd_buffer * cmd_buffer)303 radv_cmd_buffer_init_shader_part_cache(struct radv_device *device, struct radv_cmd_buffer *cmd_buffer)
304 {
305    if (device->vs_prologs.ops) {
306       if (!_mesa_set_init(&cmd_buffer->vs_prologs, NULL, device->vs_prologs.ops->hash, device->vs_prologs.ops->equals))
307          return false;
308    }
309    if (device->ps_epilogs.ops) {
310       if (!_mesa_set_init(&cmd_buffer->ps_epilogs, NULL, device->ps_epilogs.ops->hash, device->ps_epilogs.ops->equals))
311          return false;
312    }
313    return true;
314 }
315 
316 static void
radv_destroy_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer)317 radv_destroy_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer)
318 {
319    struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk);
320    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
321 
322    if (cmd_buffer->qf != RADV_QUEUE_SPARSE) {
323       util_dynarray_fini(&cmd_buffer->ray_history);
324 
325       radv_rra_accel_struct_buffers_unref(device, cmd_buffer->accel_struct_buffers);
326       _mesa_set_destroy(cmd_buffer->accel_struct_buffers, NULL);
327 
328       list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) {
329          radv_rmv_log_command_buffer_bo_destroy(device, up->upload_bo);
330          radv_bo_destroy(device, &cmd_buffer->vk.base, up->upload_bo);
331          list_del(&up->list);
332          free(up);
333       }
334 
335       if (cmd_buffer->upload.upload_bo) {
336          radv_rmv_log_command_buffer_bo_destroy(device, cmd_buffer->upload.upload_bo);
337          radv_bo_destroy(device, &cmd_buffer->vk.base, cmd_buffer->upload.upload_bo);
338       }
339 
340       if (cmd_buffer->cs)
341          device->ws->cs_destroy(cmd_buffer->cs);
342       if (cmd_buffer->gang.cs)
343          device->ws->cs_destroy(cmd_buffer->gang.cs);
344       if (cmd_buffer->transfer.copy_temp)
345          radv_bo_destroy(device, &cmd_buffer->vk.base, cmd_buffer->transfer.copy_temp);
346 
347       radv_cmd_buffer_finish_shader_part_cache(cmd_buffer);
348 
349       for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
350          struct radv_descriptor_set_header *set = &cmd_buffer->descriptors[i].push_set.set;
351          free(set->mapped_ptr);
352          if (set->layout)
353             vk_descriptor_set_layout_unref(&device->vk, &set->layout->vk);
354          vk_object_base_finish(&set->base);
355       }
356 
357       vk_object_base_finish(&cmd_buffer->meta_push_descriptors.base);
358    }
359 
360    vk_command_buffer_finish(&cmd_buffer->vk);
361    vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
362 }
363 
364 static VkResult
radv_create_cmd_buffer(struct vk_command_pool * pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmd_buffer_out)365 radv_create_cmd_buffer(struct vk_command_pool *pool, VkCommandBufferLevel level,
366                        struct vk_command_buffer **cmd_buffer_out)
367 {
368    struct radv_device *device = container_of(pool->base.device, struct radv_device, vk);
369    const struct radv_physical_device *pdev = radv_device_physical(device);
370    struct radv_cmd_buffer *cmd_buffer;
371    unsigned ring;
372    cmd_buffer = vk_zalloc(&pool->alloc, sizeof(*cmd_buffer), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
373    if (cmd_buffer == NULL)
374       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
375 
376    VkResult result = vk_command_buffer_init(pool, &cmd_buffer->vk, &radv_cmd_buffer_ops, level);
377    if (result != VK_SUCCESS) {
378       vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
379       return result;
380    }
381 
382    cmd_buffer->qf = vk_queue_to_radv(pdev, pool->queue_family_index);
383 
384    if (cmd_buffer->qf != RADV_QUEUE_SPARSE) {
385       list_inithead(&cmd_buffer->upload.list);
386 
387       if (!radv_cmd_buffer_init_shader_part_cache(device, cmd_buffer)) {
388          radv_destroy_cmd_buffer(&cmd_buffer->vk);
389          return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
390       }
391 
392       ring = radv_queue_family_to_ring(pdev, cmd_buffer->qf);
393 
394       cmd_buffer->cs =
395          device->ws->cs_create(device->ws, ring, cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
396       if (!cmd_buffer->cs) {
397          radv_destroy_cmd_buffer(&cmd_buffer->vk);
398          return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
399       }
400 
401       vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
402 
403       for (unsigned i = 0; i < MAX_BIND_POINTS; i++)
404          vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, VK_OBJECT_TYPE_DESCRIPTOR_SET);
405 
406       cmd_buffer->accel_struct_buffers = _mesa_pointer_set_create(NULL);
407       util_dynarray_init(&cmd_buffer->ray_history, NULL);
408    }
409 
410    *cmd_buffer_out = &cmd_buffer->vk;
411 
412    return VK_SUCCESS;
413 }
414 
415 void
radv_cmd_buffer_reset_rendering(struct radv_cmd_buffer * cmd_buffer)416 radv_cmd_buffer_reset_rendering(struct radv_cmd_buffer *cmd_buffer)
417 {
418    memset(&cmd_buffer->state.render, 0, sizeof(cmd_buffer->state.render));
419 }
420 
421 static void
radv_reset_tracked_regs(struct radv_cmd_buffer * cmd_buffer)422 radv_reset_tracked_regs(struct radv_cmd_buffer *cmd_buffer)
423 {
424    struct radv_tracked_regs *tracked_regs = &cmd_buffer->tracked_regs;
425 
426    /* Mark all registers as unknown. */
427    memset(tracked_regs->reg_value, 0, RADV_NUM_ALL_TRACKED_REGS * sizeof(uint32_t));
428    BITSET_ZERO(tracked_regs->reg_saved_mask);
429 
430    /* 0xffffffff is an impossible value for SPI_PS_INPUT_CNTL_n registers */
431    memset(tracked_regs->spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
432 }
433 
434 static void
radv_reset_cmd_buffer(struct vk_command_buffer * vk_cmd_buffer,UNUSED VkCommandBufferResetFlags flags)435 radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandBufferResetFlags flags)
436 {
437    struct radv_cmd_buffer *cmd_buffer = container_of(vk_cmd_buffer, struct radv_cmd_buffer, vk);
438    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
439 
440    vk_command_buffer_reset(&cmd_buffer->vk);
441 
442    if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
443       return;
444 
445    device->ws->cs_reset(cmd_buffer->cs);
446    if (cmd_buffer->gang.cs)
447       device->ws->cs_reset(cmd_buffer->gang.cs);
448 
449    list_for_each_entry_safe (struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) {
450       radv_rmv_log_command_buffer_bo_destroy(device, up->upload_bo);
451       radv_bo_destroy(device, &cmd_buffer->vk.base, up->upload_bo);
452       list_del(&up->list);
453       free(up);
454    }
455 
456    util_dynarray_clear(&cmd_buffer->ray_history);
457 
458    radv_rra_accel_struct_buffers_unref(device, cmd_buffer->accel_struct_buffers);
459 
460    cmd_buffer->push_constant_stages = 0;
461    cmd_buffer->scratch_size_per_wave_needed = 0;
462    cmd_buffer->scratch_waves_wanted = 0;
463    cmd_buffer->compute_scratch_size_per_wave_needed = 0;
464    cmd_buffer->compute_scratch_waves_wanted = 0;
465    cmd_buffer->esgs_ring_size_needed = 0;
466    cmd_buffer->gsvs_ring_size_needed = 0;
467    cmd_buffer->tess_rings_needed = false;
468    cmd_buffer->task_rings_needed = false;
469    cmd_buffer->mesh_scratch_ring_needed = false;
470    cmd_buffer->gds_needed = false;
471    cmd_buffer->gds_oa_needed = false;
472    cmd_buffer->sample_positions_needed = false;
473    cmd_buffer->gang.sem.leader_value = 0;
474    cmd_buffer->gang.sem.emitted_leader_value = 0;
475    cmd_buffer->gang.sem.va = 0;
476    cmd_buffer->shader_upload_seq = 0;
477    cmd_buffer->has_indirect_pipeline_binds = false;
478 
479    if (cmd_buffer->upload.upload_bo)
480       radv_cs_add_buffer(device->ws, cmd_buffer->cs, cmd_buffer->upload.upload_bo);
481    cmd_buffer->upload.offset = 0;
482 
483    memset(cmd_buffer->vertex_binding_buffers, 0, sizeof(struct radv_buffer *) * cmd_buffer->used_vertex_bindings);
484    cmd_buffer->used_vertex_bindings = 0;
485 
486    for (unsigned i = 0; i < MAX_BIND_POINTS; i++) {
487       cmd_buffer->descriptors[i].dirty = 0;
488       cmd_buffer->descriptors[i].valid = 0;
489    }
490 
491    radv_cmd_buffer_reset_rendering(cmd_buffer);
492 }
493 
494 const struct vk_command_buffer_ops radv_cmd_buffer_ops = {
495    .create = radv_create_cmd_buffer,
496    .reset = radv_reset_cmd_buffer,
497    .destroy = radv_destroy_cmd_buffer,
498 };
499 
500 static bool
radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer * cmd_buffer,uint64_t min_needed)501 radv_cmd_buffer_resize_upload_buf(struct radv_cmd_buffer *cmd_buffer, uint64_t min_needed)
502 {
503    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
504    uint64_t new_size;
505    struct radeon_winsys_bo *bo = NULL;
506    struct radv_cmd_buffer_upload *upload;
507 
508    new_size = MAX2(min_needed, 16 * 1024);
509    new_size = MAX2(new_size, 2 * cmd_buffer->upload.size);
510 
511    VkResult result = radv_bo_create(
512       device, &cmd_buffer->vk.base, new_size, 4096, device->ws->cs_domain(device->ws),
513       RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_32BIT | RADEON_FLAG_GTT_WC,
514       RADV_BO_PRIORITY_UPLOAD_BUFFER, 0, true, &bo);
515 
516    if (result != VK_SUCCESS) {
517       vk_command_buffer_set_error(&cmd_buffer->vk, result);
518       return false;
519    }
520 
521    radv_cs_add_buffer(device->ws, cmd_buffer->cs, bo);
522    if (cmd_buffer->upload.upload_bo) {
523       upload = malloc(sizeof(*upload));
524 
525       if (!upload) {
526          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
527          radv_bo_destroy(device, &cmd_buffer->vk.base, bo);
528          return false;
529       }
530 
531       memcpy(upload, &cmd_buffer->upload, sizeof(*upload));
532       list_add(&upload->list, &cmd_buffer->upload.list);
533    }
534 
535    cmd_buffer->upload.upload_bo = bo;
536    cmd_buffer->upload.size = new_size;
537    cmd_buffer->upload.offset = 0;
538    cmd_buffer->upload.map = radv_buffer_map(device->ws, cmd_buffer->upload.upload_bo);
539 
540    if (!cmd_buffer->upload.map) {
541       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
542       return false;
543    }
544 
545    radv_rmv_log_command_buffer_bo_create(device, cmd_buffer->upload.upload_bo, 0, cmd_buffer->upload.size, 0);
546 
547    return true;
548 }
549 
550 bool
radv_cmd_buffer_upload_alloc_aligned(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned alignment,unsigned * out_offset,void ** ptr)551 radv_cmd_buffer_upload_alloc_aligned(struct radv_cmd_buffer *cmd_buffer, unsigned size, unsigned alignment,
552                                      unsigned *out_offset, void **ptr)
553 {
554    assert(size % 4 == 0);
555 
556    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
557    const struct radv_physical_device *pdev = radv_device_physical(device);
558    const struct radeon_info *gpu_info = &pdev->info;
559 
560    /* Align to the scalar cache line size if it results in this allocation
561     * being placed in less of them.
562     */
563    unsigned offset = cmd_buffer->upload.offset;
564    unsigned line_size = gpu_info->gfx_level >= GFX10 ? 64 : 32;
565    unsigned gap = align(offset, line_size) - offset;
566    if ((size & (line_size - 1)) > gap)
567       offset = align(offset, line_size);
568 
569    if (alignment)
570       offset = align(offset, alignment);
571    if (offset + size > cmd_buffer->upload.size) {
572       if (!radv_cmd_buffer_resize_upload_buf(cmd_buffer, size))
573          return false;
574       offset = 0;
575    }
576 
577    *out_offset = offset;
578    *ptr = cmd_buffer->upload.map + offset;
579 
580    cmd_buffer->upload.offset = offset + size;
581    return true;
582 }
583 
584 bool
radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer * cmd_buffer,unsigned size,unsigned * out_offset,void ** ptr)585 radv_cmd_buffer_upload_alloc(struct radv_cmd_buffer *cmd_buffer, unsigned size, unsigned *out_offset, void **ptr)
586 {
587    return radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, size, 0, out_offset, ptr);
588 }
589 
590 bool
radv_cmd_buffer_upload_data(struct radv_cmd_buffer * cmd_buffer,unsigned size,const void * data,unsigned * out_offset)591 radv_cmd_buffer_upload_data(struct radv_cmd_buffer *cmd_buffer, unsigned size, const void *data, unsigned *out_offset)
592 {
593    uint8_t *ptr;
594 
595    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, out_offset, (void **)&ptr))
596       return false;
597    assert(ptr);
598 
599    memcpy(ptr, data, size);
600    return true;
601 }
602 
603 void
radv_cmd_buffer_trace_emit(struct radv_cmd_buffer * cmd_buffer)604 radv_cmd_buffer_trace_emit(struct radv_cmd_buffer *cmd_buffer)
605 {
606    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
607    struct radeon_cmdbuf *cs = cmd_buffer->cs;
608    uint64_t va;
609 
610    if (cmd_buffer->qf != RADV_QUEUE_GENERAL && cmd_buffer->qf != RADV_QUEUE_COMPUTE)
611       return;
612 
613    va = radv_buffer_get_va(device->trace_bo);
614    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY)
615       va += offsetof(struct radv_trace_data, primary_id);
616    else
617       va += offsetof(struct radv_trace_data, secondary_id);
618 
619    ++cmd_buffer->state.trace_id;
620    radv_write_data(cmd_buffer, V_370_ME, va, 1, &cmd_buffer->state.trace_id, false);
621 
622    radeon_check_space(device->ws, cs, 2);
623 
624    radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
625    radeon_emit(cs, AC_ENCODE_TRACE_POINT(cmd_buffer->state.trace_id));
626 }
627 
628 void
radv_cmd_buffer_annotate(struct radv_cmd_buffer * cmd_buffer,const char * annotation)629 radv_cmd_buffer_annotate(struct radv_cmd_buffer *cmd_buffer, const char *annotation)
630 {
631    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
632 
633    device->ws->cs_annotate(cmd_buffer->cs, annotation);
634 }
635 
636 static void
radv_gang_barrier(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask,VkPipelineStageFlags2 dst_stage_mask)637 radv_gang_barrier(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask,
638                   VkPipelineStageFlags2 dst_stage_mask)
639 {
640    /* Update flush bits from the main cmdbuf, except the stage flush. */
641    cmd_buffer->gang.flush_bits |=
642       cmd_buffer->state.flush_bits & RADV_CMD_FLUSH_ALL_COMPUTE & ~RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
643 
644    /* Add stage flush only when necessary. */
645    if (src_stage_mask & (VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
646                          VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
647                          VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV))
648       cmd_buffer->gang.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
649 
650    /* Block task shaders when we have to wait for CP DMA on the GFX cmdbuf. */
651    if (src_stage_mask &
652        (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
653         VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
654       dst_stage_mask |= cmd_buffer->state.dma_is_busy ? VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT : 0;
655 
656    /* Increment the GFX/ACE semaphore when task shaders are blocked. */
657    if (dst_stage_mask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
658                          VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT))
659       cmd_buffer->gang.sem.leader_value++;
660 }
661 
662 void
radv_gang_cache_flush(struct radv_cmd_buffer * cmd_buffer)663 radv_gang_cache_flush(struct radv_cmd_buffer *cmd_buffer)
664 {
665    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
666    const struct radv_physical_device *pdev = radv_device_physical(device);
667    struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
668    const uint32_t flush_bits = cmd_buffer->gang.flush_bits;
669    enum rgp_flush_bits sqtt_flush_bits = 0;
670 
671    radv_cs_emit_cache_flush(device->ws, ace_cs, pdev->info.gfx_level, NULL, 0, RADV_QUEUE_COMPUTE, flush_bits,
672                             &sqtt_flush_bits, 0);
673 
674    cmd_buffer->gang.flush_bits = 0;
675 }
676 
677 static bool
radv_gang_sem_init(struct radv_cmd_buffer * cmd_buffer)678 radv_gang_sem_init(struct radv_cmd_buffer *cmd_buffer)
679 {
680    if (cmd_buffer->gang.sem.va)
681       return true;
682 
683    /* DWORD 0: GFX->ACE semaphore (GFX blocks ACE, ie. ACE waits for GFX)
684     * DWORD 1: ACE->GFX semaphore
685     */
686    uint64_t sem_init = 0;
687    uint32_t va_off = 0;
688    if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint64_t), &sem_init, &va_off)) {
689       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
690       return false;
691    }
692 
693    cmd_buffer->gang.sem.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + va_off;
694    return true;
695 }
696 
697 static bool
radv_gang_leader_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)698 radv_gang_leader_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
699 {
700    return cmd_buffer->gang.sem.leader_value != cmd_buffer->gang.sem.emitted_leader_value;
701 }
702 
703 static bool
radv_gang_follower_sem_dirty(const struct radv_cmd_buffer * cmd_buffer)704 radv_gang_follower_sem_dirty(const struct radv_cmd_buffer *cmd_buffer)
705 {
706    return cmd_buffer->gang.sem.follower_value != cmd_buffer->gang.sem.emitted_follower_value;
707 }
708 
709 ALWAYS_INLINE static bool
radv_flush_gang_semaphore(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs,const enum radv_queue_family qf,const uint32_t va_off,const uint32_t value)710 radv_flush_gang_semaphore(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs, const enum radv_queue_family qf,
711                           const uint32_t va_off, const uint32_t value)
712 {
713    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
714    const struct radv_physical_device *pdev = radv_device_physical(device);
715 
716    if (!radv_gang_sem_init(cmd_buffer))
717       return false;
718 
719    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 12);
720 
721    radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, qf, V_028A90_BOTTOM_OF_PIPE_TS, 0, EOP_DST_SEL_MEM,
722                                 EOP_DATA_SEL_VALUE_32BIT, cmd_buffer->gang.sem.va + va_off, value,
723                                 cmd_buffer->gfx9_eop_bug_va);
724 
725    assert(cmd_buffer->cs->cdw <= cdw_max);
726    return true;
727 }
728 
729 ALWAYS_INLINE static bool
radv_flush_gang_leader_semaphore(struct radv_cmd_buffer * cmd_buffer)730 radv_flush_gang_leader_semaphore(struct radv_cmd_buffer *cmd_buffer)
731 {
732    if (!radv_gang_leader_sem_dirty(cmd_buffer))
733       return false;
734 
735    /* Gang leader writes a value to the semaphore which the follower can wait for. */
736    cmd_buffer->gang.sem.emitted_leader_value = cmd_buffer->gang.sem.leader_value;
737    return radv_flush_gang_semaphore(cmd_buffer, cmd_buffer->cs, cmd_buffer->qf, 0, cmd_buffer->gang.sem.leader_value);
738 }
739 
740 ALWAYS_INLINE static bool
radv_flush_gang_follower_semaphore(struct radv_cmd_buffer * cmd_buffer)741 radv_flush_gang_follower_semaphore(struct radv_cmd_buffer *cmd_buffer)
742 {
743    if (!radv_gang_follower_sem_dirty(cmd_buffer))
744       return false;
745 
746    /* Follower writes a value to the semaphore which the gang leader can wait for. */
747    cmd_buffer->gang.sem.emitted_follower_value = cmd_buffer->gang.sem.follower_value;
748    return radv_flush_gang_semaphore(cmd_buffer, cmd_buffer->gang.cs, RADV_QUEUE_COMPUTE, 4,
749                                     cmd_buffer->gang.sem.follower_value);
750 }
751 
752 ALWAYS_INLINE static void
radv_wait_gang_semaphore(struct radv_cmd_buffer * cmd_buffer,struct radeon_cmdbuf * cs,const enum radv_queue_family qf,const uint32_t va_off,const uint32_t value)753 radv_wait_gang_semaphore(struct radv_cmd_buffer *cmd_buffer, struct radeon_cmdbuf *cs, const enum radv_queue_family qf,
754                          const uint32_t va_off, const uint32_t value)
755 {
756    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
757 
758    assert(cmd_buffer->gang.sem.va);
759    radeon_check_space(device->ws, cs, 7);
760    radv_cp_wait_mem(cs, qf, WAIT_REG_MEM_GREATER_OR_EQUAL, cmd_buffer->gang.sem.va + va_off, value, 0xffffffff);
761 }
762 
763 ALWAYS_INLINE static void
radv_wait_gang_leader(struct radv_cmd_buffer * cmd_buffer)764 radv_wait_gang_leader(struct radv_cmd_buffer *cmd_buffer)
765 {
766    /* Follower waits for the semaphore which the gang leader wrote. */
767    radv_wait_gang_semaphore(cmd_buffer, cmd_buffer->gang.cs, RADV_QUEUE_COMPUTE, 0, cmd_buffer->gang.sem.leader_value);
768 }
769 
770 ALWAYS_INLINE static void
radv_wait_gang_follower(struct radv_cmd_buffer * cmd_buffer)771 radv_wait_gang_follower(struct radv_cmd_buffer *cmd_buffer)
772 {
773    /* Gang leader waits for the semaphore which the follower wrote. */
774    radv_wait_gang_semaphore(cmd_buffer, cmd_buffer->cs, cmd_buffer->qf, 4, cmd_buffer->gang.sem.follower_value);
775 }
776 
777 bool
radv_gang_init(struct radv_cmd_buffer * cmd_buffer)778 radv_gang_init(struct radv_cmd_buffer *cmd_buffer)
779 {
780    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
781 
782    if (cmd_buffer->gang.cs)
783       return true;
784 
785    struct radeon_cmdbuf *ace_cs =
786       device->ws->cs_create(device->ws, AMD_IP_COMPUTE, cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
787 
788    if (!ace_cs) {
789       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_DEVICE_MEMORY);
790       return false;
791    }
792 
793    cmd_buffer->gang.cs = ace_cs;
794    return true;
795 }
796 
797 static VkResult
radv_gang_finalize(struct radv_cmd_buffer * cmd_buffer)798 radv_gang_finalize(struct radv_cmd_buffer *cmd_buffer)
799 {
800    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
801 
802    assert(cmd_buffer->gang.cs);
803    struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
804 
805    /* Emit pending cache flush. */
806    radv_gang_cache_flush(cmd_buffer);
807 
808    /* Clear the leader<->follower semaphores if they exist.
809     * This is necessary in case the same cmd buffer is submitted again in the future.
810     */
811    if (cmd_buffer->gang.sem.va) {
812       uint64_t leader2follower_va = cmd_buffer->gang.sem.va;
813       uint64_t follower2leader_va = cmd_buffer->gang.sem.va + 4;
814       const uint32_t zero = 0;
815 
816       /* Follower: write 0 to the leader->follower semaphore. */
817       radv_cs_write_data(device, ace_cs, RADV_QUEUE_COMPUTE, V_370_ME, leader2follower_va, 1, &zero, false);
818 
819       /* Leader: write 0 to the follower->leader semaphore. */
820       radv_write_data(cmd_buffer, V_370_ME, follower2leader_va, 1, &zero, false);
821    }
822 
823    return device->ws->cs_finalize(ace_cs);
824 }
825 
826 static void
radv_cmd_buffer_after_draw(struct radv_cmd_buffer * cmd_buffer,enum radv_cmd_flush_bits flags,bool dgc)827 radv_cmd_buffer_after_draw(struct radv_cmd_buffer *cmd_buffer, enum radv_cmd_flush_bits flags, bool dgc)
828 {
829    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
830    const struct radv_physical_device *pdev = radv_device_physical(device);
831    const struct radv_instance *instance = radv_physical_device_instance(pdev);
832 
833    if (unlikely(device->sqtt.bo) && !dgc) {
834       radeon_check_space(device->ws, cmd_buffer->cs, 2);
835 
836       radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, cmd_buffer->state.predicating));
837       radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_THREAD_TRACE_MARKER) | EVENT_INDEX(0));
838    }
839 
840    if (instance->debug_flags & RADV_DEBUG_SYNC_SHADERS) {
841       enum rgp_flush_bits sqtt_flush_bits = 0;
842       assert(flags & (RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH));
843 
844       /* Force wait for graphics or compute engines to be idle. */
845       radv_cs_emit_cache_flush(device->ws, cmd_buffer->cs, pdev->info.gfx_level, &cmd_buffer->gfx9_fence_idx,
846                                cmd_buffer->gfx9_fence_va, cmd_buffer->qf, flags, &sqtt_flush_bits,
847                                cmd_buffer->gfx9_eop_bug_va);
848 
849       if ((flags & RADV_CMD_FLAG_PS_PARTIAL_FLUSH) && radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
850          /* Force wait for compute engines to be idle on the internal cmdbuf. */
851          radv_cs_emit_cache_flush(device->ws, cmd_buffer->gang.cs, pdev->info.gfx_level, NULL, 0, RADV_QUEUE_COMPUTE,
852                                   RADV_CMD_FLAG_CS_PARTIAL_FLUSH, &sqtt_flush_bits, 0);
853       }
854    }
855 
856    if (radv_device_fault_detection_enabled(device))
857       radv_cmd_buffer_trace_emit(cmd_buffer);
858 }
859 
860 static void
radv_save_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_pipeline * pipeline)861 radv_save_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_pipeline *pipeline)
862 {
863    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
864    const struct radv_physical_device *pdev = radv_device_physical(device);
865    enum amd_ip_type ring;
866    uint32_t data[2];
867    uint64_t va;
868 
869    va = radv_buffer_get_va(device->trace_bo);
870 
871    ring = radv_queue_family_to_ring(pdev, cmd_buffer->qf);
872 
873    switch (ring) {
874    case AMD_IP_GFX:
875       va += offsetof(struct radv_trace_data, gfx_ring_pipeline);
876       break;
877    case AMD_IP_COMPUTE:
878       va += offsetof(struct radv_trace_data, comp_ring_pipeline);
879       break;
880    default:
881       assert(!"invalid IP type");
882    }
883 
884    uint64_t pipeline_address = (uintptr_t)pipeline;
885    data[0] = pipeline_address;
886    data[1] = pipeline_address >> 32;
887 
888    radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
889 }
890 
891 static void
radv_save_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer,uint64_t vb_ptr)892 radv_save_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer, uint64_t vb_ptr)
893 {
894    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
895    uint32_t data[2];
896    uint64_t va;
897 
898    va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, vertex_descriptors);
899 
900    data[0] = vb_ptr;
901    data[1] = vb_ptr >> 32;
902 
903    radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
904 }
905 
906 static void
radv_save_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader_part * prolog)907 radv_save_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader_part *prolog)
908 {
909    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
910    uint32_t data[2];
911    uint64_t va;
912 
913    va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, vertex_prolog);
914 
915    uint64_t prolog_address = (uintptr_t)prolog;
916    data[0] = prolog_address;
917    data[1] = prolog_address >> 32;
918 
919    radv_write_data(cmd_buffer, V_370_ME, va, 2, data, false);
920 }
921 
922 void
radv_set_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)923 radv_set_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
924                         struct radv_descriptor_set *set, unsigned idx)
925 {
926    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
927 
928    descriptors_state->sets[idx] = set;
929 
930    descriptors_state->valid |= (1u << idx); /* active descriptors */
931    descriptors_state->dirty |= (1u << idx);
932 }
933 
934 static void
radv_save_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)935 radv_save_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
936 {
937    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
938    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
939    uint32_t data[MAX_SETS * 2] = {0};
940    uint64_t va;
941    va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, descriptor_sets);
942 
943    u_foreach_bit (i, descriptors_state->valid) {
944       struct radv_descriptor_set *set = descriptors_state->sets[i];
945       data[i * 2] = (uint64_t)(uintptr_t)set;
946       data[i * 2 + 1] = (uint64_t)(uintptr_t)set >> 32;
947    }
948 
949    radv_write_data(cmd_buffer, V_370_ME, va, MAX_SETS * 2, data, false);
950 }
951 
952 static void
radv_emit_userdata_address(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,int idx,uint64_t va)953 radv_emit_userdata_address(const struct radv_device *device, struct radeon_cmdbuf *cs, const struct radv_shader *shader,
954                            int idx, uint64_t va)
955 {
956    const uint32_t offset = radv_get_user_sgpr_loc(shader, idx);
957 
958    if (!offset)
959       return;
960 
961    radv_emit_shader_pointer(device, cs, offset, va, false);
962 }
963 
964 uint64_t
radv_descriptor_get_va(const struct radv_descriptor_state * descriptors_state,unsigned set_idx)965 radv_descriptor_get_va(const struct radv_descriptor_state *descriptors_state, unsigned set_idx)
966 {
967    struct radv_descriptor_set *set = descriptors_state->sets[set_idx];
968    uint64_t va;
969 
970    if (set) {
971       va = set->header.va;
972    } else {
973       va = descriptors_state->descriptor_buffers[set_idx];
974    }
975 
976    return va;
977 }
978 
979 static void
radv_emit_descriptors_per_stage(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,const struct radv_descriptor_state * descriptors_state)980 radv_emit_descriptors_per_stage(const struct radv_device *device, struct radeon_cmdbuf *cs,
981                                 const struct radv_shader *shader, const struct radv_descriptor_state *descriptors_state)
982 {
983    const uint32_t indirect_descriptor_sets_offset = radv_get_user_sgpr_loc(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS);
984 
985    if (indirect_descriptor_sets_offset) {
986       radv_emit_shader_pointer(device, cs, indirect_descriptor_sets_offset,
987                                descriptors_state->indirect_descriptor_sets_va, false);
988    } else {
989       const struct radv_userdata_locations *locs = &shader->info.user_sgprs_locs;
990       const uint32_t sh_base = shader->info.user_data_0;
991       unsigned mask = locs->descriptor_sets_enabled;
992 
993       mask &= descriptors_state->dirty & descriptors_state->valid;
994 
995       while (mask) {
996          int start, count;
997 
998          u_bit_scan_consecutive_range(&mask, &start, &count);
999 
1000          const struct radv_userdata_info *loc = &locs->descriptor_sets[start];
1001          const unsigned sh_offset = sh_base + loc->sgpr_idx * 4;
1002 
1003          radv_emit_shader_pointer_head(cs, sh_offset, count, true);
1004          for (int i = 0; i < count; i++) {
1005             uint64_t va = radv_descriptor_get_va(descriptors_state, start + i);
1006 
1007             radv_emit_shader_pointer_body(device, cs, va, true);
1008          }
1009       }
1010    }
1011 }
1012 
1013 static unsigned
radv_get_rasterization_prim(const struct radv_cmd_buffer * cmd_buffer)1014 radv_get_rasterization_prim(const struct radv_cmd_buffer *cmd_buffer)
1015 {
1016    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
1017    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1018 
1019    if (cmd_buffer->state.active_stages &
1020        (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
1021         VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_MESH_BIT_EXT)) {
1022       /* Ignore dynamic primitive topology for TES/GS/MS stages. */
1023       return cmd_buffer->state.rast_prim;
1024    }
1025 
1026    return radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg);
1027 }
1028 
1029 static ALWAYS_INLINE VkLineRasterizationModeEXT
radv_get_line_mode(const struct radv_cmd_buffer * cmd_buffer)1030 radv_get_line_mode(const struct radv_cmd_buffer *cmd_buffer)
1031 {
1032    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1033 
1034    const unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
1035 
1036    bool draw_lines = radv_rast_prim_is_line(rast_prim) || radv_polygon_mode_is_line(d->vk.rs.polygon_mode);
1037    draw_lines &= !radv_rast_prim_is_point(rast_prim);
1038    draw_lines &= !radv_polygon_mode_is_point(d->vk.rs.polygon_mode);
1039    if (draw_lines)
1040       return d->vk.rs.line.mode;
1041 
1042    return VK_LINE_RASTERIZATION_MODE_DEFAULT_EXT;
1043 }
1044 
1045 static ALWAYS_INLINE unsigned
radv_get_rasterization_samples(struct radv_cmd_buffer * cmd_buffer)1046 radv_get_rasterization_samples(struct radv_cmd_buffer *cmd_buffer)
1047 {
1048    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1049 
1050    VkLineRasterizationModeEXT line_mode = radv_get_line_mode(cmd_buffer);
1051 
1052    if (line_mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR) {
1053       /* From the Vulkan spec 1.3.221:
1054        *
1055        * "When Bresenham lines are being rasterized, sample locations may all be treated as being at
1056        * the pixel center (this may affect attribute and depth interpolation)."
1057        *
1058        * "One consequence of this is that Bresenham lines cover the same pixels regardless of the
1059        * number of rasterization samples, and cover all samples in those pixels (unless masked out
1060        * or killed)."
1061        */
1062       return 1;
1063    }
1064 
1065    if (line_mode == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR) {
1066       return RADV_NUM_SMOOTH_AA_SAMPLES;
1067    }
1068 
1069    return MAX2(1, d->vk.ms.rasterization_samples);
1070 }
1071 
1072 static ALWAYS_INLINE unsigned
radv_get_ps_iter_samples(struct radv_cmd_buffer * cmd_buffer)1073 radv_get_ps_iter_samples(struct radv_cmd_buffer *cmd_buffer)
1074 {
1075    const struct radv_rendering_state *render = &cmd_buffer->state.render;
1076    unsigned ps_iter_samples = 1;
1077 
1078    if (cmd_buffer->state.ms.sample_shading_enable) {
1079       unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
1080       unsigned color_samples = MAX2(render->color_samples, rasterization_samples);
1081 
1082       ps_iter_samples = ceilf(cmd_buffer->state.ms.min_sample_shading * color_samples);
1083       ps_iter_samples = util_next_power_of_two(ps_iter_samples);
1084    }
1085 
1086    return ps_iter_samples;
1087 }
1088 
1089 /**
1090  * Convert the user sample locations to hardware sample locations (the values
1091  * that will be emitted by PA_SC_AA_SAMPLE_LOCS_PIXEL_*).
1092  */
1093 static void
radv_convert_user_sample_locs(const struct radv_sample_locations_state * state,uint32_t x,uint32_t y,VkOffset2D * sample_locs)1094 radv_convert_user_sample_locs(const struct radv_sample_locations_state *state, uint32_t x, uint32_t y,
1095                               VkOffset2D *sample_locs)
1096 {
1097    uint32_t x_offset = x % state->grid_size.width;
1098    uint32_t y_offset = y % state->grid_size.height;
1099    uint32_t num_samples = (uint32_t)state->per_pixel;
1100    uint32_t pixel_offset;
1101 
1102    pixel_offset = (x_offset + y_offset * state->grid_size.width) * num_samples;
1103 
1104    assert(pixel_offset <= MAX_SAMPLE_LOCATIONS);
1105    const VkSampleLocationEXT *user_locs = &state->locations[pixel_offset];
1106 
1107    for (uint32_t i = 0; i < num_samples; i++) {
1108       float shifted_pos_x = user_locs[i].x - 0.5;
1109       float shifted_pos_y = user_locs[i].y - 0.5;
1110 
1111       int32_t scaled_pos_x = floorf(shifted_pos_x * 16);
1112       int32_t scaled_pos_y = floorf(shifted_pos_y * 16);
1113 
1114       sample_locs[i].x = CLAMP(scaled_pos_x, -8, 7);
1115       sample_locs[i].y = CLAMP(scaled_pos_y, -8, 7);
1116    }
1117 }
1118 
1119 /**
1120  * Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask based on hardware sample
1121  * locations.
1122  */
1123 static void
radv_compute_sample_locs_pixel(uint32_t num_samples,VkOffset2D * sample_locs,uint32_t * sample_locs_pixel)1124 radv_compute_sample_locs_pixel(uint32_t num_samples, VkOffset2D *sample_locs, uint32_t *sample_locs_pixel)
1125 {
1126    for (uint32_t i = 0; i < num_samples; i++) {
1127       uint32_t sample_reg_idx = i / 4;
1128       uint32_t sample_loc_idx = i % 4;
1129       int32_t pos_x = sample_locs[i].x;
1130       int32_t pos_y = sample_locs[i].y;
1131 
1132       uint32_t shift_x = 8 * sample_loc_idx;
1133       uint32_t shift_y = shift_x + 4;
1134 
1135       sample_locs_pixel[sample_reg_idx] |= (pos_x & 0xf) << shift_x;
1136       sample_locs_pixel[sample_reg_idx] |= (pos_y & 0xf) << shift_y;
1137    }
1138 }
1139 
1140 /**
1141  * Compute the PA_SC_CENTROID_PRIORITY_* mask based on the top left hardware
1142  * sample locations.
1143  */
1144 static uint64_t
radv_compute_centroid_priority(struct radv_cmd_buffer * cmd_buffer,VkOffset2D * sample_locs,uint32_t num_samples)1145 radv_compute_centroid_priority(struct radv_cmd_buffer *cmd_buffer, VkOffset2D *sample_locs, uint32_t num_samples)
1146 {
1147    uint32_t *centroid_priorities = alloca(num_samples * sizeof(*centroid_priorities));
1148    uint32_t sample_mask = num_samples - 1;
1149    uint32_t *distances = alloca(num_samples * sizeof(*distances));
1150    uint64_t centroid_priority = 0;
1151 
1152    /* Compute the distances from center for each sample. */
1153    for (int i = 0; i < num_samples; i++) {
1154       distances[i] = (sample_locs[i].x * sample_locs[i].x) + (sample_locs[i].y * sample_locs[i].y);
1155    }
1156 
1157    /* Compute the centroid priorities by looking at the distances array. */
1158    for (int i = 0; i < num_samples; i++) {
1159       uint32_t min_idx = 0;
1160 
1161       for (int j = 1; j < num_samples; j++) {
1162          if (distances[j] < distances[min_idx])
1163             min_idx = j;
1164       }
1165 
1166       centroid_priorities[i] = min_idx;
1167       distances[min_idx] = 0xffffffff;
1168    }
1169 
1170    /* Compute the final centroid priority. */
1171    for (int i = 0; i < 8; i++) {
1172       centroid_priority |= centroid_priorities[i & sample_mask] << (i * 4);
1173    }
1174 
1175    return centroid_priority << 32 | centroid_priority;
1176 }
1177 
1178 /**
1179  * Emit the sample locations that are specified with VK_EXT_sample_locations.
1180  */
1181 static void
radv_emit_sample_locations(struct radv_cmd_buffer * cmd_buffer)1182 radv_emit_sample_locations(struct radv_cmd_buffer *cmd_buffer)
1183 {
1184    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1185    const struct radv_physical_device *pdev = radv_device_physical(device);
1186    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1187    uint32_t num_samples = (uint32_t)d->sample_location.per_pixel;
1188    struct radeon_cmdbuf *cs = cmd_buffer->cs;
1189    uint32_t sample_locs_pixel[4][2] = {0};
1190    VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
1191    uint64_t centroid_priority;
1192 
1193    if (!d->sample_location.count || !d->vk.ms.sample_locations_enable)
1194       return;
1195 
1196    /* Convert the user sample locations to hardware sample locations. */
1197    radv_convert_user_sample_locs(&d->sample_location, 0, 0, sample_locs[0]);
1198    radv_convert_user_sample_locs(&d->sample_location, 1, 0, sample_locs[1]);
1199    radv_convert_user_sample_locs(&d->sample_location, 0, 1, sample_locs[2]);
1200    radv_convert_user_sample_locs(&d->sample_location, 1, 1, sample_locs[3]);
1201 
1202    /* Compute the PA_SC_AA_SAMPLE_LOCS_PIXEL_* mask. */
1203    for (uint32_t i = 0; i < 4; i++) {
1204       radv_compute_sample_locs_pixel(num_samples, sample_locs[i], sample_locs_pixel[i]);
1205    }
1206 
1207    /* Compute the PA_SC_CENTROID_PRIORITY_* mask. */
1208    centroid_priority = radv_compute_centroid_priority(cmd_buffer, sample_locs[0], num_samples);
1209 
1210    /* Emit the specified user sample locations. */
1211    switch (num_samples) {
1212    case 2:
1213    case 4:
1214       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
1215       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
1216       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
1217       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
1218       break;
1219    case 8:
1220       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_pixel[0][0]);
1221       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_pixel[1][0]);
1222       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_pixel[2][0]);
1223       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_pixel[3][0]);
1224       radeon_set_context_reg(cs, R_028BFC_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1, sample_locs_pixel[0][1]);
1225       radeon_set_context_reg(cs, R_028C0C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1, sample_locs_pixel[1][1]);
1226       radeon_set_context_reg(cs, R_028C1C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1, sample_locs_pixel[2][1]);
1227       radeon_set_context_reg(cs, R_028C2C_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1, sample_locs_pixel[3][1]);
1228       break;
1229    default:
1230       unreachable("invalid number of samples");
1231    }
1232 
1233    if (pdev->info.gfx_level >= GFX12) {
1234       radeon_set_context_reg_seq(cs, R_028BF0_PA_SC_CENTROID_PRIORITY_0, 2);
1235    } else {
1236       radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1237    }
1238    radeon_emit(cs, centroid_priority);
1239    radeon_emit(cs, centroid_priority >> 32);
1240 }
1241 
1242 static void
radv_emit_inline_push_consts(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,int idx,const uint32_t * values)1243 radv_emit_inline_push_consts(const struct radv_device *device, struct radeon_cmdbuf *cs,
1244                              const struct radv_shader *shader, int idx, const uint32_t *values)
1245 {
1246    const struct radv_userdata_info *loc = &shader->info.user_sgprs_locs.shader_data[idx];
1247    const uint32_t base_reg = shader->info.user_data_0;
1248 
1249    if (loc->sgpr_idx == -1)
1250       return;
1251 
1252    radeon_check_space(device->ws, cs, 2 + loc->num_sgprs);
1253 
1254    radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, loc->num_sgprs);
1255    radeon_emit_array(cs, values, loc->num_sgprs);
1256 }
1257 
1258 struct radv_bin_size_entry {
1259    unsigned bpp;
1260    VkExtent2D extent;
1261 };
1262 
1263 static VkExtent2D
radv_gfx10_compute_bin_size(struct radv_cmd_buffer * cmd_buffer)1264 radv_gfx10_compute_bin_size(struct radv_cmd_buffer *cmd_buffer)
1265 {
1266    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1267    const struct radv_physical_device *pdev = radv_device_physical(device);
1268    const struct radv_rendering_state *render = &cmd_buffer->state.render;
1269    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1270    VkExtent2D extent = {512, 512};
1271 
1272    const unsigned db_tag_size = 64;
1273    const unsigned db_tag_count = 312;
1274    const unsigned color_tag_size = 1024;
1275    const unsigned color_tag_count = 31;
1276    const unsigned fmask_tag_size = 256;
1277    const unsigned fmask_tag_count = 44;
1278 
1279    const unsigned rb_count = pdev->info.max_render_backends;
1280    const unsigned pipe_count = MAX2(rb_count, pdev->info.num_tcc_blocks);
1281 
1282    const unsigned db_tag_part = (db_tag_count * rb_count / pipe_count) * db_tag_size * pipe_count;
1283    const unsigned color_tag_part = (color_tag_count * rb_count / pipe_count) * color_tag_size * pipe_count;
1284    const unsigned fmask_tag_part = (fmask_tag_count * rb_count / pipe_count) * fmask_tag_size * pipe_count;
1285 
1286    const unsigned total_samples = radv_get_rasterization_samples(cmd_buffer);
1287    const unsigned samples_log = util_logbase2_ceil(total_samples);
1288 
1289    unsigned color_bytes_per_pixel = 0;
1290    unsigned fmask_bytes_per_pixel = 0;
1291 
1292    for (unsigned i = 0; i < render->color_att_count; ++i) {
1293       struct radv_image_view *iview = render->color_att[i].iview;
1294 
1295       if (!iview)
1296          continue;
1297 
1298       if (!d->vk.cb.attachments[i].write_mask)
1299          continue;
1300 
1301       color_bytes_per_pixel += vk_format_get_blocksize(render->color_att[i].format);
1302 
1303       if (total_samples > 1) {
1304          assert(samples_log <= 3);
1305          const unsigned fmask_array[] = {0, 1, 1, 4};
1306          fmask_bytes_per_pixel += fmask_array[samples_log];
1307       }
1308    }
1309 
1310    color_bytes_per_pixel *= total_samples;
1311    color_bytes_per_pixel = MAX2(color_bytes_per_pixel, 1);
1312 
1313    const unsigned color_pixel_count_log = util_logbase2(color_tag_part / color_bytes_per_pixel);
1314    extent.width = 1ull << ((color_pixel_count_log + 1) / 2);
1315    extent.height = 1ull << (color_pixel_count_log / 2);
1316 
1317    if (fmask_bytes_per_pixel) {
1318       const unsigned fmask_pixel_count_log = util_logbase2(fmask_tag_part / fmask_bytes_per_pixel);
1319 
1320       const VkExtent2D fmask_extent = (VkExtent2D){.width = 1ull << ((fmask_pixel_count_log + 1) / 2),
1321                                                    .height = 1ull << (color_pixel_count_log / 2)};
1322 
1323       if (fmask_extent.width * fmask_extent.height < extent.width * extent.height)
1324          extent = fmask_extent;
1325    }
1326 
1327    if (render->ds_att.iview) {
1328       /* Coefficients taken from AMDVLK */
1329       unsigned depth_coeff = vk_format_has_depth(render->ds_att.format) ? 5 : 0;
1330       unsigned stencil_coeff = vk_format_has_stencil(render->ds_att.format) ? 1 : 0;
1331       unsigned db_bytes_per_pixel = (depth_coeff + stencil_coeff) * total_samples;
1332 
1333       const unsigned db_pixel_count_log = util_logbase2(db_tag_part / db_bytes_per_pixel);
1334 
1335       const VkExtent2D db_extent =
1336          (VkExtent2D){.width = 1ull << ((db_pixel_count_log + 1) / 2), .height = 1ull << (color_pixel_count_log / 2)};
1337 
1338       if (db_extent.width * db_extent.height < extent.width * extent.height)
1339          extent = db_extent;
1340    }
1341 
1342    extent.width = MAX2(extent.width, 128);
1343    extent.height = MAX2(extent.width, pdev->info.gfx_level >= GFX12 ? 128 : 64);
1344 
1345    return extent;
1346 }
1347 
1348 static VkExtent2D
radv_gfx9_compute_bin_size(struct radv_cmd_buffer * cmd_buffer)1349 radv_gfx9_compute_bin_size(struct radv_cmd_buffer *cmd_buffer)
1350 {
1351    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1352    const struct radv_physical_device *pdev = radv_device_physical(device);
1353    const struct radv_rendering_state *render = &cmd_buffer->state.render;
1354    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1355    static const struct radv_bin_size_entry color_size_table[][3][9] = {
1356       {
1357          /* One RB / SE */
1358          {
1359             /* One shader engine */
1360             {0, {128, 128}},
1361             {1, {64, 128}},
1362             {2, {32, 128}},
1363             {3, {16, 128}},
1364             {17, {0, 0}},
1365             {UINT_MAX, {0, 0}},
1366          },
1367          {
1368             /* Two shader engines */
1369             {0, {128, 128}},
1370             {2, {64, 128}},
1371             {3, {32, 128}},
1372             {5, {16, 128}},
1373             {17, {0, 0}},
1374             {UINT_MAX, {0, 0}},
1375          },
1376          {
1377             /* Four shader engines */
1378             {0, {128, 128}},
1379             {3, {64, 128}},
1380             {5, {16, 128}},
1381             {17, {0, 0}},
1382             {UINT_MAX, {0, 0}},
1383          },
1384       },
1385       {
1386          /* Two RB / SE */
1387          {
1388             /* One shader engine */
1389             {0, {128, 128}},
1390             {2, {64, 128}},
1391             {3, {32, 128}},
1392             {5, {16, 128}},
1393             {33, {0, 0}},
1394             {UINT_MAX, {0, 0}},
1395          },
1396          {
1397             /* Two shader engines */
1398             {0, {128, 128}},
1399             {3, {64, 128}},
1400             {5, {32, 128}},
1401             {9, {16, 128}},
1402             {33, {0, 0}},
1403             {UINT_MAX, {0, 0}},
1404          },
1405          {
1406             /* Four shader engines */
1407             {0, {256, 256}},
1408             {2, {128, 256}},
1409             {3, {128, 128}},
1410             {5, {64, 128}},
1411             {9, {16, 128}},
1412             {33, {0, 0}},
1413             {UINT_MAX, {0, 0}},
1414          },
1415       },
1416       {
1417          /* Four RB / SE */
1418          {
1419             /* One shader engine */
1420             {0, {128, 256}},
1421             {2, {128, 128}},
1422             {3, {64, 128}},
1423             {5, {32, 128}},
1424             {9, {16, 128}},
1425             {33, {0, 0}},
1426             {UINT_MAX, {0, 0}},
1427          },
1428          {
1429             /* Two shader engines */
1430             {0, {256, 256}},
1431             {2, {128, 256}},
1432             {3, {128, 128}},
1433             {5, {64, 128}},
1434             {9, {32, 128}},
1435             {17, {16, 128}},
1436             {33, {0, 0}},
1437             {UINT_MAX, {0, 0}},
1438          },
1439          {
1440             /* Four shader engines */
1441             {0, {256, 512}},
1442             {2, {256, 256}},
1443             {3, {128, 256}},
1444             {5, {128, 128}},
1445             {9, {64, 128}},
1446             {17, {16, 128}},
1447             {33, {0, 0}},
1448             {UINT_MAX, {0, 0}},
1449          },
1450       },
1451    };
1452    static const struct radv_bin_size_entry ds_size_table[][3][9] = {
1453       {
1454          // One RB / SE
1455          {
1456             // One shader engine
1457             {0, {128, 256}},
1458             {2, {128, 128}},
1459             {4, {64, 128}},
1460             {7, {32, 128}},
1461             {13, {16, 128}},
1462             {49, {0, 0}},
1463             {UINT_MAX, {0, 0}},
1464          },
1465          {
1466             // Two shader engines
1467             {0, {256, 256}},
1468             {2, {128, 256}},
1469             {4, {128, 128}},
1470             {7, {64, 128}},
1471             {13, {32, 128}},
1472             {25, {16, 128}},
1473             {49, {0, 0}},
1474             {UINT_MAX, {0, 0}},
1475          },
1476          {
1477             // Four shader engines
1478             {0, {256, 512}},
1479             {2, {256, 256}},
1480             {4, {128, 256}},
1481             {7, {128, 128}},
1482             {13, {64, 128}},
1483             {25, {16, 128}},
1484             {49, {0, 0}},
1485             {UINT_MAX, {0, 0}},
1486          },
1487       },
1488       {
1489          // Two RB / SE
1490          {
1491             // One shader engine
1492             {0, {256, 256}},
1493             {2, {128, 256}},
1494             {4, {128, 128}},
1495             {7, {64, 128}},
1496             {13, {32, 128}},
1497             {25, {16, 128}},
1498             {97, {0, 0}},
1499             {UINT_MAX, {0, 0}},
1500          },
1501          {
1502             // Two shader engines
1503             {0, {256, 512}},
1504             {2, {256, 256}},
1505             {4, {128, 256}},
1506             {7, {128, 128}},
1507             {13, {64, 128}},
1508             {25, {32, 128}},
1509             {49, {16, 128}},
1510             {97, {0, 0}},
1511             {UINT_MAX, {0, 0}},
1512          },
1513          {
1514             // Four shader engines
1515             {0, {512, 512}},
1516             {2, {256, 512}},
1517             {4, {256, 256}},
1518             {7, {128, 256}},
1519             {13, {128, 128}},
1520             {25, {64, 128}},
1521             {49, {16, 128}},
1522             {97, {0, 0}},
1523             {UINT_MAX, {0, 0}},
1524          },
1525       },
1526       {
1527          // Four RB / SE
1528          {
1529             // One shader engine
1530             {0, {256, 512}},
1531             {2, {256, 256}},
1532             {4, {128, 256}},
1533             {7, {128, 128}},
1534             {13, {64, 128}},
1535             {25, {32, 128}},
1536             {49, {16, 128}},
1537             {UINT_MAX, {0, 0}},
1538          },
1539          {
1540             // Two shader engines
1541             {0, {512, 512}},
1542             {2, {256, 512}},
1543             {4, {256, 256}},
1544             {7, {128, 256}},
1545             {13, {128, 128}},
1546             {25, {64, 128}},
1547             {49, {32, 128}},
1548             {97, {16, 128}},
1549             {UINT_MAX, {0, 0}},
1550          },
1551          {
1552             // Four shader engines
1553             {0, {512, 512}},
1554             {4, {256, 512}},
1555             {7, {256, 256}},
1556             {13, {128, 256}},
1557             {25, {128, 128}},
1558             {49, {64, 128}},
1559             {97, {16, 128}},
1560             {UINT_MAX, {0, 0}},
1561          },
1562       },
1563    };
1564 
1565    VkExtent2D extent = {512, 512};
1566 
1567    unsigned log_num_rb_per_se = util_logbase2_ceil(pdev->info.max_render_backends / pdev->info.max_se);
1568    unsigned log_num_se = util_logbase2_ceil(pdev->info.max_se);
1569 
1570    unsigned total_samples = radv_get_rasterization_samples(cmd_buffer);
1571    unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
1572    unsigned effective_samples = total_samples;
1573    unsigned color_bytes_per_pixel = 0;
1574 
1575    for (unsigned i = 0; i < render->color_att_count; ++i) {
1576       struct radv_image_view *iview = render->color_att[i].iview;
1577 
1578       if (!iview)
1579          continue;
1580 
1581       if (!d->vk.cb.attachments[i].write_mask)
1582          continue;
1583 
1584       color_bytes_per_pixel += vk_format_get_blocksize(render->color_att[i].format);
1585    }
1586 
1587    /* MSAA images typically don't use all samples all the time. */
1588    if (effective_samples >= 2 && ps_iter_samples <= 1)
1589       effective_samples = 2;
1590    color_bytes_per_pixel *= effective_samples;
1591 
1592    const struct radv_bin_size_entry *color_entry = color_size_table[log_num_rb_per_se][log_num_se];
1593    while (color_entry[1].bpp <= color_bytes_per_pixel)
1594       ++color_entry;
1595 
1596    extent = color_entry->extent;
1597 
1598    if (render->ds_att.iview) {
1599       /* Coefficients taken from AMDVLK */
1600       unsigned depth_coeff = vk_format_has_depth(render->ds_att.format) ? 5 : 0;
1601       unsigned stencil_coeff = vk_format_has_stencil(render->ds_att.format) ? 1 : 0;
1602       unsigned ds_bytes_per_pixel = 4 * (depth_coeff + stencil_coeff) * total_samples;
1603 
1604       const struct radv_bin_size_entry *ds_entry = ds_size_table[log_num_rb_per_se][log_num_se];
1605       while (ds_entry[1].bpp <= ds_bytes_per_pixel)
1606          ++ds_entry;
1607 
1608       if (ds_entry->extent.width * ds_entry->extent.height < extent.width * extent.height)
1609          extent = ds_entry->extent;
1610    }
1611 
1612    return extent;
1613 }
1614 
1615 static unsigned
radv_get_disabled_binning_state(struct radv_cmd_buffer * cmd_buffer)1616 radv_get_disabled_binning_state(struct radv_cmd_buffer *cmd_buffer)
1617 {
1618    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1619    const struct radv_physical_device *pdev = radv_device_physical(device);
1620    const struct radv_rendering_state *render = &cmd_buffer->state.render;
1621    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1622    uint32_t pa_sc_binner_cntl_0;
1623 
1624    if (pdev->info.gfx_level >= GFX12) {
1625       const uint32_t bin_size_x = 128, bin_size_y = 128;
1626 
1627       pa_sc_binner_cntl_0 =
1628          S_028C44_BINNING_MODE(V_028C44_BINNING_DISABLED) | S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(bin_size_x) - 5) |
1629          S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(bin_size_y) - 5) | S_028C44_DISABLE_START_OF_PRIM(1) |
1630          S_028C44_FPOVS_PER_BATCH(63) | S_028C44_OPTIMAL_BIN_SELECTION(1) | S_028C44_FLUSH_ON_BINNING_TRANSITION(1);
1631    } else if (pdev->info.gfx_level >= GFX10) {
1632       const unsigned binning_disabled =
1633          pdev->info.gfx_level >= GFX11_5 ? V_028C44_BINNING_DISABLED : V_028C44_DISABLE_BINNING_USE_NEW_SC;
1634       unsigned min_bytes_per_pixel = 0;
1635 
1636       for (unsigned i = 0; i < render->color_att_count; ++i) {
1637          struct radv_image_view *iview = render->color_att[i].iview;
1638 
1639          if (!iview)
1640             continue;
1641 
1642          if (!d->vk.cb.attachments[i].write_mask)
1643             continue;
1644 
1645          unsigned bytes = vk_format_get_blocksize(render->color_att[i].format);
1646          if (!min_bytes_per_pixel || bytes < min_bytes_per_pixel)
1647             min_bytes_per_pixel = bytes;
1648       }
1649 
1650       pa_sc_binner_cntl_0 = S_028C44_BINNING_MODE(binning_disabled) | S_028C44_BIN_SIZE_X(0) | S_028C44_BIN_SIZE_Y(0) |
1651                             S_028C44_BIN_SIZE_X_EXTEND(2) |                                /* 128 */
1652                             S_028C44_BIN_SIZE_Y_EXTEND(min_bytes_per_pixel <= 4 ? 2 : 1) | /* 128 or 64 */
1653                             S_028C44_DISABLE_START_OF_PRIM(1) | S_028C44_FLUSH_ON_BINNING_TRANSITION(1);
1654    } else {
1655       pa_sc_binner_cntl_0 =
1656          S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) | S_028C44_DISABLE_START_OF_PRIM(1) |
1657          S_028C44_FLUSH_ON_BINNING_TRANSITION(pdev->info.family == CHIP_VEGA12 || pdev->info.family == CHIP_VEGA20 ||
1658                                               pdev->info.family >= CHIP_RAVEN2);
1659    }
1660 
1661    return pa_sc_binner_cntl_0;
1662 }
1663 
1664 static unsigned
radv_get_binning_state(struct radv_cmd_buffer * cmd_buffer)1665 radv_get_binning_state(struct radv_cmd_buffer *cmd_buffer)
1666 {
1667    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1668    const struct radv_physical_device *pdev = radv_device_physical(device);
1669    unsigned pa_sc_binner_cntl_0;
1670    VkExtent2D bin_size;
1671 
1672    if (pdev->info.gfx_level >= GFX10) {
1673       bin_size = radv_gfx10_compute_bin_size(cmd_buffer);
1674    } else {
1675       assert(pdev->info.gfx_level == GFX9);
1676       bin_size = radv_gfx9_compute_bin_size(cmd_buffer);
1677    }
1678 
1679    if (device->pbb_allowed && bin_size.width && bin_size.height) {
1680       const struct radv_binning_settings *settings = &pdev->binning_settings;
1681 
1682       pa_sc_binner_cntl_0 =
1683          S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) | S_028C44_BIN_SIZE_X(bin_size.width == 16) |
1684          S_028C44_BIN_SIZE_Y(bin_size.height == 16) |
1685          S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(MAX2(bin_size.width, 32)) - 5) |
1686          S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(MAX2(bin_size.height, 32)) - 5) |
1687          S_028C44_CONTEXT_STATES_PER_BIN(settings->context_states_per_bin - 1) |
1688          S_028C44_PERSISTENT_STATES_PER_BIN(settings->persistent_states_per_bin - 1) |
1689          S_028C44_DISABLE_START_OF_PRIM(1) | S_028C44_FPOVS_PER_BATCH(settings->fpovs_per_batch) |
1690          S_028C44_OPTIMAL_BIN_SELECTION(1) |
1691          S_028C44_FLUSH_ON_BINNING_TRANSITION(pdev->info.family == CHIP_VEGA12 || pdev->info.family == CHIP_VEGA20 ||
1692                                               pdev->info.family >= CHIP_RAVEN2);
1693    } else {
1694       pa_sc_binner_cntl_0 = radv_get_disabled_binning_state(cmd_buffer);
1695    }
1696 
1697    return pa_sc_binner_cntl_0;
1698 }
1699 
1700 static void
radv_emit_binning_state(struct radv_cmd_buffer * cmd_buffer)1701 radv_emit_binning_state(struct radv_cmd_buffer *cmd_buffer)
1702 {
1703    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1704    const struct radv_physical_device *pdev = radv_device_physical(device);
1705    unsigned pa_sc_binner_cntl_0;
1706 
1707    if (pdev->info.gfx_level < GFX9)
1708       return;
1709 
1710    pa_sc_binner_cntl_0 = radv_get_binning_state(cmd_buffer);
1711 
1712    radeon_opt_set_context_reg(cmd_buffer, R_028C44_PA_SC_BINNER_CNTL_0, RADV_TRACKED_PA_SC_BINNER_CNTL_0,
1713                               pa_sc_binner_cntl_0);
1714 }
1715 
1716 static void
radv_emit_shader_prefetch(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader)1717 radv_emit_shader_prefetch(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader)
1718 {
1719    uint64_t va;
1720 
1721    if (!shader)
1722       return;
1723 
1724    va = radv_shader_get_va(shader);
1725 
1726    radv_cp_dma_prefetch(cmd_buffer, va, shader->code_size);
1727 }
1728 
1729 ALWAYS_INLINE static void
radv_emit_prefetch_L2(struct radv_cmd_buffer * cmd_buffer,bool first_stage_only)1730 radv_emit_prefetch_L2(struct radv_cmd_buffer *cmd_buffer, bool first_stage_only)
1731 {
1732    struct radv_cmd_state *state = &cmd_buffer->state;
1733    uint32_t mask = state->prefetch_L2_mask;
1734 
1735    /* Fast prefetch path for starting draws as soon as possible. */
1736    if (first_stage_only)
1737       mask &= RADV_PREFETCH_VS | RADV_PREFETCH_VBO_DESCRIPTORS | RADV_PREFETCH_MS;
1738 
1739    if (mask & RADV_PREFETCH_VS)
1740       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_VERTEX]);
1741 
1742    if (mask & RADV_PREFETCH_MS)
1743       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_MESH]);
1744 
1745    if (mask & RADV_PREFETCH_VBO_DESCRIPTORS)
1746       radv_cp_dma_prefetch(cmd_buffer, state->vb_va, state->vb_size);
1747 
1748    if (mask & RADV_PREFETCH_TCS)
1749       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]);
1750 
1751    if (mask & RADV_PREFETCH_TES)
1752       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]);
1753 
1754    if (mask & RADV_PREFETCH_GS) {
1755       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]);
1756       if (cmd_buffer->state.gs_copy_shader)
1757          radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.gs_copy_shader);
1758    }
1759 
1760    if (mask & RADV_PREFETCH_PS) {
1761       radv_emit_shader_prefetch(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]);
1762    }
1763 
1764    state->prefetch_L2_mask &= ~mask;
1765 }
1766 
1767 static void
radv_emit_rbplus_state(struct radv_cmd_buffer * cmd_buffer)1768 radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
1769 {
1770    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1771    const struct radv_physical_device *pdev = radv_device_physical(device);
1772 
1773    assert(pdev->info.rbplus_allowed);
1774 
1775    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
1776    struct radv_rendering_state *render = &cmd_buffer->state.render;
1777 
1778    unsigned sx_ps_downconvert = 0;
1779    unsigned sx_blend_opt_epsilon = 0;
1780    unsigned sx_blend_opt_control = 0;
1781 
1782    for (unsigned i = 0; i < render->color_att_count; i++) {
1783       unsigned format, swap;
1784       bool has_alpha, has_rgb;
1785       if (render->color_att[i].iview == NULL) {
1786          /* We don't set the DISABLE bits, because the HW can't have holes,
1787           * so the SPI color format is set to 32-bit 1-component. */
1788          sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1789          continue;
1790       }
1791 
1792       struct radv_color_buffer_info *cb = &render->color_att[i].cb;
1793 
1794       format = pdev->info.gfx_level >= GFX11 ? G_028C70_FORMAT_GFX11(cb->ac.cb_color_info)
1795                                              : G_028C70_FORMAT_GFX6(cb->ac.cb_color_info);
1796       swap = G_028C70_COMP_SWAP(cb->ac.cb_color_info);
1797       has_alpha = pdev->info.gfx_level >= GFX11 ? !G_028C74_FORCE_DST_ALPHA_1_GFX11(cb->ac.cb_color_attrib)
1798                                                 : !G_028C74_FORCE_DST_ALPHA_1_GFX6(cb->ac.cb_color_attrib);
1799 
1800       uint32_t spi_format = (cmd_buffer->state.spi_shader_col_format >> (i * 4)) & 0xf;
1801       uint32_t colormask = d->vk.cb.attachments[i].write_mask;
1802 
1803       if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 || format == V_028C70_COLOR_32)
1804          has_rgb = !has_alpha;
1805       else
1806          has_rgb = true;
1807 
1808       /* Check the colormask and export format. */
1809       if (!(colormask & 0x7))
1810          has_rgb = false;
1811       if (!(colormask & 0x8))
1812          has_alpha = false;
1813 
1814       if (spi_format == V_028714_SPI_SHADER_ZERO) {
1815          has_rgb = false;
1816          has_alpha = false;
1817       }
1818 
1819       /* The HW doesn't quite blend correctly with rgb9e5 if we disable the alpha
1820        * optimization, even though it has no alpha. */
1821       if (has_rgb && format == V_028C70_COLOR_5_9_9_9)
1822          has_alpha = true;
1823 
1824       /* Disable value checking for disabled channels. */
1825       if (!has_rgb)
1826          sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
1827       if (!has_alpha)
1828          sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
1829 
1830       /* Enable down-conversion for 32bpp and smaller formats. */
1831       switch (format) {
1832       case V_028C70_COLOR_8:
1833       case V_028C70_COLOR_8_8:
1834       case V_028C70_COLOR_8_8_8_8:
1835          /* For 1 and 2-channel formats, use the superset thereof. */
1836          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR || spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
1837              spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1838             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
1839 
1840             if (G_028C70_NUMBER_TYPE(cb->ac.cb_color_info) != V_028C70_NUMBER_SRGB)
1841                sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT_0_5 << (i * 4);
1842          }
1843          break;
1844 
1845       case V_028C70_COLOR_5_6_5:
1846          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1847             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
1848             sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT_0_5 << (i * 4);
1849          }
1850          break;
1851 
1852       case V_028C70_COLOR_1_5_5_5:
1853          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1854             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
1855             sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT_0_5 << (i * 4);
1856          }
1857          break;
1858 
1859       case V_028C70_COLOR_4_4_4_4:
1860          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1861             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
1862             sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT_0_5 << (i * 4);
1863          }
1864          break;
1865 
1866       case V_028C70_COLOR_32:
1867          if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
1868             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
1869          else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
1870             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
1871          break;
1872 
1873       case V_028C70_COLOR_16:
1874       case V_028C70_COLOR_16_16:
1875          /* For 1-channel formats, use the superset thereof. */
1876          if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR || spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
1877              spi_format == V_028714_SPI_SHADER_UINT16_ABGR || spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
1878             if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
1879                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
1880             else
1881                sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
1882          }
1883          break;
1884 
1885       case V_028C70_COLOR_10_11_11:
1886          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1887             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
1888          break;
1889 
1890       case V_028C70_COLOR_2_10_10_10:
1891          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
1892             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
1893             sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT_0_5 << (i * 4);
1894          }
1895          break;
1896       case V_028C70_COLOR_5_9_9_9:
1897          if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
1898             sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
1899          break;
1900       }
1901    }
1902 
1903    /* Do not set the DISABLE bits for the unused attachments, as that
1904     * breaks dual source blending in SkQP and does not seem to improve
1905     * performance. */
1906 
1907    radeon_opt_set_context_reg3(cmd_buffer, R_028754_SX_PS_DOWNCONVERT, RADV_TRACKED_SX_PS_DOWNCONVERT,
1908                                sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control);
1909 
1910    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_RBPLUS;
1911 }
1912 
1913 static void
radv_emit_ps_epilog_state(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_part * ps_epilog)1914 radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_part *ps_epilog)
1915 {
1916    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1917    const struct radv_physical_device *pdev = radv_device_physical(device);
1918    struct radv_shader *ps_shader = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
1919 
1920    if (cmd_buffer->state.emitted_ps_epilog == ps_epilog)
1921       return;
1922 
1923    if (ps_epilog->spi_shader_z_format) {
1924       if (pdev->info.gfx_level >= GFX12) {
1925          radeon_set_context_reg(cmd_buffer->cs, R_028650_SPI_SHADER_Z_FORMAT, ps_epilog->spi_shader_z_format);
1926       } else {
1927          radeon_set_context_reg(cmd_buffer->cs, R_028710_SPI_SHADER_Z_FORMAT, ps_epilog->spi_shader_z_format);
1928       }
1929    }
1930 
1931    assert(ps_shader->config.num_shared_vgprs == 0);
1932    if (G_00B848_VGPRS(ps_epilog->rsrc1) > G_00B848_VGPRS(ps_shader->config.rsrc1)) {
1933       uint32_t rsrc1 = ps_shader->config.rsrc1;
1934       rsrc1 = (rsrc1 & C_00B848_VGPRS) | (ps_epilog->rsrc1 & ~C_00B848_VGPRS);
1935       radeon_set_sh_reg(cmd_buffer->cs, ps_shader->info.regs.pgm_rsrc1, rsrc1);
1936    }
1937 
1938    radv_cs_add_buffer(device->ws, cmd_buffer->cs, ps_epilog->bo);
1939 
1940    assert((ps_epilog->va >> 32) == pdev->info.address32_hi);
1941 
1942    const uint32_t epilog_pc_offset = radv_get_user_sgpr_loc(ps_shader, AC_UD_EPILOG_PC);
1943    radv_emit_shader_pointer(device, cmd_buffer->cs, epilog_pc_offset, ps_epilog->va, false);
1944 
1945    cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, ps_epilog->upload_seq);
1946 
1947    cmd_buffer->state.emitted_ps_epilog = ps_epilog;
1948 }
1949 
1950 void
radv_emit_compute_shader(const struct radv_physical_device * pdev,struct radeon_cmdbuf * cs,const struct radv_shader * shader)1951 radv_emit_compute_shader(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs,
1952                          const struct radv_shader *shader)
1953 {
1954    uint64_t va = radv_shader_get_va(shader);
1955 
1956    radeon_set_sh_reg(cs, shader->info.regs.pgm_lo, va >> 8);
1957 
1958    radeon_set_sh_reg_seq(cs, shader->info.regs.pgm_rsrc1, 2);
1959    radeon_emit(cs, shader->config.rsrc1);
1960    radeon_emit(cs, shader->config.rsrc2);
1961    if (pdev->info.gfx_level >= GFX10) {
1962       radeon_set_sh_reg(cs, shader->info.regs.pgm_rsrc3, shader->config.rsrc3);
1963    }
1964 
1965    radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, shader->info.regs.cs.compute_resource_limits);
1966    radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
1967    radeon_emit(cs, shader->info.regs.cs.compute_num_thread_x);
1968    radeon_emit(cs, shader->info.regs.cs.compute_num_thread_y);
1969    radeon_emit(cs, shader->info.regs.cs.compute_num_thread_z);
1970 }
1971 
1972 static void
radv_emit_vgt_gs_mode(struct radv_cmd_buffer * cmd_buffer)1973 radv_emit_vgt_gs_mode(struct radv_cmd_buffer *cmd_buffer)
1974 {
1975    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
1976    const struct radv_physical_device *pdev = radv_device_physical(device);
1977    const struct radv_shader_info *info = &cmd_buffer->state.last_vgt_shader->info;
1978    unsigned vgt_primitiveid_en = 0;
1979    uint32_t vgt_gs_mode = 0;
1980 
1981    if (info->is_ngg)
1982       return;
1983 
1984    if (info->stage == MESA_SHADER_GEOMETRY) {
1985       vgt_gs_mode = ac_vgt_gs_mode(info->gs.vertices_out, pdev->info.gfx_level);
1986    } else if (info->outinfo.export_prim_id || info->uses_prim_id) {
1987       vgt_gs_mode = S_028A40_MODE(V_028A40_GS_SCENARIO_A);
1988       vgt_primitiveid_en |= S_028A84_PRIMITIVEID_EN(1);
1989    }
1990 
1991    radeon_opt_set_context_reg(cmd_buffer, R_028A84_VGT_PRIMITIVEID_EN, RADV_TRACKED_VGT_PRIMITIVEID_EN,
1992                               vgt_primitiveid_en);
1993    radeon_opt_set_context_reg(cmd_buffer, R_028A40_VGT_GS_MODE, RADV_TRACKED_VGT_GS_MODE, vgt_gs_mode);
1994 }
1995 
1996 static void
radv_emit_hw_vs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)1997 radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
1998 {
1999    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2000    const struct radv_physical_device *pdev = radv_device_physical(device);
2001    const uint64_t va = radv_shader_get_va(shader);
2002 
2003    radeon_set_sh_reg_seq(cmd_buffer->cs, shader->info.regs.pgm_lo, 4);
2004    radeon_emit(cmd_buffer->cs, va >> 8);
2005    radeon_emit(cmd_buffer->cs, S_00B124_MEM_BASE(va >> 40));
2006    radeon_emit(cmd_buffer->cs, shader->config.rsrc1);
2007    radeon_emit(cmd_buffer->cs, shader->config.rsrc2);
2008 
2009    radeon_opt_set_context_reg(cmd_buffer, R_0286C4_SPI_VS_OUT_CONFIG, RADV_TRACKED_SPI_VS_OUT_CONFIG,
2010                               shader->info.regs.spi_vs_out_config);
2011    radeon_opt_set_context_reg(cmd_buffer, R_02870C_SPI_SHADER_POS_FORMAT, RADV_TRACKED_SPI_SHADER_POS_FORMAT,
2012                               shader->info.regs.spi_shader_pos_format);
2013    radeon_opt_set_context_reg(cmd_buffer, R_02881C_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
2014                               shader->info.regs.pa_cl_vs_out_cntl);
2015 
2016    if (pdev->info.gfx_level <= GFX8)
2017       radeon_opt_set_context_reg(cmd_buffer, R_028AB4_VGT_REUSE_OFF, RADV_TRACKED_VGT_REUSE_OFF,
2018                                  shader->info.regs.vs.vgt_reuse_off);
2019 
2020    if (pdev->info.gfx_level >= GFX7) {
2021       radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, 3,
2022                             shader->info.regs.vs.spi_shader_pgm_rsrc3_vs);
2023       radeon_set_sh_reg(cmd_buffer->cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
2024                         shader->info.regs.vs.spi_shader_late_alloc_vs);
2025 
2026       if (pdev->info.gfx_level >= GFX10) {
2027          radeon_set_uconfig_reg(cmd_buffer->cs, R_030980_GE_PC_ALLOC, shader->info.regs.ge_pc_alloc);
2028 
2029          if (shader->info.stage == MESA_SHADER_TESS_EVAL) {
2030             radeon_opt_set_context_reg(cmd_buffer, R_028A44_VGT_GS_ONCHIP_CNTL, RADV_TRACKED_VGT_GS_ONCHIP_CNTL,
2031                                        shader->info.regs.vgt_gs_onchip_cntl);
2032          }
2033       }
2034    }
2035 }
2036 
2037 static void
radv_emit_hw_es(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)2038 radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
2039 {
2040    const uint64_t va = radv_shader_get_va(shader);
2041 
2042    radeon_set_sh_reg_seq(cmd_buffer->cs, shader->info.regs.pgm_lo, 4);
2043    radeon_emit(cmd_buffer->cs, va >> 8);
2044    radeon_emit(cmd_buffer->cs, S_00B324_MEM_BASE(va >> 40));
2045    radeon_emit(cmd_buffer->cs, shader->config.rsrc1);
2046    radeon_emit(cmd_buffer->cs, shader->config.rsrc2);
2047 }
2048 
2049 static void
radv_emit_hw_ls(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)2050 radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
2051 {
2052    const uint64_t va = radv_shader_get_va(shader);
2053 
2054    radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_lo, va >> 8);
2055 
2056    radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
2057 }
2058 
2059 static void
radv_emit_hw_ngg(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * es,const struct radv_shader * shader)2060 radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *es, const struct radv_shader *shader)
2061 {
2062    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2063    const struct radv_physical_device *pdev = radv_device_physical(device);
2064    const uint64_t va = radv_shader_get_va(shader);
2065    gl_shader_stage es_type;
2066    const struct gfx10_ngg_info *ngg_state = &shader->info.ngg_info;
2067 
2068    if (shader->info.stage == MESA_SHADER_GEOMETRY) {
2069       if (shader->info.merged_shader_compiled_separately) {
2070          es_type = es->info.stage;
2071       } else {
2072          es_type = shader->info.gs.es_type;
2073       }
2074    } else {
2075       es_type = shader->info.stage;
2076    }
2077 
2078    if (!shader->info.merged_shader_compiled_separately) {
2079       radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_lo, va >> 8);
2080 
2081       radeon_set_sh_reg_seq(cmd_buffer->cs, shader->info.regs.pgm_rsrc1, 2);
2082       radeon_emit(cmd_buffer->cs, shader->config.rsrc1);
2083       radeon_emit(cmd_buffer->cs, shader->config.rsrc2);
2084    }
2085 
2086    const struct radv_vs_output_info *outinfo = &shader->info.outinfo;
2087 
2088    const bool es_enable_prim_id = outinfo->export_prim_id || (es && es->info.uses_prim_id);
2089    bool break_wave_at_eoi = false;
2090 
2091    if (es_type == MESA_SHADER_TESS_EVAL) {
2092       if (es_enable_prim_id || (shader->info.uses_prim_id))
2093          break_wave_at_eoi = true;
2094    }
2095 
2096    if (pdev->info.gfx_level >= GFX12) {
2097       radeon_opt_set_context_reg(cmd_buffer, R_028818_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
2098                                  shader->info.regs.pa_cl_vs_out_cntl);
2099 
2100       radeon_opt_set_context_reg(cmd_buffer, R_028B3C_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
2101                                  shader->info.regs.vgt_gs_instance_cnt);
2102 
2103       radeon_set_uconfig_reg(cmd_buffer->cs, R_030988_VGT_PRIMITIVEID_EN, shader->info.regs.ngg.vgt_primitiveid_en);
2104 
2105       radeon_opt_set_context_reg2(cmd_buffer, R_028648_SPI_SHADER_IDX_FORMAT, RADV_TRACKED_SPI_SHADER_IDX_FORMAT,
2106                                   shader->info.regs.ngg.spi_shader_idx_format, shader->info.regs.spi_shader_pos_format);
2107    } else {
2108       radeon_opt_set_context_reg(cmd_buffer, R_02881C_PA_CL_VS_OUT_CNTL, RADV_TRACKED_PA_CL_VS_OUT_CNTL,
2109                                  shader->info.regs.pa_cl_vs_out_cntl);
2110 
2111       radeon_opt_set_context_reg(cmd_buffer, R_028B90_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
2112                                  shader->info.regs.vgt_gs_instance_cnt);
2113 
2114       radeon_opt_set_context_reg(cmd_buffer, R_028A84_VGT_PRIMITIVEID_EN, RADV_TRACKED_VGT_PRIMITIVEID_EN,
2115                                  shader->info.regs.ngg.vgt_primitiveid_en | S_028A84_PRIMITIVEID_EN(es_enable_prim_id));
2116 
2117       radeon_opt_set_context_reg2(cmd_buffer, R_028708_SPI_SHADER_IDX_FORMAT, RADV_TRACKED_SPI_SHADER_IDX_FORMAT,
2118                                   shader->info.regs.ngg.spi_shader_idx_format, shader->info.regs.spi_shader_pos_format);
2119 
2120       radeon_opt_set_context_reg(cmd_buffer, R_0286C4_SPI_VS_OUT_CONFIG, RADV_TRACKED_SPI_VS_OUT_CONFIG,
2121                                  shader->info.regs.spi_vs_out_config);
2122    }
2123 
2124    radeon_opt_set_context_reg(cmd_buffer, R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP, RADV_TRACKED_GE_MAX_OUTPUT_PER_SUBGROUP,
2125                               shader->info.regs.ngg.ge_max_output_per_subgroup);
2126 
2127    radeon_opt_set_context_reg(cmd_buffer, R_028B4C_GE_NGG_SUBGRP_CNTL, RADV_TRACKED_GE_NGG_SUBGRP_CNTL,
2128                               shader->info.regs.ngg.ge_ngg_subgrp_cntl);
2129 
2130    uint32_t ge_cntl = shader->info.regs.ngg.ge_cntl;
2131    if (pdev->info.gfx_level >= GFX11) {
2132       ge_cntl |= S_03096C_BREAK_PRIMGRP_AT_EOI(break_wave_at_eoi);
2133    } else {
2134       ge_cntl |= S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
2135 
2136       /* Bug workaround for a possible hang with non-tessellation cases.
2137        * Tessellation always sets GE_CNTL.VERT_GRP_SIZE = 0
2138        *
2139        * Requirement: GE_CNTL.VERT_GRP_SIZE = VGT_GS_ONCHIP_CNTL.ES_VERTS_PER_SUBGRP - 5
2140        */
2141       if (pdev->info.gfx_level == GFX10 && es_type != MESA_SHADER_TESS_EVAL && ngg_state->hw_max_esverts != 256) {
2142          ge_cntl &= C_03096C_VERT_GRP_SIZE;
2143 
2144          if (ngg_state->hw_max_esverts > 5) {
2145             ge_cntl |= S_03096C_VERT_GRP_SIZE(ngg_state->hw_max_esverts - 5);
2146          }
2147       }
2148 
2149       radeon_opt_set_context_reg(cmd_buffer, R_028A44_VGT_GS_ONCHIP_CNTL, RADV_TRACKED_VGT_GS_ONCHIP_CNTL,
2150                                  shader->info.regs.vgt_gs_onchip_cntl);
2151    }
2152 
2153    radeon_set_uconfig_reg(cmd_buffer->cs, R_03096C_GE_CNTL, ge_cntl);
2154 
2155    if (pdev->info.gfx_level >= GFX12) {
2156       radeon_set_sh_reg(cmd_buffer->cs, R_00B220_SPI_SHADER_PGM_RSRC4_GS, shader->info.regs.spi_shader_pgm_rsrc4_gs);
2157    } else {
2158       if (pdev->info.gfx_level >= GFX7) {
2159          radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
2160                                shader->info.regs.spi_shader_pgm_rsrc3_gs);
2161       }
2162 
2163       radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
2164                             shader->info.regs.spi_shader_pgm_rsrc4_gs);
2165 
2166       radeon_set_uconfig_reg(cmd_buffer->cs, R_030980_GE_PC_ALLOC, shader->info.regs.ge_pc_alloc);
2167    }
2168 }
2169 
2170 static void
radv_emit_hw_hs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)2171 radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
2172 {
2173    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2174    const struct radv_physical_device *pdev = radv_device_physical(device);
2175    const uint64_t va = radv_shader_get_va(shader);
2176 
2177    if (pdev->info.gfx_level >= GFX9) {
2178       radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_lo, va >> 8);
2179       radeon_set_sh_reg(cmd_buffer->cs, shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
2180    } else {
2181       radeon_set_sh_reg_seq(cmd_buffer->cs, shader->info.regs.pgm_lo, 4);
2182       radeon_emit(cmd_buffer->cs, va >> 8);
2183       radeon_emit(cmd_buffer->cs, S_00B424_MEM_BASE(va >> 40));
2184       radeon_emit(cmd_buffer->cs, shader->config.rsrc1);
2185       radeon_emit(cmd_buffer->cs, shader->config.rsrc2);
2186    }
2187 }
2188 
2189 static void
radv_emit_vertex_shader(struct radv_cmd_buffer * cmd_buffer)2190 radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer)
2191 {
2192    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2193    const struct radv_physical_device *pdev = radv_device_physical(device);
2194    const struct radv_shader *vs = cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
2195 
2196    if (vs->info.merged_shader_compiled_separately) {
2197       assert(vs->info.next_stage == MESA_SHADER_TESS_CTRL || vs->info.next_stage == MESA_SHADER_GEOMETRY);
2198 
2199       const struct radv_shader *next_stage = cmd_buffer->state.shaders[vs->info.next_stage];
2200 
2201       if (!vs->info.vs.has_prolog) {
2202          uint32_t rsrc1, rsrc2;
2203 
2204          radeon_set_sh_reg(cmd_buffer->cs, vs->info.regs.pgm_lo, vs->va >> 8);
2205 
2206          if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
2207             radv_shader_combine_cfg_vs_tcs(vs, next_stage, &rsrc1, NULL);
2208 
2209             radeon_set_sh_reg(cmd_buffer->cs, vs->info.regs.pgm_rsrc1, rsrc1);
2210          } else {
2211             radv_shader_combine_cfg_vs_gs(vs, next_stage, &rsrc1, &rsrc2);
2212 
2213             unsigned lds_size;
2214             if (next_stage->info.is_ngg) {
2215                lds_size = DIV_ROUND_UP(next_stage->info.ngg_info.lds_size, pdev->info.lds_encode_granularity);
2216             } else {
2217                lds_size = next_stage->info.gs_ring_info.lds_size;
2218             }
2219 
2220             radeon_set_sh_reg_seq(cmd_buffer->cs, vs->info.regs.pgm_rsrc1, 2);
2221             radeon_emit(cmd_buffer->cs, rsrc1);
2222             radeon_emit(cmd_buffer->cs, rsrc2 | S_00B22C_LDS_SIZE(lds_size));
2223          }
2224       }
2225 
2226       const uint32_t next_stage_pc_offset = radv_get_user_sgpr_loc(vs, AC_UD_NEXT_STAGE_PC);
2227       radv_emit_shader_pointer(device, cmd_buffer->cs, next_stage_pc_offset, next_stage->va, false);
2228       return;
2229    }
2230 
2231    if (vs->info.vs.as_ls)
2232       radv_emit_hw_ls(cmd_buffer, vs);
2233    else if (vs->info.vs.as_es)
2234       radv_emit_hw_es(cmd_buffer, vs);
2235    else if (vs->info.is_ngg)
2236       radv_emit_hw_ngg(cmd_buffer, NULL, vs);
2237    else
2238       radv_emit_hw_vs(cmd_buffer, vs);
2239 }
2240 
2241 static void
radv_emit_tess_ctrl_shader(struct radv_cmd_buffer * cmd_buffer)2242 radv_emit_tess_ctrl_shader(struct radv_cmd_buffer *cmd_buffer)
2243 {
2244    const struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
2245 
2246    if (tcs->info.merged_shader_compiled_separately) {
2247       /* When VS+TCS are compiled separately on GFX9+, the VS will jump to the TCS and everything is
2248        * emitted as part of the VS.
2249        */
2250       return;
2251    }
2252 
2253    radv_emit_hw_hs(cmd_buffer, tcs);
2254 }
2255 
2256 static void
radv_emit_tess_eval_shader(struct radv_cmd_buffer * cmd_buffer)2257 radv_emit_tess_eval_shader(struct radv_cmd_buffer *cmd_buffer)
2258 {
2259    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2260    const struct radv_physical_device *pdev = radv_device_physical(device);
2261    const struct radv_shader *tes = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL];
2262 
2263    if (tes->info.merged_shader_compiled_separately) {
2264       assert(tes->info.next_stage == MESA_SHADER_GEOMETRY);
2265 
2266       const struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
2267       uint32_t rsrc1, rsrc2;
2268 
2269       radv_shader_combine_cfg_tes_gs(tes, gs, &rsrc1, &rsrc2);
2270 
2271       radeon_set_sh_reg(cmd_buffer->cs, tes->info.regs.pgm_lo, tes->va >> 8);
2272 
2273       unsigned lds_size;
2274       if (gs->info.is_ngg) {
2275          lds_size = DIV_ROUND_UP(gs->info.ngg_info.lds_size, pdev->info.lds_encode_granularity);
2276       } else {
2277          lds_size = gs->info.gs_ring_info.lds_size;
2278       }
2279 
2280       radeon_set_sh_reg_seq(cmd_buffer->cs, tes->info.regs.pgm_rsrc1, 2);
2281       radeon_emit(cmd_buffer->cs, rsrc1);
2282       radeon_emit(cmd_buffer->cs, rsrc2 | S_00B22C_LDS_SIZE(lds_size));
2283 
2284       const uint32_t next_stage_pc_offset = radv_get_user_sgpr_loc(tes, AC_UD_NEXT_STAGE_PC);
2285       radv_emit_shader_pointer(device, cmd_buffer->cs, next_stage_pc_offset, gs->va, false);
2286       return;
2287    }
2288 
2289    if (tes->info.is_ngg) {
2290       radv_emit_hw_ngg(cmd_buffer, NULL, tes);
2291    } else if (tes->info.tes.as_es) {
2292       radv_emit_hw_es(cmd_buffer, tes);
2293    } else {
2294       radv_emit_hw_vs(cmd_buffer, tes);
2295    }
2296 }
2297 
2298 static void
radv_emit_hw_gs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * gs)2299 radv_emit_hw_gs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *gs)
2300 {
2301    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2302    const struct radv_physical_device *pdev = radv_device_physical(device);
2303    const struct radv_legacy_gs_info *gs_state = &gs->info.gs_ring_info;
2304    const uint64_t va = radv_shader_get_va(gs);
2305 
2306    radeon_opt_set_context_reg3(cmd_buffer, R_028A60_VGT_GSVS_RING_OFFSET_1, RADV_TRACKED_VGT_GSVS_RING_OFFSET_1,
2307                                gs->info.regs.gs.vgt_gsvs_ring_offset[0], gs->info.regs.gs.vgt_gsvs_ring_offset[1],
2308                                gs->info.regs.gs.vgt_gsvs_ring_offset[2]);
2309 
2310    radeon_opt_set_context_reg(cmd_buffer, R_028AB0_VGT_GSVS_RING_ITEMSIZE, RADV_TRACKED_VGT_GSVS_RING_ITEMSIZE,
2311                               gs->info.regs.gs.vgt_gsvs_ring_itemsize);
2312 
2313    radeon_opt_set_context_reg4(cmd_buffer, R_028B5C_VGT_GS_VERT_ITEMSIZE, RADV_TRACKED_VGT_GS_VERT_ITEMSIZE,
2314                                gs->info.regs.gs.vgt_gs_vert_itemsize[0], gs->info.regs.gs.vgt_gs_vert_itemsize[1],
2315                                gs->info.regs.gs.vgt_gs_vert_itemsize[2], gs->info.regs.gs.vgt_gs_vert_itemsize[3]);
2316 
2317    radeon_opt_set_context_reg(cmd_buffer, R_028B90_VGT_GS_INSTANCE_CNT, RADV_TRACKED_VGT_GS_INSTANCE_CNT,
2318                               gs->info.regs.gs.vgt_gs_instance_cnt);
2319 
2320    if (pdev->info.gfx_level >= GFX9) {
2321       if (!gs->info.merged_shader_compiled_separately) {
2322          radeon_set_sh_reg(cmd_buffer->cs, gs->info.regs.pgm_lo, va >> 8);
2323 
2324          radeon_set_sh_reg_seq(cmd_buffer->cs, gs->info.regs.pgm_rsrc1, 2);
2325          radeon_emit(cmd_buffer->cs, gs->config.rsrc1);
2326          radeon_emit(cmd_buffer->cs, gs->config.rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size));
2327       }
2328 
2329       radeon_opt_set_context_reg(cmd_buffer, R_028A44_VGT_GS_ONCHIP_CNTL, RADV_TRACKED_VGT_GS_ONCHIP_CNTL,
2330                                  gs->info.regs.vgt_gs_onchip_cntl);
2331 
2332       if (pdev->info.gfx_level == GFX9) {
2333          radeon_opt_set_context_reg(cmd_buffer, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
2334                                     RADV_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
2335                                     gs->info.regs.gs.vgt_gs_max_prims_per_subgroup);
2336       }
2337    } else {
2338       radeon_set_sh_reg_seq(cmd_buffer->cs, gs->info.regs.pgm_lo, 4);
2339       radeon_emit(cmd_buffer->cs, va >> 8);
2340       radeon_emit(cmd_buffer->cs, S_00B224_MEM_BASE(va >> 40));
2341       radeon_emit(cmd_buffer->cs, gs->config.rsrc1);
2342       radeon_emit(cmd_buffer->cs, gs->config.rsrc2);
2343 
2344       /* GFX6-8: ESGS offchip ring buffer is allocated according to VGT_ESGS_RING_ITEMSIZE.
2345        * GFX9+: Only used to set the GS input VGPRs, emulated in shaders.
2346        */
2347       radeon_opt_set_context_reg(cmd_buffer, R_028AAC_VGT_ESGS_RING_ITEMSIZE, RADV_TRACKED_VGT_ESGS_RING_ITEMSIZE,
2348                                  gs->info.regs.gs.vgt_esgs_ring_itemsize);
2349    }
2350 
2351    if (pdev->info.gfx_level >= GFX7) {
2352       radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, 3,
2353                             gs->info.regs.spi_shader_pgm_rsrc3_gs);
2354    }
2355 
2356    if (pdev->info.gfx_level >= GFX10) {
2357       radeon_set_sh_reg_idx(&pdev->info, cmd_buffer->cs, R_00B204_SPI_SHADER_PGM_RSRC4_GS, 3,
2358                             gs->info.regs.spi_shader_pgm_rsrc4_gs);
2359    }
2360 }
2361 
2362 static void
radv_emit_geometry_shader(struct radv_cmd_buffer * cmd_buffer)2363 radv_emit_geometry_shader(struct radv_cmd_buffer *cmd_buffer)
2364 {
2365    const struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
2366    const struct radv_shader *es = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
2367                                      ? cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
2368                                      : cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
2369    if (gs->info.is_ngg) {
2370       radv_emit_hw_ngg(cmd_buffer, es, gs);
2371    } else {
2372       radv_emit_hw_gs(cmd_buffer, gs);
2373       radv_emit_hw_vs(cmd_buffer, cmd_buffer->state.gs_copy_shader);
2374    }
2375 
2376    radeon_opt_set_context_reg(cmd_buffer, R_028B38_VGT_GS_MAX_VERT_OUT, RADV_TRACKED_VGT_GS_MAX_VERT_OUT,
2377                               gs->info.regs.vgt_gs_max_vert_out);
2378 
2379    if (gs->info.merged_shader_compiled_separately) {
2380       const uint32_t vgt_esgs_ring_itemsize_offset = radv_get_user_sgpr_loc(gs, AC_UD_VGT_ESGS_RING_ITEMSIZE);
2381 
2382       assert(vgt_esgs_ring_itemsize_offset);
2383 
2384       radeon_set_sh_reg(cmd_buffer->cs, vgt_esgs_ring_itemsize_offset, es->info.esgs_itemsize / 4);
2385 
2386       if (gs->info.is_ngg) {
2387          const uint32_t ngg_lds_layout_offset = radv_get_user_sgpr_loc(gs, AC_UD_NGG_LDS_LAYOUT);
2388 
2389          assert(ngg_lds_layout_offset);
2390          assert(!(gs->info.ngg_info.esgs_ring_size & 0xffff0000) && !(gs->info.ngg_info.scratch_lds_base & 0xffff0000));
2391 
2392          radeon_set_sh_reg(cmd_buffer->cs, ngg_lds_layout_offset,
2393                            SET_SGPR_FIELD(NGG_LDS_LAYOUT_GS_OUT_VERTEX_BASE, gs->info.ngg_info.esgs_ring_size) |
2394                               SET_SGPR_FIELD(NGG_LDS_LAYOUT_SCRATCH_BASE, gs->info.ngg_info.scratch_lds_base));
2395       }
2396    }
2397 }
2398 
2399 static void
radv_emit_vgt_gs_out(struct radv_cmd_buffer * cmd_buffer,uint32_t vgt_gs_out_prim_type)2400 radv_emit_vgt_gs_out(struct radv_cmd_buffer *cmd_buffer, uint32_t vgt_gs_out_prim_type)
2401 {
2402    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2403    const struct radv_physical_device *pdev = radv_device_physical(device);
2404 
2405    if (pdev->info.gfx_level >= GFX11) {
2406       radeon_set_uconfig_reg(cmd_buffer->cs, R_030998_VGT_GS_OUT_PRIM_TYPE, vgt_gs_out_prim_type);
2407    } else {
2408       radeon_opt_set_context_reg(cmd_buffer, R_028A6C_VGT_GS_OUT_PRIM_TYPE, RADV_TRACKED_VGT_GS_OUT_PRIM_TYPE,
2409                                  vgt_gs_out_prim_type);
2410    }
2411 }
2412 
2413 static void
radv_emit_mesh_shader(struct radv_cmd_buffer * cmd_buffer)2414 radv_emit_mesh_shader(struct radv_cmd_buffer *cmd_buffer)
2415 {
2416    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2417    const struct radv_physical_device *pdev = radv_device_physical(device);
2418    const struct radv_shader *ms = cmd_buffer->state.shaders[MESA_SHADER_MESH];
2419    const uint32_t gs_out = radv_conv_gl_prim_to_gs_out(ms->info.ms.output_prim);
2420 
2421    radv_emit_hw_ngg(cmd_buffer, NULL, ms);
2422    radeon_opt_set_context_reg(cmd_buffer, R_028B38_VGT_GS_MAX_VERT_OUT, RADV_TRACKED_VGT_GS_MAX_VERT_OUT,
2423                               ms->info.regs.vgt_gs_max_vert_out);
2424    radeon_set_uconfig_reg_idx(&pdev->info, cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, V_008958_DI_PT_POINTLIST);
2425 
2426    if (pdev->mesh_fast_launch_2) {
2427       radeon_set_sh_reg_seq(cmd_buffer->cs, R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, 2);
2428       radeon_emit(cmd_buffer->cs, ms->info.regs.ms.spi_shader_gs_meshlet_dim);
2429       radeon_emit(cmd_buffer->cs, ms->info.regs.ms.spi_shader_gs_meshlet_exp_alloc);
2430    }
2431 
2432    radv_emit_vgt_gs_out(cmd_buffer, gs_out);
2433 }
2434 
2435 enum radv_ps_in_type {
2436    radv_ps_in_interpolated,
2437    radv_ps_in_flat,
2438    radv_ps_in_explicit,
2439    radv_ps_in_explicit_strict,
2440    radv_ps_in_interpolated_fp16,
2441    radv_ps_in_interpolated_fp16_hi,
2442    radv_ps_in_per_prim_gfx103,
2443    radv_ps_in_per_prim_gfx11,
2444 };
2445 
2446 static uint32_t
offset_to_ps_input(const uint32_t offset,const enum radv_ps_in_type type)2447 offset_to_ps_input(const uint32_t offset, const enum radv_ps_in_type type)
2448 {
2449    assert(offset != AC_EXP_PARAM_UNDEFINED);
2450 
2451    if (offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && offset <= AC_EXP_PARAM_DEFAULT_VAL_1111) {
2452       /* The input is a DEFAULT_VAL constant. */
2453       return S_028644_OFFSET(0x20) | S_028644_DEFAULT_VAL(offset - AC_EXP_PARAM_DEFAULT_VAL_0000);
2454    }
2455 
2456    assert(offset <= AC_EXP_PARAM_OFFSET_31);
2457    uint32_t ps_input_cntl = S_028644_OFFSET(offset);
2458 
2459    switch (type) {
2460    case radv_ps_in_explicit_strict:
2461       /* Rotate parameter cache contents to strict vertex order. */
2462       ps_input_cntl |= S_028644_ROTATE_PC_PTR(1);
2463       FALLTHROUGH;
2464    case radv_ps_in_explicit:
2465       /* Force parameter cache to be read in passthrough mode. */
2466       ps_input_cntl |= S_028644_OFFSET(1 << 5);
2467       FALLTHROUGH;
2468    case radv_ps_in_flat:
2469       ps_input_cntl |= S_028644_FLAT_SHADE(1);
2470       break;
2471    case radv_ps_in_interpolated_fp16_hi:
2472       ps_input_cntl |= S_028644_ATTR1_VALID(1);
2473       FALLTHROUGH;
2474    case radv_ps_in_interpolated_fp16:
2475       /* These must be set even if only the high 16 bits are used. */
2476       ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | S_028644_ATTR0_VALID(1);
2477       break;
2478    case radv_ps_in_per_prim_gfx11:
2479       ps_input_cntl |= S_028644_PRIM_ATTR(1);
2480       break;
2481    case radv_ps_in_interpolated:
2482    case radv_ps_in_per_prim_gfx103:
2483       break;
2484    }
2485 
2486    return ps_input_cntl;
2487 }
2488 
2489 static void
slot_to_ps_input(const struct radv_vs_output_info * outinfo,unsigned slot,uint32_t * ps_input_cntl,unsigned * ps_offset,const bool use_default_0,const enum radv_ps_in_type type)2490 slot_to_ps_input(const struct radv_vs_output_info *outinfo, unsigned slot, uint32_t *ps_input_cntl, unsigned *ps_offset,
2491                  const bool use_default_0, const enum radv_ps_in_type type)
2492 {
2493    unsigned vs_offset = outinfo->vs_output_param_offset[slot];
2494 
2495    if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
2496       if (use_default_0)
2497          vs_offset = AC_EXP_PARAM_DEFAULT_VAL_0000;
2498       else
2499          return;
2500    }
2501 
2502    ps_input_cntl[*ps_offset] = offset_to_ps_input(vs_offset, type);
2503    ++(*ps_offset);
2504 }
2505 
2506 static void
input_mask_to_ps_inputs(const struct radv_vs_output_info * outinfo,const struct radv_shader * ps,uint32_t input_mask,uint32_t * ps_input_cntl,unsigned * ps_offset,const enum radv_ps_in_type default_type)2507 input_mask_to_ps_inputs(const struct radv_vs_output_info *outinfo, const struct radv_shader *ps, uint32_t input_mask,
2508                         uint32_t *ps_input_cntl, unsigned *ps_offset, const enum radv_ps_in_type default_type)
2509 {
2510    u_foreach_bit (i, input_mask) {
2511       unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_VAR0 + i];
2512       if (vs_offset == AC_EXP_PARAM_UNDEFINED) {
2513          ps_input_cntl[*ps_offset] = S_028644_OFFSET(0x20);
2514          ++(*ps_offset);
2515          continue;
2516       }
2517 
2518       enum radv_ps_in_type type = default_type;
2519 
2520       if (ps->info.ps.explicit_shaded_mask & BITFIELD_BIT(*ps_offset))
2521          type = radv_ps_in_explicit;
2522       else if (ps->info.ps.explicit_strict_shaded_mask & BITFIELD_BIT(*ps_offset))
2523          type = radv_ps_in_explicit_strict;
2524       else if (ps->info.ps.float16_hi_shaded_mask & BITFIELD_BIT(*ps_offset))
2525          type = radv_ps_in_interpolated_fp16_hi;
2526       else if (ps->info.ps.float16_shaded_mask & BITFIELD_BIT(*ps_offset))
2527          type = radv_ps_in_interpolated_fp16;
2528       else if (ps->info.ps.float32_shaded_mask & BITFIELD_BIT(*ps_offset))
2529          type = radv_ps_in_interpolated;
2530 
2531       ps_input_cntl[*ps_offset] = offset_to_ps_input(vs_offset, type);
2532       ++(*ps_offset);
2533    }
2534 }
2535 
2536 static void
radv_emit_ps_inputs(struct radv_cmd_buffer * cmd_buffer)2537 radv_emit_ps_inputs(struct radv_cmd_buffer *cmd_buffer)
2538 {
2539    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2540    const struct radv_physical_device *pdev = radv_device_physical(device);
2541    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2542    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
2543    const struct radv_vs_output_info *outinfo = &last_vgt_shader->info.outinfo;
2544    const bool mesh = last_vgt_shader->info.stage == MESA_SHADER_MESH;
2545    const bool gfx11plus = pdev->info.gfx_level >= GFX11;
2546    const enum radv_ps_in_type per_prim = gfx11plus ? radv_ps_in_per_prim_gfx11 : radv_ps_in_per_prim_gfx103;
2547 
2548    uint32_t ps_input_cntl[32];
2549    unsigned ps_offset = 0;
2550 
2551    if (!mesh) {
2552       if (ps->info.ps.prim_id_input)
2553          slot_to_ps_input(outinfo, VARYING_SLOT_PRIMITIVE_ID, ps_input_cntl, &ps_offset, false, radv_ps_in_flat);
2554 
2555       if (ps->info.ps.layer_input)
2556          slot_to_ps_input(outinfo, VARYING_SLOT_LAYER, ps_input_cntl, &ps_offset, true, radv_ps_in_flat);
2557 
2558       if (ps->info.ps.viewport_index_input)
2559          slot_to_ps_input(outinfo, VARYING_SLOT_VIEWPORT, ps_input_cntl, &ps_offset, true, radv_ps_in_flat);
2560    }
2561 
2562    if (ps->info.ps.has_pcoord)
2563       ps_input_cntl[ps_offset++] = S_028644_PT_SPRITE_TEX(1) | S_028644_OFFSET(0x20);
2564 
2565    if (ps->info.ps.input_clips_culls_mask & 0x0f)
2566       slot_to_ps_input(outinfo, VARYING_SLOT_CLIP_DIST0, ps_input_cntl, &ps_offset, false, radv_ps_in_interpolated);
2567 
2568    if (ps->info.ps.input_clips_culls_mask & 0xf0)
2569       slot_to_ps_input(outinfo, VARYING_SLOT_CLIP_DIST1, ps_input_cntl, &ps_offset, false, radv_ps_in_interpolated);
2570 
2571    input_mask_to_ps_inputs(outinfo, ps, ps->info.ps.input_mask, ps_input_cntl, &ps_offset, radv_ps_in_flat);
2572 
2573    /* Per-primitive PS inputs: the HW needs these to be last. */
2574    if (mesh) {
2575       if (ps->info.ps.prim_id_input)
2576          slot_to_ps_input(outinfo, VARYING_SLOT_PRIMITIVE_ID, ps_input_cntl, &ps_offset, false, per_prim);
2577 
2578       if (ps->info.ps.layer_input)
2579          slot_to_ps_input(outinfo, VARYING_SLOT_LAYER, ps_input_cntl, &ps_offset, true, per_prim);
2580 
2581       if (ps->info.ps.viewport_index_input)
2582          slot_to_ps_input(outinfo, VARYING_SLOT_VIEWPORT, ps_input_cntl, &ps_offset, true, per_prim);
2583    }
2584 
2585    input_mask_to_ps_inputs(outinfo, ps, ps->info.ps.input_per_primitive_mask, ps_input_cntl, &ps_offset, per_prim);
2586 
2587    if (pdev->info.gfx_level >= GFX12) {
2588       radeon_set_sh_reg(cmd_buffer->cs, R_00B0C4_SPI_SHADER_GS_OUT_CONFIG_PS,
2589                         last_vgt_shader->info.regs.spi_vs_out_config | ps->info.regs.ps.spi_gs_out_config_ps);
2590 
2591       radeon_opt_set_context_regn(cmd_buffer, R_028664_SPI_PS_INPUT_CNTL_0, ps_input_cntl,
2592                                   cmd_buffer->tracked_regs.spi_ps_input_cntl, ps_offset);
2593    } else {
2594       radeon_opt_set_context_regn(cmd_buffer, R_028644_SPI_PS_INPUT_CNTL_0, ps_input_cntl,
2595                                   cmd_buffer->tracked_regs.spi_ps_input_cntl, ps_offset);
2596    }
2597 }
2598 
2599 static void
radv_emit_fragment_shader(struct radv_cmd_buffer * cmd_buffer)2600 radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer)
2601 {
2602    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2603    const struct radv_physical_device *pdev = radv_device_physical(device);
2604    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2605    const uint64_t va = radv_shader_get_va(ps);
2606 
2607    radeon_set_sh_reg_seq(cmd_buffer->cs, ps->info.regs.pgm_lo, 4);
2608    radeon_emit(cmd_buffer->cs, va >> 8);
2609    radeon_emit(cmd_buffer->cs, S_00B024_MEM_BASE(va >> 40));
2610    radeon_emit(cmd_buffer->cs, ps->config.rsrc1);
2611    radeon_emit(cmd_buffer->cs, ps->config.rsrc2);
2612 
2613    if (pdev->info.gfx_level >= GFX12) {
2614       radeon_opt_set_context_reg2(cmd_buffer, R_02865C_SPI_PS_INPUT_ENA, RADV_TRACKED_SPI_PS_INPUT_ENA,
2615                                   ps->config.spi_ps_input_ena, ps->config.spi_ps_input_addr);
2616 
2617       radeon_opt_set_context_reg(cmd_buffer, R_028640_SPI_PS_IN_CONTROL, RADV_TRACKED_SPI_PS_IN_CONTROL,
2618                                  ps->info.regs.ps.spi_ps_in_control);
2619 
2620       radeon_set_context_reg(cmd_buffer->cs, R_028650_SPI_SHADER_Z_FORMAT, ps->info.regs.ps.spi_shader_z_format);
2621 
2622       radeon_set_context_reg(cmd_buffer->cs, R_028BBC_PA_SC_HISZ_CONTROL, ps->info.regs.ps.pa_sc_hisz_control);
2623    } else {
2624       radeon_opt_set_context_reg2(cmd_buffer, R_0286CC_SPI_PS_INPUT_ENA, RADV_TRACKED_SPI_PS_INPUT_ENA,
2625                                   ps->config.spi_ps_input_ena, ps->config.spi_ps_input_addr);
2626 
2627       radeon_opt_set_context_reg(cmd_buffer, R_0286D8_SPI_PS_IN_CONTROL, RADV_TRACKED_SPI_PS_IN_CONTROL,
2628                                  ps->info.regs.ps.spi_ps_in_control);
2629 
2630       radeon_opt_set_context_reg(cmd_buffer, R_028710_SPI_SHADER_Z_FORMAT, RADV_TRACKED_SPI_SHADER_Z_FORMAT,
2631                                  ps->info.regs.ps.spi_shader_z_format);
2632 
2633       if (pdev->info.gfx_level >= GFX9 && pdev->info.gfx_level < GFX11)
2634          radeon_opt_set_context_reg(cmd_buffer, R_028C40_PA_SC_SHADER_CONTROL, RADV_TRACKED_PA_SC_SHADER_CONTROL,
2635                                     ps->info.regs.ps.pa_sc_shader_control);
2636    }
2637 }
2638 
2639 static void
radv_emit_vgt_reuse(struct radv_cmd_buffer * cmd_buffer,const struct radv_vgt_shader_key * key)2640 radv_emit_vgt_reuse(struct radv_cmd_buffer *cmd_buffer, const struct radv_vgt_shader_key *key)
2641 {
2642    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2643    const struct radv_physical_device *pdev = radv_device_physical(device);
2644    const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
2645 
2646    if (pdev->info.gfx_level == GFX10_3) {
2647       /* Legacy Tess+GS should disable reuse to prevent hangs on GFX10.3. */
2648       const bool has_legacy_tess_gs = key->tess && key->gs && !key->ngg;
2649 
2650       radeon_opt_set_context_reg(cmd_buffer, R_028AB4_VGT_REUSE_OFF, RADV_TRACKED_VGT_REUSE_OFF,
2651                                  S_028AB4_REUSE_OFF(has_legacy_tess_gs));
2652    }
2653 
2654    if (pdev->info.family >= CHIP_POLARIS10 && pdev->info.gfx_level < GFX10) {
2655       unsigned vtx_reuse_depth = 30;
2656       if (tes && tes->info.tes.spacing == TESS_SPACING_FRACTIONAL_ODD) {
2657          vtx_reuse_depth = 14;
2658       }
2659       radeon_opt_set_context_reg(cmd_buffer, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
2660                                  RADV_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL, S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth));
2661    }
2662 }
2663 
2664 static void
radv_emit_vgt_shader_config_gfx12(struct radv_cmd_buffer * cmd_buffer,const struct radv_vgt_shader_key * key)2665 radv_emit_vgt_shader_config_gfx12(struct radv_cmd_buffer *cmd_buffer, const struct radv_vgt_shader_key *key)
2666 {
2667    const bool ngg_wave_id_en = key->ngg_streamout || (key->mesh && key->mesh_scratch_ring);
2668    uint32_t stages = 0;
2669 
2670    stages |= S_028A98_GS_EN(key->gs) | S_028A98_GS_FAST_LAUNCH(key->mesh) | S_028A98_GS_W32_EN(key->gs_wave32) |
2671              S_028A98_NGG_WAVE_ID_EN(ngg_wave_id_en) | S_028A98_PRIMGEN_PASSTHRU_NO_MSG(key->ngg_passthrough);
2672 
2673    if (key->tess)
2674       stages |= S_028A98_HS_EN(1) | S_028A98_HS_W32_EN(key->hs_wave32);
2675 
2676    radeon_opt_set_context_reg(cmd_buffer, R_028A98_VGT_SHADER_STAGES_EN, RADV_TRACKED_VGT_SHADER_STAGES_EN, stages);
2677 }
2678 
2679 static void
radv_emit_vgt_shader_config_gfx6(struct radv_cmd_buffer * cmd_buffer,const struct radv_vgt_shader_key * key)2680 radv_emit_vgt_shader_config_gfx6(struct radv_cmd_buffer *cmd_buffer, const struct radv_vgt_shader_key *key)
2681 {
2682    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2683    const struct radv_physical_device *pdev = radv_device_physical(device);
2684    uint32_t stages = 0;
2685 
2686    if (key->tess) {
2687       stages |=
2688          S_028B54_LS_EN(V_028B54_LS_STAGE_ON) | S_028B54_HS_EN(1) | S_028B54_DYNAMIC_HS(pdev->info.gfx_level != GFX9);
2689 
2690       if (key->gs)
2691          stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS) | S_028B54_GS_EN(1);
2692       else if (key->ngg)
2693          stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_DS);
2694       else
2695          stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_DS);
2696    } else if (key->gs) {
2697       stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL) | S_028B54_GS_EN(1);
2698    } else if (key->mesh) {
2699       assert(!key->ngg_passthrough);
2700       unsigned gs_fast_launch = pdev->mesh_fast_launch_2 ? 2 : 1;
2701       stages |=
2702          S_028B54_GS_EN(1) | S_028B54_GS_FAST_LAUNCH(gs_fast_launch) | S_028B54_NGG_WAVE_ID_EN(key->mesh_scratch_ring);
2703    } else if (key->ngg) {
2704       stages |= S_028B54_ES_EN(V_028B54_ES_STAGE_REAL);
2705    }
2706 
2707    if (key->ngg) {
2708       stages |= S_028B54_PRIMGEN_EN(1) | S_028B54_NGG_WAVE_ID_EN(key->ngg_streamout) |
2709                 S_028B54_PRIMGEN_PASSTHRU_EN(key->ngg_passthrough) |
2710                 S_028B54_PRIMGEN_PASSTHRU_NO_MSG(key->ngg_passthrough && pdev->info.family >= CHIP_NAVI23);
2711    } else if (key->gs) {
2712       stages |= S_028B54_VS_EN(V_028B54_VS_STAGE_COPY_SHADER);
2713    }
2714 
2715    if (pdev->info.gfx_level >= GFX9)
2716       stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
2717 
2718    if (pdev->info.gfx_level >= GFX10) {
2719       stages |= S_028B54_HS_W32_EN(key->hs_wave32) | S_028B54_GS_W32_EN(key->gs_wave32) |
2720                 S_028B54_VS_W32_EN(pdev->info.gfx_level < GFX11 && key->vs_wave32);
2721       /* Legacy GS only supports Wave64. Read it as an implication. */
2722       assert(!(key->gs && !key->ngg) || !key->gs_wave32);
2723    }
2724 
2725    radeon_opt_set_context_reg(cmd_buffer, R_028B54_VGT_SHADER_STAGES_EN, RADV_TRACKED_VGT_SHADER_STAGES_EN, stages);
2726 }
2727 
2728 static void
radv_emit_vgt_shader_config(struct radv_cmd_buffer * cmd_buffer,const struct radv_vgt_shader_key * key)2729 radv_emit_vgt_shader_config(struct radv_cmd_buffer *cmd_buffer, const struct radv_vgt_shader_key *key)
2730 {
2731    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2732    const struct radv_physical_device *pdev = radv_device_physical(device);
2733 
2734    if (pdev->info.gfx_level >= GFX12) {
2735       radv_emit_vgt_shader_config_gfx12(cmd_buffer, key);
2736    } else {
2737       radv_emit_vgt_shader_config_gfx6(cmd_buffer, key);
2738    }
2739 }
2740 
2741 static void
gfx103_emit_vgt_draw_payload_cntl(struct radv_cmd_buffer * cmd_buffer)2742 gfx103_emit_vgt_draw_payload_cntl(struct radv_cmd_buffer *cmd_buffer)
2743 {
2744    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2745    const struct radv_physical_device *pdev = radv_device_physical(device);
2746    const struct radv_shader *mesh_shader = cmd_buffer->state.shaders[MESA_SHADER_MESH];
2747    const bool enable_vrs = cmd_buffer->state.uses_vrs;
2748    bool enable_prim_payload = false;
2749 
2750    /* Enables the second channel of the primitive export instruction.
2751     * This channel contains: VRS rate x, y, viewport and layer.
2752     */
2753    if (mesh_shader) {
2754       const struct radv_vs_output_info *outinfo = &mesh_shader->info.outinfo;
2755 
2756       enable_prim_payload = (outinfo->writes_viewport_index_per_primitive || outinfo->writes_layer_per_primitive ||
2757                              outinfo->writes_primitive_shading_rate_per_primitive);
2758    }
2759 
2760    const uint32_t vgt_draw_payload_cntl =
2761       S_028A98_EN_VRS_RATE(enable_vrs) | S_028A98_EN_PRIM_PAYLOAD(enable_prim_payload);
2762 
2763    if (pdev->info.gfx_level >= GFX12) {
2764       radeon_opt_set_context_reg(cmd_buffer, R_028AA0_VGT_DRAW_PAYLOAD_CNTL, RADV_TRACKED_VGT_DRAW_PAYLOAD_CNTL,
2765                                  vgt_draw_payload_cntl);
2766    } else {
2767       radeon_opt_set_context_reg(cmd_buffer, R_028A98_VGT_DRAW_PAYLOAD_CNTL, RADV_TRACKED_VGT_DRAW_PAYLOAD_CNTL,
2768                                  vgt_draw_payload_cntl);
2769    }
2770 }
2771 
2772 static void
gfx103_emit_vrs_state(struct radv_cmd_buffer * cmd_buffer)2773 gfx103_emit_vrs_state(struct radv_cmd_buffer *cmd_buffer)
2774 {
2775    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2776    const struct radv_physical_device *pdev = radv_device_physical(device);
2777    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
2778    const bool force_vrs_per_vertex = cmd_buffer->state.last_vgt_shader->info.force_vrs_per_vertex;
2779    const bool enable_vrs_coarse_shading = cmd_buffer->state.uses_vrs_coarse_shading;
2780    uint32_t mode = V_028064_SC_VRS_COMB_MODE_PASSTHRU;
2781    uint8_t rate_x = 0, rate_y = 0;
2782 
2783    if (enable_vrs_coarse_shading) {
2784       /* When per-draw VRS is not enabled at all, try enabling VRS coarse shading 2x2 if the driver
2785        * determined that it's safe to enable.
2786        */
2787       mode = V_028064_SC_VRS_COMB_MODE_OVERRIDE;
2788       rate_x = rate_y = 1;
2789    } else if (force_vrs_per_vertex) {
2790       /* Otherwise, if per-draw VRS is not enabled statically, try forcing per-vertex VRS if
2791        * requested by the user. Note that vkd3d-proton always has to declare VRS as dynamic because
2792        * in DX12 it's fully dynamic.
2793        */
2794       radeon_opt_set_context_reg(cmd_buffer, R_028848_PA_CL_VRS_CNTL, RADV_TRACKED_PA_CL_VRS_CNTL,
2795                                  S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE) |
2796                                     S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE));
2797 
2798       /* If the shader is using discard, turn off coarse shading because discard at 2x2 pixel
2799        * granularity degrades quality too much. MIN allows sample shading but not coarse shading.
2800        */
2801       mode = ps->info.ps.can_discard ? V_028064_SC_VRS_COMB_MODE_MIN : V_028064_SC_VRS_COMB_MODE_PASSTHRU;
2802    }
2803 
2804    if (pdev->info.gfx_level < GFX11) {
2805       radeon_opt_set_context_reg(cmd_buffer, R_028064_DB_VRS_OVERRIDE_CNTL, RADV_TRACKED_DB_VRS_OVERRIDE_CNTL,
2806                                  S_028064_VRS_OVERRIDE_RATE_COMBINER_MODE(mode) | S_028064_VRS_OVERRIDE_RATE_X(rate_x) |
2807                                     S_028064_VRS_OVERRIDE_RATE_Y(rate_y));
2808    }
2809 }
2810 
2811 static void
radv_emit_graphics_shaders(struct radv_cmd_buffer * cmd_buffer)2812 radv_emit_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
2813 {
2814    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2815    const struct radv_physical_device *pdev = radv_device_physical(device);
2816 
2817    radv_foreach_stage(s, cmd_buffer->state.active_stages & RADV_GRAPHICS_STAGE_BITS)
2818    {
2819       switch (s) {
2820       case MESA_SHADER_VERTEX:
2821          radv_emit_vertex_shader(cmd_buffer);
2822          break;
2823       case MESA_SHADER_TESS_CTRL:
2824          radv_emit_tess_ctrl_shader(cmd_buffer);
2825          break;
2826       case MESA_SHADER_TESS_EVAL:
2827          radv_emit_tess_eval_shader(cmd_buffer);
2828          break;
2829       case MESA_SHADER_GEOMETRY:
2830          radv_emit_geometry_shader(cmd_buffer);
2831          break;
2832       case MESA_SHADER_FRAGMENT:
2833          radv_emit_fragment_shader(cmd_buffer);
2834          radv_emit_ps_inputs(cmd_buffer);
2835          break;
2836       case MESA_SHADER_MESH:
2837          radv_emit_mesh_shader(cmd_buffer);
2838          break;
2839       case MESA_SHADER_TASK:
2840          radv_emit_compute_shader(pdev, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK]);
2841          break;
2842       default:
2843          unreachable("invalid bind stage");
2844       }
2845    }
2846 
2847    const struct radv_vgt_shader_key vgt_shader_cfg_key =
2848       radv_get_vgt_shader_key(device, cmd_buffer->state.shaders, cmd_buffer->state.gs_copy_shader);
2849 
2850    radv_emit_vgt_gs_mode(cmd_buffer);
2851    radv_emit_vgt_reuse(cmd_buffer, &vgt_shader_cfg_key);
2852    radv_emit_vgt_shader_config(cmd_buffer, &vgt_shader_cfg_key);
2853 
2854    if (pdev->info.gfx_level >= GFX10_3) {
2855       gfx103_emit_vgt_draw_payload_cntl(cmd_buffer);
2856       gfx103_emit_vrs_state(cmd_buffer);
2857    }
2858 
2859    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GRAPHICS_SHADERS;
2860 }
2861 
2862 static void
radv_emit_graphics_pipeline(struct radv_cmd_buffer * cmd_buffer)2863 radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
2864 {
2865    struct radv_graphics_pipeline *pipeline = cmd_buffer->state.graphics_pipeline;
2866    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2867    const struct radv_physical_device *pdev = radv_device_physical(device);
2868 
2869    if (cmd_buffer->state.emitted_graphics_pipeline == pipeline)
2870       return;
2871 
2872    if (cmd_buffer->state.emitted_graphics_pipeline) {
2873       if (radv_rast_prim_is_points_or_lines(cmd_buffer->state.emitted_graphics_pipeline->rast_prim) !=
2874           radv_rast_prim_is_points_or_lines(pipeline->rast_prim))
2875          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GUARDBAND;
2876 
2877       if (cmd_buffer->state.emitted_graphics_pipeline->rast_prim != pipeline->rast_prim)
2878          cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_RASTERIZATION_SAMPLES;
2879 
2880       if (cmd_buffer->state.emitted_graphics_pipeline->ms.min_sample_shading != pipeline->ms.min_sample_shading ||
2881           cmd_buffer->state.emitted_graphics_pipeline->uses_out_of_order_rast != pipeline->uses_out_of_order_rast ||
2882           cmd_buffer->state.emitted_graphics_pipeline->uses_vrs_attachment != pipeline->uses_vrs_attachment)
2883          cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
2884 
2885       if (cmd_buffer->state.emitted_graphics_pipeline->ms.sample_shading_enable != pipeline->ms.sample_shading_enable) {
2886          cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
2887          if (pdev->info.gfx_level >= GFX10_3)
2888             cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
2889       }
2890 
2891       if (cmd_buffer->state.emitted_graphics_pipeline->db_render_control != pipeline->db_render_control)
2892          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
2893    }
2894 
2895    radv_emit_graphics_shaders(cmd_buffer);
2896 
2897    if (device->pbb_allowed) {
2898       const struct radv_binning_settings *settings = &pdev->binning_settings;
2899 
2900       if ((!cmd_buffer->state.emitted_graphics_pipeline ||
2901            cmd_buffer->state.emitted_graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT] !=
2902               cmd_buffer->state.graphics_pipeline->base.shaders[MESA_SHADER_FRAGMENT]) &&
2903           (settings->context_states_per_bin > 1 || settings->persistent_states_per_bin > 1)) {
2904          /* Break the batch on PS changes. */
2905          radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
2906          radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
2907       }
2908    }
2909 
2910    if (pipeline->sqtt_shaders_reloc) {
2911       /* Emit shaders relocation because RGP requires them to be contiguous in memory. */
2912       radv_sqtt_emit_relocated_shaders(cmd_buffer, pipeline);
2913 
2914       struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
2915       if (task_shader) {
2916          const struct radv_sqtt_shaders_reloc *reloc = pipeline->sqtt_shaders_reloc;
2917          const uint64_t va = reloc->va[MESA_SHADER_TASK];
2918 
2919          radeon_set_sh_reg(cmd_buffer->gang.cs, task_shader->info.regs.pgm_lo, va >> 8);
2920       }
2921    }
2922 
2923    if (radv_device_fault_detection_enabled(device))
2924       radv_save_pipeline(cmd_buffer, &pipeline->base);
2925 
2926    cmd_buffer->state.emitted_graphics_pipeline = pipeline;
2927 
2928    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
2929 }
2930 
2931 static bool
radv_get_depth_clip_enable(struct radv_cmd_buffer * cmd_buffer)2932 radv_get_depth_clip_enable(struct radv_cmd_buffer *cmd_buffer)
2933 {
2934    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2935 
2936    return d->vk.rs.depth_clip_enable == VK_MESA_DEPTH_CLIP_ENABLE_TRUE ||
2937           (d->vk.rs.depth_clip_enable == VK_MESA_DEPTH_CLIP_ENABLE_NOT_CLAMP && !d->vk.rs.depth_clamp_enable);
2938 }
2939 
2940 enum radv_depth_clamp_mode {
2941    RADV_DEPTH_CLAMP_MODE_VIEWPORT = 0,    /* Clamp to the viewport min/max depth bounds */
2942    RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE = 1, /* Clamp between 0.0f and 1.0f */
2943    RADV_DEPTH_CLAMP_MODE_DISABLED = 2,    /* Disable depth clamping */
2944 };
2945 
2946 static enum radv_depth_clamp_mode
radv_get_depth_clamp_mode(struct radv_cmd_buffer * cmd_buffer)2947 radv_get_depth_clamp_mode(struct radv_cmd_buffer *cmd_buffer)
2948 {
2949    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2950    bool depth_clip_enable = radv_get_depth_clip_enable(cmd_buffer);
2951    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
2952    enum radv_depth_clamp_mode mode;
2953 
2954    mode = RADV_DEPTH_CLAMP_MODE_VIEWPORT;
2955    if (!d->vk.rs.depth_clamp_enable) {
2956       /* For optimal performance, depth clamping should always be enabled except if the application
2957        * disables clamping explicitly or uses depth values outside of the [0.0, 1.0] range.
2958        */
2959       if (!depth_clip_enable || device->vk.enabled_extensions.EXT_depth_range_unrestricted) {
2960          mode = RADV_DEPTH_CLAMP_MODE_DISABLED;
2961       } else {
2962          mode = RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE;
2963       }
2964    }
2965 
2966    return mode;
2967 }
2968 
2969 static void
radv_get_viewport_zscale_ztranslate(struct radv_cmd_buffer * cmd_buffer,uint32_t vp_idx,float * zscale,float * ztranslate)2970 radv_get_viewport_zscale_ztranslate(struct radv_cmd_buffer *cmd_buffer, uint32_t vp_idx, float *zscale,
2971                                     float *ztranslate)
2972 {
2973    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
2974 
2975    if (d->vk.vp.depth_clip_negative_one_to_one) {
2976       *zscale = d->hw_vp.xform[vp_idx].scale[2] * 0.5f;
2977       *ztranslate = (d->hw_vp.xform[vp_idx].translate[2] + d->vk.vp.viewports[vp_idx].maxDepth) * 0.5f;
2978    } else {
2979       *zscale = d->hw_vp.xform[vp_idx].scale[2];
2980       *ztranslate = d->hw_vp.xform[vp_idx].translate[2];
2981    }
2982 }
2983 
2984 static void
radv_get_viewport_zmin_zmax(struct radv_cmd_buffer * cmd_buffer,const VkViewport * viewport,float * zmin,float * zmax)2985 radv_get_viewport_zmin_zmax(struct radv_cmd_buffer *cmd_buffer, const VkViewport *viewport, float *zmin, float *zmax)
2986 {
2987    const enum radv_depth_clamp_mode depth_clamp_mode = radv_get_depth_clamp_mode(cmd_buffer);
2988 
2989    if (depth_clamp_mode == RADV_DEPTH_CLAMP_MODE_ZERO_TO_ONE) {
2990       *zmin = 0.0f;
2991       *zmax = 1.0f;
2992    } else {
2993       *zmin = MIN2(viewport->minDepth, viewport->maxDepth);
2994       *zmax = MAX2(viewport->minDepth, viewport->maxDepth);
2995    }
2996 }
2997 
2998 static void
radv_emit_viewport(struct radv_cmd_buffer * cmd_buffer)2999 radv_emit_viewport(struct radv_cmd_buffer *cmd_buffer)
3000 {
3001    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3002    const struct radv_physical_device *pdev = radv_device_physical(device);
3003    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3004 
3005    assert(d->vk.vp.viewport_count);
3006 
3007    if (pdev->info.gfx_level >= GFX12) {
3008       radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, d->vk.vp.viewport_count * 8);
3009 
3010       for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
3011          float zscale, ztranslate, zmin, zmax;
3012 
3013          radv_get_viewport_zscale_ztranslate(cmd_buffer, i, &zscale, &ztranslate);
3014          radv_get_viewport_zmin_zmax(cmd_buffer, &d->vk.vp.viewports[i], &zmin, &zmax);
3015 
3016          radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[0]));
3017          radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[0]));
3018          radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[1]));
3019          radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[1]));
3020          radeon_emit(cmd_buffer->cs, fui(zscale));
3021          radeon_emit(cmd_buffer->cs, fui(ztranslate));
3022          radeon_emit(cmd_buffer->cs, fui(zmin));
3023          radeon_emit(cmd_buffer->cs, fui(zmax));
3024       }
3025    } else {
3026       radeon_set_context_reg_seq(cmd_buffer->cs, R_02843C_PA_CL_VPORT_XSCALE, d->vk.vp.viewport_count * 6);
3027 
3028       for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
3029          float zscale, ztranslate;
3030 
3031          radv_get_viewport_zscale_ztranslate(cmd_buffer, i, &zscale, &ztranslate);
3032 
3033          radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[0]));
3034          radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[0]));
3035          radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].scale[1]));
3036          radeon_emit(cmd_buffer->cs, fui(d->hw_vp.xform[i].translate[1]));
3037          radeon_emit(cmd_buffer->cs, fui(zscale));
3038          radeon_emit(cmd_buffer->cs, fui(ztranslate));
3039       }
3040 
3041       radeon_set_context_reg_seq(cmd_buffer->cs, R_0282D0_PA_SC_VPORT_ZMIN_0, d->vk.vp.viewport_count * 2);
3042       for (unsigned i = 0; i < d->vk.vp.viewport_count; i++) {
3043          float zmin, zmax;
3044 
3045          radv_get_viewport_zmin_zmax(cmd_buffer, &d->vk.vp.viewports[i], &zmin, &zmax);
3046 
3047          radeon_emit(cmd_buffer->cs, fui(zmin));
3048          radeon_emit(cmd_buffer->cs, fui(zmax));
3049       }
3050    }
3051 }
3052 
3053 static VkRect2D
radv_scissor_from_viewport(const VkViewport * viewport)3054 radv_scissor_from_viewport(const VkViewport *viewport)
3055 {
3056    float scale[3], translate[3];
3057    VkRect2D rect;
3058 
3059    radv_get_viewport_xform(viewport, scale, translate);
3060 
3061    rect.offset.x = translate[0] - fabsf(scale[0]);
3062    rect.offset.y = translate[1] - fabsf(scale[1]);
3063    rect.extent.width = ceilf(translate[0] + fabsf(scale[0])) - rect.offset.x;
3064    rect.extent.height = ceilf(translate[1] + fabsf(scale[1])) - rect.offset.y;
3065 
3066    return rect;
3067 }
3068 
3069 static VkRect2D
radv_intersect_scissor(const VkRect2D * a,const VkRect2D * b)3070 radv_intersect_scissor(const VkRect2D *a, const VkRect2D *b)
3071 {
3072    VkRect2D ret;
3073    ret.offset.x = MAX2(a->offset.x, b->offset.x);
3074    ret.offset.y = MAX2(a->offset.y, b->offset.y);
3075    ret.extent.width = MIN2(a->offset.x + a->extent.width, b->offset.x + b->extent.width) - ret.offset.x;
3076    ret.extent.height = MIN2(a->offset.y + a->extent.height, b->offset.y + b->extent.height) - ret.offset.y;
3077    return ret;
3078 }
3079 
3080 static void
radv_emit_scissor(struct radv_cmd_buffer * cmd_buffer)3081 radv_emit_scissor(struct radv_cmd_buffer *cmd_buffer)
3082 {
3083    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3084    const struct radv_physical_device *pdev = radv_device_physical(device);
3085    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3086    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3087 
3088    if (!d->vk.vp.scissor_count)
3089       return;
3090 
3091    radeon_set_context_reg_seq(cs, R_028250_PA_SC_VPORT_SCISSOR_0_TL, d->vk.vp.scissor_count * 2);
3092    for (unsigned i = 0; i < d->vk.vp.scissor_count; i++) {
3093       VkRect2D viewport_scissor = radv_scissor_from_viewport(d->vk.vp.viewports + i);
3094       VkRect2D scissor = radv_intersect_scissor(&d->vk.vp.scissors[i], &viewport_scissor);
3095 
3096       uint32_t minx = scissor.offset.x;
3097       uint32_t miny = scissor.offset.y;
3098       uint32_t maxx = minx + scissor.extent.width;
3099       uint32_t maxy = miny + scissor.extent.height;
3100 
3101       if (pdev->info.gfx_level >= GFX12) {
3102          /* On GFX12, an empty scissor must be done like this because the bottom-right bounds are inclusive. */
3103          if (maxx == 0 || maxy == 0) {
3104             minx = miny = maxx = maxy = 1;
3105          }
3106 
3107          radeon_emit(cs, S_028250_TL_X(minx) | S_028250_TL_Y_GFX12(miny));
3108          radeon_emit(cs, S_028254_BR_X(maxx - 1) | S_028254_BR_Y(maxy - 1));
3109       } else {
3110          radeon_emit(cs, S_028250_TL_X(minx) | S_028250_TL_Y_GFX6(miny) | S_028250_WINDOW_OFFSET_DISABLE(1));
3111          radeon_emit(cs, S_028254_BR_X(maxx) | S_028254_BR_Y(maxy));
3112       }
3113    }
3114 }
3115 
3116 static void
radv_emit_discard_rectangle(struct radv_cmd_buffer * cmd_buffer)3117 radv_emit_discard_rectangle(struct radv_cmd_buffer *cmd_buffer)
3118 {
3119    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3120    const struct radv_physical_device *pdev = radv_device_physical(device);
3121    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3122    uint32_t cliprect_rule = 0;
3123 
3124    if (!d->vk.dr.enable) {
3125       cliprect_rule = 0xffff;
3126    } else {
3127       for (unsigned i = 0; i < (1u << MAX_DISCARD_RECTANGLES); ++i) {
3128          /* Interpret i as a bitmask, and then set the bit in
3129           * the mask if that combination of rectangles in which
3130           * the pixel is contained should pass the cliprect
3131           * test.
3132           */
3133          unsigned relevant_subset = i & ((1u << d->vk.dr.rectangle_count) - 1);
3134 
3135          if (d->vk.dr.mode == VK_DISCARD_RECTANGLE_MODE_INCLUSIVE_EXT && !relevant_subset)
3136             continue;
3137 
3138          if (d->vk.dr.mode == VK_DISCARD_RECTANGLE_MODE_EXCLUSIVE_EXT && relevant_subset)
3139             continue;
3140 
3141          cliprect_rule |= 1u << i;
3142       }
3143 
3144       radeon_set_context_reg_seq(cmd_buffer->cs, R_028210_PA_SC_CLIPRECT_0_TL, d->vk.dr.rectangle_count * 2);
3145       for (unsigned i = 0; i < d->vk.dr.rectangle_count; ++i) {
3146          VkRect2D rect = d->vk.dr.rectangles[i];
3147          radeon_emit(cmd_buffer->cs, S_028210_TL_X(rect.offset.x) | S_028210_TL_Y(rect.offset.y));
3148          radeon_emit(cmd_buffer->cs, S_028214_BR_X(rect.offset.x + rect.extent.width) |
3149                                         S_028214_BR_Y(rect.offset.y + rect.extent.height));
3150       }
3151 
3152       if (pdev->info.gfx_level >= GFX12) {
3153          radeon_set_context_reg_seq(cmd_buffer->cs, R_028374_PA_SC_CLIPRECT_0_EXT, d->vk.dr.rectangle_count);
3154          for (unsigned i = 0; i < d->vk.dr.rectangle_count; ++i) {
3155             VkRect2D rect = d->vk.dr.rectangles[i];
3156             radeon_emit(cmd_buffer->cs, S_028374_TL_X_EXT(rect.offset.x >> 15) |
3157                                            S_028374_TL_Y_EXT(rect.offset.y >> 15) |
3158                                            S_028374_BR_X_EXT((rect.offset.x + rect.extent.width) >> 15) |
3159                                            S_028374_BR_Y_EXT((rect.offset.y + rect.extent.height) >> 15));
3160          }
3161       }
3162    }
3163 
3164    radeon_set_context_reg(cmd_buffer->cs, R_02820C_PA_SC_CLIPRECT_RULE, cliprect_rule);
3165 }
3166 
3167 static void
radv_emit_line_width(struct radv_cmd_buffer * cmd_buffer)3168 radv_emit_line_width(struct radv_cmd_buffer *cmd_buffer)
3169 {
3170    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3171 
3172    radeon_set_context_reg(cmd_buffer->cs, R_028A08_PA_SU_LINE_CNTL,
3173                           S_028A08_WIDTH(CLAMP(d->vk.rs.line.width * 8, 0, 0xFFFF)));
3174 }
3175 
3176 static void
radv_emit_blend_constants(struct radv_cmd_buffer * cmd_buffer)3177 radv_emit_blend_constants(struct radv_cmd_buffer *cmd_buffer)
3178 {
3179    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3180 
3181    radeon_set_context_reg_seq(cmd_buffer->cs, R_028414_CB_BLEND_RED, 4);
3182    radeon_emit_array(cmd_buffer->cs, (uint32_t *)d->vk.cb.blend_constants, 4);
3183 }
3184 
3185 static void
radv_emit_stencil(struct radv_cmd_buffer * cmd_buffer)3186 radv_emit_stencil(struct radv_cmd_buffer *cmd_buffer)
3187 {
3188    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3189    const struct radv_physical_device *pdev = radv_device_physical(device);
3190    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3191 
3192    if (pdev->info.gfx_level >= GFX12) {
3193       radeon_set_context_reg(
3194          cmd_buffer->cs, R_028088_DB_STENCIL_REF,
3195          S_028088_TESTVAL(d->vk.ds.stencil.front.reference) | S_028088_TESTVAL_BF(d->vk.ds.stencil.back.reference));
3196 
3197       radeon_set_context_reg(cmd_buffer->cs, R_028090_DB_STENCIL_READ_MASK,
3198                              S_028090_TESTMASK(d->vk.ds.stencil.front.compare_mask) |
3199                                 S_028090_TESTMASK_BF(d->vk.ds.stencil.back.compare_mask));
3200 
3201       radeon_set_context_reg(cmd_buffer->cs, R_028094_DB_STENCIL_WRITE_MASK,
3202                              S_028094_WRITEMASK(d->vk.ds.stencil.front.write_mask) |
3203                                 S_028094_WRITEMASK_BF(d->vk.ds.stencil.back.write_mask));
3204    } else {
3205       radeon_set_context_reg_seq(cmd_buffer->cs, R_028430_DB_STENCILREFMASK, 2);
3206       radeon_emit(cmd_buffer->cs, S_028430_STENCILTESTVAL(d->vk.ds.stencil.front.reference) |
3207                                      S_028430_STENCILMASK(d->vk.ds.stencil.front.compare_mask) |
3208                                      S_028430_STENCILWRITEMASK(d->vk.ds.stencil.front.write_mask) |
3209                                      S_028430_STENCILOPVAL(1));
3210       radeon_emit(cmd_buffer->cs, S_028434_STENCILTESTVAL_BF(d->vk.ds.stencil.back.reference) |
3211                                      S_028434_STENCILMASK_BF(d->vk.ds.stencil.back.compare_mask) |
3212                                      S_028434_STENCILWRITEMASK_BF(d->vk.ds.stencil.back.write_mask) |
3213                                      S_028434_STENCILOPVAL_BF(1));
3214    }
3215 }
3216 
3217 static void
radv_emit_depth_bounds(struct radv_cmd_buffer * cmd_buffer)3218 radv_emit_depth_bounds(struct radv_cmd_buffer *cmd_buffer)
3219 {
3220    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3221    const struct radv_physical_device *pdev = radv_device_physical(device);
3222    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3223 
3224    if (pdev->info.gfx_level >= GFX12) {
3225       radeon_set_context_reg_seq(cmd_buffer->cs, R_028050_DB_DEPTH_BOUNDS_MIN, 2);
3226    } else {
3227       radeon_set_context_reg_seq(cmd_buffer->cs, R_028020_DB_DEPTH_BOUNDS_MIN, 2);
3228    }
3229 
3230    radeon_emit(cmd_buffer->cs, fui(d->vk.ds.depth.bounds_test.min));
3231    radeon_emit(cmd_buffer->cs, fui(d->vk.ds.depth.bounds_test.max));
3232 }
3233 
3234 static void
radv_emit_depth_bias(struct radv_cmd_buffer * cmd_buffer)3235 radv_emit_depth_bias(struct radv_cmd_buffer *cmd_buffer)
3236 {
3237    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3238    struct radv_rendering_state *render = &cmd_buffer->state.render;
3239    unsigned slope = fui(d->vk.rs.depth_bias.slope * 16.0f);
3240    unsigned pa_su_poly_offset_db_fmt_cntl = 0;
3241 
3242    if (vk_format_has_depth(render->ds_att.format) &&
3243        d->vk.rs.depth_bias.representation != VK_DEPTH_BIAS_REPRESENTATION_FLOAT_EXT) {
3244       VkFormat format = vk_format_depth_only(render->ds_att.format);
3245 
3246       if (format == VK_FORMAT_D16_UNORM) {
3247          pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
3248       } else {
3249          assert(format == VK_FORMAT_D32_SFLOAT);
3250          if (d->vk.rs.depth_bias.representation ==
3251              VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORCE_UNORM_EXT) {
3252             pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
3253          } else {
3254             pa_su_poly_offset_db_fmt_cntl =
3255                S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
3256          }
3257       }
3258    }
3259 
3260    radeon_set_context_reg_seq(cmd_buffer->cs, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, 5);
3261    radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.clamp));    /* CLAMP */
3262    radeon_emit(cmd_buffer->cs, slope);                             /* FRONT SCALE */
3263    radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.constant)); /* FRONT OFFSET */
3264    radeon_emit(cmd_buffer->cs, slope);                             /* BACK SCALE */
3265    radeon_emit(cmd_buffer->cs, fui(d->vk.rs.depth_bias.constant)); /* BACK OFFSET */
3266 
3267    radeon_set_context_reg(cmd_buffer->cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
3268 }
3269 
3270 static void
radv_emit_line_stipple(struct radv_cmd_buffer * cmd_buffer)3271 radv_emit_line_stipple(struct radv_cmd_buffer *cmd_buffer)
3272 {
3273    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3274    const struct radv_physical_device *pdev = radv_device_physical(device);
3275    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3276    enum amd_gfx_level gfx_level = pdev->info.gfx_level;
3277    /* GFX9 chips fail linestrip CTS tests unless this is set to 0 = no reset */
3278    uint32_t auto_reset_cntl = (gfx_level == GFX9) ? 0 : 2;
3279 
3280    if (radv_primitive_topology_is_line_list(d->vk.ia.primitive_topology))
3281       auto_reset_cntl = 1;
3282 
3283    radeon_set_context_reg(cmd_buffer->cs, R_028A0C_PA_SC_LINE_STIPPLE,
3284                           S_028A0C_LINE_PATTERN(d->vk.rs.line.stipple.pattern) |
3285                              S_028A0C_REPEAT_COUNT(d->vk.rs.line.stipple.factor - 1) |
3286                              S_028A0C_AUTO_RESET_CNTL(pdev->info.gfx_level < GFX12 ? auto_reset_cntl : 0));
3287 
3288    if (pdev->info.gfx_level >= GFX12) {
3289       radeon_set_context_reg(cmd_buffer->cs, R_028A44_PA_SC_LINE_STIPPLE_RESET,
3290                              S_028A44_AUTO_RESET_CNTL(auto_reset_cntl));
3291    }
3292 }
3293 
3294 static void
radv_emit_culling(struct radv_cmd_buffer * cmd_buffer)3295 radv_emit_culling(struct radv_cmd_buffer *cmd_buffer)
3296 {
3297    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3298    const struct radv_physical_device *pdev = radv_device_physical(device);
3299    enum amd_gfx_level gfx_level = pdev->info.gfx_level;
3300    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3301    unsigned pa_su_sc_mode_cntl;
3302 
3303    pa_su_sc_mode_cntl =
3304       S_028814_CULL_FRONT(!!(d->vk.rs.cull_mode & VK_CULL_MODE_FRONT_BIT)) |
3305       S_028814_CULL_BACK(!!(d->vk.rs.cull_mode & VK_CULL_MODE_BACK_BIT)) | S_028814_FACE(d->vk.rs.front_face) |
3306       S_028814_POLY_OFFSET_FRONT_ENABLE(d->vk.rs.depth_bias.enable) |
3307       S_028814_POLY_OFFSET_BACK_ENABLE(d->vk.rs.depth_bias.enable) |
3308       S_028814_POLY_OFFSET_PARA_ENABLE(d->vk.rs.depth_bias.enable) |
3309       S_028814_POLY_MODE(d->vk.rs.polygon_mode != V_028814_X_DRAW_TRIANGLES) |
3310       S_028814_POLYMODE_FRONT_PTYPE(d->vk.rs.polygon_mode) | S_028814_POLYMODE_BACK_PTYPE(d->vk.rs.polygon_mode) |
3311       S_028814_PROVOKING_VTX_LAST(d->vk.rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT);
3312 
3313    if (gfx_level >= GFX10 && gfx_level < GFX12) {
3314       /* Ensure that SC processes the primitive group in the same order as PA produced them.  Needed
3315        * when either POLY_MODE or PERPENDICULAR_ENDCAP_ENA is set.
3316        */
3317       pa_su_sc_mode_cntl |=
3318          S_028814_KEEP_TOGETHER_ENABLE(d->vk.rs.polygon_mode != V_028814_X_DRAW_TRIANGLES ||
3319                                        radv_get_line_mode(cmd_buffer) == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR);
3320    }
3321 
3322    if (pdev->info.gfx_level >= GFX12) {
3323       radeon_set_context_reg(cmd_buffer->cs, R_02881C_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
3324    } else {
3325       radeon_set_context_reg(cmd_buffer->cs, R_028814_PA_SU_SC_MODE_CNTL, pa_su_sc_mode_cntl);
3326    }
3327 }
3328 
3329 static void
radv_emit_provoking_vertex_mode(struct radv_cmd_buffer * cmd_buffer)3330 radv_emit_provoking_vertex_mode(struct radv_cmd_buffer *cmd_buffer)
3331 {
3332    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
3333    const unsigned stage = last_vgt_shader->info.stage;
3334    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3335    const uint32_t ngg_provoking_vtx_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_PROVOKING_VTX);
3336    unsigned provoking_vtx = 0;
3337 
3338    if (!ngg_provoking_vtx_offset)
3339       return;
3340 
3341    if (d->vk.rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT) {
3342       if (stage == MESA_SHADER_VERTEX) {
3343          provoking_vtx = radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg);
3344       } else {
3345          assert(stage == MESA_SHADER_GEOMETRY);
3346          provoking_vtx = last_vgt_shader->info.gs.vertices_in - 1;
3347       }
3348    }
3349 
3350    radeon_set_sh_reg(cmd_buffer->cs, ngg_provoking_vtx_offset, provoking_vtx);
3351 }
3352 
3353 static void
radv_emit_primitive_topology(struct radv_cmd_buffer * cmd_buffer)3354 radv_emit_primitive_topology(struct radv_cmd_buffer *cmd_buffer)
3355 {
3356    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3357    const struct radv_physical_device *pdev = radv_device_physical(device);
3358    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
3359    const uint32_t verts_per_prim_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NUM_VERTS_PER_PRIM);
3360    const uint32_t vgt_gs_out_prim_type = radv_get_rasterization_prim(cmd_buffer);
3361    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3362 
3363    assert(!cmd_buffer->state.mesh_shading);
3364 
3365    if (pdev->info.gfx_level >= GFX7) {
3366       uint32_t vgt_prim = d->vk.ia.primitive_topology;
3367 
3368       if (pdev->info.gfx_level >= GFX12)
3369          vgt_prim |= S_030908_NUM_INPUT_CP(d->vk.ts.patch_control_points);
3370 
3371       radeon_set_uconfig_reg_idx(&pdev->info, cmd_buffer->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
3372    } else {
3373       radeon_set_config_reg(cmd_buffer->cs, R_008958_VGT_PRIMITIVE_TYPE, d->vk.ia.primitive_topology);
3374    }
3375 
3376    radv_emit_vgt_gs_out(cmd_buffer, vgt_gs_out_prim_type);
3377 
3378    if (!verts_per_prim_offset)
3379       return;
3380 
3381    radeon_set_sh_reg(cmd_buffer->cs, verts_per_prim_offset,
3382                      radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, last_vgt_shader->info.is_ngg) + 1);
3383 }
3384 
3385 static void
radv_emit_depth_control(struct radv_cmd_buffer * cmd_buffer)3386 radv_emit_depth_control(struct radv_cmd_buffer *cmd_buffer)
3387 {
3388    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3389    const struct radv_physical_device *pdev = radv_device_physical(device);
3390    const struct radv_rendering_state *render = &cmd_buffer->state.render;
3391    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3392    const bool stencil_test_enable =
3393       d->vk.ds.stencil.test_enable && (render->ds_att_aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
3394    const uint32_t db_depth_control =
3395       S_028800_Z_ENABLE(d->vk.ds.depth.test_enable ? 1 : 0) |
3396       S_028800_Z_WRITE_ENABLE(d->vk.ds.depth.write_enable ? 1 : 0) | S_028800_ZFUNC(d->vk.ds.depth.compare_op) |
3397       S_028800_DEPTH_BOUNDS_ENABLE(d->vk.ds.depth.bounds_test.enable ? 1 : 0) |
3398       S_028800_STENCIL_ENABLE(stencil_test_enable) | S_028800_BACKFACE_ENABLE(stencil_test_enable) |
3399       S_028800_STENCILFUNC(d->vk.ds.stencil.front.op.compare) |
3400       S_028800_STENCILFUNC_BF(d->vk.ds.stencil.back.op.compare);
3401 
3402    if (pdev->info.gfx_level >= GFX12) {
3403       radeon_set_context_reg(cmd_buffer->cs, R_028070_DB_DEPTH_CONTROL, db_depth_control);
3404    } else {
3405       radeon_set_context_reg(cmd_buffer->cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
3406    }
3407 }
3408 
3409 static void
radv_emit_stencil_control(struct radv_cmd_buffer * cmd_buffer)3410 radv_emit_stencil_control(struct radv_cmd_buffer *cmd_buffer)
3411 {
3412    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3413    const struct radv_physical_device *pdev = radv_device_physical(device);
3414    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3415    const uint32_t db_stencil_control =
3416       S_02842C_STENCILFAIL(radv_translate_stencil_op(d->vk.ds.stencil.front.op.fail)) |
3417       S_02842C_STENCILZPASS(radv_translate_stencil_op(d->vk.ds.stencil.front.op.pass)) |
3418       S_02842C_STENCILZFAIL(radv_translate_stencil_op(d->vk.ds.stencil.front.op.depth_fail)) |
3419       S_02842C_STENCILFAIL_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.fail)) |
3420       S_02842C_STENCILZPASS_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.pass)) |
3421       S_02842C_STENCILZFAIL_BF(radv_translate_stencil_op(d->vk.ds.stencil.back.op.depth_fail));
3422 
3423    if (pdev->info.gfx_level >= GFX12) {
3424       radeon_set_context_reg(cmd_buffer->cs, R_028074_DB_STENCIL_CONTROL, db_stencil_control);
3425    } else {
3426       radeon_set_context_reg(cmd_buffer->cs, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
3427    }
3428 }
3429 
3430 static bool
radv_should_force_vrs1x1(struct radv_cmd_buffer * cmd_buffer)3431 radv_should_force_vrs1x1(struct radv_cmd_buffer *cmd_buffer)
3432 {
3433    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3434    const struct radv_physical_device *pdev = radv_device_physical(device);
3435    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
3436 
3437    return pdev->info.gfx_level >= GFX10_3 &&
3438           (cmd_buffer->state.ms.sample_shading_enable || (ps && ps->info.ps.force_sample_iter_shading_rate));
3439 }
3440 
3441 static void
radv_emit_fragment_shading_rate(struct radv_cmd_buffer * cmd_buffer)3442 radv_emit_fragment_shading_rate(struct radv_cmd_buffer *cmd_buffer)
3443 {
3444    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3445    const struct radv_physical_device *pdev = radv_device_physical(device);
3446    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3447 
3448    /* When per-vertex VRS is forced and the dynamic fragment shading rate is a no-op, ignore
3449     * it. This is needed for vkd3d-proton because it always declares per-draw VRS as dynamic.
3450     */
3451    if (device->force_vrs != RADV_FORCE_VRS_1x1 && d->vk.fsr.fragment_size.width == 1 &&
3452        d->vk.fsr.fragment_size.height == 1 &&
3453        d->vk.fsr.combiner_ops[0] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR &&
3454        d->vk.fsr.combiner_ops[1] == VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR)
3455       return;
3456 
3457    uint32_t rate_x = MIN2(2, d->vk.fsr.fragment_size.width) - 1;
3458    uint32_t rate_y = MIN2(2, d->vk.fsr.fragment_size.height) - 1;
3459    uint32_t pipeline_comb_mode = d->vk.fsr.combiner_ops[0];
3460    uint32_t htile_comb_mode = d->vk.fsr.combiner_ops[1];
3461    uint32_t pa_cl_vrs_cntl = 0;
3462 
3463    assert(pdev->info.gfx_level >= GFX10_3);
3464 
3465    if (!cmd_buffer->state.render.vrs_att.iview) {
3466       /* When the current subpass has no VRS attachment, the VRS rates are expected to be 1x1, so we
3467        * can cheat by tweaking the different combiner modes.
3468        */
3469       switch (htile_comb_mode) {
3470       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MIN_KHR:
3471          /* The result of min(A, 1x1) is always 1x1. */
3472          FALLTHROUGH;
3473       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_REPLACE_KHR:
3474          /* Force the per-draw VRS rate to 1x1. */
3475          rate_x = rate_y = 0;
3476 
3477          /* As the result of min(A, 1x1) or replace(A, 1x1) are always 1x1, set the vertex rate
3478           * combiner mode as passthrough.
3479           */
3480          pipeline_comb_mode = V_028848_SC_VRS_COMB_MODE_PASSTHRU;
3481          break;
3482       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_MAX_KHR:
3483          /* The result of max(A, 1x1) is always A. */
3484          FALLTHROUGH;
3485       case VK_FRAGMENT_SHADING_RATE_COMBINER_OP_KEEP_KHR:
3486          /* Nothing to do here because the SAMPLE_ITER combiner mode should already be passthrough. */
3487          break;
3488       default:
3489          break;
3490       }
3491    }
3492 
3493    /* Emit per-draw VRS rate which is the first combiner. */
3494    radeon_set_uconfig_reg(cmd_buffer->cs, R_03098C_GE_VRS_RATE, S_03098C_RATE_X(rate_x) | S_03098C_RATE_Y(rate_y));
3495 
3496    /* Disable VRS and use the rates from PS_ITER_SAMPLES if:
3497     *
3498     * 1) sample shading is enabled or per-sample interpolation is used by the fragment shader
3499     * 2) the fragment shader requires 1x1 shading rate for some other reason
3500     */
3501    if (radv_should_force_vrs1x1(cmd_buffer)) {
3502       pa_cl_vrs_cntl |= S_028848_SAMPLE_ITER_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_OVERRIDE);
3503    }
3504 
3505    /* VERTEX_RATE_COMBINER_MODE controls the combiner mode between the
3506     * draw rate and the vertex rate.
3507     */
3508    if (cmd_buffer->state.mesh_shading) {
3509       pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_PASSTHRU) |
3510                         S_028848_PRIMITIVE_RATE_COMBINER_MODE(pipeline_comb_mode);
3511    } else {
3512       pa_cl_vrs_cntl |= S_028848_VERTEX_RATE_COMBINER_MODE(pipeline_comb_mode) |
3513                         S_028848_PRIMITIVE_RATE_COMBINER_MODE(V_028848_SC_VRS_COMB_MODE_PASSTHRU);
3514    }
3515 
3516    /* HTILE_RATE_COMBINER_MODE controls the combiner mode between the primitive rate and the HTILE
3517     * rate.
3518     */
3519    pa_cl_vrs_cntl |= S_028848_HTILE_RATE_COMBINER_MODE(htile_comb_mode);
3520 
3521    radeon_set_context_reg(cmd_buffer->cs, R_028848_PA_CL_VRS_CNTL, pa_cl_vrs_cntl);
3522 }
3523 
3524 static uint32_t
radv_get_primitive_reset_index(const struct radv_cmd_buffer * cmd_buffer)3525 radv_get_primitive_reset_index(const struct radv_cmd_buffer *cmd_buffer)
3526 {
3527    const uint32_t index_type = G_028A7C_INDEX_TYPE(cmd_buffer->state.index_type);
3528    switch (index_type) {
3529    case V_028A7C_VGT_INDEX_8:
3530       return 0xffu;
3531    case V_028A7C_VGT_INDEX_16:
3532       return 0xffffu;
3533    case V_028A7C_VGT_INDEX_32:
3534       return 0xffffffffu;
3535    default:
3536       unreachable("invalid index type");
3537    }
3538 }
3539 
3540 static void
radv_emit_primitive_restart_enable(struct radv_cmd_buffer * cmd_buffer)3541 radv_emit_primitive_restart_enable(struct radv_cmd_buffer *cmd_buffer)
3542 {
3543    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3544    const struct radv_physical_device *pdev = radv_device_physical(device);
3545    const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
3546    const struct radv_dynamic_state *const d = &cmd_buffer->state.dynamic;
3547    struct radeon_cmdbuf *cs = cmd_buffer->cs;
3548    const bool en = d->vk.ia.primitive_restart_enable;
3549 
3550    if (gfx_level >= GFX11) {
3551       radeon_set_uconfig_reg(cs, R_03092C_GE_MULTI_PRIM_IB_RESET_EN,
3552                              S_03092C_RESET_EN(en) |
3553                                 /* This disables primitive restart for non-indexed draws.
3554                                  * By keeping this set, we don't have to unset RESET_EN
3555                                  * for non-indexed draws. */
3556                                 S_03092C_DISABLE_FOR_AUTO_INDEX(1));
3557    } else if (gfx_level >= GFX9) {
3558       radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, en);
3559    } else {
3560       radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, en);
3561 
3562       /* GFX6-7: All 32 bits are compared.
3563        * GFX8: Only index type bits are compared.
3564        * GFX9+: Default is same as GFX8, MATCH_ALL_BITS=1 selects GFX6-7 behavior
3565        */
3566       if (en && gfx_level <= GFX7) {
3567          const uint32_t primitive_reset_index = radv_get_primitive_reset_index(cmd_buffer);
3568 
3569          radeon_opt_set_context_reg(cmd_buffer, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
3570                                     RADV_TRACKED_VGT_MULTI_PRIM_IB_RESET_INDX, primitive_reset_index);
3571       }
3572    }
3573 }
3574 
3575 static void
radv_emit_clipping(struct radv_cmd_buffer * cmd_buffer)3576 radv_emit_clipping(struct radv_cmd_buffer *cmd_buffer)
3577 {
3578    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3579    bool depth_clip_enable = radv_get_depth_clip_enable(cmd_buffer);
3580 
3581    radeon_set_context_reg(
3582       cmd_buffer->cs, R_028810_PA_CL_CLIP_CNTL,
3583       S_028810_DX_RASTERIZATION_KILL(d->vk.rs.rasterizer_discard_enable) |
3584          S_028810_ZCLIP_NEAR_DISABLE(!depth_clip_enable) | S_028810_ZCLIP_FAR_DISABLE(!depth_clip_enable) |
3585          S_028810_DX_CLIP_SPACE_DEF(!d->vk.vp.depth_clip_negative_one_to_one) | S_028810_DX_LINEAR_ATTR_CLIP_ENA(1));
3586 }
3587 
3588 static bool
radv_is_mrt0_dual_src(struct radv_cmd_buffer * cmd_buffer)3589 radv_is_mrt0_dual_src(struct radv_cmd_buffer *cmd_buffer)
3590 {
3591    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3592 
3593    if (!d->vk.cb.attachments[0].write_mask || !d->vk.cb.attachments[0].blend_enable)
3594       return false;
3595 
3596    return radv_can_enable_dual_src(&d->vk.cb.attachments[0]);
3597 }
3598 
3599 static void
radv_emit_logic_op(struct radv_cmd_buffer * cmd_buffer)3600 radv_emit_logic_op(struct radv_cmd_buffer *cmd_buffer)
3601 {
3602    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3603    const struct radv_physical_device *pdev = radv_device_physical(device);
3604    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3605    unsigned cb_color_control = 0;
3606 
3607    if (d->vk.cb.logic_op_enable) {
3608       cb_color_control |= S_028808_ROP3(d->vk.cb.logic_op);
3609    } else {
3610       cb_color_control |= S_028808_ROP3(V_028808_ROP3_COPY);
3611    }
3612 
3613    if (pdev->info.has_rbplus) {
3614       /* RB+ doesn't work with dual source blending, logic op and CB_RESOLVE. */
3615       bool mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
3616 
3617       cb_color_control |= S_028808_DISABLE_DUAL_QUAD(mrt0_is_dual_src || d->vk.cb.logic_op_enable ||
3618                                                      cmd_buffer->state.custom_blend_mode == V_028808_CB_RESOLVE);
3619    }
3620 
3621    if (cmd_buffer->state.custom_blend_mode) {
3622       cb_color_control |= S_028808_MODE(cmd_buffer->state.custom_blend_mode);
3623    } else {
3624       bool color_write_enabled = false;
3625 
3626       for (unsigned i = 0; i < MAX_RTS; i++) {
3627          if (d->vk.cb.attachments[i].write_mask) {
3628             color_write_enabled = true;
3629             break;
3630          }
3631       }
3632 
3633       if (color_write_enabled) {
3634          cb_color_control |= S_028808_MODE(V_028808_CB_NORMAL);
3635       } else {
3636          cb_color_control |= S_028808_MODE(V_028808_CB_DISABLE);
3637       }
3638    }
3639 
3640    if (pdev->info.gfx_level >= GFX12) {
3641       radeon_set_context_reg(cmd_buffer->cs, R_028858_CB_COLOR_CONTROL, cb_color_control);
3642    } else {
3643       radeon_set_context_reg(cmd_buffer->cs, R_028808_CB_COLOR_CONTROL, cb_color_control);
3644    }
3645 }
3646 
3647 static void
radv_emit_color_write(struct radv_cmd_buffer * cmd_buffer)3648 radv_emit_color_write(struct radv_cmd_buffer *cmd_buffer)
3649 {
3650    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3651    const struct radv_physical_device *pdev = radv_device_physical(device);
3652    const struct radv_binning_settings *settings = &pdev->binning_settings;
3653    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3654    uint32_t color_write_enable = 0, color_write_mask = 0;
3655 
3656    u_foreach_bit (i, d->vk.cb.color_write_enables) {
3657       color_write_enable |= 0xfu << (i * 4);
3658    }
3659 
3660    for (unsigned i = 0; i < MAX_RTS; i++) {
3661       color_write_mask |= d->vk.cb.attachments[i].write_mask << (4 * i);
3662    }
3663 
3664    if (device->pbb_allowed && settings->context_states_per_bin > 1) {
3665       /* Flush DFSM on CB_TARGET_MASK changes. */
3666       radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
3667       radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
3668    }
3669 
3670    if (pdev->info.gfx_level >= GFX12) {
3671       radeon_set_context_reg(cmd_buffer->cs, R_028850_CB_TARGET_MASK, color_write_mask & color_write_enable);
3672    } else {
3673       radeon_set_context_reg(cmd_buffer->cs, R_028238_CB_TARGET_MASK, color_write_mask & color_write_enable);
3674    }
3675 }
3676 
3677 static void
radv_emit_patch_control_points(struct radv_cmd_buffer * cmd_buffer)3678 radv_emit_patch_control_points(struct radv_cmd_buffer *cmd_buffer)
3679 {
3680    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3681    const struct radv_physical_device *pdev = radv_device_physical(device);
3682    const struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
3683    const struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
3684    const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
3685    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3686    unsigned ls_hs_config;
3687 
3688    /* Compute tessellation info that depends on the number of patch control points when this state
3689     * is dynamic.
3690     */
3691    if (cmd_buffer->state.uses_dynamic_patch_control_points) {
3692       /* Compute the number of patches. */
3693       cmd_buffer->state.tess_num_patches = radv_get_tcs_num_patches(
3694          pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out, vs->info.vs.num_linked_outputs,
3695          tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs,
3696          tcs->info.tcs.num_linked_outputs, tcs->info.tcs.num_linked_patch_outputs);
3697 
3698       /* Compute the LDS size. */
3699       cmd_buffer->state.tess_lds_size =
3700          radv_get_tess_lds_size(pdev, d->vk.ts.patch_control_points, tcs->info.tcs.tcs_vertices_out,
3701                                 vs->info.vs.num_linked_outputs, cmd_buffer->state.tess_num_patches,
3702                                 tcs->info.tcs.num_lds_per_vertex_outputs, tcs->info.tcs.num_lds_per_patch_outputs);
3703    }
3704 
3705    ls_hs_config = S_028B58_NUM_PATCHES(cmd_buffer->state.tess_num_patches) |
3706                   /* GFX12 programs patch_vertices in VGT_PRIMITIVE_TYPE.NUM_INPUT_CP. */
3707                   S_028B58_HS_NUM_INPUT_CP(pdev->info.gfx_level < GFX12 ? d->vk.ts.patch_control_points : 0) |
3708                   S_028B58_HS_NUM_OUTPUT_CP(tcs->info.tcs.tcs_vertices_out);
3709 
3710    if (pdev->info.gfx_level >= GFX7) {
3711       radeon_set_context_reg_idx(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, 2, ls_hs_config);
3712    } else {
3713       radeon_set_context_reg(cmd_buffer->cs, R_028B58_VGT_LS_HS_CONFIG, ls_hs_config);
3714    }
3715 
3716    if (pdev->info.gfx_level >= GFX9) {
3717       unsigned hs_rsrc2;
3718 
3719       if (tcs->info.merged_shader_compiled_separately) {
3720          radv_shader_combine_cfg_vs_tcs(cmd_buffer->state.shaders[MESA_SHADER_VERTEX], tcs, NULL, &hs_rsrc2);
3721       } else {
3722          hs_rsrc2 = tcs->config.rsrc2;
3723       }
3724 
3725       if (pdev->info.gfx_level >= GFX10) {
3726          hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX10(cmd_buffer->state.tess_lds_size);
3727       } else {
3728          hs_rsrc2 |= S_00B42C_LDS_SIZE_GFX9(cmd_buffer->state.tess_lds_size);
3729       }
3730 
3731       radeon_set_sh_reg(cmd_buffer->cs, tcs->info.regs.pgm_rsrc2, hs_rsrc2);
3732    } else {
3733       unsigned ls_rsrc2 = vs->config.rsrc2 | S_00B52C_LDS_SIZE(cmd_buffer->state.tess_lds_size);
3734 
3735       radeon_set_sh_reg(cmd_buffer->cs, vs->info.regs.pgm_rsrc2, ls_rsrc2);
3736    }
3737 
3738    /* Emit user SGPRs for dynamic patch control points. */
3739    uint32_t tcs_offchip_layout_offset = radv_get_user_sgpr_loc(tcs, AC_UD_TCS_OFFCHIP_LAYOUT);
3740    if (!tcs_offchip_layout_offset)
3741       return;
3742 
3743    unsigned tcs_offchip_layout =
3744       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PATCH_CONTROL_POINTS, d->vk.ts.patch_control_points - 1) |
3745       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_OUT_PATCH_CP, tcs->info.tcs.tcs_vertices_out - 1) |
3746       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_PATCHES, cmd_buffer->state.tess_num_patches - 1) |
3747       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_LS_OUTPUTS, vs->info.vs.num_linked_outputs) |
3748       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_NUM_HS_OUTPUTS, tcs->info.tcs.num_linked_outputs) |
3749       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_TES_READS_TF, tes->info.tes.reads_tess_factors) |
3750       SET_SGPR_FIELD(TCS_OFFCHIP_LAYOUT_PRIMITIVE_MODE, tes->info.tes._primitive_mode);
3751 
3752    radeon_set_sh_reg(cmd_buffer->cs, tcs_offchip_layout_offset, tcs_offchip_layout);
3753 
3754    tcs_offchip_layout_offset = radv_get_user_sgpr_loc(tes, AC_UD_TCS_OFFCHIP_LAYOUT);
3755    assert(tcs_offchip_layout_offset);
3756 
3757    radeon_set_sh_reg(cmd_buffer->cs, tcs_offchip_layout_offset, tcs_offchip_layout);
3758 }
3759 
3760 static void
radv_emit_conservative_rast_mode(struct radv_cmd_buffer * cmd_buffer)3761 radv_emit_conservative_rast_mode(struct radv_cmd_buffer *cmd_buffer)
3762 {
3763    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3764    const struct radv_physical_device *pdev = radv_device_physical(device);
3765    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3766 
3767    if (pdev->info.gfx_level >= GFX9) {
3768       uint32_t pa_sc_conservative_rast;
3769 
3770       if (d->vk.rs.conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
3771          const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
3772          const bool uses_inner_coverage = ps && ps->info.ps.reads_fully_covered;
3773 
3774          pa_sc_conservative_rast =
3775             S_028C4C_PREZ_AA_MASK_ENABLE(1) | S_028C4C_POSTZ_AA_MASK_ENABLE(1) | S_028C4C_CENTROID_SAMPLE_OVERRIDE(1);
3776 
3777          /* Inner coverage requires underestimate conservative rasterization. */
3778          if (d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT &&
3779              !uses_inner_coverage) {
3780             pa_sc_conservative_rast |= S_028C4C_OVER_RAST_ENABLE(1) | S_028C4C_UNDER_RAST_SAMPLE_SELECT(1) |
3781                                        S_028C4C_PBB_UNCERTAINTY_REGION_ENABLE(1);
3782          } else {
3783             pa_sc_conservative_rast |= S_028C4C_OVER_RAST_SAMPLE_SELECT(1) | S_028C4C_UNDER_RAST_ENABLE(1);
3784          }
3785       } else {
3786          pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
3787       }
3788 
3789       if (pdev->info.gfx_level >= GFX12) {
3790          radeon_set_context_reg(cmd_buffer->cs, R_028C54_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
3791                                 pa_sc_conservative_rast);
3792       } else {
3793          radeon_set_context_reg(cmd_buffer->cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
3794                                 pa_sc_conservative_rast);
3795       }
3796    }
3797 }
3798 
3799 static void
radv_emit_depth_clamp_enable(struct radv_cmd_buffer * cmd_buffer)3800 radv_emit_depth_clamp_enable(struct radv_cmd_buffer *cmd_buffer)
3801 {
3802    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3803    const struct radv_physical_device *pdev = radv_device_physical(device);
3804 
3805    enum radv_depth_clamp_mode mode = radv_get_depth_clamp_mode(cmd_buffer);
3806 
3807    radeon_set_context_reg(
3808       cmd_buffer->cs, R_02800C_DB_RENDER_OVERRIDE,
3809       S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE) |
3810          S_02800C_DISABLE_VIEWPORT_CLAMP(pdev->info.gfx_level < GFX12 && mode == RADV_DEPTH_CLAMP_MODE_DISABLED));
3811 
3812    if (pdev->info.gfx_level >= GFX12) {
3813       radeon_set_context_reg(cmd_buffer->cs, R_028064_DB_VIEWPORT_CONTROL,
3814                              S_028064_DISABLE_VIEWPORT_CLAMP(mode == RADV_DEPTH_CLAMP_MODE_DISABLED));
3815    }
3816 }
3817 
3818 static void
radv_emit_rasterization_samples(struct radv_cmd_buffer * cmd_buffer)3819 radv_emit_rasterization_samples(struct radv_cmd_buffer *cmd_buffer)
3820 {
3821    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3822    const struct radv_physical_device *pdev = radv_device_physical(device);
3823    unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
3824    unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
3825    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
3826    unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
3827    unsigned pa_sc_mode_cntl_1;
3828    bool has_hiz_his = false;
3829 
3830    if (pdev->info.gfx_level >= GFX12) {
3831       const struct radv_rendering_state *render = &cmd_buffer->state.render;
3832 
3833       if (render->ds_att.iview) {
3834          const struct radeon_surf *surf = &render->ds_att.iview->image->planes[0].surface;
3835          has_hiz_his = surf->u.gfx9.zs.hiz.offset || surf->u.gfx9.zs.his.offset;
3836       }
3837    }
3838 
3839    pa_sc_mode_cntl_1 =
3840       S_028A4C_WALK_FENCE_ENABLE(1) | // TODO linear dst fixes
3841       S_028A4C_WALK_FENCE_SIZE(pdev->info.num_tile_pipes == 2 ? 2 : 3) |
3842       S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(cmd_buffer->state.uses_out_of_order_rast) |
3843       S_028A4C_OUT_OF_ORDER_WATER_MARK(pdev->info.gfx_level >= GFX12 ? 0 : 0x7) |
3844       /* always 1: */
3845       S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) | S_028A4C_TILE_WALK_ORDER_ENABLE(1) |
3846       S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) | S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
3847       S_028A4C_FORCE_EOV_REZ_ENABLE(1) |
3848       /* This should only be set when VRS surfaces aren't enabled on GFX11, otherwise the GPU might
3849        * hang.
3850        */
3851       S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(pdev->info.gfx_level < GFX11 || !cmd_buffer->state.uses_vrs_attachment ||
3852                                         (pdev->info.gfx_level >= GFX12 && !has_hiz_his));
3853 
3854    if (!d->sample_location.count)
3855       radv_emit_default_sample_locations(pdev, cmd_buffer->cs, rasterization_samples);
3856 
3857    if (ps_iter_samples > 1) {
3858       spi_baryc_cntl |= S_0286E0_POS_FLOAT_LOCATION(2);
3859       pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
3860    }
3861 
3862    if (radv_should_force_vrs1x1(cmd_buffer)) {
3863       /* Make sure sample shading is enabled even if only MSAA1x is used because the SAMPLE_ITER
3864        * combiner is in passthrough mode if PS_ITER_SAMPLE is 0, and it uses the per-draw rate. The
3865        * default VRS rate when sample shading is enabled is 1x1.
3866        */
3867       if (!G_028A4C_PS_ITER_SAMPLE(pa_sc_mode_cntl_1))
3868          pa_sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(1);
3869    }
3870 
3871    if (pdev->info.gfx_level >= GFX12) {
3872       radeon_set_context_reg(cmd_buffer->cs, R_028658_SPI_BARYC_CNTL, spi_baryc_cntl);
3873    } else {
3874       radeon_set_context_reg(cmd_buffer->cs, R_0286E0_SPI_BARYC_CNTL, spi_baryc_cntl);
3875    }
3876 
3877    radeon_set_context_reg(cmd_buffer->cs, R_028A4C_PA_SC_MODE_CNTL_1, pa_sc_mode_cntl_1);
3878 }
3879 
3880 static void
radv_emit_fb_color_state(struct radv_cmd_buffer * cmd_buffer,int index,struct radv_color_buffer_info * cb,struct radv_image_view * iview,VkImageLayout layout)3881 radv_emit_fb_color_state(struct radv_cmd_buffer *cmd_buffer, int index, struct radv_color_buffer_info *cb,
3882                          struct radv_image_view *iview, VkImageLayout layout)
3883 {
3884    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
3885    const struct radv_physical_device *pdev = radv_device_physical(device);
3886    bool is_vi = pdev->info.gfx_level >= GFX8;
3887    uint32_t cb_fdcc_control = cb->ac.cb_dcc_control;
3888    uint32_t cb_color_info = cb->ac.cb_color_info;
3889    struct radv_image *image = iview->image;
3890 
3891    if (!radv_layout_dcc_compressed(device, image, iview->vk.base_mip_level, layout,
3892                                    radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf))) {
3893       if (pdev->info.gfx_level >= GFX11) {
3894          cb_fdcc_control &= C_028C78_FDCC_ENABLE;
3895       } else {
3896          cb_color_info &= C_028C70_DCC_ENABLE;
3897       }
3898    }
3899 
3900    const enum radv_fmask_compression fmask_comp = radv_layout_fmask_compression(
3901       device, image, layout, radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf));
3902    if (fmask_comp == RADV_FMASK_COMPRESSION_NONE) {
3903       cb_color_info &= C_028C70_COMPRESSION;
3904    }
3905 
3906    if (pdev->info.gfx_level >= GFX12) {
3907       radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x24, cb->ac.cb_color_base);
3908       radeon_set_context_reg(cmd_buffer->cs, R_028C64_CB_COLOR0_VIEW + index * 0x24, cb->ac.cb_color_view);
3909       radeon_set_context_reg(cmd_buffer->cs, R_028C68_CB_COLOR0_VIEW2 + index * 0x24, cb->ac.cb_color_view2);
3910       radeon_set_context_reg(cmd_buffer->cs, R_028C6C_CB_COLOR0_ATTRIB + index * 0x24, cb->ac.cb_color_attrib);
3911       radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_FDCC_CONTROL + index * 0x24, cb_fdcc_control);
3912       radeon_set_context_reg(cmd_buffer->cs, R_028C78_CB_COLOR0_ATTRIB2 + index * 0x24, cb->ac.cb_color_attrib2);
3913       radeon_set_context_reg(cmd_buffer->cs, R_028C7C_CB_COLOR0_ATTRIB3 + index * 0x24, cb->ac.cb_color_attrib3);
3914       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
3915                              S_028E40_BASE_256B(cb->ac.cb_color_base >> 32));
3916       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_INFO + index * 4, cb->ac.cb_color_info);
3917    } else if (pdev->info.gfx_level >= GFX11) {
3918       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C6C_CB_COLOR0_VIEW + index * 0x3c, 4);
3919       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_view);    /* CB_COLOR0_VIEW */
3920       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_info);    /* CB_COLOR0_INFO */
3921       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib);  /* CB_COLOR0_ATTRIB */
3922       radeon_emit(cmd_buffer->cs, cb_fdcc_control);     /* CB_COLOR0_FDCC_CONTROL */
3923 
3924       radeon_set_context_reg(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, cb->ac.cb_color_base);
3925       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
3926                              S_028E40_BASE_256B(cb->ac.cb_color_base >> 32));
3927       radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->ac.cb_dcc_base);
3928       radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
3929                              S_028EA0_BASE_256B(cb->ac.cb_dcc_base >> 32));
3930       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->ac.cb_color_attrib2);
3931       radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->ac.cb_color_attrib3);
3932    } else if (pdev->info.gfx_level >= GFX10) {
3933       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
3934       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_base);
3935       radeon_emit(cmd_buffer->cs, 0);
3936       radeon_emit(cmd_buffer->cs, 0);
3937       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_view);
3938       radeon_emit(cmd_buffer->cs, cb_color_info);
3939       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib);
3940       radeon_emit(cmd_buffer->cs, cb->ac.cb_dcc_control);
3941       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_cmask);
3942       radeon_emit(cmd_buffer->cs, 0);
3943       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_fmask);
3944       radeon_emit(cmd_buffer->cs, 0);
3945 
3946       radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->ac.cb_dcc_base);
3947 
3948       radeon_set_context_reg(cmd_buffer->cs, R_028E40_CB_COLOR0_BASE_EXT + index * 4,
3949                              S_028E40_BASE_256B(cb->ac.cb_color_base >> 32));
3950       radeon_set_context_reg(cmd_buffer->cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + index * 4,
3951                              S_028E60_BASE_256B(cb->ac.cb_color_cmask >> 32));
3952       radeon_set_context_reg(cmd_buffer->cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + index * 4,
3953                              S_028E80_BASE_256B(cb->ac.cb_color_fmask >> 32));
3954       radeon_set_context_reg(cmd_buffer->cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + index * 4,
3955                              S_028EA0_BASE_256B(cb->ac.cb_dcc_base >> 32));
3956       radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_ATTRIB2 + index * 4, cb->ac.cb_color_attrib2);
3957       radeon_set_context_reg(cmd_buffer->cs, R_028EE0_CB_COLOR0_ATTRIB3 + index * 4, cb->ac.cb_color_attrib3);
3958    } else if (pdev->info.gfx_level == GFX9) {
3959       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 11);
3960       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_base);
3961       radeon_emit(cmd_buffer->cs, S_028C64_BASE_256B(cb->ac.cb_color_base >> 32));
3962       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib2);
3963       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_view);
3964       radeon_emit(cmd_buffer->cs, cb_color_info);
3965       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib);
3966       radeon_emit(cmd_buffer->cs, cb->ac.cb_dcc_control);
3967       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_cmask);
3968       radeon_emit(cmd_buffer->cs, S_028C80_BASE_256B(cb->ac.cb_color_cmask >> 32));
3969       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_fmask);
3970       radeon_emit(cmd_buffer->cs, S_028C88_BASE_256B(cb->ac.cb_color_fmask >> 32));
3971 
3972       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, 2);
3973       radeon_emit(cmd_buffer->cs, cb->ac.cb_dcc_base);
3974       radeon_emit(cmd_buffer->cs, S_028C98_BASE_256B(cb->ac.cb_dcc_base >> 32));
3975 
3976       radeon_set_context_reg(cmd_buffer->cs, R_0287A0_CB_MRT0_EPITCH + index * 4, cb->ac.cb_mrt_epitch);
3977    } else {
3978       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C60_CB_COLOR0_BASE + index * 0x3c, 6);
3979       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_base);
3980       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_pitch);
3981       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_slice);
3982       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_view);
3983       radeon_emit(cmd_buffer->cs, cb_color_info);
3984       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_attrib);
3985 
3986       if (pdev->info.gfx_level == GFX8)
3987          radeon_set_context_reg(cmd_buffer->cs, R_028C78_CB_COLOR0_DCC_CONTROL + index * 0x3c, cb->ac.cb_dcc_control);
3988 
3989       radeon_set_context_reg_seq(cmd_buffer->cs, R_028C7C_CB_COLOR0_CMASK + index * 0x3c, 4);
3990       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_cmask);
3991       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_cmask_slice);
3992       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_fmask);
3993       radeon_emit(cmd_buffer->cs, cb->ac.cb_color_fmask_slice);
3994 
3995       if (is_vi) { /* DCC BASE */
3996          radeon_set_context_reg(cmd_buffer->cs, R_028C94_CB_COLOR0_DCC_BASE + index * 0x3c, cb->ac.cb_dcc_base);
3997       }
3998    }
3999 
4000    if (pdev->info.gfx_level >= GFX11 ? G_028C78_FDCC_ENABLE(cb_fdcc_control) : G_028C70_DCC_ENABLE(cb_color_info)) {
4001       /* Drawing with DCC enabled also compresses colorbuffers. */
4002       VkImageSubresourceRange range = {
4003          .aspectMask = iview->vk.aspects,
4004          .baseMipLevel = iview->vk.base_mip_level,
4005          .levelCount = iview->vk.level_count,
4006          .baseArrayLayer = iview->vk.base_array_layer,
4007          .layerCount = iview->vk.layer_count,
4008       };
4009 
4010       radv_update_dcc_metadata(cmd_buffer, image, &range, true);
4011    }
4012 }
4013 
4014 static void
radv_update_zrange_precision(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,const struct radv_image_view * iview,bool requires_cond_exec)4015 radv_update_zrange_precision(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds,
4016                              const struct radv_image_view *iview, bool requires_cond_exec)
4017 {
4018    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4019    const struct radv_physical_device *pdev = radv_device_physical(device);
4020    const struct radv_image *image = iview->image;
4021    uint32_t db_z_info = ds->ac.db_z_info;
4022    uint32_t db_z_info_reg;
4023 
4024    if (!pdev->info.has_tc_compat_zrange_bug || !radv_image_is_tc_compat_htile(image))
4025       return;
4026 
4027    db_z_info &= C_028040_ZRANGE_PRECISION;
4028 
4029    if (pdev->info.gfx_level == GFX9) {
4030       db_z_info_reg = R_028038_DB_Z_INFO;
4031    } else {
4032       db_z_info_reg = R_028040_DB_Z_INFO;
4033    }
4034 
4035    /* When we don't know the last fast clear value we need to emit a
4036     * conditional packet that will eventually skip the following
4037     * SET_CONTEXT_REG packet.
4038     */
4039    if (requires_cond_exec) {
4040       uint64_t va = radv_get_tc_compat_zrange_va(image, iview->vk.base_mip_level);
4041 
4042       radv_emit_cond_exec(device, cmd_buffer->cs, va, 3 /* SET_CONTEXT_REG size */);
4043    }
4044 
4045    radeon_set_context_reg(cmd_buffer->cs, db_z_info_reg, db_z_info);
4046 }
4047 
4048 static struct radv_image *
radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer * cmd_buffer)4049 radv_cmd_buffer_get_vrs_image(struct radv_cmd_buffer *cmd_buffer)
4050 {
4051    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4052 
4053    if (!device->vrs.image) {
4054       VkResult result;
4055 
4056       /* The global VRS state is initialized on-demand to avoid wasting VRAM. */
4057       result = radv_device_init_vrs_state(device);
4058       if (result != VK_SUCCESS) {
4059          vk_command_buffer_set_error(&cmd_buffer->vk, result);
4060          return NULL;
4061       }
4062    }
4063 
4064    return device->vrs.image;
4065 }
4066 
4067 static void
radv_emit_fb_ds_state(struct radv_cmd_buffer * cmd_buffer,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,bool depth_compressed,bool stencil_compressed)4068 radv_emit_fb_ds_state(struct radv_cmd_buffer *cmd_buffer, struct radv_ds_buffer_info *ds, struct radv_image_view *iview,
4069                       bool depth_compressed, bool stencil_compressed)
4070 {
4071    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4072    const struct radv_physical_device *pdev = radv_device_physical(device);
4073    uint64_t db_htile_data_base = ds->ac.u.gfx6.db_htile_data_base;
4074    uint32_t db_htile_surface = ds->ac.u.gfx6.db_htile_surface;
4075    uint32_t db_render_control = ds->db_render_control | cmd_buffer->state.db_render_control;
4076    uint32_t db_z_info = ds->ac.db_z_info;
4077 
4078    if (!depth_compressed)
4079       db_render_control |= S_028000_DEPTH_COMPRESS_DISABLE(1);
4080    if (!stencil_compressed)
4081       db_render_control |= S_028000_STENCIL_COMPRESS_DISABLE(1);
4082 
4083    if (pdev->info.gfx_level == GFX10_3) {
4084       if (!cmd_buffer->state.render.vrs_att.iview) {
4085          db_htile_surface &= C_028ABC_VRS_HTILE_ENCODING;
4086       } else {
4087          /* On GFX10.3, when a subpass uses VRS attachment but HTILE can't be enabled, we fallback to
4088           * our internal HTILE buffer.
4089           */
4090          if (!radv_htile_enabled(iview->image, iview->vk.base_mip_level) && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
4091             struct radv_buffer *htile_buffer = device->vrs.buffer;
4092 
4093             assert(!G_028038_TILE_SURFACE_ENABLE(db_z_info) && !db_htile_data_base && !db_htile_surface);
4094             db_z_info |= S_028038_TILE_SURFACE_ENABLE(1);
4095             db_htile_data_base = radv_buffer_get_va(htile_buffer->bo) >> 8;
4096             db_htile_surface = S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1) |
4097                                S_028ABC_VRS_HTILE_ENCODING(V_028ABC_VRS_HTILE_4BIT_ENCODING);
4098          }
4099       }
4100    }
4101 
4102    if (pdev->info.gfx_level < GFX12) {
4103       radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, db_render_control);
4104       radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW, ds->ac.db_depth_view);
4105       radeon_set_context_reg(cmd_buffer->cs, R_028ABC_DB_HTILE_SURFACE, db_htile_surface);
4106    }
4107 
4108    radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2, ds->db_render_override2);
4109 
4110    if (pdev->info.gfx_level >= GFX12) {
4111       radeon_set_context_reg(cmd_buffer->cs, R_028004_DB_DEPTH_VIEW, ds->ac.db_depth_view);
4112       radeon_set_context_reg(cmd_buffer->cs, R_028008_DB_DEPTH_VIEW1, ds->ac.u.gfx12.db_depth_view1);
4113       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_DEPTH_SIZE_XY, ds->ac.db_depth_size);
4114       radeon_set_context_reg(cmd_buffer->cs, R_028018_DB_Z_INFO, ds->ac.db_z_info);
4115       radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_STENCIL_INFO, ds->ac.db_stencil_info);
4116       radeon_set_context_reg(cmd_buffer->cs, R_028020_DB_Z_READ_BASE, ds->ac.db_depth_base);
4117       radeon_set_context_reg(cmd_buffer->cs, R_028024_DB_Z_READ_BASE_HI, S_028024_BASE_HI(ds->ac.db_depth_base >> 32));
4118       radeon_set_context_reg(cmd_buffer->cs, R_028028_DB_Z_WRITE_BASE, ds->ac.db_depth_base);
4119       radeon_set_context_reg(cmd_buffer->cs, R_02802C_DB_Z_WRITE_BASE_HI, S_02802C_BASE_HI(ds->ac.db_depth_base >> 32));
4120       radeon_set_context_reg(cmd_buffer->cs, R_028030_DB_STENCIL_READ_BASE, ds->ac.db_stencil_base);
4121       radeon_set_context_reg(cmd_buffer->cs, R_028034_DB_STENCIL_READ_BASE_HI,
4122                              S_028034_BASE_HI(ds->ac.db_stencil_base >> 32));
4123       radeon_set_context_reg(cmd_buffer->cs, R_028038_DB_STENCIL_WRITE_BASE, ds->ac.db_stencil_base);
4124       radeon_set_context_reg(cmd_buffer->cs, R_02803C_DB_STENCIL_WRITE_BASE_HI,
4125                              S_02803C_BASE_HI(ds->ac.db_stencil_base >> 32));
4126       radeon_set_context_reg(cmd_buffer->cs, R_028B94_PA_SC_HIZ_INFO, ds->ac.u.gfx12.hiz_info);
4127       radeon_set_context_reg(cmd_buffer->cs, R_028B98_PA_SC_HIS_INFO, ds->ac.u.gfx12.his_info);
4128 
4129       if (ds->ac.u.gfx12.hiz_info) {
4130          radeon_set_context_reg(cmd_buffer->cs, R_028B9C_PA_SC_HIZ_BASE, ds->ac.u.gfx12.hiz_base);
4131          radeon_set_context_reg(cmd_buffer->cs, R_028BA0_PA_SC_HIZ_BASE_EXT,
4132                                 S_028BA0_BASE_256B(ds->ac.u.gfx12.hiz_base >> 32));
4133          radeon_set_context_reg(cmd_buffer->cs, R_028BA4_PA_SC_HIZ_SIZE_XY, ds->ac.u.gfx12.hiz_size_xy);
4134       }
4135       if (ds->ac.u.gfx12.his_info) {
4136          radeon_set_context_reg(cmd_buffer->cs, R_028BA8_PA_SC_HIS_BASE, ds->ac.u.gfx12.his_base);
4137          radeon_set_context_reg(cmd_buffer->cs, R_028BAC_PA_SC_HIS_BASE_EXT,
4138                                 S_028BAC_BASE_256B(ds->ac.u.gfx12.his_base >> 32));
4139          radeon_set_context_reg(cmd_buffer->cs, R_028BB0_PA_SC_HIS_SIZE_XY, ds->ac.u.gfx12.his_size_xy);
4140       }
4141    } else if (pdev->info.gfx_level >= GFX10) {
4142       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, db_htile_data_base);
4143       radeon_set_context_reg(cmd_buffer->cs, R_02801C_DB_DEPTH_SIZE_XY, ds->ac.db_depth_size);
4144 
4145       if (pdev->info.gfx_level >= GFX11) {
4146          radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 6);
4147       } else {
4148          radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 7);
4149          radeon_emit(cmd_buffer->cs, S_02803C_RESOURCE_LEVEL(1));
4150       }
4151       radeon_emit(cmd_buffer->cs, db_z_info);
4152       radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_info);
4153       radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base);
4154       radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base);
4155       radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base);
4156       radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base);
4157 
4158       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_READ_BASE_HI, 5);
4159       radeon_emit(cmd_buffer->cs, S_028068_BASE_HI(ds->ac.db_depth_base >> 32));
4160       radeon_emit(cmd_buffer->cs, S_02806C_BASE_HI(ds->ac.db_stencil_base >> 32));
4161       radeon_emit(cmd_buffer->cs, S_028070_BASE_HI(ds->ac.db_depth_base >> 32));
4162       radeon_emit(cmd_buffer->cs, S_028074_BASE_HI(ds->ac.db_stencil_base >> 32));
4163       radeon_emit(cmd_buffer->cs, S_028078_BASE_HI(db_htile_data_base >> 32));
4164    } else if (pdev->info.gfx_level == GFX9) {
4165       radeon_set_context_reg_seq(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, 3);
4166       radeon_emit(cmd_buffer->cs, db_htile_data_base);
4167       radeon_emit(cmd_buffer->cs, S_028018_BASE_HI(db_htile_data_base >> 32));
4168       radeon_emit(cmd_buffer->cs, ds->ac.db_depth_size);
4169 
4170       radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 10);
4171       radeon_emit(cmd_buffer->cs, db_z_info);                                         /* DB_Z_INFO */
4172       radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_info);                            /* DB_STENCIL_INFO */
4173       radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base);                              /* DB_Z_READ_BASE */
4174       radeon_emit(cmd_buffer->cs, S_028044_BASE_HI(ds->ac.db_depth_base >> 32));      /* DB_Z_READ_BASE_HI */
4175       radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base);                            /* DB_STENCIL_READ_BASE */
4176       radeon_emit(cmd_buffer->cs, S_02804C_BASE_HI(ds->ac.db_stencil_base >> 32));    /* DB_STENCIL_READ_BASE_HI */
4177       radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base);                              /* DB_Z_WRITE_BASE */
4178       radeon_emit(cmd_buffer->cs, S_028054_BASE_HI(ds->ac.db_depth_base >> 32));      /* DB_Z_WRITE_BASE_HI */
4179       radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base);                            /* DB_STENCIL_WRITE_BASE */
4180       radeon_emit(cmd_buffer->cs, S_02805C_BASE_HI(ds->ac.db_stencil_base >> 32));    /* DB_STENCIL_WRITE_BASE_HI */
4181 
4182       radeon_set_context_reg_seq(cmd_buffer->cs, R_028068_DB_Z_INFO2, 2);
4183       radeon_emit(cmd_buffer->cs, ds->ac.u.gfx6.db_z_info2);
4184       radeon_emit(cmd_buffer->cs, ds->ac.u.gfx6.db_stencil_info2);
4185    } else {
4186       radeon_set_context_reg(cmd_buffer->cs, R_028014_DB_HTILE_DATA_BASE, db_htile_data_base);
4187 
4188       radeon_set_context_reg_seq(cmd_buffer->cs, R_02803C_DB_DEPTH_INFO, 9);
4189       radeon_emit(cmd_buffer->cs, ds->ac.u.gfx6.db_depth_info);  /* R_02803C_DB_DEPTH_INFO */
4190       radeon_emit(cmd_buffer->cs, db_z_info);                    /* R_028040_DB_Z_INFO */
4191       radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_info);       /* R_028044_DB_STENCIL_INFO */
4192       radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base);         /* R_028048_DB_Z_READ_BASE */
4193       radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base);       /* R_02804C_DB_STENCIL_READ_BASE */
4194       radeon_emit(cmd_buffer->cs, ds->ac.db_depth_base);         /* R_028050_DB_Z_WRITE_BASE */
4195       radeon_emit(cmd_buffer->cs, ds->ac.db_stencil_base);       /* R_028054_DB_STENCIL_WRITE_BASE */
4196       radeon_emit(cmd_buffer->cs, ds->ac.db_depth_size);         /* R_028058_DB_DEPTH_SIZE */
4197       radeon_emit(cmd_buffer->cs, ds->ac.u.gfx6.db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */
4198    }
4199 
4200    /* Update the ZRANGE_PRECISION value for the TC-compat bug. */
4201    radv_update_zrange_precision(cmd_buffer, ds, iview, true);
4202 }
4203 
4204 static void
radv_emit_null_ds_state(struct radv_cmd_buffer * cmd_buffer)4205 radv_emit_null_ds_state(struct radv_cmd_buffer *cmd_buffer)
4206 {
4207    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4208    const struct radv_physical_device *pdev = radv_device_physical(device);
4209    const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
4210 
4211    if (pdev->info.gfx_level >= GFX12) {
4212       radeon_set_context_reg_seq(cmd_buffer->cs, R_028018_DB_Z_INFO, 2);
4213       radeon_emit(cmd_buffer->cs, S_028018_FORMAT(V_028018_Z_INVALID) | S_028018_NUM_SAMPLES(3));
4214       radeon_emit(cmd_buffer->cs, S_02801C_FORMAT(V_02801C_STENCIL_INVALID) | S_02801C_TILE_STENCIL_DISABLE(1));
4215 
4216       radeon_set_context_reg(cmd_buffer->cs, R_028B94_PA_SC_HIZ_INFO, S_028B94_SURFACE_ENABLE(0));
4217       radeon_set_context_reg(cmd_buffer->cs, R_028B98_PA_SC_HIS_INFO, S_028B98_SURFACE_ENABLE(0));
4218    } else {
4219       if (gfx_level == GFX9) {
4220          radeon_set_context_reg_seq(cmd_buffer->cs, R_028038_DB_Z_INFO, 2);
4221       } else {
4222          radeon_set_context_reg_seq(cmd_buffer->cs, R_028040_DB_Z_INFO, 2);
4223       }
4224 
4225       /* On GFX11+, the hw intentionally looks at DB_Z_INFO.NUM_SAMPLES when there is no bound
4226        * depth/stencil buffer and it clamps the number of samples like MIN2(DB_Z_INFO.NUM_SAMPLES,
4227        * PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES). Use 8x for DB_Z_INFO.NUM_SAMPLES to make sure it's not
4228        * the constraining factor. This affects VRS, occlusion queries and POPS.
4229        */
4230       radeon_emit(cmd_buffer->cs,
4231                   S_028040_FORMAT(V_028040_Z_INVALID) | S_028040_NUM_SAMPLES(pdev->info.gfx_level >= GFX11 ? 3 : 0));
4232       radeon_emit(cmd_buffer->cs, S_028044_FORMAT(V_028044_STENCIL_INVALID));
4233       uint32_t db_render_control = 0;
4234 
4235       if (gfx_level == GFX11 || gfx_level == GFX11_5)
4236          radv_gfx11_set_db_render_control(device, 1, &db_render_control);
4237 
4238       radeon_set_context_reg(cmd_buffer->cs, R_028000_DB_RENDER_CONTROL, db_render_control);
4239    }
4240 
4241    radeon_set_context_reg(cmd_buffer->cs, R_028010_DB_RENDER_OVERRIDE2,
4242                           S_028010_CENTROID_COMPUTATION_MODE(gfx_level >= GFX10_3));
4243 }
4244 /**
4245  * Update the fast clear depth/stencil values if the image is bound as a
4246  * depth/stencil buffer.
4247  */
4248 static void
radv_update_bound_fast_clear_ds(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)4249 radv_update_bound_fast_clear_ds(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
4250                                 VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
4251 {
4252    const struct radv_image *image = iview->image;
4253    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4254 
4255    if (cmd_buffer->state.render.ds_att.iview == NULL || cmd_buffer->state.render.ds_att.iview->image != image)
4256       return;
4257 
4258    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4259       radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
4260       radeon_emit(cs, ds_clear_value.stencil);
4261       radeon_emit(cs, fui(ds_clear_value.depth));
4262    } else if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
4263       radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(ds_clear_value.depth));
4264    } else {
4265       assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
4266       radeon_set_context_reg(cs, R_028028_DB_STENCIL_CLEAR, ds_clear_value.stencil);
4267    }
4268 
4269    /* Update the ZRANGE_PRECISION value for the TC-compat bug. This is
4270     * only needed when clearing Z to 0.0.
4271     */
4272    if ((aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && ds_clear_value.depth == 0.0) {
4273       radv_update_zrange_precision(cmd_buffer, &cmd_buffer->state.render.ds_att.ds, iview, false);
4274    }
4275 
4276    cmd_buffer->state.context_roll_without_scissor_emitted = true;
4277 }
4278 
4279 /**
4280  * Set the clear depth/stencil values to the image's metadata.
4281  */
4282 static void
radv_set_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)4283 radv_set_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4284                            const VkImageSubresourceRange *range, VkClearDepthStencilValue ds_clear_value,
4285                            VkImageAspectFlags aspects)
4286 {
4287    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4288    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4289    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4290 
4291    if (aspects == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
4292       uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel);
4293 
4294       /* Use the fastest way when both aspects are used. */
4295       ASSERTED unsigned cdw_end = radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va,
4296                                                           2 * level_count, cmd_buffer->state.predicating);
4297 
4298       for (uint32_t l = 0; l < level_count; l++) {
4299          radeon_emit(cs, ds_clear_value.stencil);
4300          radeon_emit(cs, fui(ds_clear_value.depth));
4301       }
4302 
4303       assert(cmd_buffer->cs->cdw == cdw_end);
4304    } else {
4305       /* Otherwise we need one WRITE_DATA packet per level. */
4306       for (uint32_t l = 0; l < level_count; l++) {
4307          uint64_t va = radv_get_ds_clear_value_va(image, range->baseMipLevel + l);
4308          unsigned value;
4309 
4310          if (aspects == VK_IMAGE_ASPECT_DEPTH_BIT) {
4311             value = fui(ds_clear_value.depth);
4312             va += 4;
4313          } else {
4314             assert(aspects == VK_IMAGE_ASPECT_STENCIL_BIT);
4315             value = ds_clear_value.stencil;
4316          }
4317 
4318          radv_write_data(cmd_buffer, V_370_PFP, va, 1, &value, cmd_buffer->state.predicating);
4319       }
4320    }
4321 }
4322 
4323 /**
4324  * Update the TC-compat metadata value for this image.
4325  */
4326 static void
radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)4327 radv_set_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4328                                    const VkImageSubresourceRange *range, uint32_t value)
4329 {
4330    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4331    const struct radv_physical_device *pdev = radv_device_physical(device);
4332    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4333 
4334    if (!pdev->info.has_tc_compat_zrange_bug)
4335       return;
4336 
4337    uint64_t va = radv_get_tc_compat_zrange_va(image, range->baseMipLevel);
4338    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4339 
4340    ASSERTED unsigned cdw_end = radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va,
4341                                                        level_count, cmd_buffer->state.predicating);
4342 
4343    for (uint32_t l = 0; l < level_count; l++)
4344       radeon_emit(cs, value);
4345 
4346    assert(cmd_buffer->cs->cdw == cdw_end);
4347 }
4348 
4349 static void
radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value)4350 radv_update_tc_compat_zrange_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
4351                                       VkClearDepthStencilValue ds_clear_value)
4352 {
4353    VkImageSubresourceRange range = {
4354       .aspectMask = iview->vk.aspects,
4355       .baseMipLevel = iview->vk.base_mip_level,
4356       .levelCount = iview->vk.level_count,
4357       .baseArrayLayer = iview->vk.base_array_layer,
4358       .layerCount = iview->vk.layer_count,
4359    };
4360    uint32_t cond_val;
4361 
4362    /* Conditionally set DB_Z_INFO.ZRANGE_PRECISION to 0 when the last
4363     * depth clear value is 0.0f.
4364     */
4365    cond_val = ds_clear_value.depth == 0.0f ? UINT_MAX : 0;
4366 
4367    radv_set_tc_compat_zrange_metadata(cmd_buffer, iview->image, &range, cond_val);
4368 }
4369 
4370 /**
4371  * Update the clear depth/stencil values for this image.
4372  */
4373 void
radv_update_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,VkClearDepthStencilValue ds_clear_value,VkImageAspectFlags aspects)4374 radv_update_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview,
4375                               VkClearDepthStencilValue ds_clear_value, VkImageAspectFlags aspects)
4376 {
4377    VkImageSubresourceRange range = {
4378       .aspectMask = iview->vk.aspects,
4379       .baseMipLevel = iview->vk.base_mip_level,
4380       .levelCount = iview->vk.level_count,
4381       .baseArrayLayer = iview->vk.base_array_layer,
4382       .layerCount = iview->vk.layer_count,
4383    };
4384    struct radv_image *image = iview->image;
4385 
4386    assert(radv_htile_enabled(image, range.baseMipLevel));
4387 
4388    radv_set_ds_clear_metadata(cmd_buffer, iview->image, &range, ds_clear_value, aspects);
4389 
4390    if (radv_image_is_tc_compat_htile(image) && (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
4391       radv_update_tc_compat_zrange_metadata(cmd_buffer, iview, ds_clear_value);
4392    }
4393 
4394    radv_update_bound_fast_clear_ds(cmd_buffer, iview, ds_clear_value, aspects);
4395 }
4396 
4397 /**
4398  * Load the clear depth/stencil values from the image's metadata.
4399  */
4400 static void
radv_load_ds_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview)4401 radv_load_ds_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview)
4402 {
4403    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4404    const struct radv_physical_device *pdev = radv_device_physical(device);
4405    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4406    const struct radv_image *image = iview->image;
4407    VkImageAspectFlags aspects = vk_format_aspects(image->vk.format);
4408    uint64_t va = radv_get_ds_clear_value_va(image, iview->vk.base_mip_level);
4409    unsigned reg_offset = 0, reg_count = 0;
4410 
4411    assert(radv_image_has_htile(image));
4412 
4413    if (aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
4414       ++reg_count;
4415    } else {
4416       ++reg_offset;
4417       va += 4;
4418    }
4419    if (aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
4420       ++reg_count;
4421 
4422    uint32_t reg = R_028028_DB_STENCIL_CLEAR + 4 * reg_offset;
4423 
4424    if (pdev->info.has_load_ctx_reg_pkt) {
4425       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
4426       radeon_emit(cs, va);
4427       radeon_emit(cs, va >> 32);
4428       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
4429       radeon_emit(cs, reg_count);
4430    } else {
4431       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
4432       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) |
4433                          (reg_count == 2 ? COPY_DATA_COUNT_SEL : 0));
4434       radeon_emit(cs, va);
4435       radeon_emit(cs, va >> 32);
4436       radeon_emit(cs, reg >> 2);
4437       radeon_emit(cs, 0);
4438 
4439       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
4440       radeon_emit(cs, 0);
4441    }
4442 }
4443 
4444 /*
4445  * With DCC some colors don't require CMASK elimination before being
4446  * used as a texture. This sets a predicate value to determine if the
4447  * cmask eliminate is required.
4448  */
4449 void
radv_update_fce_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)4450 radv_update_fce_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4451                          const VkImageSubresourceRange *range, bool value)
4452 {
4453    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4454 
4455    if (!image->fce_pred_offset)
4456       return;
4457 
4458    uint64_t pred_val = value;
4459    uint64_t va = radv_image_get_fce_pred_va(image, range->baseMipLevel);
4460    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4461 
4462    ASSERTED unsigned cdw_end =
4463       radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va, 2 * level_count, false);
4464 
4465    for (uint32_t l = 0; l < level_count; l++) {
4466       radeon_emit(cmd_buffer->cs, pred_val);
4467       radeon_emit(cmd_buffer->cs, pred_val >> 32);
4468    }
4469 
4470    assert(cmd_buffer->cs->cdw == cdw_end);
4471 }
4472 
4473 /**
4474  * Update the DCC predicate to reflect the compression state.
4475  */
4476 void
radv_update_dcc_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,bool value)4477 radv_update_dcc_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4478                          const VkImageSubresourceRange *range, bool value)
4479 {
4480    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4481 
4482    if (image->dcc_pred_offset == 0)
4483       return;
4484 
4485    uint64_t pred_val = value;
4486    uint64_t va = radv_image_get_dcc_pred_va(image, range->baseMipLevel);
4487    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4488 
4489    assert(radv_dcc_enabled(image, range->baseMipLevel));
4490 
4491    ASSERTED unsigned cdw_end =
4492       radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va, 2 * level_count, false);
4493 
4494    for (uint32_t l = 0; l < level_count; l++) {
4495       radeon_emit(cmd_buffer->cs, pred_val);
4496       radeon_emit(cmd_buffer->cs, pred_val >> 32);
4497    }
4498 
4499    assert(cmd_buffer->cs->cdw == cdw_end);
4500 }
4501 
4502 /**
4503  * Update the fast clear color values if the image is bound as a color buffer.
4504  */
4505 static void
radv_update_bound_fast_clear_color(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,int cb_idx,uint32_t color_values[2])4506 radv_update_bound_fast_clear_color(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, int cb_idx,
4507                                    uint32_t color_values[2])
4508 {
4509    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4510    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4511 
4512    if (cb_idx >= cmd_buffer->state.render.color_att_count || cmd_buffer->state.render.color_att[cb_idx].iview == NULL ||
4513        cmd_buffer->state.render.color_att[cb_idx].iview->image != image)
4514       return;
4515 
4516    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 4);
4517 
4518    radeon_set_context_reg_seq(cs, R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c, 2);
4519    radeon_emit(cs, color_values[0]);
4520    radeon_emit(cs, color_values[1]);
4521 
4522    assert(cmd_buffer->cs->cdw <= cdw_max);
4523 
4524    cmd_buffer->state.context_roll_without_scissor_emitted = true;
4525 }
4526 
4527 /**
4528  * Set the clear color values to the image's metadata.
4529  */
4530 static void
radv_set_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t color_values[2])4531 radv_set_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
4532                               const VkImageSubresourceRange *range, uint32_t color_values[2])
4533 {
4534    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4535    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4536    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
4537 
4538    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel));
4539 
4540    if (radv_image_has_clear_value(image)) {
4541       uint64_t va = radv_image_get_fast_clear_va(image, range->baseMipLevel);
4542 
4543       ASSERTED unsigned cdw_end = radv_cs_write_data_head(device, cmd_buffer->cs, cmd_buffer->qf, V_370_PFP, va,
4544                                                           2 * level_count, cmd_buffer->state.predicating);
4545 
4546       for (uint32_t l = 0; l < level_count; l++) {
4547          radeon_emit(cs, color_values[0]);
4548          radeon_emit(cs, color_values[1]);
4549       }
4550 
4551       assert(cmd_buffer->cs->cdw == cdw_end);
4552    } else {
4553       /* Some default value we can set in the update. */
4554       assert(color_values[0] == 0 && color_values[1] == 0);
4555    }
4556 }
4557 
4558 /**
4559  * Update the clear color values for this image.
4560  */
4561 void
radv_update_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,const struct radv_image_view * iview,int cb_idx,uint32_t color_values[2])4562 radv_update_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, const struct radv_image_view *iview, int cb_idx,
4563                                  uint32_t color_values[2])
4564 {
4565    struct radv_image *image = iview->image;
4566    VkImageSubresourceRange range = {
4567       .aspectMask = iview->vk.aspects,
4568       .baseMipLevel = iview->vk.base_mip_level,
4569       .levelCount = iview->vk.level_count,
4570       .baseArrayLayer = iview->vk.base_array_layer,
4571       .layerCount = iview->vk.layer_count,
4572    };
4573 
4574    assert(radv_image_has_cmask(image) || radv_dcc_enabled(image, iview->vk.base_mip_level));
4575 
4576    /* Do not need to update the clear value for images that are fast cleared with the comp-to-single
4577     * mode because the hardware gets the value from the image directly.
4578     */
4579    if (iview->image->support_comp_to_single)
4580       return;
4581 
4582    radv_set_color_clear_metadata(cmd_buffer, image, &range, color_values);
4583 
4584    radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
4585 }
4586 
4587 /**
4588  * Load the clear color values from the image's metadata.
4589  */
4590 static void
radv_load_color_clear_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * iview,int cb_idx)4591 radv_load_color_clear_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *iview, int cb_idx)
4592 {
4593    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4594    const struct radv_physical_device *pdev = radv_device_physical(device);
4595    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4596    struct radv_image *image = iview->image;
4597 
4598    if (!radv_image_has_cmask(image) && !radv_dcc_enabled(image, iview->vk.base_mip_level))
4599       return;
4600 
4601    if (iview->image->support_comp_to_single)
4602       return;
4603 
4604    if (!radv_image_has_clear_value(image)) {
4605       uint32_t color_values[2] = {0, 0};
4606       radv_update_bound_fast_clear_color(cmd_buffer, image, cb_idx, color_values);
4607       return;
4608    }
4609 
4610    uint64_t va = radv_image_get_fast_clear_va(image, iview->vk.base_mip_level);
4611    uint32_t reg = R_028C8C_CB_COLOR0_CLEAR_WORD0 + cb_idx * 0x3c;
4612 
4613    if (pdev->info.has_load_ctx_reg_pkt) {
4614       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, cmd_buffer->state.predicating));
4615       radeon_emit(cs, va);
4616       radeon_emit(cs, va >> 32);
4617       radeon_emit(cs, (reg - SI_CONTEXT_REG_OFFSET) >> 2);
4618       radeon_emit(cs, 2);
4619    } else {
4620       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, cmd_buffer->state.predicating));
4621       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_COUNT_SEL);
4622       radeon_emit(cs, va);
4623       radeon_emit(cs, va >> 32);
4624       radeon_emit(cs, reg >> 2);
4625       radeon_emit(cs, 0);
4626 
4627       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
4628       radeon_emit(cs, 0);
4629    }
4630 }
4631 
4632 /* GFX9+ metadata cache flushing workaround. metadata cache coherency is
4633  * broken if the CB caches data of multiple mips of the same image at the
4634  * same time.
4635  *
4636  * Insert some flushes to avoid this.
4637  */
4638 static void
radv_emit_fb_mip_change_flush(struct radv_cmd_buffer * cmd_buffer)4639 radv_emit_fb_mip_change_flush(struct radv_cmd_buffer *cmd_buffer)
4640 {
4641    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4642    const struct radv_physical_device *pdev = radv_device_physical(device);
4643    struct radv_rendering_state *render = &cmd_buffer->state.render;
4644    bool color_mip_changed = false;
4645 
4646    /* Entire workaround is not applicable before GFX9 */
4647    if (pdev->info.gfx_level < GFX9)
4648       return;
4649 
4650    for (int i = 0; i < render->color_att_count; ++i) {
4651       struct radv_image_view *iview = render->color_att[i].iview;
4652       if (!iview)
4653          continue;
4654 
4655       if ((radv_image_has_cmask(iview->image) || radv_dcc_enabled(iview->image, iview->vk.base_mip_level) ||
4656            radv_dcc_enabled(iview->image, cmd_buffer->state.cb_mip[i])) &&
4657           cmd_buffer->state.cb_mip[i] != iview->vk.base_mip_level)
4658          color_mip_changed = true;
4659 
4660       cmd_buffer->state.cb_mip[i] = iview->vk.base_mip_level;
4661    }
4662 
4663    if (color_mip_changed) {
4664       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4665    }
4666 
4667    const struct radv_image_view *iview = render->ds_att.iview;
4668    if (iview) {
4669       if ((radv_htile_enabled(iview->image, iview->vk.base_mip_level) ||
4670            radv_htile_enabled(iview->image, cmd_buffer->state.ds_mip)) &&
4671           cmd_buffer->state.ds_mip != iview->vk.base_mip_level) {
4672          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4673       }
4674 
4675       cmd_buffer->state.ds_mip = iview->vk.base_mip_level;
4676    }
4677 }
4678 
4679 /* This function does the flushes for mip changes if the levels are not zero for
4680  * all render targets. This way we can assume at the start of the next cmd_buffer
4681  * that rendering to mip 0 doesn't need any flushes. As that is the most common
4682  * case that saves some flushes. */
4683 static void
radv_emit_mip_change_flush_default(struct radv_cmd_buffer * cmd_buffer)4684 radv_emit_mip_change_flush_default(struct radv_cmd_buffer *cmd_buffer)
4685 {
4686    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4687    const struct radv_physical_device *pdev = radv_device_physical(device);
4688 
4689    /* Entire workaround is not applicable before GFX9 */
4690    if (pdev->info.gfx_level < GFX9)
4691       return;
4692 
4693    bool need_color_mip_flush = false;
4694    for (unsigned i = 0; i < 8; ++i) {
4695       if (cmd_buffer->state.cb_mip[i]) {
4696          need_color_mip_flush = true;
4697          break;
4698       }
4699    }
4700 
4701    if (need_color_mip_flush) {
4702       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
4703    }
4704 
4705    if (cmd_buffer->state.ds_mip) {
4706       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
4707    }
4708 
4709    memset(cmd_buffer->state.cb_mip, 0, sizeof(cmd_buffer->state.cb_mip));
4710    cmd_buffer->state.ds_mip = 0;
4711 }
4712 
4713 static void
radv_emit_framebuffer_state(struct radv_cmd_buffer * cmd_buffer)4714 radv_emit_framebuffer_state(struct radv_cmd_buffer *cmd_buffer)
4715 {
4716    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4717    const struct radv_physical_device *pdev = radv_device_physical(device);
4718    struct radv_rendering_state *render = &cmd_buffer->state.render;
4719    int i;
4720    bool disable_constant_encode_ac01 = false;
4721    unsigned color_invalid = pdev->info.gfx_level >= GFX12   ? S_028EC0_FORMAT(V_028EC0_COLOR_INVALID)
4722                             : pdev->info.gfx_level >= GFX11 ? S_028C70_FORMAT_GFX11(V_028C70_COLOR_INVALID)
4723                                                             : S_028C70_FORMAT_GFX6(V_028C70_COLOR_INVALID);
4724    VkExtent2D extent = {MAX_FRAMEBUFFER_WIDTH, MAX_FRAMEBUFFER_HEIGHT};
4725 
4726    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 51 + MAX_RTS * 70);
4727 
4728    for (i = 0; i < render->color_att_count; ++i) {
4729       struct radv_image_view *iview = render->color_att[i].iview;
4730       if (!iview) {
4731          if (pdev->info.gfx_level >= GFX12) {
4732             radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_INFO + i * 4, color_invalid);
4733          } else {
4734             radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
4735          }
4736          continue;
4737       }
4738 
4739       VkImageLayout layout = render->color_att[i].layout;
4740 
4741       radv_cs_add_buffer(device->ws, cmd_buffer->cs, iview->image->bindings[0].bo);
4742 
4743       assert(iview->vk.aspects & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_PLANE_0_BIT |
4744                                   VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT));
4745 
4746       if (iview->image->disjoint && iview->vk.aspects == VK_IMAGE_ASPECT_COLOR_BIT) {
4747          for (uint32_t plane_id = 0; plane_id < iview->image->plane_count; plane_id++) {
4748             radv_cs_add_buffer(device->ws, cmd_buffer->cs, iview->image->bindings[plane_id].bo);
4749          }
4750       } else {
4751          uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
4752          radv_cs_add_buffer(device->ws, cmd_buffer->cs, iview->image->bindings[plane_id].bo);
4753       }
4754 
4755       radv_emit_fb_color_state(cmd_buffer, i, &render->color_att[i].cb, iview, layout);
4756 
4757       radv_load_color_clear_metadata(cmd_buffer, iview, i);
4758 
4759       if (pdev->info.gfx_level >= GFX9 && iview->image->dcc_sign_reinterpret) {
4760          /* Disable constant encoding with the clear value of "1" with different DCC signedness
4761           * because the hardware will fill "1" instead of the clear value.
4762           */
4763          disable_constant_encode_ac01 = true;
4764       }
4765 
4766       extent.width = MIN2(extent.width, iview->vk.extent.width);
4767       extent.height = MIN2(extent.height, iview->vk.extent.height);
4768    }
4769    for (; i < cmd_buffer->state.last_subpass_color_count; i++) {
4770       if (pdev->info.gfx_level >= GFX12) {
4771          radeon_set_context_reg(cmd_buffer->cs, R_028EC0_CB_COLOR0_INFO + i * 4, color_invalid);
4772       } else {
4773          radeon_set_context_reg(cmd_buffer->cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, color_invalid);
4774       }
4775    }
4776    cmd_buffer->state.last_subpass_color_count = render->color_att_count;
4777 
4778    if (render->ds_att.iview) {
4779       struct radv_image_view *iview = render->ds_att.iview;
4780       const struct radv_image *image = iview->image;
4781       radv_cs_add_buffer(device->ws, cmd_buffer->cs, image->bindings[0].bo);
4782 
4783       uint32_t qf_mask = radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf);
4784       bool depth_compressed = radv_layout_is_htile_compressed(device, image, render->ds_att.layout, qf_mask);
4785       bool stencil_compressed = radv_layout_is_htile_compressed(device, image, render->ds_att.stencil_layout, qf_mask);
4786 
4787       radv_emit_fb_ds_state(cmd_buffer, &render->ds_att.ds, iview, depth_compressed, stencil_compressed);
4788 
4789       if (depth_compressed || stencil_compressed) {
4790          /* Only load the depth/stencil fast clear values when
4791           * compressed rendering is enabled.
4792           */
4793          radv_load_ds_clear_metadata(cmd_buffer, iview);
4794       }
4795 
4796       extent.width = MIN2(extent.width, iview->vk.extent.width);
4797       extent.height = MIN2(extent.height, iview->vk.extent.height);
4798    } else if (pdev->info.gfx_level == GFX10_3 && render->vrs_att.iview && radv_cmd_buffer_get_vrs_image(cmd_buffer)) {
4799       /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, we have to
4800        * bind our internal depth buffer that contains the VRS data as part of HTILE.
4801        */
4802       VkImageLayout layout = VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL;
4803       struct radv_buffer *htile_buffer = device->vrs.buffer;
4804       struct radv_image *image = device->vrs.image;
4805       struct radv_ds_buffer_info ds;
4806       struct radv_image_view iview;
4807 
4808       radv_image_view_init(&iview, device,
4809                            &(VkImageViewCreateInfo){
4810                               .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
4811                               .image = radv_image_to_handle(image),
4812                               .viewType = radv_meta_get_view_type(image),
4813                               .format = image->vk.format,
4814                               .subresourceRange =
4815                                  {
4816                                     .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
4817                                     .baseMipLevel = 0,
4818                                     .levelCount = 1,
4819                                     .baseArrayLayer = 0,
4820                                     .layerCount = 1,
4821                                  },
4822                            },
4823                            0, NULL);
4824 
4825       radv_initialise_vrs_surface(image, htile_buffer, &ds);
4826 
4827       radv_cs_add_buffer(device->ws, cmd_buffer->cs, htile_buffer->bo);
4828 
4829       bool depth_compressed = radv_layout_is_htile_compressed(
4830          device, image, layout, radv_image_queue_family_mask(image, cmd_buffer->qf, cmd_buffer->qf));
4831       radv_emit_fb_ds_state(cmd_buffer, &ds, &iview, depth_compressed, false);
4832 
4833       radv_image_view_finish(&iview);
4834    } else {
4835       radv_emit_null_ds_state(cmd_buffer);
4836    }
4837 
4838    if (pdev->info.gfx_level >= GFX11) {
4839       bool vrs_surface_enable = render->vrs_att.iview != NULL;
4840       unsigned xmax = 0, ymax = 0;
4841       uint64_t va = 0;
4842 
4843       if (vrs_surface_enable) {
4844          const struct radv_image_view *vrs_iview = render->vrs_att.iview;
4845          struct radv_image *vrs_image = vrs_iview->image;
4846 
4847          va = radv_image_get_va(vrs_image, 0);
4848          va |= vrs_image->planes[0].surface.tile_swizzle << 8;
4849 
4850          xmax = vrs_iview->vk.extent.width - 1;
4851          ymax = vrs_iview->vk.extent.height - 1;
4852       }
4853 
4854       radeon_set_context_reg_seq(cmd_buffer->cs, R_0283F0_PA_SC_VRS_RATE_BASE, 3);
4855       radeon_emit(cmd_buffer->cs, va >> 8);
4856       radeon_emit(cmd_buffer->cs, S_0283F4_BASE_256B(va >> 40));
4857       radeon_emit(cmd_buffer->cs, S_0283F8_X_MAX(xmax) | S_0283F8_Y_MAX(ymax));
4858 
4859       radeon_set_context_reg(cmd_buffer->cs, R_0283D0_PA_SC_VRS_OVERRIDE_CNTL,
4860                              S_0283D0_VRS_SURFACE_ENABLE(vrs_surface_enable));
4861    }
4862 
4863    if (pdev->info.gfx_level >= GFX8 && pdev->info.gfx_level < GFX12) {
4864       bool disable_constant_encode = pdev->info.has_dcc_constant_encode;
4865       enum amd_gfx_level gfx_level = pdev->info.gfx_level;
4866 
4867       if (pdev->info.gfx_level >= GFX11) {
4868          const bool has_dedicated_vram = pdev->info.has_dedicated_vram;
4869 
4870          radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_FDCC_CONTROL,
4871                                 S_028424_SAMPLE_MASK_TRACKER_WATERMARK(has_dedicated_vram ? 0 : 15));
4872       } else {
4873          uint8_t watermark = gfx_level >= GFX10 ? 6 : 4;
4874 
4875          radeon_set_context_reg(cmd_buffer->cs, R_028424_CB_DCC_CONTROL,
4876                                 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(gfx_level <= GFX9) |
4877                                    S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
4878                                    S_028424_DISABLE_CONSTANT_ENCODE_AC01(disable_constant_encode_ac01) |
4879                                    S_028424_DISABLE_CONSTANT_ENCODE_REG(disable_constant_encode));
4880       }
4881    }
4882 
4883    if (pdev->info.gfx_level >= GFX12) {
4884       radeon_set_context_reg(cmd_buffer->cs, R_028184_PA_SC_SCREEN_SCISSOR_BR,
4885                              S_028034_BR_X(extent.width) | S_028034_BR_Y(extent.height));
4886    } else {
4887       radeon_set_context_reg(cmd_buffer->cs, R_028034_PA_SC_SCREEN_SCISSOR_BR,
4888                              S_028034_BR_X(extent.width) | S_028034_BR_Y(extent.height));
4889    }
4890 
4891    assert(cmd_buffer->cs->cdw <= cdw_max);
4892 
4893    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FRAMEBUFFER;
4894 }
4895 
4896 static void
radv_emit_guardband_state(struct radv_cmd_buffer * cmd_buffer)4897 radv_emit_guardband_state(struct radv_cmd_buffer *cmd_buffer)
4898 {
4899    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4900    const struct radv_physical_device *pdev = radv_device_physical(device);
4901    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
4902    unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
4903    const bool draw_points = radv_rast_prim_is_point(rast_prim) || radv_polygon_mode_is_point(d->vk.rs.polygon_mode);
4904    const bool draw_lines = radv_rast_prim_is_line(rast_prim) || radv_polygon_mode_is_line(d->vk.rs.polygon_mode);
4905    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4906    int i;
4907    float scale[3], translate[3], guardband_x = INFINITY, guardband_y = INFINITY;
4908    float discard_x = 1.0f, discard_y = 1.0f;
4909    const float max_range = 32767.0f;
4910 
4911    if (!d->vk.vp.viewport_count)
4912       return;
4913 
4914    for (i = 0; i < d->vk.vp.viewport_count; i++) {
4915       radv_get_viewport_xform(d->vk.vp.viewports + i, scale, translate);
4916       scale[0] = fabsf(scale[0]);
4917       scale[1] = fabsf(scale[1]);
4918 
4919       if (scale[0] < 0.5)
4920          scale[0] = 0.5;
4921       if (scale[1] < 0.5)
4922          scale[1] = 0.5;
4923 
4924       guardband_x = MIN2(guardband_x, (max_range - fabsf(translate[0])) / scale[0]);
4925       guardband_y = MIN2(guardband_y, (max_range - fabsf(translate[1])) / scale[1]);
4926 
4927       if (draw_points || draw_lines) {
4928          /* When rendering wide points or lines, we need to be more conservative about when to
4929           * discard them entirely. */
4930          float pixels;
4931 
4932          if (draw_points) {
4933             pixels = 8191.875f;
4934          } else {
4935             pixels = d->vk.rs.line.width;
4936          }
4937 
4938          /* Add half the point size / line width. */
4939          discard_x += pixels / (2.0 * scale[0]);
4940          discard_y += pixels / (2.0 * scale[1]);
4941 
4942          /* Discard primitives that would lie entirely outside the clip region. */
4943          discard_x = MIN2(discard_x, guardband_x);
4944          discard_y = MIN2(discard_y, guardband_y);
4945       }
4946    }
4947 
4948    if (pdev->info.gfx_level >= GFX12) {
4949       radeon_set_context_reg_seq(cs, R_02842C_PA_CL_GB_VERT_CLIP_ADJ, 4);
4950    } else {
4951       radeon_set_context_reg_seq(cs, R_028BE8_PA_CL_GB_VERT_CLIP_ADJ, 4);
4952    }
4953    radeon_emit(cs, fui(guardband_y));
4954    radeon_emit(cs, fui(discard_y));
4955    radeon_emit(cs, fui(guardband_x));
4956    radeon_emit(cs, fui(discard_x));
4957 
4958    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GUARDBAND;
4959 }
4960 
4961 /* Bind an internal index buffer for GPUs that hang with 0-sized index buffers to handle robustness2
4962  * which requires 0 for out-of-bounds access.
4963  */
4964 static void
radv_handle_zero_index_buffer_bug(struct radv_cmd_buffer * cmd_buffer,uint64_t * index_va,uint32_t * remaining_indexes)4965 radv_handle_zero_index_buffer_bug(struct radv_cmd_buffer *cmd_buffer, uint64_t *index_va, uint32_t *remaining_indexes)
4966 {
4967    const uint32_t zero = 0;
4968    uint32_t offset;
4969 
4970    if (!radv_cmd_buffer_upload_data(cmd_buffer, sizeof(uint32_t), &zero, &offset)) {
4971       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
4972       return;
4973    }
4974 
4975    *index_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
4976    *remaining_indexes = 1;
4977 }
4978 
4979 static void
radv_emit_index_buffer(struct radv_cmd_buffer * cmd_buffer)4980 radv_emit_index_buffer(struct radv_cmd_buffer *cmd_buffer)
4981 {
4982    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
4983    const struct radv_physical_device *pdev = radv_device_physical(device);
4984    struct radeon_cmdbuf *cs = cmd_buffer->cs;
4985    struct radv_cmd_state *state = &cmd_buffer->state;
4986    uint32_t max_index_count = state->max_index_count;
4987    uint64_t index_va = state->index_va;
4988 
4989    /* With indirect generated commands the index buffer bind may be part of the
4990     * indirect command buffer, in which case the app may not have bound any yet. */
4991    if (state->index_type < 0)
4992       return;
4993 
4994    /* Handle indirect draw calls with NULL index buffer if the GPU doesn't support them. */
4995    if (!max_index_count && pdev->info.has_zero_index_buffer_bug) {
4996       radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &max_index_count);
4997    }
4998 
4999    radeon_emit(cs, PKT3(PKT3_INDEX_BASE, 1, 0));
5000    radeon_emit(cs, index_va);
5001    radeon_emit(cs, index_va >> 32);
5002 
5003    radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
5004    radeon_emit(cs, max_index_count);
5005 
5006    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_INDEX_BUFFER;
5007 }
5008 
5009 static void
radv_flush_occlusion_query_state(struct radv_cmd_buffer * cmd_buffer)5010 radv_flush_occlusion_query_state(struct radv_cmd_buffer *cmd_buffer)
5011 {
5012    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5013    const struct radv_physical_device *pdev = radv_device_physical(device);
5014    const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
5015    const bool enable_occlusion_queries =
5016       cmd_buffer->state.active_occlusion_queries || cmd_buffer->state.inherited_occlusion_queries;
5017    uint32_t db_count_control;
5018 
5019    if (!enable_occlusion_queries) {
5020       db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(gfx_level < GFX11);
5021    } else {
5022       bool gfx10_perfect =
5023          gfx_level >= GFX10 && (cmd_buffer->state.perfect_occlusion_queries_enabled ||
5024                                 cmd_buffer->state.inherited_query_control_flags & VK_QUERY_CONTROL_PRECISE_BIT);
5025 
5026       if (gfx_level >= GFX7) {
5027          /* Always enable PERFECT_ZPASS_COUNTS due to issues with partially
5028           * covered tiles, discards, and early depth testing. For more details,
5029           * see https://gitlab.freedesktop.org/mesa/mesa/-/issues/3218 */
5030          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1) |
5031                             S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) | S_028004_ZPASS_ENABLE(1) |
5032                             S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
5033       } else {
5034          db_count_control = S_028004_PERFECT_ZPASS_COUNTS(1);
5035       }
5036 
5037       if (gfx_level < GFX12) {
5038          const uint32_t rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
5039          const uint32_t sample_rate = util_logbase2(rasterization_samples);
5040 
5041          db_count_control |= S_028004_SAMPLE_RATE(sample_rate);
5042       }
5043    }
5044 
5045    if (pdev->info.gfx_level >= GFX12) {
5046       radeon_opt_set_context_reg(cmd_buffer, R_028060_DB_COUNT_CONTROL, RADV_TRACKED_DB_COUNT_CONTROL,
5047                                  db_count_control);
5048    } else {
5049       radeon_opt_set_context_reg(cmd_buffer, R_028004_DB_COUNT_CONTROL, RADV_TRACKED_DB_COUNT_CONTROL,
5050                                  db_count_control);
5051    }
5052 
5053    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_OCCLUSION_QUERY;
5054 }
5055 
5056 unsigned
radv_instance_rate_prolog_index(unsigned num_attributes,uint32_t instance_rate_inputs)5057 radv_instance_rate_prolog_index(unsigned num_attributes, uint32_t instance_rate_inputs)
5058 {
5059    /* instance_rate_vs_prologs is a flattened array of array of arrays of different sizes, or a
5060     * single array sorted in ascending order using:
5061     * - total number of attributes
5062     * - number of instanced attributes
5063     * - index of first instanced attribute
5064     */
5065 
5066    /* From total number of attributes to offset. */
5067    static const uint16_t total_to_offset[16] = {0, 1, 4, 10, 20, 35, 56, 84, 120, 165, 220, 286, 364, 455, 560, 680};
5068    unsigned start_index = total_to_offset[num_attributes - 1];
5069 
5070    /* From number of instanced attributes to offset. This would require a different LUT depending on
5071     * the total number of attributes, but we can exploit a pattern to use just the LUT for 16 total
5072     * attributes.
5073     */
5074    static const uint8_t count_to_offset_total16[16] = {0,   16,  31,  45,  58,  70,  81,  91,
5075                                                        100, 108, 115, 121, 126, 130, 133, 135};
5076    unsigned count = util_bitcount(instance_rate_inputs);
5077    unsigned offset_from_start_index = count_to_offset_total16[count - 1] - ((16 - num_attributes) * (count - 1));
5078 
5079    unsigned first = ffs(instance_rate_inputs) - 1;
5080    return start_index + offset_from_start_index + first;
5081 }
5082 
5083 static struct radv_shader_part *
lookup_vs_prolog(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,uint32_t * nontrivial_divisors)5084 lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader, uint32_t *nontrivial_divisors)
5085 {
5086    assert(vs_shader->info.vs.dynamic_inputs);
5087 
5088    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5089    const struct radv_physical_device *pdev = radv_device_physical(device);
5090    const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
5091 
5092    unsigned num_attributes = util_last_bit(vs_shader->info.vs.vb_desc_usage_mask);
5093    uint32_t attribute_mask = BITFIELD_MASK(num_attributes);
5094 
5095    uint32_t instance_rate_inputs = vi_state->instance_rate_inputs & attribute_mask;
5096    uint32_t zero_divisors = vi_state->zero_divisors & attribute_mask;
5097    *nontrivial_divisors = vi_state->nontrivial_divisors & attribute_mask;
5098    uint32_t misaligned_mask = cmd_buffer->state.vbo_misaligned_mask;
5099    uint32_t unaligned_mask = cmd_buffer->state.vbo_unaligned_mask;
5100    if (cmd_buffer->state.vbo_misaligned_mask_invalid) {
5101       bool misalignment_possible = pdev->info.gfx_level == GFX6 || pdev->info.gfx_level >= GFX10;
5102       u_foreach_bit (index, cmd_buffer->state.vbo_misaligned_mask_invalid & attribute_mask) {
5103          uint8_t binding = vi_state->bindings[index];
5104          if (!(cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(binding)))
5105             continue;
5106 
5107          uint8_t format_req = vi_state->format_align_req_minus_1[index];
5108          uint8_t component_req = vi_state->component_align_req_minus_1[index];
5109          uint64_t vb_offset = cmd_buffer->vertex_bindings[binding].offset;
5110          uint64_t vb_stride = cmd_buffer->vertex_bindings[binding].stride;
5111 
5112          VkDeviceSize offset = vb_offset + vi_state->offsets[index];
5113 
5114          if (misalignment_possible && ((offset | vb_stride) & format_req))
5115             misaligned_mask |= BITFIELD_BIT(index);
5116          if ((offset | vb_stride) & component_req)
5117             unaligned_mask |= BITFIELD_BIT(index);
5118       }
5119       cmd_buffer->state.vbo_misaligned_mask = misaligned_mask;
5120       cmd_buffer->state.vbo_unaligned_mask = unaligned_mask;
5121       cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask;
5122    }
5123    misaligned_mask |= vi_state->nontrivial_formats | unaligned_mask;
5124    misaligned_mask &= attribute_mask;
5125    unaligned_mask &= attribute_mask;
5126 
5127    const bool can_use_simple_input =
5128       cmd_buffer->state.shaders[MESA_SHADER_VERTEX] &&
5129       !cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.merged_shader_compiled_separately &&
5130       cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.is_ngg == pdev->use_ngg &&
5131       cmd_buffer->state.shaders[MESA_SHADER_VERTEX]->info.wave_size == pdev->ge_wave_size;
5132 
5133    /* The instance ID input VGPR is placed differently when as_ls=true. as_ls is also needed to
5134     * workaround the LS VGPR initialization bug.
5135     */
5136    bool as_ls = vs_shader->info.vs.as_ls && (instance_rate_inputs || pdev->info.has_ls_vgpr_init_bug);
5137 
5138    /* try to use a pre-compiled prolog first */
5139    struct radv_shader_part *prolog = NULL;
5140    if (can_use_simple_input && !as_ls && !misaligned_mask && !vi_state->alpha_adjust_lo && !vi_state->alpha_adjust_hi) {
5141       if (!instance_rate_inputs) {
5142          prolog = device->simple_vs_prologs[num_attributes - 1];
5143       } else if (num_attributes <= 16 && !*nontrivial_divisors && !zero_divisors &&
5144                  util_bitcount(instance_rate_inputs) ==
5145                     (util_last_bit(instance_rate_inputs) - ffs(instance_rate_inputs) + 1)) {
5146          unsigned index = radv_instance_rate_prolog_index(num_attributes, instance_rate_inputs);
5147          prolog = device->instance_rate_vs_prologs[index];
5148       }
5149    }
5150    if (prolog)
5151       return prolog;
5152 
5153    struct radv_vs_prolog_key key;
5154    memset(&key, 0, sizeof(key));
5155    key.instance_rate_inputs = instance_rate_inputs;
5156    key.nontrivial_divisors = *nontrivial_divisors;
5157    key.zero_divisors = zero_divisors;
5158    /* If the attribute is aligned, post shuffle is implemented using DST_SEL instead. */
5159    key.post_shuffle = vi_state->post_shuffle & misaligned_mask;
5160    key.alpha_adjust_hi = vi_state->alpha_adjust_hi & attribute_mask & ~unaligned_mask;
5161    key.alpha_adjust_lo = vi_state->alpha_adjust_lo & attribute_mask & ~unaligned_mask;
5162    u_foreach_bit (index, misaligned_mask)
5163       key.formats[index] = vi_state->formats[index];
5164    key.num_attributes = num_attributes;
5165    key.misaligned_mask = misaligned_mask;
5166    key.unaligned_mask = unaligned_mask;
5167    key.as_ls = as_ls;
5168    key.is_ngg = vs_shader->info.is_ngg;
5169    key.wave32 = vs_shader->info.wave_size == 32;
5170 
5171    if (vs_shader->info.merged_shader_compiled_separately) {
5172       assert(vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL || vs_shader->info.next_stage == MESA_SHADER_GEOMETRY);
5173       key.next_stage = vs_shader->info.next_stage;
5174    } else {
5175       key.next_stage = vs_shader->info.stage;
5176    }
5177 
5178    return radv_shader_part_cache_get(device, &device->vs_prologs, &cmd_buffer->vs_prologs, &key);
5179 }
5180 
5181 static void
emit_prolog_regs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,const struct radv_shader_part * prolog)5182 emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader,
5183                  const struct radv_shader_part *prolog)
5184 {
5185    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5186    const struct radv_physical_device *pdev = radv_device_physical(device);
5187    uint32_t rsrc1, rsrc2;
5188 
5189    /* no need to re-emit anything in this case */
5190    if (cmd_buffer->state.emitted_vs_prolog == prolog)
5191       return;
5192 
5193    enum amd_gfx_level chip = pdev->info.gfx_level;
5194 
5195    assert(cmd_buffer->state.emitted_graphics_pipeline == cmd_buffer->state.graphics_pipeline);
5196 
5197    if (vs_shader->info.merged_shader_compiled_separately) {
5198       if (vs_shader->info.next_stage == MESA_SHADER_GEOMETRY) {
5199          radv_shader_combine_cfg_vs_gs(vs_shader, cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY], &rsrc1, &rsrc2);
5200       } else {
5201          assert(vs_shader->info.next_stage == MESA_SHADER_TESS_CTRL);
5202 
5203          radv_shader_combine_cfg_vs_tcs(vs_shader, cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL], &rsrc1, &rsrc2);
5204       }
5205    } else {
5206       rsrc1 = vs_shader->config.rsrc1;
5207    }
5208 
5209    if (chip < GFX10 && G_00B228_SGPRS(prolog->rsrc1) > G_00B228_SGPRS(rsrc1))
5210       rsrc1 = (rsrc1 & C_00B228_SGPRS) | (prolog->rsrc1 & ~C_00B228_SGPRS);
5211 
5212    if (G_00B848_VGPRS(prolog->rsrc1) > G_00B848_VGPRS(rsrc1))
5213       rsrc1 = (rsrc1 & C_00B848_VGPRS) | (prolog->rsrc1 & ~C_00B848_VGPRS);
5214 
5215    radeon_set_sh_reg(cmd_buffer->cs, vs_shader->info.regs.pgm_lo, prolog->va >> 8);
5216    radeon_set_sh_reg(cmd_buffer->cs, vs_shader->info.regs.pgm_rsrc1, rsrc1);
5217 
5218    if (vs_shader->info.merged_shader_compiled_separately) {
5219       if (vs_shader->info.next_stage == MESA_SHADER_GEOMETRY) {
5220          const struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
5221          unsigned lds_size;
5222 
5223          if (gs->info.is_ngg) {
5224             lds_size = DIV_ROUND_UP(gs->info.ngg_info.lds_size, pdev->info.lds_encode_granularity);
5225          } else {
5226             lds_size = gs->info.gs_ring_info.lds_size;
5227          }
5228 
5229          radeon_set_sh_reg(cmd_buffer->cs, vs_shader->info.regs.pgm_rsrc2, rsrc2 | S_00B22C_LDS_SIZE(lds_size));
5230       } else {
5231          radeon_set_sh_reg(cmd_buffer->cs, vs_shader->info.regs.pgm_rsrc2, rsrc2);
5232       }
5233    }
5234 
5235    radv_cs_add_buffer(device->ws, cmd_buffer->cs, prolog->bo);
5236 }
5237 
5238 static void
emit_prolog_inputs(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs_shader,uint32_t nontrivial_divisors)5239 emit_prolog_inputs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs_shader,
5240                    uint32_t nontrivial_divisors)
5241 {
5242    /* no need to re-emit anything in this case */
5243    if (!nontrivial_divisors && cmd_buffer->state.emitted_vs_prolog &&
5244        !cmd_buffer->state.emitted_vs_prolog->nontrivial_divisors)
5245       return;
5246 
5247    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5248    const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
5249    uint64_t input_va = radv_shader_get_va(vs_shader);
5250 
5251    if (nontrivial_divisors) {
5252       unsigned inputs_offset;
5253       uint32_t *inputs;
5254       unsigned size = 8 + util_bitcount(nontrivial_divisors) * 8;
5255       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &inputs_offset, (void **)&inputs))
5256          return;
5257 
5258       *(inputs++) = input_va;
5259       *(inputs++) = input_va >> 32;
5260 
5261       u_foreach_bit (index, nontrivial_divisors) {
5262          uint32_t div = vi_state->divisors[index];
5263          if (div == 0) {
5264             *(inputs++) = 0;
5265             *(inputs++) = 1;
5266          } else if (util_is_power_of_two_or_zero(div)) {
5267             *(inputs++) = util_logbase2(div) | (1 << 8);
5268             *(inputs++) = 0xffffffffu;
5269          } else {
5270             struct util_fast_udiv_info info = util_compute_fast_udiv_info(div, 32, 32);
5271             *(inputs++) = info.pre_shift | (info.increment << 8) | (info.post_shift << 16);
5272             *(inputs++) = info.multiplier;
5273          }
5274       }
5275 
5276       input_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + inputs_offset;
5277    }
5278 
5279    const uint32_t vs_prolog_inputs_offset = radv_get_user_sgpr_loc(vs_shader, AC_UD_VS_PROLOG_INPUTS);
5280    radv_emit_shader_pointer(device, cmd_buffer->cs, vs_prolog_inputs_offset, input_va, true);
5281 }
5282 
5283 static void
radv_emit_vertex_input(struct radv_cmd_buffer * cmd_buffer)5284 radv_emit_vertex_input(struct radv_cmd_buffer *cmd_buffer)
5285 {
5286    const struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
5287    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5288 
5289    assert(!cmd_buffer->state.mesh_shading);
5290 
5291    if (!vs_shader->info.vs.has_prolog)
5292       return;
5293 
5294    uint32_t nontrivial_divisors;
5295    struct radv_shader_part *prolog = lookup_vs_prolog(cmd_buffer, vs_shader, &nontrivial_divisors);
5296    if (!prolog) {
5297       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
5298       return;
5299    }
5300    emit_prolog_regs(cmd_buffer, vs_shader, prolog);
5301    emit_prolog_inputs(cmd_buffer, vs_shader, nontrivial_divisors);
5302 
5303    cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, prolog->upload_seq);
5304 
5305    cmd_buffer->state.emitted_vs_prolog = prolog;
5306 
5307    if (radv_device_fault_detection_enabled(device))
5308       radv_save_vs_prolog(cmd_buffer, prolog);
5309 }
5310 
5311 static void
radv_emit_tess_domain_origin(struct radv_cmd_buffer * cmd_buffer)5312 radv_emit_tess_domain_origin(struct radv_cmd_buffer *cmd_buffer)
5313 {
5314    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5315    const struct radv_physical_device *pdev = radv_device_physical(device);
5316    const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
5317    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5318    unsigned type = 0, partitioning = 0;
5319    unsigned topology;
5320 
5321    switch (tes->info.tes._primitive_mode) {
5322    case TESS_PRIMITIVE_TRIANGLES:
5323       type = V_028B6C_TESS_TRIANGLE;
5324       break;
5325    case TESS_PRIMITIVE_QUADS:
5326       type = V_028B6C_TESS_QUAD;
5327       break;
5328    case TESS_PRIMITIVE_ISOLINES:
5329       type = V_028B6C_TESS_ISOLINE;
5330       break;
5331    default:
5332       unreachable("Invalid tess primitive type");
5333    }
5334 
5335    switch (tes->info.tes.spacing) {
5336    case TESS_SPACING_EQUAL:
5337       partitioning = V_028B6C_PART_INTEGER;
5338       break;
5339    case TESS_SPACING_FRACTIONAL_ODD:
5340       partitioning = V_028B6C_PART_FRAC_ODD;
5341       break;
5342    case TESS_SPACING_FRACTIONAL_EVEN:
5343       partitioning = V_028B6C_PART_FRAC_EVEN;
5344       break;
5345    default:
5346       unreachable("Invalid tess spacing type");
5347    }
5348 
5349    if (tes->info.tes.point_mode) {
5350       topology = V_028B6C_OUTPUT_POINT;
5351    } else if (tes->info.tes._primitive_mode == TESS_PRIMITIVE_ISOLINES) {
5352       topology = V_028B6C_OUTPUT_LINE;
5353    } else {
5354       bool ccw = tes->info.tes.ccw;
5355 
5356       if (d->vk.ts.domain_origin != VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT) {
5357          ccw = !ccw;
5358       }
5359 
5360       topology = ccw ? V_028B6C_OUTPUT_TRIANGLE_CCW : V_028B6C_OUTPUT_TRIANGLE_CW;
5361    }
5362 
5363    uint32_t vgt_tf_param = S_028B6C_TYPE(type) | S_028B6C_PARTITIONING(partitioning) | S_028B6C_TOPOLOGY(topology) |
5364                            S_028B6C_DISTRIBUTION_MODE(pdev->tess_distribution_mode);
5365 
5366    if (pdev->info.gfx_level >= GFX12) {
5367       vgt_tf_param |= S_028AA4_TEMPORAL(gfx12_load_last_use_discard);
5368 
5369       radeon_set_context_reg(cmd_buffer->cs, R_028AA4_VGT_TF_PARAM, vgt_tf_param);
5370    } else {
5371       radeon_set_context_reg(cmd_buffer->cs, R_028B6C_VGT_TF_PARAM, vgt_tf_param);
5372    }
5373 }
5374 
5375 static void
radv_emit_alpha_to_coverage_enable(struct radv_cmd_buffer * cmd_buffer)5376 radv_emit_alpha_to_coverage_enable(struct radv_cmd_buffer *cmd_buffer)
5377 {
5378    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5379    const struct radv_physical_device *pdev = radv_device_physical(device);
5380    const struct radv_instance *instance = radv_physical_device_instance(pdev);
5381    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5382    unsigned db_alpha_to_mask = 0;
5383 
5384    if (instance->debug_flags & RADV_DEBUG_NO_ATOC_DITHERING) {
5385       db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(2) | S_028B70_ALPHA_TO_MASK_OFFSET1(2) |
5386                          S_028B70_ALPHA_TO_MASK_OFFSET2(2) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
5387                          S_028B70_OFFSET_ROUND(0);
5388    } else {
5389       db_alpha_to_mask = S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
5390                          S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
5391                          S_028B70_OFFSET_ROUND(1);
5392    }
5393 
5394    db_alpha_to_mask |= S_028B70_ALPHA_TO_MASK_ENABLE(d->vk.ms.alpha_to_coverage_enable);
5395 
5396    if (pdev->info.gfx_level >= GFX12) {
5397       radeon_set_context_reg(cmd_buffer->cs, R_02807C_DB_ALPHA_TO_MASK, db_alpha_to_mask);
5398    } else {
5399       radeon_set_context_reg(cmd_buffer->cs, R_028B70_DB_ALPHA_TO_MASK, db_alpha_to_mask);
5400    }
5401 }
5402 
5403 static void
radv_emit_sample_mask(struct radv_cmd_buffer * cmd_buffer)5404 radv_emit_sample_mask(struct radv_cmd_buffer *cmd_buffer)
5405 {
5406    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5407 
5408    radeon_set_context_reg_seq(cmd_buffer->cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
5409    radeon_emit(cmd_buffer->cs, d->vk.ms.sample_mask | ((uint32_t)d->vk.ms.sample_mask << 16));
5410    radeon_emit(cmd_buffer->cs, d->vk.ms.sample_mask | ((uint32_t)d->vk.ms.sample_mask << 16));
5411 }
5412 
5413 static void
radv_emit_color_blend(struct radv_cmd_buffer * cmd_buffer)5414 radv_emit_color_blend(struct radv_cmd_buffer *cmd_buffer)
5415 {
5416    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5417    const struct radv_physical_device *pdev = radv_device_physical(device);
5418    const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
5419    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5420    unsigned cb_blend_control[MAX_RTS], sx_mrt_blend_opt[MAX_RTS];
5421    bool mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
5422 
5423    for (unsigned i = 0; i < MAX_RTS; i++) {
5424       VkBlendOp eqRGB = d->vk.cb.attachments[i].color_blend_op;
5425       VkBlendFactor srcRGB = d->vk.cb.attachments[i].src_color_blend_factor;
5426       VkBlendFactor dstRGB = d->vk.cb.attachments[i].dst_color_blend_factor;
5427       VkBlendOp eqA = d->vk.cb.attachments[i].alpha_blend_op;
5428       VkBlendFactor srcA = d->vk.cb.attachments[i].src_alpha_blend_factor;
5429       VkBlendFactor dstA = d->vk.cb.attachments[i].dst_alpha_blend_factor;
5430       unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
5431       unsigned blend_cntl = 0;
5432 
5433       cb_blend_control[i] = sx_mrt_blend_opt[i] = 0;
5434 
5435       /* Ignore other blend targets if dual-source blending is enabled to prevent wrong behaviour.
5436        */
5437       if (i > 0 && mrt0_is_dual_src)
5438          continue;
5439 
5440       if (!d->vk.cb.attachments[i].blend_enable) {
5441          sx_mrt_blend_opt[i] |= S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
5442                                 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
5443          continue;
5444       }
5445 
5446       radv_normalize_blend_factor(eqRGB, &srcRGB, &dstRGB);
5447       radv_normalize_blend_factor(eqA, &srcA, &dstA);
5448 
5449       /* Blending optimizations for RB+.
5450        * These transformations don't change the behavior.
5451        *
5452        * First, get rid of DST in the blend factors:
5453        *    func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
5454        */
5455       radv_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, VK_BLEND_FACTOR_DST_COLOR, VK_BLEND_FACTOR_SRC_COLOR);
5456 
5457       radv_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_COLOR, VK_BLEND_FACTOR_SRC_COLOR);
5458 
5459       radv_blend_remove_dst(&eqA, &srcA, &dstA, VK_BLEND_FACTOR_DST_ALPHA, VK_BLEND_FACTOR_SRC_ALPHA);
5460 
5461       /* Look up the ideal settings from tables. */
5462       srcRGB_opt = radv_translate_blend_opt_factor(srcRGB, false);
5463       dstRGB_opt = radv_translate_blend_opt_factor(dstRGB, false);
5464       srcA_opt = radv_translate_blend_opt_factor(srcA, true);
5465       dstA_opt = radv_translate_blend_opt_factor(dstA, true);
5466 
5467       /* Handle interdependencies. */
5468       if (radv_blend_factor_uses_dst(srcRGB))
5469          dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
5470       if (radv_blend_factor_uses_dst(srcA))
5471          dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
5472 
5473       if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE &&
5474           (dstRGB == VK_BLEND_FACTOR_ZERO || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
5475            dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE))
5476          dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
5477 
5478       /* Set the final value. */
5479       sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) | S_028760_COLOR_DST_OPT(dstRGB_opt) |
5480                             S_028760_COLOR_COMB_FCN(radv_translate_blend_opt_function(eqRGB)) |
5481                             S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
5482                             S_028760_ALPHA_COMB_FCN(radv_translate_blend_opt_function(eqA));
5483 
5484       blend_cntl |= S_028780_ENABLE(1);
5485       blend_cntl |= S_028780_COLOR_COMB_FCN(radv_translate_blend_function(eqRGB));
5486       blend_cntl |= S_028780_COLOR_SRCBLEND(radv_translate_blend_factor(gfx_level, srcRGB));
5487       blend_cntl |= S_028780_COLOR_DESTBLEND(radv_translate_blend_factor(gfx_level, dstRGB));
5488       if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
5489          blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
5490          blend_cntl |= S_028780_ALPHA_COMB_FCN(radv_translate_blend_function(eqA));
5491          blend_cntl |= S_028780_ALPHA_SRCBLEND(radv_translate_blend_factor(gfx_level, srcA));
5492          blend_cntl |= S_028780_ALPHA_DESTBLEND(radv_translate_blend_factor(gfx_level, dstA));
5493       }
5494       cb_blend_control[i] = blend_cntl;
5495    }
5496 
5497    if (pdev->info.has_rbplus) {
5498       /* Disable RB+ blend optimizations for dual source blending. */
5499       if (mrt0_is_dual_src) {
5500          for (unsigned i = 0; i < MAX_RTS; i++) {
5501             sx_mrt_blend_opt[i] =
5502                S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
5503          }
5504       }
5505 
5506       /* Disable RB+ blend optimizations on GFX11 when alpha-to-coverage is enabled. */
5507       if (gfx_level >= GFX11 && d->vk.ms.alpha_to_coverage_enable) {
5508          sx_mrt_blend_opt[0] =
5509             S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
5510       }
5511    }
5512 
5513    radeon_set_context_reg_seq(cmd_buffer->cs, R_028780_CB_BLEND0_CONTROL, MAX_RTS);
5514    radeon_emit_array(cmd_buffer->cs, cb_blend_control, MAX_RTS);
5515 
5516    if (pdev->info.has_rbplus) {
5517       radeon_set_context_reg_seq(cmd_buffer->cs, R_028760_SX_MRT0_BLEND_OPT, MAX_RTS);
5518       radeon_emit_array(cmd_buffer->cs, sx_mrt_blend_opt, MAX_RTS);
5519    }
5520 }
5521 
5522 static struct radv_shader_part *
lookup_ps_epilog(struct radv_cmd_buffer * cmd_buffer)5523 lookup_ps_epilog(struct radv_cmd_buffer *cmd_buffer)
5524 {
5525    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5526    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
5527    const struct radv_rendering_state *render = &cmd_buffer->state.render;
5528    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5529    const struct radv_physical_device *pdev = radv_device_physical(device);
5530    struct radv_ps_epilog_state state = {0};
5531    uint8_t color_remap[MAX_RTS];
5532 
5533    memset(color_remap, MESA_VK_ATTACHMENT_UNUSED, sizeof(color_remap));
5534 
5535    state.color_attachment_count = render->color_att_count;
5536    for (unsigned i = 0; i < render->color_att_count; ++i) {
5537       state.color_attachment_formats[i] = render->color_att[i].format;
5538    }
5539 
5540    for (unsigned i = 0; i < MAX_RTS; i++) {
5541       VkBlendOp eqRGB = d->vk.cb.attachments[i].color_blend_op;
5542       VkBlendFactor srcRGB = d->vk.cb.attachments[i].src_color_blend_factor;
5543       VkBlendFactor dstRGB = d->vk.cb.attachments[i].dst_color_blend_factor;
5544 
5545       state.color_write_mask |= d->vk.cb.attachments[i].write_mask << (4 * i);
5546       state.color_blend_enable |= d->vk.cb.attachments[i].blend_enable << (4 * i);
5547 
5548       radv_normalize_blend_factor(eqRGB, &srcRGB, &dstRGB);
5549 
5550       if (srcRGB == VK_BLEND_FACTOR_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA ||
5551           srcRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE || dstRGB == VK_BLEND_FACTOR_SRC_ALPHA_SATURATE ||
5552           srcRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA || dstRGB == VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA)
5553          state.need_src_alpha |= 1 << i;
5554 
5555       state.color_attachment_mappings[i] = d->vk.cal.color_map[i];
5556       if (state.color_attachment_mappings[i] != MESA_VK_ATTACHMENT_UNUSED)
5557          color_remap[state.color_attachment_mappings[i]] = i;
5558    }
5559 
5560    state.mrt0_is_dual_src = radv_is_mrt0_dual_src(cmd_buffer);
5561 
5562    if (d->vk.ms.alpha_to_coverage_enable) {
5563       /* Select a color export format with alpha when alpha to coverage is enabled. */
5564       state.need_src_alpha |= 0x1;
5565    }
5566 
5567    state.alpha_to_one = d->vk.ms.alpha_to_one_enable;
5568 
5569    if (ps) {
5570       state.colors_written = ps->info.ps.colors_written;
5571 
5572       if (ps->info.ps.exports_mrtz_via_epilog) {
5573          assert(pdev->info.gfx_level >= GFX11);
5574          state.export_depth = ps->info.ps.writes_z;
5575          state.export_stencil = ps->info.ps.writes_stencil;
5576          state.export_sample_mask = ps->info.ps.writes_sample_mask;
5577          state.alpha_to_coverage_via_mrtz = d->vk.ms.alpha_to_coverage_enable;
5578       }
5579    }
5580 
5581    struct radv_ps_epilog_key key = radv_generate_ps_epilog_key(device, &state);
5582 
5583    /* Determine the actual colors written if outputs are remapped. */
5584    uint32_t colors_written = 0;
5585    for (uint32_t i = 0; i < MAX_RTS; i++) {
5586       if (!((ps->info.ps.colors_written >> (i * 4)) & 0xf))
5587          continue;
5588 
5589       if (color_remap[i] == MESA_VK_ATTACHMENT_UNUSED)
5590          continue;
5591 
5592       colors_written |= 0xfu << (4 * color_remap[i]);
5593    }
5594 
5595    /* Clear color attachments that aren't exported by the FS to match IO shader arguments. */
5596    key.spi_shader_col_format &= colors_written;
5597 
5598    return radv_shader_part_cache_get(device, &device->ps_epilogs, &cmd_buffer->ps_epilogs, &key);
5599 }
5600 
5601 static void
radv_emit_msaa_state(struct radv_cmd_buffer * cmd_buffer)5602 radv_emit_msaa_state(struct radv_cmd_buffer *cmd_buffer)
5603 {
5604    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5605    const struct radv_physical_device *pdev = radv_device_physical(device);
5606    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
5607    unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
5608    const struct radv_rendering_state *render = &cmd_buffer->state.render;
5609    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
5610    unsigned log_samples = util_logbase2(rasterization_samples);
5611    unsigned pa_sc_aa_config = 0;
5612    unsigned max_sample_dist = 0;
5613    unsigned db_eqaa;
5614 
5615    db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(pdev->info.gfx_level < GFX12) |
5616              S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
5617 
5618    if (pdev->info.gfx_level >= GFX9 && d->vk.rs.conservative_mode != VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT) {
5619       /* Adjust MSAA state if conservative rasterization is enabled. */
5620       db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(4);
5621       pa_sc_aa_config |= S_028BE0_AA_MASK_CENTROID_DTMN(1);
5622    }
5623 
5624    if (!d->sample_location.count) {
5625       max_sample_dist = radv_get_default_max_sample_dist(log_samples);
5626    } else {
5627       uint32_t num_samples = (uint32_t)d->sample_location.per_pixel;
5628       VkOffset2D sample_locs[4][8]; /* 8 is the max. sample count supported */
5629 
5630       /* Convert the user sample locations to hardware sample locations. */
5631       radv_convert_user_sample_locs(&d->sample_location, 0, 0, sample_locs[0]);
5632       radv_convert_user_sample_locs(&d->sample_location, 1, 0, sample_locs[1]);
5633       radv_convert_user_sample_locs(&d->sample_location, 0, 1, sample_locs[2]);
5634       radv_convert_user_sample_locs(&d->sample_location, 1, 1, sample_locs[3]);
5635 
5636       /* Compute the maximum sample distance from the specified locations. */
5637       for (unsigned i = 0; i < 4; ++i) {
5638          for (uint32_t j = 0; j < num_samples; j++) {
5639             VkOffset2D offset = sample_locs[i][j];
5640             max_sample_dist = MAX2(max_sample_dist, MAX2(abs(offset.x), abs(offset.y)));
5641          }
5642       }
5643    }
5644 
5645    if (rasterization_samples > 1) {
5646       unsigned z_samples = MAX2(render->ds_samples, rasterization_samples);
5647       unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
5648       unsigned log_z_samples = util_logbase2(z_samples);
5649       unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
5650       bool uses_underestimate = d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_UNDERESTIMATE_EXT;
5651 
5652       pa_sc_aa_config |=
5653          S_028BE0_MSAA_NUM_SAMPLES(uses_underestimate ? 0 : log_samples) | S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
5654 
5655       if (pdev->info.gfx_level >= GFX12) {
5656          pa_sc_aa_config |= S_028BE0_PS_ITER_SAMPLES(log_ps_iter_samples);
5657 
5658          db_eqaa |= S_028078_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028078_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
5659       } else {
5660          pa_sc_aa_config |= S_028BE0_MAX_SAMPLE_DIST(max_sample_dist) |
5661                             S_028BE0_COVERED_CENTROID_IS_CENTER(pdev->info.gfx_level >= GFX10_3);
5662 
5663          db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) | S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
5664                     S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
5665       }
5666 
5667       if (radv_get_line_mode(cmd_buffer) == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR)
5668          db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
5669    }
5670 
5671    /* GFX12 programs it in SPI_PS_INPUT_ENA.COVERAGE_TO_SHADER_SELECT */
5672    pa_sc_aa_config |=
5673       S_028BE0_COVERAGE_TO_SHADER_SELECT(pdev->info.gfx_level < GFX12 && ps && ps->info.ps.reads_fully_covered);
5674 
5675    if (pdev->info.gfx_level >= GFX12) {
5676       radeon_set_context_reg(cmd_buffer->cs, R_028C5C_PA_SC_SAMPLE_PROPERTIES,
5677                              S_028C5C_MAX_SAMPLE_DIST(max_sample_dist));
5678 
5679       radeon_set_context_reg(cmd_buffer->cs, R_028078_DB_EQAA, db_eqaa);
5680    } else {
5681       radeon_set_context_reg(cmd_buffer->cs, R_028804_DB_EQAA, db_eqaa);
5682    }
5683 
5684    radeon_set_context_reg(cmd_buffer->cs, R_028BE0_PA_SC_AA_CONFIG, pa_sc_aa_config);
5685    radeon_set_context_reg(
5686       cmd_buffer->cs, R_028A48_PA_SC_MODE_CNTL_0,
5687       S_028A48_ALTERNATE_RBS_PER_TILE(pdev->info.gfx_level >= GFX9) | S_028A48_VPORT_SCISSOR_ENABLE(1) |
5688          S_028A48_LINE_STIPPLE_ENABLE(d->vk.rs.line.stipple.enable) | S_028A48_MSAA_ENABLE(rasterization_samples > 1));
5689 }
5690 
5691 static void
radv_emit_line_rasterization_mode(struct radv_cmd_buffer * cmd_buffer)5692 radv_emit_line_rasterization_mode(struct radv_cmd_buffer *cmd_buffer)
5693 {
5694    /* The DX10 diamond test is unnecessary with Vulkan and it decreases line rasterization
5695     * performance.
5696     */
5697    radeon_set_context_reg(
5698       cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL,
5699       S_028BDC_PERPENDICULAR_ENDCAP_ENA(radv_get_line_mode(cmd_buffer) == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_KHR));
5700 }
5701 
5702 static void
radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer * cmd_buffer,const uint64_t states)5703 radv_cmd_buffer_flush_dynamic_state(struct radv_cmd_buffer *cmd_buffer, const uint64_t states)
5704 {
5705    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5706    const struct radv_physical_device *pdev = radv_device_physical(device);
5707 
5708    if (states & (RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_DEPTH_CLIP_ENABLE | RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE |
5709                  RADV_DYNAMIC_DEPTH_CLAMP_ENABLE))
5710       radv_emit_viewport(cmd_buffer);
5711 
5712    if (states & (RADV_DYNAMIC_SCISSOR | RADV_DYNAMIC_VIEWPORT) && !pdev->info.has_gfx9_scissor_bug)
5713       radv_emit_scissor(cmd_buffer);
5714 
5715    if (states & RADV_DYNAMIC_LINE_WIDTH)
5716       radv_emit_line_width(cmd_buffer);
5717 
5718    if (states & RADV_DYNAMIC_BLEND_CONSTANTS)
5719       radv_emit_blend_constants(cmd_buffer);
5720 
5721    if (states & (RADV_DYNAMIC_STENCIL_REFERENCE | RADV_DYNAMIC_STENCIL_WRITE_MASK | RADV_DYNAMIC_STENCIL_COMPARE_MASK))
5722       radv_emit_stencil(cmd_buffer);
5723 
5724    if (states & RADV_DYNAMIC_DEPTH_BOUNDS)
5725       radv_emit_depth_bounds(cmd_buffer);
5726 
5727    if (states & RADV_DYNAMIC_DEPTH_BIAS)
5728       radv_emit_depth_bias(cmd_buffer);
5729 
5730    if (states &
5731        (RADV_DYNAMIC_DISCARD_RECTANGLE | RADV_DYNAMIC_DISCARD_RECTANGLE_ENABLE | RADV_DYNAMIC_DISCARD_RECTANGLE_MODE))
5732       radv_emit_discard_rectangle(cmd_buffer);
5733 
5734    if (states & RADV_DYNAMIC_CONSERVATIVE_RAST_MODE)
5735       radv_emit_conservative_rast_mode(cmd_buffer);
5736 
5737    if (states & RADV_DYNAMIC_SAMPLE_LOCATIONS)
5738       radv_emit_sample_locations(cmd_buffer);
5739 
5740    if (states & RADV_DYNAMIC_LINE_STIPPLE)
5741       radv_emit_line_stipple(cmd_buffer);
5742 
5743    if (states & (RADV_DYNAMIC_CULL_MODE | RADV_DYNAMIC_FRONT_FACE | RADV_DYNAMIC_DEPTH_BIAS_ENABLE |
5744                  RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE | RADV_DYNAMIC_PROVOKING_VERTEX_MODE |
5745                  RADV_DYNAMIC_LINE_RASTERIZATION_MODE))
5746       radv_emit_culling(cmd_buffer);
5747 
5748    if (states & (RADV_DYNAMIC_PROVOKING_VERTEX_MODE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY))
5749       radv_emit_provoking_vertex_mode(cmd_buffer);
5750 
5751    if ((states & RADV_DYNAMIC_PRIMITIVE_TOPOLOGY) ||
5752        (pdev->info.gfx_level >= GFX12 && states & RADV_DYNAMIC_PATCH_CONTROL_POINTS))
5753       radv_emit_primitive_topology(cmd_buffer);
5754 
5755    if (states & (RADV_DYNAMIC_DEPTH_TEST_ENABLE | RADV_DYNAMIC_DEPTH_WRITE_ENABLE | RADV_DYNAMIC_DEPTH_COMPARE_OP |
5756                  RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE | RADV_DYNAMIC_STENCIL_TEST_ENABLE | RADV_DYNAMIC_STENCIL_OP))
5757       radv_emit_depth_control(cmd_buffer);
5758 
5759    if (states & RADV_DYNAMIC_STENCIL_OP)
5760       radv_emit_stencil_control(cmd_buffer);
5761 
5762    if (states & RADV_DYNAMIC_FRAGMENT_SHADING_RATE)
5763       radv_emit_fragment_shading_rate(cmd_buffer);
5764 
5765    if (states & RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE)
5766       radv_emit_primitive_restart_enable(cmd_buffer);
5767 
5768    if (states & (RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE | RADV_DYNAMIC_DEPTH_CLIP_ENABLE |
5769                  RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE | RADV_DYNAMIC_DEPTH_CLAMP_ENABLE))
5770       radv_emit_clipping(cmd_buffer);
5771 
5772    if (states & (RADV_DYNAMIC_LOGIC_OP | RADV_DYNAMIC_LOGIC_OP_ENABLE | RADV_DYNAMIC_COLOR_WRITE_MASK |
5773                  RADV_DYNAMIC_COLOR_BLEND_ENABLE | RADV_DYNAMIC_COLOR_BLEND_EQUATION))
5774       radv_emit_logic_op(cmd_buffer);
5775 
5776    if (states & (RADV_DYNAMIC_COLOR_WRITE_ENABLE | RADV_DYNAMIC_COLOR_WRITE_MASK))
5777       radv_emit_color_write(cmd_buffer);
5778 
5779    if (states & RADV_DYNAMIC_VERTEX_INPUT)
5780       radv_emit_vertex_input(cmd_buffer);
5781 
5782    if (states & RADV_DYNAMIC_PATCH_CONTROL_POINTS)
5783       radv_emit_patch_control_points(cmd_buffer);
5784 
5785    if (states & RADV_DYNAMIC_TESS_DOMAIN_ORIGIN)
5786       radv_emit_tess_domain_origin(cmd_buffer);
5787 
5788    if (states & RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE)
5789       radv_emit_alpha_to_coverage_enable(cmd_buffer);
5790 
5791    if (states & RADV_DYNAMIC_SAMPLE_MASK)
5792       radv_emit_sample_mask(cmd_buffer);
5793 
5794    if (states & (RADV_DYNAMIC_DEPTH_CLAMP_ENABLE | RADV_DYNAMIC_DEPTH_CLIP_ENABLE))
5795       radv_emit_depth_clamp_enable(cmd_buffer);
5796 
5797    if (states & (RADV_DYNAMIC_COLOR_BLEND_ENABLE | RADV_DYNAMIC_COLOR_WRITE_MASK | RADV_DYNAMIC_COLOR_BLEND_EQUATION |
5798                  RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE))
5799       radv_emit_color_blend(cmd_buffer);
5800 
5801    if (states & (RADV_DYNAMIC_LINE_RASTERIZATION_MODE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE))
5802       radv_emit_line_rasterization_mode(cmd_buffer);
5803 
5804    if (states & (RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_LINE_RASTERIZATION_MODE |
5805                  RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE))
5806       radv_emit_rasterization_samples(cmd_buffer);
5807 
5808    if (states & (RADV_DYNAMIC_LINE_STIPPLE_ENABLE | RADV_DYNAMIC_CONSERVATIVE_RAST_MODE |
5809                  RADV_DYNAMIC_SAMPLE_LOCATIONS | RADV_DYNAMIC_RASTERIZATION_SAMPLES |
5810                  RADV_DYNAMIC_LINE_RASTERIZATION_MODE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE))
5811       radv_emit_msaa_state(cmd_buffer);
5812 
5813    /* RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE is handled by radv_emit_db_shader_control. */
5814 
5815    cmd_buffer->state.dirty_dynamic &= ~states;
5816 }
5817 
5818 static void
radv_flush_push_descriptors(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_state * descriptors_state)5819 radv_flush_push_descriptors(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_state *descriptors_state)
5820 {
5821    struct radv_descriptor_set *set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
5822    unsigned bo_offset;
5823 
5824    if (!radv_cmd_buffer_upload_data(cmd_buffer, set->header.size, set->header.mapped_ptr, &bo_offset))
5825       return;
5826 
5827    set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
5828    set->header.va += bo_offset;
5829 }
5830 
5831 void
radv_upload_indirect_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_state * descriptors_state)5832 radv_upload_indirect_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
5833                                      struct radv_descriptor_state *descriptors_state)
5834 {
5835    uint32_t size = MAX_SETS * 4;
5836    uint32_t offset;
5837    void *ptr;
5838 
5839    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, size, &offset, &ptr))
5840       return;
5841 
5842    descriptors_state->indirect_descriptor_sets_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
5843 
5844    for (unsigned i = 0; i < MAX_SETS; i++) {
5845       uint32_t *uptr = ((uint32_t *)ptr) + i;
5846       uint64_t set_va = 0;
5847       if (descriptors_state->valid & (1u << i))
5848          set_va = radv_descriptor_get_va(descriptors_state, i);
5849 
5850       uptr[0] = set_va & 0xffffffff;
5851    }
5852 }
5853 
5854 ALWAYS_INLINE static void
radv_flush_descriptors(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)5855 radv_flush_descriptors(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, VkPipelineBindPoint bind_point)
5856 {
5857    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
5858    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5859    struct radeon_cmdbuf *cs = cmd_buffer->cs;
5860    bool flush_indirect_descriptors;
5861 
5862    if (!descriptors_state->dirty)
5863       return;
5864 
5865    flush_indirect_descriptors = descriptors_state->need_indirect_descriptor_sets;
5866 
5867    if (flush_indirect_descriptors)
5868       radv_upload_indirect_descriptor_sets(cmd_buffer, descriptors_state);
5869 
5870    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, MAX_SETS * MESA_VULKAN_SHADER_STAGES * 4);
5871 
5872    if (stages & VK_SHADER_STAGE_COMPUTE_BIT) {
5873       struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
5874                                               ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
5875                                               : cmd_buffer->state.rt_prolog;
5876 
5877       radv_emit_descriptors_per_stage(device, cs, compute_shader, descriptors_state);
5878    } else {
5879       radv_foreach_stage(stage, stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
5880       {
5881          if (!cmd_buffer->state.shaders[stage])
5882             continue;
5883 
5884          radv_emit_descriptors_per_stage(device, cs, cmd_buffer->state.shaders[stage], descriptors_state);
5885       }
5886 
5887       if (stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
5888          radv_emit_descriptors_per_stage(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
5889                                          descriptors_state);
5890       }
5891    }
5892 
5893    descriptors_state->dirty = 0;
5894 
5895    assert(cmd_buffer->cs->cdw <= cdw_max);
5896 
5897    if (radv_device_fault_detection_enabled(device))
5898       radv_save_descriptors(cmd_buffer, bind_point);
5899 }
5900 
5901 static void
radv_emit_all_inline_push_consts(const struct radv_device * device,struct radeon_cmdbuf * cs,const struct radv_shader * shader,const uint32_t * values,bool * need_push_constants)5902 radv_emit_all_inline_push_consts(const struct radv_device *device, struct radeon_cmdbuf *cs,
5903                                  const struct radv_shader *shader, const uint32_t *values, bool *need_push_constants)
5904 {
5905    if (radv_get_user_sgpr_info(shader, AC_UD_PUSH_CONSTANTS)->sgpr_idx != -1)
5906       *need_push_constants |= true;
5907 
5908    const uint64_t mask = shader->info.inline_push_constant_mask;
5909    if (!mask)
5910       return;
5911 
5912    const uint8_t base = ffs(mask) - 1;
5913    if (mask == u_bit_consecutive64(base, util_last_bit64(mask) - base)) {
5914       /* consecutive inline push constants */
5915       radv_emit_inline_push_consts(device, cs, shader, AC_UD_INLINE_PUSH_CONSTANTS, values + base);
5916    } else {
5917       /* sparse inline push constants */
5918       uint32_t consts[AC_MAX_INLINE_PUSH_CONSTS];
5919       unsigned num_consts = 0;
5920       u_foreach_bit64 (idx, mask)
5921          consts[num_consts++] = values[idx];
5922       radv_emit_inline_push_consts(device, cs, shader, AC_UD_INLINE_PUSH_CONSTANTS, consts);
5923    }
5924 }
5925 
5926 ALWAYS_INLINE static VkShaderStageFlags
radv_must_flush_constants(const struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)5927 radv_must_flush_constants(const struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages,
5928                           VkPipelineBindPoint bind_point)
5929 {
5930    const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
5931 
5932    if (push_constants->size || push_constants->dynamic_offset_count)
5933       return stages & cmd_buffer->push_constant_stages;
5934 
5935    return 0;
5936 }
5937 
5938 static void
radv_flush_constants(struct radv_cmd_buffer * cmd_buffer,VkShaderStageFlags stages,VkPipelineBindPoint bind_point)5939 radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stages, VkPipelineBindPoint bind_point)
5940 {
5941    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
5942    struct radeon_cmdbuf *cs = cmd_buffer->cs;
5943    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
5944    const struct radv_push_constant_state *push_constants = radv_get_push_constants_state(cmd_buffer, bind_point);
5945    struct radv_shader *shader, *prev_shader;
5946    bool need_push_constants = false;
5947    unsigned offset;
5948    void *ptr;
5949    uint64_t va;
5950    uint32_t internal_stages = stages;
5951    uint32_t dirty_stages = 0;
5952 
5953    switch (bind_point) {
5954    case VK_PIPELINE_BIND_POINT_GRAPHICS:
5955       break;
5956    case VK_PIPELINE_BIND_POINT_COMPUTE:
5957       dirty_stages = RADV_RT_STAGE_BITS;
5958       break;
5959    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR:
5960       internal_stages = VK_SHADER_STAGE_COMPUTE_BIT;
5961       dirty_stages = VK_SHADER_STAGE_COMPUTE_BIT;
5962       break;
5963    default:
5964       unreachable("Unhandled bind point");
5965    }
5966 
5967    if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
5968       struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
5969                                               ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
5970                                               : cmd_buffer->state.rt_prolog;
5971 
5972       radv_emit_all_inline_push_consts(device, cs, compute_shader, (uint32_t *)cmd_buffer->push_constants,
5973                                        &need_push_constants);
5974    } else {
5975       radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
5976       {
5977          shader = radv_get_shader(cmd_buffer->state.shaders, stage);
5978 
5979          if (!shader)
5980             continue;
5981 
5982          radv_emit_all_inline_push_consts(device, cs, shader, (uint32_t *)cmd_buffer->push_constants,
5983                                           &need_push_constants);
5984       }
5985 
5986       if (internal_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
5987          radv_emit_all_inline_push_consts(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
5988                                           (uint32_t *)cmd_buffer->push_constants, &need_push_constants);
5989       }
5990    }
5991 
5992    if (need_push_constants) {
5993       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_constants->size + 16 * push_constants->dynamic_offset_count,
5994                                         &offset, &ptr))
5995          return;
5996 
5997       memcpy(ptr, cmd_buffer->push_constants, push_constants->size);
5998       memcpy((char *)ptr + push_constants->size, descriptors_state->dynamic_buffers,
5999              16 * push_constants->dynamic_offset_count);
6000 
6001       va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6002       va += offset;
6003 
6004       ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, MESA_VULKAN_SHADER_STAGES * 4);
6005 
6006       if (internal_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
6007          struct radv_shader *compute_shader = bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
6008                                                  ? cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]
6009                                                  : cmd_buffer->state.rt_prolog;
6010 
6011          radv_emit_userdata_address(device, cs, compute_shader, AC_UD_PUSH_CONSTANTS, va);
6012       } else {
6013          prev_shader = NULL;
6014          radv_foreach_stage(stage, internal_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
6015          {
6016             shader = radv_get_shader(cmd_buffer->state.shaders, stage);
6017 
6018             /* Avoid redundantly emitting the address for merged stages. */
6019             if (shader && shader != prev_shader) {
6020                radv_emit_userdata_address(device, cs, shader, AC_UD_PUSH_CONSTANTS, va);
6021 
6022                prev_shader = shader;
6023             }
6024          }
6025 
6026          if (internal_stages & VK_SHADER_STAGE_TASK_BIT_EXT) {
6027             radv_emit_userdata_address(device, cmd_buffer->gang.cs, cmd_buffer->state.shaders[MESA_SHADER_TASK],
6028                                        AC_UD_PUSH_CONSTANTS, va);
6029          }
6030       }
6031 
6032       assert(cmd_buffer->cs->cdw <= cdw_max);
6033    }
6034 
6035    cmd_buffer->push_constant_stages &= ~stages;
6036    cmd_buffer->push_constant_stages |= dirty_stages;
6037 }
6038 
6039 void
radv_get_vbo_info(const struct radv_cmd_buffer * cmd_buffer,uint32_t idx,struct radv_vbo_info * vbo_info)6040 radv_get_vbo_info(const struct radv_cmd_buffer *cmd_buffer, uint32_t idx, struct radv_vbo_info *vbo_info)
6041 {
6042    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6043    const struct radv_physical_device *pdev = radv_device_physical(device);
6044    const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
6045    const uint32_t binding = vi_state->bindings[idx];
6046 
6047    memset(vbo_info, 0, sizeof(*vbo_info));
6048 
6049    vbo_info->binding = binding;
6050    vbo_info->stride = cmd_buffer->vertex_bindings[binding].stride;
6051 
6052    vbo_info->attrib_offset = vi_state->offsets[idx];
6053    vbo_info->attrib_index_offset = vi_state->attrib_index_offset[idx];
6054    vbo_info->attrib_format_size = vi_state->format_sizes[idx];
6055 
6056    if (!(vi_state->nontrivial_formats & BITFIELD_BIT(idx))) {
6057       const struct ac_vtx_format_info *vtx_info_table =
6058          ac_get_vtx_format_info_table(pdev->info.gfx_level, pdev->info.family);
6059       const struct ac_vtx_format_info *vtx_info = &vtx_info_table[vi_state->formats[idx]];
6060       const uint32_t hw_format = vtx_info->hw_format[vtx_info->num_channels - 1];
6061 
6062       if (pdev->info.gfx_level >= GFX10) {
6063          vbo_info->non_trivial_format |= vtx_info->dst_sel | S_008F0C_FORMAT_GFX10(hw_format);
6064       } else {
6065          vbo_info->non_trivial_format |=
6066             vtx_info->dst_sel | S_008F0C_NUM_FORMAT((hw_format >> 4) & 0x7) | S_008F0C_DATA_FORMAT(hw_format & 0xf);
6067       }
6068    }
6069 
6070    const struct radv_buffer *buffer = cmd_buffer->vertex_binding_buffers[binding];
6071 
6072    if (!buffer)
6073       return;
6074 
6075    const uint32_t offset = cmd_buffer->vertex_bindings[binding].offset;
6076 
6077    vbo_info->va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
6078 
6079    if (cmd_buffer->vertex_bindings[binding].size) {
6080       vbo_info->size = cmd_buffer->vertex_bindings[binding].size;
6081    } else {
6082       vbo_info->size = vk_buffer_range(&buffer->vk, offset, VK_WHOLE_SIZE);
6083    }
6084 }
6085 
6086 static void
radv_write_vertex_descriptors(const struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs,void * vb_ptr)6087 radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs, void *vb_ptr)
6088 {
6089    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6090    const struct radv_physical_device *pdev = radv_device_physical(device);
6091    enum amd_gfx_level chip = pdev->info.gfx_level;
6092    unsigned desc_index = 0;
6093    uint32_t mask = vs->info.vs.vb_desc_usage_mask;
6094    const bool uses_dynamic_inputs = vs->info.vs.dynamic_inputs;
6095    const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
6096 
6097    while (mask) {
6098       unsigned i = u_bit_scan(&mask);
6099       uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4];
6100 
6101       if (uses_dynamic_inputs && !(vi_state->attribute_mask & BITFIELD_BIT(i))) {
6102          /* No vertex attribute description given: assume that the shader doesn't use this
6103           * location (vb_desc_usage_mask can be larger than attribute usage) and use a null
6104           * descriptor to avoid hangs (prologs load all attributes, even if there are holes).
6105           */
6106          memset(desc, 0, 4 * 4);
6107          continue;
6108       }
6109 
6110       struct radv_vbo_info vbo_info;
6111       radv_get_vbo_info(cmd_buffer, i, &vbo_info);
6112 
6113       uint32_t rsrc_word3;
6114 
6115       if (uses_dynamic_inputs && vbo_info.non_trivial_format) {
6116          rsrc_word3 = vbo_info.non_trivial_format;
6117       } else {
6118          rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
6119                       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
6120 
6121          if (pdev->info.gfx_level >= GFX10) {
6122             rsrc_word3 |= S_008F0C_FORMAT_GFX10(V_008F0C_GFX10_FORMAT_32_UINT);
6123          } else {
6124             rsrc_word3 |=
6125                S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6126          }
6127       }
6128 
6129       if (!vbo_info.va) {
6130          if (uses_dynamic_inputs) {
6131             /* Stride needs to be non-zero on GFX9, or else bounds checking is disabled. We need
6132              * to include the format/word3 so that the alpha channel is 1 for formats without an
6133              * alpha channel.
6134              */
6135             desc[0] = 0;
6136             desc[1] = S_008F04_STRIDE(16);
6137             desc[2] = 0;
6138             desc[3] = rsrc_word3;
6139          } else {
6140             memset(desc, 0, 4 * 4);
6141          }
6142 
6143          continue;
6144       }
6145 
6146       const unsigned stride = vbo_info.stride;
6147       uint32_t num_records = vbo_info.size;
6148 
6149       if (vs->info.vs.use_per_attribute_vb_descs) {
6150          const uint32_t attrib_end = vbo_info.attrib_offset + vbo_info.attrib_format_size;
6151 
6152          if (num_records < attrib_end) {
6153             num_records = 0; /* not enough space for one vertex */
6154          } else if (stride == 0) {
6155             num_records = 1; /* only one vertex */
6156          } else {
6157             num_records = (num_records - attrib_end) / stride + 1;
6158             /* If attrib_offset>stride, then the compiler will increase the vertex index by
6159              * attrib_offset/stride and decrease the offset by attrib_offset%stride. This is
6160              * only allowed with static strides.
6161              */
6162             num_records += vbo_info.attrib_index_offset;
6163          }
6164 
6165          /* GFX10 uses OOB_SELECT_RAW if stride==0, so convert num_records from elements into
6166           * into bytes in that case. GFX8 always uses bytes.
6167           */
6168          if (num_records && (chip == GFX8 || (chip != GFX9 && !stride))) {
6169             num_records = (num_records - 1) * stride + attrib_end;
6170          } else if (!num_records) {
6171             /* On GFX9, it seems bounds checking is disabled if both
6172              * num_records and stride are zero. This doesn't seem necessary on GFX8, GFX10 and
6173              * GFX10.3 but it doesn't hurt.
6174              */
6175             if (uses_dynamic_inputs) {
6176                desc[0] = 0;
6177                desc[1] = S_008F04_STRIDE(16);
6178                desc[2] = 0;
6179                desc[3] = rsrc_word3;
6180             } else {
6181                memset(desc, 0, 16);
6182             }
6183 
6184             continue;
6185          }
6186       } else {
6187          if (chip != GFX8 && stride)
6188             num_records = DIV_ROUND_UP(num_records, stride);
6189       }
6190 
6191       if (chip >= GFX10) {
6192          /* OOB_SELECT chooses the out-of-bounds check:
6193           * - 1: index >= NUM_RECORDS (Structured)
6194           * - 3: offset >= NUM_RECORDS (Raw)
6195           */
6196          int oob_select = stride ? V_008F0C_OOB_SELECT_STRUCTURED : V_008F0C_OOB_SELECT_RAW;
6197          rsrc_word3 |= S_008F0C_OOB_SELECT(oob_select) | S_008F0C_RESOURCE_LEVEL(chip < GFX11);
6198       }
6199 
6200       uint64_t va = vbo_info.va;
6201       if (uses_dynamic_inputs)
6202          va += vbo_info.attrib_offset;
6203 
6204       desc[0] = va;
6205       desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
6206       desc[2] = num_records;
6207       desc[3] = rsrc_word3;
6208    }
6209 }
6210 
6211 static void
radv_flush_vertex_descriptors(struct radv_cmd_buffer * cmd_buffer)6212 radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer)
6213 {
6214    struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
6215    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6216 
6217    if (!vs->info.vs.vb_desc_usage_mask)
6218       return;
6219 
6220    /* Mesh shaders don't have vertex descriptors. */
6221    assert(!cmd_buffer->state.mesh_shading);
6222 
6223    unsigned vb_desc_alloc_size = util_bitcount(vs->info.vs.vb_desc_usage_mask) * 16;
6224    unsigned vb_offset;
6225    void *vb_ptr;
6226    uint64_t va;
6227 
6228    /* allocate some descriptor state for vertex buffers */
6229    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, vb_desc_alloc_size, &vb_offset, &vb_ptr))
6230       return;
6231 
6232    radv_write_vertex_descriptors(cmd_buffer, vs, vb_ptr);
6233 
6234    va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6235    va += vb_offset;
6236 
6237    radv_emit_userdata_address(device, cmd_buffer->cs, vs, AC_UD_VS_VERTEX_BUFFERS, va);
6238 
6239    cmd_buffer->state.vb_va = va;
6240    cmd_buffer->state.vb_size = vb_desc_alloc_size;
6241    cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_VBO_DESCRIPTORS;
6242 
6243    if (radv_device_fault_detection_enabled(device))
6244       radv_save_vertex_descriptors(cmd_buffer, (uintptr_t)vb_ptr);
6245 
6246    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_VERTEX_BUFFER;
6247 }
6248 
6249 static void
radv_emit_streamout_buffers(struct radv_cmd_buffer * cmd_buffer,uint64_t va)6250 radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
6251 {
6252    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6253    uint32_t streamout_buffers_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_STREAMOUT_BUFFERS);
6254    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6255 
6256    if (!streamout_buffers_offset)
6257       return;
6258 
6259    radv_emit_shader_pointer(device, cmd_buffer->cs, streamout_buffers_offset, va, false);
6260 
6261    if (cmd_buffer->state.gs_copy_shader) {
6262       streamout_buffers_offset = radv_get_user_sgpr_loc(cmd_buffer->state.gs_copy_shader, AC_UD_STREAMOUT_BUFFERS);
6263       if (streamout_buffers_offset)
6264          radv_emit_shader_pointer(device, cmd_buffer->cs, streamout_buffers_offset, va, false);
6265    }
6266 }
6267 
6268 static void
radv_emit_streamout_state(struct radv_cmd_buffer * cmd_buffer,uint64_t va)6269 radv_emit_streamout_state(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
6270 {
6271    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6272    const uint32_t streamout_state_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_STREAMOUT_STATE);
6273    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6274 
6275    if (!streamout_state_offset)
6276       return;
6277 
6278    radv_emit_shader_pointer(device, cmd_buffer->cs, streamout_state_offset, va, false);
6279 }
6280 
6281 static void
radv_flush_streamout_descriptors(struct radv_cmd_buffer * cmd_buffer)6282 radv_flush_streamout_descriptors(struct radv_cmd_buffer *cmd_buffer)
6283 {
6284    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6285    const struct radv_physical_device *pdev = radv_device_physical(device);
6286 
6287    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_BUFFER) {
6288       struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
6289       struct radv_streamout_state *so = &cmd_buffer->state.streamout;
6290       unsigned so_offset;
6291       uint64_t desc_va;
6292       void *so_ptr;
6293 
6294       /* Allocate some descriptor state for streamout buffers. */
6295       if (!radv_cmd_buffer_upload_alloc(cmd_buffer, MAX_SO_BUFFERS * 16, &so_offset, &so_ptr))
6296          return;
6297 
6298       for (uint32_t i = 0; i < MAX_SO_BUFFERS; i++) {
6299          struct radv_buffer *buffer = sb[i].buffer;
6300          uint32_t *desc = &((uint32_t *)so_ptr)[i * 4];
6301          uint32_t size = 0;
6302          uint64_t va = 0;
6303 
6304          if (so->enabled_mask & (1 << i)) {
6305             va = radv_buffer_get_va(buffer->bo) + buffer->offset;
6306 
6307             va += sb[i].offset;
6308 
6309             /* Set the descriptor.
6310              *
6311              * On GFX8, the format must be non-INVALID, otherwise
6312              * the buffer will be considered not bound and store
6313              * instructions will be no-ops.
6314              */
6315             size = 0xffffffff;
6316 
6317             if (pdev->use_ngg_streamout) {
6318                /* With NGG streamout, the buffer size is used to determine the max emit per buffer
6319                 * and also acts as a disable bit when it's 0.
6320                 */
6321                size = radv_is_streamout_enabled(cmd_buffer) ? sb[i].size : 0;
6322             }
6323          }
6324 
6325          ac_build_raw_buffer_descriptor(pdev->info.gfx_level, va, size, desc);
6326       }
6327 
6328       desc_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6329       desc_va += so_offset;
6330 
6331       radv_emit_streamout_buffers(cmd_buffer, desc_va);
6332 
6333       if (pdev->info.gfx_level >= GFX12) {
6334          const uint8_t first_target = ffs(so->enabled_mask) - 1;
6335          unsigned state_offset;
6336          uint64_t state_va;
6337          void *state_ptr;
6338 
6339          /* The layout is:
6340           *    struct {
6341           *       struct {
6342           *          uint32_t ordered_id; // equal for all buffers
6343           *          uint32_t dwords_written;
6344           *       } buffer[4];
6345           *    };
6346           *
6347           * The buffer must be initialized to 0 and the address must be aligned to 64
6348           * because it's faster when the atomic doesn't straddle a 64B block boundary.
6349           */
6350          if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, MAX_SO_BUFFERS * 8, 64, &state_offset, &state_ptr))
6351             return;
6352 
6353          memset(state_ptr, 0, MAX_SO_BUFFERS * 8);
6354 
6355          state_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
6356          state_va += state_offset;
6357 
6358          /* The first enabled streamout target will contain the ordered ID/offset buffer for all
6359           * targets.
6360           */
6361          state_va += first_target * 8;
6362 
6363          radv_emit_streamout_state(cmd_buffer, state_va);
6364       }
6365    }
6366 
6367    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_BUFFER;
6368 }
6369 
6370 static void
radv_flush_shader_query_state_gfx(struct radv_cmd_buffer * cmd_buffer)6371 radv_flush_shader_query_state_gfx(struct radv_cmd_buffer *cmd_buffer)
6372 {
6373    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6374    const struct radv_physical_device *pdev = radv_device_physical(device);
6375    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6376    const uint32_t shader_query_state_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_SHADER_QUERY_STATE);
6377    enum radv_shader_query_state shader_query_state = radv_shader_query_none;
6378 
6379    if (!shader_query_state_offset)
6380       return;
6381 
6382    assert(last_vgt_shader->info.is_ngg || last_vgt_shader->info.stage == MESA_SHADER_GEOMETRY);
6383 
6384    /* By default shader queries are disabled but they are enabled if the command buffer has active GDS
6385     * queries or if it's a secondary command buffer that inherits the number of generated
6386     * primitives.
6387     */
6388    if (cmd_buffer->state.active_pipeline_gds_queries ||
6389        (cmd_buffer->state.inherited_pipeline_statistics &
6390         (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
6391          VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT)) ||
6392        (pdev->emulate_mesh_shader_queries && (cmd_buffer->state.inherited_pipeline_statistics &
6393                                               VK_QUERY_PIPELINE_STATISTIC_MESH_SHADER_INVOCATIONS_BIT_EXT)))
6394       shader_query_state |= radv_shader_query_pipeline_stat;
6395 
6396    if (cmd_buffer->state.active_prims_gen_gds_queries)
6397       shader_query_state |= radv_shader_query_prim_gen;
6398 
6399    if (cmd_buffer->state.active_prims_xfb_gds_queries && radv_is_streamout_enabled(cmd_buffer)) {
6400       shader_query_state |= radv_shader_query_prim_xfb | radv_shader_query_prim_gen;
6401    }
6402 
6403    radeon_set_sh_reg(cmd_buffer->cs, shader_query_state_offset, shader_query_state);
6404 }
6405 
6406 static void
radv_flush_shader_query_state_ace(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * task_shader)6407 radv_flush_shader_query_state_ace(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *task_shader)
6408 {
6409    const uint32_t shader_query_state_offset = radv_get_user_sgpr_loc(task_shader, AC_UD_SHADER_QUERY_STATE);
6410    enum radv_shader_query_state shader_query_state = radv_shader_query_none;
6411 
6412    if (!shader_query_state_offset)
6413       return;
6414 
6415    /* By default shader queries are disabled but they are enabled if the command buffer has active ACE
6416     * queries or if it's a secondary command buffer that inherits the number of task shader
6417     * invocations query.
6418     */
6419    if (cmd_buffer->state.active_pipeline_ace_queries ||
6420        (cmd_buffer->state.inherited_pipeline_statistics & VK_QUERY_PIPELINE_STATISTIC_TASK_SHADER_INVOCATIONS_BIT_EXT))
6421       shader_query_state |= radv_shader_query_pipeline_stat;
6422 
6423    radeon_set_sh_reg(cmd_buffer->gang.cs, shader_query_state_offset, shader_query_state);
6424 }
6425 
6426 static void
radv_flush_shader_query_state(struct radv_cmd_buffer * cmd_buffer)6427 radv_flush_shader_query_state(struct radv_cmd_buffer *cmd_buffer)
6428 {
6429    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6430    const struct radv_physical_device *pdev = radv_device_physical(device);
6431 
6432    radv_flush_shader_query_state_gfx(cmd_buffer);
6433 
6434    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK) && pdev->emulate_mesh_shader_queries)
6435       radv_flush_shader_query_state_ace(cmd_buffer, cmd_buffer->state.shaders[MESA_SHADER_TASK]);
6436 
6437    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_SHADER_QUERY;
6438 }
6439 
6440 static void
radv_flush_force_vrs_state(struct radv_cmd_buffer * cmd_buffer)6441 radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
6442 {
6443    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6444    const struct radv_physical_device *pdev = radv_device_physical(device);
6445    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6446    uint32_t force_vrs_rates_offset;
6447 
6448    if (!last_vgt_shader->info.force_vrs_per_vertex) {
6449       /* Un-set the SGPR index so we know to re-emit it later. */
6450       cmd_buffer->state.last_force_vrs_rates_offset = -1;
6451       return;
6452    }
6453 
6454    if (cmd_buffer->state.gs_copy_shader) {
6455       force_vrs_rates_offset = radv_get_user_sgpr_loc(cmd_buffer->state.gs_copy_shader, AC_UD_FORCE_VRS_RATES);
6456    } else {
6457       force_vrs_rates_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_FORCE_VRS_RATES);
6458    }
6459 
6460    enum amd_gfx_level gfx_level = pdev->info.gfx_level;
6461    uint32_t vrs_rates = 0;
6462 
6463    switch (device->force_vrs) {
6464    case RADV_FORCE_VRS_2x2:
6465       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X2 : (1u << 2) | (1u << 4);
6466       break;
6467    case RADV_FORCE_VRS_2x1:
6468       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_2X1 : (1u << 2) | (0u << 4);
6469       break;
6470    case RADV_FORCE_VRS_1x2:
6471       vrs_rates = gfx_level >= GFX11 ? V_0283D0_VRS_SHADING_RATE_1X2 : (0u << 2) | (1u << 4);
6472       break;
6473    default:
6474       break;
6475    }
6476 
6477    if (cmd_buffer->state.last_vrs_rates != vrs_rates ||
6478        cmd_buffer->state.last_force_vrs_rates_offset != force_vrs_rates_offset) {
6479       radeon_set_sh_reg(cmd_buffer->cs, force_vrs_rates_offset, vrs_rates);
6480    }
6481 
6482    cmd_buffer->state.last_vrs_rates = vrs_rates;
6483    cmd_buffer->state.last_force_vrs_rates_offset = force_vrs_rates_offset;
6484 }
6485 
6486 static void
radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer * cmd_buffer)6487 radv_upload_graphics_shader_descriptors(struct radv_cmd_buffer *cmd_buffer)
6488 {
6489    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_VERTEX_BUFFER)
6490       radv_flush_vertex_descriptors(cmd_buffer);
6491 
6492    radv_flush_streamout_descriptors(cmd_buffer);
6493 
6494    VkShaderStageFlags stages = VK_SHADER_STAGE_ALL_GRAPHICS;
6495    radv_flush_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
6496 
6497    const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
6498    if (pc_stages)
6499       radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
6500 
6501    radv_flush_force_vrs_state(cmd_buffer);
6502 }
6503 
6504 struct radv_draw_info {
6505    /**
6506     * Number of vertices.
6507     */
6508    uint32_t count;
6509 
6510    /**
6511     * First instance id.
6512     */
6513    uint32_t first_instance;
6514 
6515    /**
6516     * Number of instances.
6517     */
6518    uint32_t instance_count;
6519 
6520    /**
6521     * Whether it's an indexed draw.
6522     */
6523    bool indexed;
6524 
6525    /**
6526     * Indirect draw parameters resource.
6527     */
6528    struct radv_buffer *indirect;
6529    uint64_t indirect_offset;
6530    uint32_t stride;
6531 
6532    /**
6533     * Draw count parameters resource.
6534     */
6535    struct radv_buffer *count_buffer;
6536    uint64_t count_buffer_offset;
6537 
6538    /**
6539     * Stream output parameters resource.
6540     */
6541    struct radv_buffer *strmout_buffer;
6542    uint64_t strmout_buffer_offset;
6543 };
6544 
6545 struct radv_prim_vertex_count {
6546    uint8_t min;
6547    uint8_t incr;
6548 };
6549 
6550 static inline unsigned
radv_prims_for_vertices(struct radv_prim_vertex_count * info,unsigned num)6551 radv_prims_for_vertices(struct radv_prim_vertex_count *info, unsigned num)
6552 {
6553    if (num == 0)
6554       return 0;
6555 
6556    if (info->incr == 0)
6557       return 0;
6558 
6559    if (num < info->min)
6560       return 0;
6561 
6562    return 1 + ((num - info->min) / info->incr);
6563 }
6564 
6565 static const struct radv_prim_vertex_count prim_size_table[] = {
6566    [V_008958_DI_PT_NONE] = {0, 0},          [V_008958_DI_PT_POINTLIST] = {1, 1},
6567    [V_008958_DI_PT_LINELIST] = {2, 2},      [V_008958_DI_PT_LINESTRIP] = {2, 1},
6568    [V_008958_DI_PT_TRILIST] = {3, 3},       [V_008958_DI_PT_TRIFAN] = {3, 1},
6569    [V_008958_DI_PT_TRISTRIP] = {3, 1},      [V_008958_DI_PT_LINELIST_ADJ] = {4, 4},
6570    [V_008958_DI_PT_LINESTRIP_ADJ] = {4, 1}, [V_008958_DI_PT_TRILIST_ADJ] = {6, 6},
6571    [V_008958_DI_PT_TRISTRIP_ADJ] = {6, 2},  [V_008958_DI_PT_RECTLIST] = {3, 3},
6572    [V_008958_DI_PT_LINELOOP] = {2, 1},      [V_008958_DI_PT_POLYGON] = {3, 1},
6573    [V_008958_DI_PT_2D_TRI_STRIP] = {0, 0},
6574 };
6575 
6576 static uint32_t
radv_get_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count,unsigned topology,bool prim_restart_enable,unsigned patch_control_points,unsigned num_tess_patches)6577 radv_get_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, bool indirect_draw,
6578                             bool count_from_stream_output, uint32_t draw_vertex_count, unsigned topology,
6579                             bool prim_restart_enable, unsigned patch_control_points, unsigned num_tess_patches)
6580 {
6581    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6582    const struct radv_physical_device *pdev = radv_device_physical(device);
6583    const struct radeon_info *gpu_info = &pdev->info;
6584    const unsigned max_primgroup_in_wave = 2;
6585    /* SWITCH_ON_EOP(0) is always preferable. */
6586    bool wd_switch_on_eop = false;
6587    bool ia_switch_on_eop = false;
6588    bool ia_switch_on_eoi = false;
6589    bool partial_vs_wave = false;
6590    bool partial_es_wave = cmd_buffer->state.ia_multi_vgt_param.partial_es_wave;
6591    bool multi_instances_smaller_than_primgroup;
6592    struct radv_prim_vertex_count prim_vertex_count = prim_size_table[topology];
6593    unsigned primgroup_size;
6594 
6595    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TESS_CTRL)) {
6596       primgroup_size = num_tess_patches;
6597    } else if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY)) {
6598       primgroup_size = 64;
6599    } else {
6600       primgroup_size = 128; /* recommended without a GS */
6601    }
6602 
6603    /* GS requirement. */
6604    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY) && gpu_info->gfx_level <= GFX8) {
6605       unsigned gs_table_depth = pdev->gs_table_depth;
6606       if (SI_GS_PER_ES / primgroup_size >= gs_table_depth - 3)
6607          partial_es_wave = true;
6608    }
6609 
6610    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TESS_CTRL)) {
6611       if (topology == V_008958_DI_PT_PATCH) {
6612          prim_vertex_count.min = patch_control_points;
6613          prim_vertex_count.incr = 1;
6614       }
6615    }
6616 
6617    multi_instances_smaller_than_primgroup = indirect_draw;
6618    if (!multi_instances_smaller_than_primgroup && instanced_draw) {
6619       uint32_t num_prims = radv_prims_for_vertices(&prim_vertex_count, draw_vertex_count);
6620       if (num_prims < primgroup_size)
6621          multi_instances_smaller_than_primgroup = true;
6622    }
6623 
6624    ia_switch_on_eoi = cmd_buffer->state.ia_multi_vgt_param.ia_switch_on_eoi;
6625    partial_vs_wave = cmd_buffer->state.ia_multi_vgt_param.partial_vs_wave;
6626 
6627    if (gpu_info->gfx_level >= GFX7) {
6628       /* WD_SWITCH_ON_EOP has no effect on GPUs with less than
6629        * 4 shader engines. Set 1 to pass the assertion below.
6630        * The other cases are hardware requirements. */
6631       if (gpu_info->max_se < 4 || topology == V_008958_DI_PT_POLYGON || topology == V_008958_DI_PT_LINELOOP ||
6632           topology == V_008958_DI_PT_TRIFAN || topology == V_008958_DI_PT_TRISTRIP_ADJ ||
6633           (prim_restart_enable && (gpu_info->family < CHIP_POLARIS10 ||
6634                                    (topology != V_008958_DI_PT_POINTLIST && topology != V_008958_DI_PT_LINESTRIP))))
6635          wd_switch_on_eop = true;
6636 
6637       /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
6638        * We don't know that for indirect drawing, so treat it as
6639        * always problematic. */
6640       if (gpu_info->family == CHIP_HAWAII && (instanced_draw || indirect_draw))
6641          wd_switch_on_eop = true;
6642 
6643       /* Performance recommendation for 4 SE Gfx7-8 parts if
6644        * instances are smaller than a primgroup.
6645        * Assume indirect draws always use small instances.
6646        * This is needed for good VS wave utilization.
6647        */
6648       if (gpu_info->gfx_level <= GFX8 && gpu_info->max_se == 4 && multi_instances_smaller_than_primgroup)
6649          wd_switch_on_eop = true;
6650 
6651       /* Hardware requirement when drawing primitives from a stream
6652        * output buffer.
6653        */
6654       if (count_from_stream_output)
6655          wd_switch_on_eop = true;
6656 
6657       /* Required on GFX7 and later. */
6658       if (gpu_info->max_se > 2 && !wd_switch_on_eop)
6659          ia_switch_on_eoi = true;
6660 
6661       /* Required by Hawaii and, for some special cases, by GFX8. */
6662       if (ia_switch_on_eoi &&
6663           (gpu_info->family == CHIP_HAWAII ||
6664            (gpu_info->gfx_level == GFX8 &&
6665             /* max primgroup in wave is always 2 - leave this for documentation */
6666             (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY) || max_primgroup_in_wave != 2))))
6667          partial_vs_wave = true;
6668 
6669       /* Instancing bug on Bonaire. */
6670       if (gpu_info->family == CHIP_BONAIRE && ia_switch_on_eoi && (instanced_draw || indirect_draw))
6671          partial_vs_wave = true;
6672 
6673       /* If the WD switch is false, the IA switch must be false too. */
6674       assert(wd_switch_on_eop || !ia_switch_on_eop);
6675    }
6676    /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
6677    if (gpu_info->gfx_level <= GFX8 && ia_switch_on_eoi)
6678       partial_es_wave = true;
6679 
6680    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY)) {
6681       /* GS hw bug with single-primitive instances and SWITCH_ON_EOI.
6682        * The hw doc says all multi-SE chips are affected, but amdgpu-pro Vulkan
6683        * only applies it to Hawaii. Do what amdgpu-pro Vulkan does.
6684        */
6685       if (gpu_info->family == CHIP_HAWAII && ia_switch_on_eoi) {
6686          bool set_vgt_flush = indirect_draw;
6687          if (!set_vgt_flush && instanced_draw) {
6688             uint32_t num_prims = radv_prims_for_vertices(&prim_vertex_count, draw_vertex_count);
6689             if (num_prims <= 1)
6690                set_vgt_flush = true;
6691          }
6692          if (set_vgt_flush)
6693             cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
6694       }
6695    }
6696 
6697    /* Workaround for a VGT hang when strip primitive types are used with
6698     * primitive restart.
6699     */
6700    if (prim_restart_enable && (topology == V_008958_DI_PT_LINESTRIP || topology == V_008958_DI_PT_TRISTRIP ||
6701                                topology == V_008958_DI_PT_LINESTRIP_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
6702       partial_vs_wave = true;
6703    }
6704 
6705    return cmd_buffer->state.ia_multi_vgt_param.base | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1) |
6706           S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | S_028AA8_SWITCH_ON_EOI(ia_switch_on_eoi) |
6707           S_028AA8_PARTIAL_VS_WAVE_ON(partial_vs_wave) | S_028AA8_PARTIAL_ES_WAVE_ON(partial_es_wave) |
6708           S_028AA8_WD_SWITCH_ON_EOP(gpu_info->gfx_level >= GFX7 ? wd_switch_on_eop : 0);
6709 }
6710 
6711 static void
radv_emit_ia_multi_vgt_param(struct radv_cmd_buffer * cmd_buffer,bool instanced_draw,bool indirect_draw,bool count_from_stream_output,uint32_t draw_vertex_count)6712 radv_emit_ia_multi_vgt_param(struct radv_cmd_buffer *cmd_buffer, bool instanced_draw, bool indirect_draw,
6713                              bool count_from_stream_output, uint32_t draw_vertex_count)
6714 {
6715    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6716    const struct radv_physical_device *pdev = radv_device_physical(device);
6717    const struct radeon_info *gpu_info = &pdev->info;
6718    struct radv_cmd_state *state = &cmd_buffer->state;
6719    const unsigned patch_control_points = state->dynamic.vk.ts.patch_control_points;
6720    const unsigned topology = state->dynamic.vk.ia.primitive_topology;
6721    const bool prim_restart_enable = state->dynamic.vk.ia.primitive_restart_enable;
6722    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6723    unsigned ia_multi_vgt_param;
6724 
6725    ia_multi_vgt_param = radv_get_ia_multi_vgt_param(cmd_buffer, instanced_draw, indirect_draw, count_from_stream_output,
6726                                                     draw_vertex_count, topology, prim_restart_enable,
6727                                                     patch_control_points, state->tess_num_patches);
6728 
6729    if (state->last_ia_multi_vgt_param != ia_multi_vgt_param) {
6730       if (gpu_info->gfx_level == GFX9) {
6731          radeon_set_uconfig_reg_idx(&pdev->info, cs, R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param);
6732       } else if (gpu_info->gfx_level >= GFX7) {
6733          radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
6734       } else {
6735          radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
6736       }
6737       state->last_ia_multi_vgt_param = ia_multi_vgt_param;
6738    }
6739 }
6740 
6741 static void
gfx10_emit_ge_cntl(struct radv_cmd_buffer * cmd_buffer)6742 gfx10_emit_ge_cntl(struct radv_cmd_buffer *cmd_buffer)
6743 {
6744    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
6745    struct radv_cmd_state *state = &cmd_buffer->state;
6746    bool break_wave_at_eoi = false;
6747    unsigned primgroup_size;
6748    unsigned ge_cntl;
6749 
6750    if (last_vgt_shader->info.is_ngg)
6751       return;
6752 
6753    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TESS_CTRL)) {
6754       const struct radv_shader *tes = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_TESS_EVAL);
6755 
6756       primgroup_size = state->tess_num_patches;
6757 
6758       if (cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL]->info.uses_prim_id || tes->info.uses_prim_id ||
6759           (tes->info.merged_shader_compiled_separately &&
6760            cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.uses_prim_id)) {
6761          break_wave_at_eoi = true;
6762       }
6763    } else if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_GEOMETRY)) {
6764       const struct radv_legacy_gs_info *gs_state = &cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.gs_ring_info;
6765       primgroup_size = gs_state->gs_prims_per_subgroup;
6766    } else {
6767       primgroup_size = 128; /* recommended without a GS and tess */
6768    }
6769 
6770    ge_cntl = S_03096C_PRIM_GRP_SIZE_GFX10(primgroup_size) | S_03096C_VERT_GRP_SIZE(256) | /* disable vertex grouping */
6771              S_03096C_PACKET_TO_ONE_PA(0) /* this should only be set if LINE_STIPPLE_TEX_ENA == 1 */ |
6772              S_03096C_BREAK_WAVE_AT_EOI(break_wave_at_eoi);
6773 
6774    if (state->last_ge_cntl != ge_cntl) {
6775       radeon_set_uconfig_reg(cmd_buffer->cs, R_03096C_GE_CNTL, ge_cntl);
6776       state->last_ge_cntl = ge_cntl;
6777    }
6778 }
6779 
6780 static void
radv_emit_draw_registers(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)6781 radv_emit_draw_registers(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
6782 {
6783    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6784    const struct radv_physical_device *pdev = radv_device_physical(device);
6785    const struct radeon_info *gpu_info = &pdev->info;
6786    struct radv_cmd_state *state = &cmd_buffer->state;
6787    struct radeon_cmdbuf *cs = cmd_buffer->cs;
6788    uint32_t topology = state->dynamic.vk.ia.primitive_topology;
6789    bool disable_instance_packing = false;
6790 
6791    /* Draw state. */
6792    if (gpu_info->gfx_level >= GFX10) {
6793       gfx10_emit_ge_cntl(cmd_buffer);
6794    } else {
6795       radv_emit_ia_multi_vgt_param(cmd_buffer, draw_info->instance_count > 1, draw_info->indirect,
6796                                    !!draw_info->strmout_buffer, draw_info->indirect ? 0 : draw_info->count);
6797    }
6798 
6799    /* RDNA2 is affected by a hardware bug when instance packing is enabled for adjacent primitive
6800     * topologies and instance_count > 1, pipeline stats generated by GE are incorrect. It needs to
6801     * be applied for indexed and non-indexed draws.
6802     */
6803    if (gpu_info->gfx_level == GFX10_3 && state->active_pipeline_queries > 0 &&
6804        (draw_info->instance_count > 1 || draw_info->indirect) &&
6805        (topology == V_008958_DI_PT_LINELIST_ADJ || topology == V_008958_DI_PT_LINESTRIP_ADJ ||
6806         topology == V_008958_DI_PT_TRILIST_ADJ || topology == V_008958_DI_PT_TRISTRIP_ADJ)) {
6807       disable_instance_packing = true;
6808    }
6809 
6810    if ((draw_info->indexed && state->index_type != state->last_index_type) ||
6811        (gpu_info->gfx_level == GFX10_3 &&
6812         (state->last_index_type == -1 ||
6813          disable_instance_packing != G_028A7C_DISABLE_INSTANCE_PACKING(state->last_index_type)))) {
6814       uint32_t index_type = state->index_type | S_028A7C_DISABLE_INSTANCE_PACKING(disable_instance_packing);
6815 
6816       if (pdev->info.gfx_level >= GFX9) {
6817          radeon_set_uconfig_reg_idx(&pdev->info, cs, R_03090C_VGT_INDEX_TYPE, 2, index_type);
6818       } else {
6819          radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
6820          radeon_emit(cs, index_type);
6821       }
6822 
6823       state->last_index_type = index_type;
6824    }
6825 }
6826 
6827 static void
radv_stage_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stage_mask)6828 radv_stage_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stage_mask)
6829 {
6830    /* For simplicity, if the barrier wants to wait for the task shader,
6831     * just make it wait for the mesh shader too.
6832     */
6833    if (src_stage_mask & VK_PIPELINE_STAGE_2_TASK_SHADER_BIT_EXT)
6834       src_stage_mask |= VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT;
6835 
6836    if (src_stage_mask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
6837                          VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
6838       /* Be conservative for now. */
6839       src_stage_mask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
6840    }
6841 
6842    if (src_stage_mask &
6843        (VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
6844         VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV | VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
6845         VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_COPY_BIT_KHR | VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
6846         VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
6847       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
6848    }
6849 
6850    if (src_stage_mask & (VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
6851                          VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
6852                          VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
6853                          VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
6854       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
6855    } else if (src_stage_mask &
6856               (VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
6857                VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
6858                VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT | VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
6859                VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT | VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT |
6860                VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT)) {
6861       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
6862    }
6863 }
6864 
6865 static bool
can_skip_buffer_l2_flushes(struct radv_device * device)6866 can_skip_buffer_l2_flushes(struct radv_device *device)
6867 {
6868    const struct radv_physical_device *pdev = radv_device_physical(device);
6869    return pdev->info.gfx_level == GFX9 || (pdev->info.gfx_level >= GFX10 && !pdev->info.tcc_rb_non_coherent);
6870 }
6871 
6872 /*
6873  * In vulkan barriers have two kinds of operations:
6874  *
6875  * - visibility (implemented with radv_src_access_flush)
6876  * - availability (implemented with radv_dst_access_flush)
6877  *
6878  * for a memory operation to observe the result of a previous memory operation
6879  * one needs to do a visibility operation from the source memory and then an
6880  * availability operation to the target memory.
6881  *
6882  * The complication is the availability and visibility operations do not need to
6883  * be in the same barrier.
6884  *
6885  * The cleanest way to implement this is to define the visibility operation to
6886  * bring the caches to a "state of rest", which none of the caches below that
6887  * level dirty.
6888  *
6889  * For GFX8 and earlier this would be VRAM/GTT with none of the caches dirty.
6890  *
6891  * For GFX9+ we can define the state at rest to be L2 instead of VRAM for all
6892  * buffers and for images marked as coherent, and VRAM/GTT for non-coherent
6893  * images. However, given the existence of memory barriers which do not specify
6894  * the image/buffer it often devolves to just VRAM/GTT anyway.
6895  *
6896  * To help reducing the invalidations for GPUs that have L2 coherency between the
6897  * RB and the shader caches, we always invalidate L2 on the src side, as we can
6898  * use our knowledge of past usage to optimize flushes away.
6899  */
6900 
6901 enum radv_cmd_flush_bits
radv_src_access_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 src_stages,VkAccessFlags2 src_flags,const struct radv_image * image)6902 radv_src_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 src_stages, VkAccessFlags2 src_flags,
6903                       const struct radv_image *image)
6904 {
6905    src_flags = vk_expand_src_access_flags2(src_stages, src_flags);
6906 
6907    bool has_CB_meta = true, has_DB_meta = true;
6908    bool image_is_coherent = image ? image->l2_coherent : false;
6909    enum radv_cmd_flush_bits flush_bits = 0;
6910 
6911    if (image) {
6912       if (!radv_image_has_CB_metadata(image))
6913          has_CB_meta = false;
6914       if (!radv_image_has_htile(image))
6915          has_DB_meta = false;
6916    }
6917 
6918    if (src_flags & VK_ACCESS_2_COMMAND_PREPROCESS_WRITE_BIT_NV)
6919       flush_bits |= RADV_CMD_FLAG_INV_L2;
6920 
6921    if (src_flags & (VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | VK_ACCESS_2_ACCELERATION_STRUCTURE_WRITE_BIT_KHR)) {
6922       /* since the STORAGE bit isn't set we know that this is a meta operation.
6923        * on the dst flush side we skip CB/DB flushes without the STORAGE bit, so
6924        * set it here. */
6925       if (image && !(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
6926          if (vk_format_is_depth_or_stencil(image->vk.format)) {
6927             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
6928          } else {
6929             flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
6930          }
6931       }
6932 
6933       if (!image_is_coherent)
6934          flush_bits |= RADV_CMD_FLAG_INV_L2;
6935    }
6936 
6937    if (src_flags &
6938        (VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT)) {
6939       if (!image_is_coherent)
6940          flush_bits |= RADV_CMD_FLAG_WB_L2;
6941    }
6942 
6943    if (src_flags & VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT) {
6944       flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
6945       if (has_CB_meta)
6946          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
6947    }
6948 
6949    if (src_flags & VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT) {
6950       flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
6951       if (has_DB_meta)
6952          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
6953    }
6954 
6955    if (src_flags & VK_ACCESS_2_TRANSFER_WRITE_BIT) {
6956       flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB;
6957 
6958       if (!image_is_coherent)
6959          flush_bits |= RADV_CMD_FLAG_INV_L2;
6960       if (has_CB_meta)
6961          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
6962       if (has_DB_meta)
6963          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
6964    }
6965 
6966    return flush_bits;
6967 }
6968 
6969 enum radv_cmd_flush_bits
radv_dst_access_flush(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 dst_stages,VkAccessFlags2 dst_flags,const struct radv_image * image)6970 radv_dst_access_flush(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 dst_stages, VkAccessFlags2 dst_flags,
6971                       const struct radv_image *image)
6972 {
6973    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
6974    const struct radv_physical_device *pdev = radv_device_physical(device);
6975    bool has_CB_meta = true, has_DB_meta = true;
6976    enum radv_cmd_flush_bits flush_bits = 0;
6977    bool flush_CB = true, flush_DB = true;
6978    bool image_is_coherent = image ? image->l2_coherent : false;
6979    bool flush_L2_metadata = false;
6980 
6981    dst_flags = vk_expand_dst_access_flags2(dst_stages, dst_flags);
6982 
6983    if (image) {
6984       if (!(image->vk.usage & VK_IMAGE_USAGE_STORAGE_BIT)) {
6985          flush_CB = false;
6986          flush_DB = false;
6987       }
6988 
6989       if (!radv_image_has_CB_metadata(image))
6990          has_CB_meta = false;
6991       if (!radv_image_has_htile(image))
6992          has_DB_meta = false;
6993    }
6994 
6995    flush_L2_metadata = (has_CB_meta || has_DB_meta) && pdev->info.gfx_level < GFX12;
6996 
6997    /* All the L2 invalidations below are not the CB/DB. So if there are no incoherent images
6998     * in the L2 cache in CB/DB mode then they are already usable from all the other L2 clients. */
6999    image_is_coherent |= can_skip_buffer_l2_flushes(device) && !cmd_buffer->state.rb_noncoherent_dirty;
7000 
7001    if (dst_flags & VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT) {
7002       /* SMEM loads are used to read compute dispatch size in shaders */
7003       if (!device->load_grid_size_from_user_sgpr)
7004          flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
7005 
7006       /* Ensure the DGC meta shader can read the commands. */
7007       if (radv_uses_device_generated_commands(device)) {
7008          flush_bits |= RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_INV_VCACHE;
7009 
7010          if (pdev->info.gfx_level < GFX9)
7011             flush_bits |= RADV_CMD_FLAG_INV_L2;
7012       }
7013    }
7014 
7015    if (dst_flags & VK_ACCESS_2_UNIFORM_READ_BIT)
7016       flush_bits |= RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_SCACHE;
7017 
7018    if (dst_flags & (VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT |
7019                     VK_ACCESS_2_TRANSFER_READ_BIT)) {
7020       flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
7021 
7022       if (flush_L2_metadata)
7023          flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
7024       if (!image_is_coherent)
7025          flush_bits |= RADV_CMD_FLAG_INV_L2;
7026    }
7027 
7028    if (dst_flags & VK_ACCESS_2_DESCRIPTOR_BUFFER_READ_BIT_EXT)
7029       flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
7030 
7031    if (dst_flags & (VK_ACCESS_2_SHADER_STORAGE_READ_BIT | VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR |
7032                     VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT)) {
7033       if (dst_flags & (VK_ACCESS_2_SHADER_STORAGE_READ_BIT | VK_ACCESS_2_SHADER_BINDING_TABLE_READ_BIT_KHR |
7034                        VK_ACCESS_2_ACCELERATION_STRUCTURE_READ_BIT_KHR)) {
7035          /* Unlike LLVM, ACO uses SMEM for SSBOs and we have to
7036           * invalidate the scalar cache. */
7037          if (!pdev->use_llvm && !image)
7038             flush_bits |= RADV_CMD_FLAG_INV_SCACHE;
7039       }
7040 
7041       flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
7042       if (flush_L2_metadata)
7043          flush_bits |= RADV_CMD_FLAG_INV_L2_METADATA;
7044       if (!image_is_coherent)
7045          flush_bits |= RADV_CMD_FLAG_INV_L2;
7046    }
7047 
7048    if (dst_flags & VK_ACCESS_2_COMMAND_PREPROCESS_READ_BIT_NV) {
7049       flush_bits |= RADV_CMD_FLAG_INV_VCACHE;
7050       if (pdev->info.gfx_level < GFX9)
7051          flush_bits |= RADV_CMD_FLAG_INV_L2;
7052    }
7053 
7054    if (dst_flags & VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT) {
7055       if (flush_CB)
7056          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB;
7057       if (has_CB_meta)
7058          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_CB_META;
7059    }
7060 
7061    if (dst_flags & VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT) {
7062       if (flush_DB)
7063          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB;
7064       if (has_DB_meta)
7065          flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
7066    }
7067 
7068    return flush_bits;
7069 }
7070 
7071 void
radv_emit_resolve_barrier(struct radv_cmd_buffer * cmd_buffer,const struct radv_resolve_barrier * barrier)7072 radv_emit_resolve_barrier(struct radv_cmd_buffer *cmd_buffer, const struct radv_resolve_barrier *barrier)
7073 {
7074    struct radv_rendering_state *render = &cmd_buffer->state.render;
7075 
7076    for (uint32_t i = 0; i < render->color_att_count; i++) {
7077       struct radv_image_view *iview = render->color_att[i].iview;
7078       if (!iview)
7079          continue;
7080 
7081       cmd_buffer->state.flush_bits |=
7082          radv_src_access_flush(cmd_buffer, barrier->src_stage_mask, barrier->src_access_mask, iview->image);
7083    }
7084    if (render->ds_att.iview) {
7085       cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, barrier->src_stage_mask,
7086                                                             barrier->src_access_mask, render->ds_att.iview->image);
7087    }
7088 
7089    radv_stage_flush(cmd_buffer, barrier->src_stage_mask);
7090 
7091    for (uint32_t i = 0; i < render->color_att_count; i++) {
7092       struct radv_image_view *iview = render->color_att[i].iview;
7093       if (!iview)
7094          continue;
7095 
7096       cmd_buffer->state.flush_bits |=
7097          radv_dst_access_flush(cmd_buffer, barrier->dst_stage_mask, barrier->dst_access_mask, iview->image);
7098    }
7099    if (render->ds_att.iview) {
7100       cmd_buffer->state.flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dst_stage_mask,
7101                                                             barrier->dst_access_mask, render->ds_att.iview->image);
7102    }
7103 
7104    radv_gang_barrier(cmd_buffer, barrier->src_stage_mask, barrier->dst_stage_mask);
7105 }
7106 
7107 static void
radv_handle_image_transition_separate(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,VkImageLayout src_stencil_layout,VkImageLayout dst_stencil_layout,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)7108 radv_handle_image_transition_separate(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
7109                                       VkImageLayout src_layout, VkImageLayout dst_layout,
7110                                       VkImageLayout src_stencil_layout, VkImageLayout dst_stencil_layout,
7111                                       uint32_t src_family_index, uint32_t dst_family_index,
7112                                       const VkImageSubresourceRange *range,
7113                                       struct radv_sample_locations_state *sample_locs)
7114 {
7115    /* If we have a stencil layout that's different from depth, we need to
7116     * perform the stencil transition separately.
7117     */
7118    if ((range->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) &&
7119        (src_layout != src_stencil_layout || dst_layout != dst_stencil_layout)) {
7120       VkImageSubresourceRange aspect_range = *range;
7121       /* Depth-only transitions. */
7122       if (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
7123          aspect_range.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT;
7124          radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout, src_family_index, dst_family_index,
7125                                       &aspect_range, sample_locs);
7126       }
7127 
7128       /* Stencil-only transitions. */
7129       aspect_range.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT;
7130       radv_handle_image_transition(cmd_buffer, image, src_stencil_layout, dst_stencil_layout, src_family_index,
7131                                    dst_family_index, &aspect_range, sample_locs);
7132    } else {
7133       radv_handle_image_transition(cmd_buffer, image, src_layout, dst_layout, src_family_index, dst_family_index, range,
7134                                    sample_locs);
7135    }
7136 }
7137 
7138 static void
radv_handle_rendering_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image_view * view,uint32_t layer_count,uint32_t view_mask,VkImageLayout initial_layout,VkImageLayout initial_stencil_layout,VkImageLayout final_layout,VkImageLayout final_stencil_layout,struct radv_sample_locations_state * sample_locs)7139 radv_handle_rendering_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image_view *view,
7140                                        uint32_t layer_count, uint32_t view_mask, VkImageLayout initial_layout,
7141                                        VkImageLayout initial_stencil_layout, VkImageLayout final_layout,
7142                                        VkImageLayout final_stencil_layout,
7143                                        struct radv_sample_locations_state *sample_locs)
7144 {
7145    VkImageSubresourceRange range;
7146    range.aspectMask = view->image->vk.aspects;
7147    range.baseMipLevel = view->vk.base_mip_level;
7148    range.levelCount = 1;
7149 
7150    if (view_mask) {
7151       while (view_mask) {
7152          int start, count;
7153          u_bit_scan_consecutive_range(&view_mask, &start, &count);
7154 
7155          range.baseArrayLayer = view->vk.base_array_layer + start;
7156          range.layerCount = count;
7157 
7158          radv_handle_image_transition_separate(cmd_buffer, view->image, initial_layout, final_layout,
7159                                                initial_stencil_layout, final_stencil_layout, 0, 0, &range, sample_locs);
7160       }
7161    } else {
7162       range.baseArrayLayer = view->vk.base_array_layer;
7163       range.layerCount = layer_count;
7164       radv_handle_image_transition_separate(cmd_buffer, view->image, initial_layout, final_layout,
7165                                             initial_stencil_layout, final_stencil_layout, 0, 0, &range, sample_locs);
7166    }
7167 }
7168 
7169 VKAPI_ATTR VkResult VKAPI_CALL
radv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)7170 radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBeginInfo *pBeginInfo)
7171 {
7172    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7173    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7174    const struct radv_physical_device *pdev = radv_device_physical(device);
7175    VkResult result = VK_SUCCESS;
7176 
7177    vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
7178 
7179    if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
7180       return result;
7181 
7182    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
7183    cmd_buffer->state.last_index_type = -1;
7184    cmd_buffer->state.last_num_instances = -1;
7185    cmd_buffer->state.last_vertex_offset_valid = false;
7186    cmd_buffer->state.last_first_instance = -1;
7187    cmd_buffer->state.last_drawid = -1;
7188    cmd_buffer->state.last_subpass_color_count = MAX_RTS;
7189    cmd_buffer->state.predication_type = -1;
7190    cmd_buffer->state.mesh_shading = false;
7191    cmd_buffer->state.last_vrs_rates = -1;
7192    cmd_buffer->state.last_force_vrs_rates_offset = -1;
7193 
7194    radv_reset_tracked_regs(cmd_buffer);
7195 
7196    cmd_buffer->usage_flags = pBeginInfo->flags;
7197 
7198    cmd_buffer->state.dirty |=
7199       RADV_CMD_DIRTY_GUARDBAND | RADV_CMD_DIRTY_OCCLUSION_QUERY | RADV_CMD_DIRTY_DB_SHADER_CONTROL;
7200    cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_ALL;
7201 
7202    if (cmd_buffer->qf == RADV_QUEUE_GENERAL)
7203       vk_dynamic_graphics_state_init(&cmd_buffer->state.dynamic.vk);
7204 
7205    if (cmd_buffer->qf == RADV_QUEUE_COMPUTE || device->vk.enabled_features.taskShader) {
7206       uint32_t pred_value = 0;
7207       uint32_t pred_offset;
7208       if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &pred_value, &pred_offset))
7209          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7210 
7211       cmd_buffer->state.mec_inv_pred_emitted = false;
7212       cmd_buffer->state.mec_inv_pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
7213    }
7214 
7215    if (pdev->info.gfx_level >= GFX9 && cmd_buffer->qf == RADV_QUEUE_GENERAL) {
7216       unsigned num_db = pdev->info.max_render_backends;
7217       unsigned fence_offset, eop_bug_offset;
7218       void *fence_ptr;
7219 
7220       radv_cmd_buffer_upload_alloc(cmd_buffer, 8, &fence_offset, &fence_ptr);
7221       memset(fence_ptr, 0, 8);
7222 
7223       cmd_buffer->gfx9_fence_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
7224       cmd_buffer->gfx9_fence_va += fence_offset;
7225 
7226       radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_fence_va, 8);
7227 
7228       if (pdev->info.gfx_level == GFX9) {
7229          /* Allocate a buffer for the EOP bug on GFX9. */
7230          radv_cmd_buffer_upload_alloc(cmd_buffer, 16 * num_db, &eop_bug_offset, &fence_ptr);
7231          memset(fence_ptr, 0, 16 * num_db);
7232          cmd_buffer->gfx9_eop_bug_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
7233          cmd_buffer->gfx9_eop_bug_va += eop_bug_offset;
7234 
7235          radv_emit_clear_data(cmd_buffer, V_370_PFP, cmd_buffer->gfx9_eop_bug_va, 16 * num_db);
7236       }
7237    }
7238 
7239    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
7240        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
7241 
7242       char gcbiar_data[VK_GCBIARR_DATA_SIZE(MAX_RTS)];
7243       const VkRenderingInfo *resume_info =
7244          vk_get_command_buffer_inheritance_as_rendering_resume(cmd_buffer->vk.level, pBeginInfo, gcbiar_data);
7245       if (resume_info) {
7246          radv_CmdBeginRendering(commandBuffer, resume_info);
7247       } else {
7248          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
7249             vk_get_command_buffer_inheritance_rendering_info(cmd_buffer->vk.level, pBeginInfo);
7250 
7251          radv_cmd_buffer_reset_rendering(cmd_buffer);
7252          struct radv_rendering_state *render = &cmd_buffer->state.render;
7253          render->active = true;
7254          render->view_mask = inheritance_info->viewMask;
7255          render->max_samples = inheritance_info->rasterizationSamples;
7256          render->color_att_count = inheritance_info->colorAttachmentCount;
7257          for (uint32_t i = 0; i < render->color_att_count; i++) {
7258             render->color_att[i] = (struct radv_attachment){
7259                .format = inheritance_info->pColorAttachmentFormats[i],
7260             };
7261          }
7262          assert(inheritance_info->depthAttachmentFormat == VK_FORMAT_UNDEFINED ||
7263                 inheritance_info->stencilAttachmentFormat == VK_FORMAT_UNDEFINED ||
7264                 inheritance_info->depthAttachmentFormat == inheritance_info->stencilAttachmentFormat);
7265          render->ds_att = (struct radv_attachment){.iview = NULL};
7266          if (inheritance_info->depthAttachmentFormat != VK_FORMAT_UNDEFINED)
7267             render->ds_att.format = inheritance_info->depthAttachmentFormat;
7268          if (inheritance_info->stencilAttachmentFormat != VK_FORMAT_UNDEFINED)
7269             render->ds_att.format = inheritance_info->stencilAttachmentFormat;
7270 
7271          if (vk_format_has_depth(render->ds_att.format))
7272             render->ds_att_aspects |= VK_IMAGE_ASPECT_DEPTH_BIT;
7273          if (vk_format_has_stencil(render->ds_att.format))
7274             render->ds_att_aspects |= VK_IMAGE_ASPECT_STENCIL_BIT;
7275       }
7276 
7277       cmd_buffer->state.inherited_pipeline_statistics = pBeginInfo->pInheritanceInfo->pipelineStatistics;
7278 
7279       if (cmd_buffer->state.inherited_pipeline_statistics &
7280           (VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
7281            VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT))
7282          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
7283 
7284       cmd_buffer->state.inherited_occlusion_queries = pBeginInfo->pInheritanceInfo->occlusionQueryEnable;
7285       cmd_buffer->state.inherited_query_control_flags = pBeginInfo->pInheritanceInfo->queryFlags;
7286       if (cmd_buffer->state.inherited_occlusion_queries)
7287          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_OCCLUSION_QUERY;
7288    }
7289 
7290    if (radv_device_fault_detection_enabled(device))
7291       radv_cmd_buffer_trace_emit(cmd_buffer);
7292 
7293    radv_describe_begin_cmd_buffer(cmd_buffer);
7294 
7295    return result;
7296 }
7297 
7298 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)7299 radv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount,
7300                            const VkBuffer *pBuffers, const VkDeviceSize *pOffsets, const VkDeviceSize *pSizes,
7301                            const VkDeviceSize *pStrides)
7302 {
7303    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7304    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7305    struct radv_vertex_binding *vb = cmd_buffer->vertex_bindings;
7306    const struct radv_vertex_input_state *vi_state = &cmd_buffer->state.vertex_input;
7307 
7308    /* We have to defer setting up vertex buffer since we need the buffer
7309     * stride from the pipeline. */
7310 
7311    assert(firstBinding + bindingCount <= MAX_VBS);
7312 
7313    if (firstBinding + bindingCount > cmd_buffer->used_vertex_bindings)
7314       cmd_buffer->used_vertex_bindings = firstBinding + bindingCount;
7315 
7316    uint32_t misaligned_mask_invalid = 0;
7317 
7318    for (uint32_t i = 0; i < bindingCount; i++) {
7319       VK_FROM_HANDLE(radv_buffer, buffer, pBuffers[i]);
7320       uint32_t idx = firstBinding + i;
7321       VkDeviceSize size = pSizes ? pSizes[i] : 0;
7322       /* if pStrides=NULL, it shouldn't overwrite the strides specified by CmdSetVertexInputEXT */
7323       VkDeviceSize stride = pStrides ? pStrides[i] : vb[idx].stride;
7324 
7325       if (!!cmd_buffer->vertex_binding_buffers[idx] != !!buffer ||
7326           (buffer && ((vb[idx].offset & 0x3) != (pOffsets[i] & 0x3) || (vb[idx].stride & 0x3) != (stride & 0x3)))) {
7327          misaligned_mask_invalid |= vi_state->bindings_match_attrib ? BITFIELD_BIT(idx) : 0xffffffff;
7328       }
7329 
7330       cmd_buffer->vertex_binding_buffers[idx] = buffer;
7331       vb[idx].offset = pOffsets[i];
7332       vb[idx].size = buffer ? vk_buffer_range(&buffer->vk, pOffsets[i], size) : size;
7333       vb[idx].stride = stride;
7334 
7335       uint32_t bit = BITFIELD_BIT(idx);
7336       if (buffer) {
7337          radv_cs_add_buffer(device->ws, cmd_buffer->cs, cmd_buffer->vertex_binding_buffers[idx]->bo);
7338          cmd_buffer->state.vbo_bound_mask |= bit;
7339       } else {
7340          cmd_buffer->state.vbo_bound_mask &= ~bit;
7341       }
7342    }
7343 
7344    if (misaligned_mask_invalid) {
7345       cmd_buffer->state.vbo_misaligned_mask_invalid = misaligned_mask_invalid;
7346       cmd_buffer->state.vbo_misaligned_mask &= ~misaligned_mask_invalid;
7347       cmd_buffer->state.vbo_unaligned_mask &= ~misaligned_mask_invalid;
7348    }
7349 
7350    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
7351    cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
7352 }
7353 
7354 static uint32_t
vk_to_index_type(VkIndexType type)7355 vk_to_index_type(VkIndexType type)
7356 {
7357    switch (type) {
7358    case VK_INDEX_TYPE_UINT8_KHR:
7359       return V_028A7C_VGT_INDEX_8;
7360    case VK_INDEX_TYPE_UINT16:
7361       return V_028A7C_VGT_INDEX_16;
7362    case VK_INDEX_TYPE_UINT32:
7363       return V_028A7C_VGT_INDEX_32;
7364    default:
7365       unreachable("invalid index type");
7366    }
7367 }
7368 
7369 static uint32_t
radv_get_vgt_index_size(uint32_t type)7370 radv_get_vgt_index_size(uint32_t type)
7371 {
7372    uint32_t index_type = G_028A7C_INDEX_TYPE(type);
7373    switch (index_type) {
7374    case V_028A7C_VGT_INDEX_8:
7375       return 1;
7376    case V_028A7C_VGT_INDEX_16:
7377       return 2;
7378    case V_028A7C_VGT_INDEX_32:
7379       return 4;
7380    default:
7381       unreachable("invalid index type");
7382    }
7383 }
7384 
7385 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)7386 radv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size,
7387                             VkIndexType indexType)
7388 {
7389    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7390    VK_FROM_HANDLE(radv_buffer, index_buffer, buffer);
7391    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7392    const struct radv_physical_device *pdev = radv_device_physical(device);
7393 
7394    cmd_buffer->state.index_type = vk_to_index_type(indexType);
7395 
7396    if (index_buffer) {
7397       cmd_buffer->state.index_va = radv_buffer_get_va(index_buffer->bo);
7398       cmd_buffer->state.index_va += index_buffer->offset + offset;
7399 
7400       int index_size = radv_get_vgt_index_size(vk_to_index_type(indexType));
7401       cmd_buffer->state.max_index_count = (vk_buffer_range(&index_buffer->vk, offset, size)) / index_size;
7402       radv_cs_add_buffer(device->ws, cmd_buffer->cs, index_buffer->bo);
7403    } else {
7404       cmd_buffer->state.index_va = 0;
7405       cmd_buffer->state.max_index_count = 0;
7406 
7407       if (pdev->info.has_null_index_buffer_clamping_bug)
7408          cmd_buffer->state.index_va = 0x2;
7409    }
7410 
7411    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
7412 
7413    /* Primitive restart state depends on the index type. */
7414    if (cmd_buffer->state.dynamic.vk.ia.primitive_restart_enable)
7415       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
7416 }
7417 
7418 static void
radv_bind_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point,struct radv_descriptor_set * set,unsigned idx)7419 radv_bind_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point,
7420                          struct radv_descriptor_set *set, unsigned idx)
7421 {
7422    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7423    struct radeon_winsys *ws = device->ws;
7424 
7425    radv_set_descriptor_set(cmd_buffer, bind_point, set, idx);
7426 
7427    assert(set);
7428    assert(!(set->header.layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR));
7429 
7430    if (!device->use_global_bo_list) {
7431       for (unsigned j = 0; j < set->header.buffer_count; ++j)
7432          if (set->descriptors[j])
7433             radv_cs_add_buffer(ws, cmd_buffer->cs, set->descriptors[j]);
7434    }
7435 
7436    if (set->header.bo)
7437       radv_cs_add_buffer(ws, cmd_buffer->cs, set->header.bo);
7438 }
7439 
7440 static void
radv_bind_descriptor_sets(struct radv_cmd_buffer * cmd_buffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo,VkPipelineBindPoint bind_point)7441 radv_bind_descriptor_sets(struct radv_cmd_buffer *cmd_buffer,
7442                           const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo, VkPipelineBindPoint bind_point)
7443 {
7444    VK_FROM_HANDLE(radv_pipeline_layout, layout, pBindDescriptorSetsInfo->layout);
7445    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7446    const struct radv_physical_device *pdev = radv_device_physical(device);
7447    const struct radv_instance *instance = radv_physical_device_instance(pdev);
7448    const bool no_dynamic_bounds = instance->debug_flags & RADV_DEBUG_NO_DYNAMIC_BOUNDS;
7449    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
7450    unsigned dyn_idx = 0;
7451 
7452    for (unsigned i = 0; i < pBindDescriptorSetsInfo->descriptorSetCount; ++i) {
7453       unsigned set_idx = i + pBindDescriptorSetsInfo->firstSet;
7454       VK_FROM_HANDLE(radv_descriptor_set, set, pBindDescriptorSetsInfo->pDescriptorSets[i]);
7455 
7456       if (!set)
7457          continue;
7458 
7459       /* If the set is already bound we only need to update the
7460        * (potentially changed) dynamic offsets. */
7461       if (descriptors_state->sets[set_idx] != set || !(descriptors_state->valid & (1u << set_idx))) {
7462          radv_bind_descriptor_set(cmd_buffer, bind_point, set, set_idx);
7463       }
7464 
7465       for (unsigned j = 0; j < set->header.layout->dynamic_offset_count; ++j, ++dyn_idx) {
7466          unsigned idx = j + layout->set[i + pBindDescriptorSetsInfo->firstSet].dynamic_offset_start;
7467          uint32_t *dst = descriptors_state->dynamic_buffers + idx * 4;
7468          assert(dyn_idx < pBindDescriptorSetsInfo->dynamicOffsetCount);
7469 
7470          struct radv_descriptor_range *range = set->header.dynamic_descriptors + j;
7471 
7472          if (!range->va) {
7473             memset(dst, 0, 4 * 4);
7474          } else {
7475             uint64_t va = range->va + pBindDescriptorSetsInfo->pDynamicOffsets[dyn_idx];
7476             const uint32_t size = no_dynamic_bounds ? 0xffffffffu : range->size;
7477 
7478             ac_build_raw_buffer_descriptor(pdev->info.gfx_level, va, size, dst);
7479          }
7480 
7481          cmd_buffer->push_constant_stages |= set->header.layout->dynamic_shader_stages;
7482       }
7483    }
7484 }
7485 
7486 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,const VkBindDescriptorSetsInfoKHR * pBindDescriptorSetsInfo)7487 radv_CmdBindDescriptorSets2KHR(VkCommandBuffer commandBuffer,
7488                                const VkBindDescriptorSetsInfoKHR *pBindDescriptorSetsInfo)
7489 {
7490    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7491 
7492    if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
7493       radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
7494    }
7495 
7496    if (pBindDescriptorSetsInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
7497       radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
7498    }
7499 
7500    if (pBindDescriptorSetsInfo->stageFlags & RADV_RT_STAGE_BITS) {
7501       radv_bind_descriptor_sets(cmd_buffer, pBindDescriptorSetsInfo, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
7502    }
7503 }
7504 
7505 static bool
radv_init_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,struct radv_descriptor_set * set,struct radv_descriptor_set_layout * layout,VkPipelineBindPoint bind_point)7506 radv_init_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, struct radv_descriptor_set *set,
7507                               struct radv_descriptor_set_layout *layout, VkPipelineBindPoint bind_point)
7508 {
7509    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
7510    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7511    set->header.size = layout->size;
7512 
7513    if (set->header.layout != layout) {
7514       if (set->header.layout)
7515          vk_descriptor_set_layout_unref(&device->vk, &set->header.layout->vk);
7516       vk_descriptor_set_layout_ref(&layout->vk);
7517       set->header.layout = layout;
7518    }
7519 
7520    if (descriptors_state->push_set.capacity < set->header.size) {
7521       size_t new_size = MAX2(set->header.size, 1024);
7522       new_size = MAX2(new_size, 2 * descriptors_state->push_set.capacity);
7523       new_size = MIN2(new_size, 96 * MAX_PUSH_DESCRIPTORS);
7524 
7525       free(set->header.mapped_ptr);
7526       set->header.mapped_ptr = malloc(new_size);
7527 
7528       if (!set->header.mapped_ptr) {
7529          descriptors_state->push_set.capacity = 0;
7530          vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7531          return false;
7532       }
7533 
7534       descriptors_state->push_set.capacity = new_size;
7535    }
7536 
7537    return true;
7538 }
7539 
7540 void
radv_meta_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t set,uint32_t descriptorWriteCount,const VkWriteDescriptorSet * pDescriptorWrites)7541 radv_meta_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint,
7542                               VkPipelineLayout _layout, uint32_t set, uint32_t descriptorWriteCount,
7543                               const VkWriteDescriptorSet *pDescriptorWrites)
7544 {
7545    VK_FROM_HANDLE(radv_pipeline_layout, layout, _layout);
7546    struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&cmd_buffer->meta_push_descriptors;
7547    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7548    unsigned bo_offset;
7549 
7550    assert(set == 0);
7551    assert(layout->set[set].layout->flags & VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
7552 
7553    push_set->header.size = layout->set[set].layout->size;
7554    push_set->header.layout = layout->set[set].layout;
7555 
7556    if (!radv_cmd_buffer_upload_alloc(cmd_buffer, push_set->header.size, &bo_offset,
7557                                      (void **)&push_set->header.mapped_ptr))
7558       return;
7559 
7560    push_set->header.va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
7561    push_set->header.va += bo_offset;
7562 
7563    radv_cmd_update_descriptor_sets(device, cmd_buffer, radv_descriptor_set_to_handle(push_set), descriptorWriteCount,
7564                                    pDescriptorWrites, 0, NULL);
7565 
7566    radv_set_descriptor_set(cmd_buffer, pipelineBindPoint, push_set, set);
7567 }
7568 
7569 static void
radv_push_descriptor_set(struct radv_cmd_buffer * cmd_buffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo,VkPipelineBindPoint bind_point)7570 radv_push_descriptor_set(struct radv_cmd_buffer *cmd_buffer, const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo,
7571                          VkPipelineBindPoint bind_point)
7572 {
7573    VK_FROM_HANDLE(radv_pipeline_layout, layout, pPushDescriptorSetInfo->layout);
7574    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
7575    struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
7576    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7577 
7578    assert(layout->set[pPushDescriptorSetInfo->set].layout->flags &
7579           VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
7580 
7581    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[pPushDescriptorSetInfo->set].layout,
7582                                       bind_point))
7583       return;
7584 
7585    /* Check that there are no inline uniform block updates when calling vkCmdPushDescriptorSetKHR()
7586     * because it is invalid, according to Vulkan spec.
7587     */
7588    for (int i = 0; i < pPushDescriptorSetInfo->descriptorWriteCount; i++) {
7589       ASSERTED const VkWriteDescriptorSet *writeset = &pPushDescriptorSetInfo->pDescriptorWrites[i];
7590       assert(writeset->descriptorType != VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK);
7591    }
7592 
7593    radv_cmd_update_descriptor_sets(device, cmd_buffer, radv_descriptor_set_to_handle(push_set),
7594                                    pPushDescriptorSetInfo->descriptorWriteCount,
7595                                    pPushDescriptorSetInfo->pDescriptorWrites, 0, NULL);
7596 
7597    radv_set_descriptor_set(cmd_buffer, bind_point, push_set, pPushDescriptorSetInfo->set);
7598 
7599    radv_flush_push_descriptors(cmd_buffer, descriptors_state);
7600 }
7601 
7602 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetInfoKHR * pPushDescriptorSetInfo)7603 radv_CmdPushDescriptorSet2KHR(VkCommandBuffer commandBuffer, const VkPushDescriptorSetInfoKHR *pPushDescriptorSetInfo)
7604 {
7605    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7606 
7607    if (pPushDescriptorSetInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
7608       radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
7609    }
7610 
7611    if (pPushDescriptorSetInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
7612       radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
7613    }
7614 
7615    if (pPushDescriptorSetInfo->stageFlags & RADV_RT_STAGE_BITS) {
7616       radv_push_descriptor_set(cmd_buffer, pPushDescriptorSetInfo, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
7617    }
7618 }
7619 
7620 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushDescriptorSetWithTemplate2KHR(VkCommandBuffer commandBuffer,const VkPushDescriptorSetWithTemplateInfoKHR * pPushDescriptorSetWithTemplateInfo)7621 radv_CmdPushDescriptorSetWithTemplate2KHR(
7622    VkCommandBuffer commandBuffer, const VkPushDescriptorSetWithTemplateInfoKHR *pPushDescriptorSetWithTemplateInfo)
7623 {
7624    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7625    VK_FROM_HANDLE(radv_pipeline_layout, layout, pPushDescriptorSetWithTemplateInfo->layout);
7626    VK_FROM_HANDLE(radv_descriptor_update_template, templ, pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate);
7627    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, templ->bind_point);
7628    struct radv_descriptor_set *push_set = (struct radv_descriptor_set *)&descriptors_state->push_set.set;
7629    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7630 
7631    assert(layout->set[pPushDescriptorSetWithTemplateInfo->set].layout->flags &
7632           VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR);
7633 
7634    if (!radv_init_push_descriptor_set(cmd_buffer, push_set, layout->set[pPushDescriptorSetWithTemplateInfo->set].layout,
7635                                       templ->bind_point))
7636       return;
7637 
7638    radv_cmd_update_descriptor_set_with_template(device, cmd_buffer, push_set,
7639                                                 pPushDescriptorSetWithTemplateInfo->descriptorUpdateTemplate,
7640                                                 pPushDescriptorSetWithTemplateInfo->pData);
7641 
7642    radv_set_descriptor_set(cmd_buffer, templ->bind_point, push_set, pPushDescriptorSetWithTemplateInfo->set);
7643 
7644    radv_flush_push_descriptors(cmd_buffer, descriptors_state);
7645 }
7646 
7647 VKAPI_ATTR void VKAPI_CALL
radv_CmdPushConstants2KHR(VkCommandBuffer commandBuffer,const VkPushConstantsInfoKHR * pPushConstantsInfo)7648 radv_CmdPushConstants2KHR(VkCommandBuffer commandBuffer, const VkPushConstantsInfoKHR *pPushConstantsInfo)
7649 {
7650    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7651    memcpy(cmd_buffer->push_constants + pPushConstantsInfo->offset, pPushConstantsInfo->pValues,
7652           pPushConstantsInfo->size);
7653    cmd_buffer->push_constant_stages |= pPushConstantsInfo->stageFlags;
7654 }
7655 
7656 VKAPI_ATTR VkResult VKAPI_CALL
radv_EndCommandBuffer(VkCommandBuffer commandBuffer)7657 radv_EndCommandBuffer(VkCommandBuffer commandBuffer)
7658 {
7659    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
7660    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7661    const struct radv_physical_device *pdev = radv_device_physical(device);
7662 
7663    if (cmd_buffer->qf == RADV_QUEUE_SPARSE)
7664       return vk_command_buffer_end(&cmd_buffer->vk);
7665 
7666    radv_emit_mip_change_flush_default(cmd_buffer);
7667 
7668    const bool is_gfx_or_ace = cmd_buffer->qf == RADV_QUEUE_GENERAL || cmd_buffer->qf == RADV_QUEUE_COMPUTE;
7669 
7670    if (is_gfx_or_ace) {
7671       if (pdev->info.gfx_level == GFX6)
7672          cmd_buffer->state.flush_bits |=
7673             RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_WB_L2;
7674 
7675       /* Make sure to sync all pending active queries at the end of
7676        * command buffer.
7677        */
7678       cmd_buffer->state.flush_bits |= cmd_buffer->active_query_flush_bits;
7679 
7680       /* Flush noncoherent images on GFX9+ so we can assume they're clean on the start of a
7681        * command buffer.
7682        */
7683       if (cmd_buffer->state.rb_noncoherent_dirty && !can_skip_buffer_l2_flushes(device))
7684          cmd_buffer->state.flush_bits |= radv_src_access_flush(
7685             cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
7686             VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, NULL);
7687 
7688       /* Since NGG streamout uses GDS, we need to make GDS idle when
7689        * we leave the IB, otherwise another process might overwrite
7690        * it while our shaders are busy.
7691        */
7692       if (cmd_buffer->gds_needed)
7693          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH;
7694    }
7695 
7696    /* Finalize the internal compute command stream, if it exists. */
7697    if (cmd_buffer->gang.cs) {
7698       VkResult result = radv_gang_finalize(cmd_buffer);
7699       if (result != VK_SUCCESS)
7700          return vk_error(cmd_buffer, result);
7701    }
7702 
7703    if (is_gfx_or_ace) {
7704       radv_emit_cache_flush(cmd_buffer);
7705 
7706       /* Make sure CP DMA is idle at the end of IBs because the kernel
7707        * doesn't wait for it.
7708        */
7709       radv_cp_dma_wait_for_idle(cmd_buffer);
7710    }
7711 
7712    radv_describe_end_cmd_buffer(cmd_buffer);
7713 
7714    VkResult result = device->ws->cs_finalize(cmd_buffer->cs);
7715    if (result != VK_SUCCESS)
7716       return vk_error(cmd_buffer, result);
7717 
7718    return vk_command_buffer_end(&cmd_buffer->vk);
7719 }
7720 
7721 static void
radv_emit_compute_pipeline(struct radv_cmd_buffer * cmd_buffer,struct radv_compute_pipeline * pipeline)7722 radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_compute_pipeline *pipeline)
7723 {
7724    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7725    const struct radv_physical_device *pdev = radv_device_physical(device);
7726 
7727    if (pipeline == cmd_buffer->state.emitted_compute_pipeline)
7728       return;
7729 
7730    radeon_check_space(device->ws, cmd_buffer->cs, pdev->info.gfx_level >= GFX10 ? 19 : 16);
7731 
7732    if (pipeline->base.type == RADV_PIPELINE_COMPUTE) {
7733       radv_emit_compute_shader(pdev, cmd_buffer->cs, cmd_buffer->state.shaders[MESA_SHADER_COMPUTE]);
7734    } else {
7735       radv_emit_compute_shader(pdev, cmd_buffer->cs, cmd_buffer->state.rt_prolog);
7736    }
7737 
7738    cmd_buffer->state.emitted_compute_pipeline = pipeline;
7739 
7740    if (radv_device_fault_detection_enabled(device))
7741       radv_save_pipeline(cmd_buffer, &pipeline->base);
7742 }
7743 
7744 static void
radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)7745 radv_mark_descriptor_sets_dirty(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
7746 {
7747    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
7748 
7749    descriptors_state->dirty |= descriptors_state->valid;
7750 }
7751 
7752 static void
radv_bind_vs_input_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_graphics_pipeline * pipeline)7753 radv_bind_vs_input_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_graphics_pipeline *pipeline)
7754 {
7755    const struct radv_shader *vs_shader = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
7756    const struct radv_vertex_input_state *src = &pipeline->vertex_input;
7757 
7758    /* Bind the vertex input state from the pipeline when it's static. */
7759    if (!vs_shader || !vs_shader->info.vs.vb_desc_usage_mask || (pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT))
7760       return;
7761 
7762    cmd_buffer->state.vertex_input = *src;
7763 
7764    if (!(pipeline->dynamic_states & RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE)) {
7765       for (uint32_t i = 0; i < MAX_VBS; i++)
7766          cmd_buffer->vertex_bindings[i].stride = pipeline->binding_stride[i];
7767    }
7768 
7769    /* When the vertex input state is static but the VS has been compiled without it (GPL), the
7770     * driver needs to compile a VS prolog.
7771     */
7772    if (!vs_shader->info.vs.has_prolog)
7773       return;
7774 
7775    cmd_buffer->state.vbo_misaligned_mask = 0;
7776    cmd_buffer->state.vbo_unaligned_mask = 0;
7777    cmd_buffer->state.vbo_misaligned_mask_invalid = src->attribute_mask;
7778 
7779    cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
7780 }
7781 
7782 static void
radv_bind_multisample_state(struct radv_cmd_buffer * cmd_buffer,const struct radv_multisample_state * ms)7783 radv_bind_multisample_state(struct radv_cmd_buffer *cmd_buffer, const struct radv_multisample_state *ms)
7784 {
7785    if (ms->sample_shading_enable) {
7786       cmd_buffer->state.ms.sample_shading_enable = true;
7787       cmd_buffer->state.ms.min_sample_shading = ms->min_sample_shading;
7788    }
7789 }
7790 
7791 static void
radv_bind_custom_blend_mode(struct radv_cmd_buffer * cmd_buffer,unsigned custom_blend_mode)7792 radv_bind_custom_blend_mode(struct radv_cmd_buffer *cmd_buffer, unsigned custom_blend_mode)
7793 {
7794    /* Re-emit CB_COLOR_CONTROL when the custom blending mode changes. */
7795    if (cmd_buffer->state.custom_blend_mode != custom_blend_mode)
7796       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_LOGIC_OP | RADV_DYNAMIC_LOGIC_OP_ENABLE;
7797 
7798    cmd_buffer->state.custom_blend_mode = custom_blend_mode;
7799 }
7800 
7801 static void
radv_bind_pre_rast_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * shader)7802 radv_bind_pre_rast_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
7803 {
7804    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7805    const struct radv_physical_device *pdev = radv_device_physical(device);
7806    bool mesh_shading = shader->info.stage == MESA_SHADER_MESH;
7807    const struct radv_userdata_info *loc;
7808 
7809    assert(shader->info.stage == MESA_SHADER_VERTEX || shader->info.stage == MESA_SHADER_TESS_CTRL ||
7810           shader->info.stage == MESA_SHADER_TESS_EVAL || shader->info.stage == MESA_SHADER_GEOMETRY ||
7811           shader->info.stage == MESA_SHADER_MESH);
7812 
7813    if (radv_get_user_sgpr_info(shader, AC_UD_NGG_PROVOKING_VTX)->sgpr_idx != -1) {
7814       /* Re-emit the provoking vertex mode state because the SGPR idx can be different. */
7815       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PROVOKING_VERTEX_MODE;
7816    }
7817 
7818    if (radv_get_user_sgpr_info(shader, AC_UD_STREAMOUT_BUFFERS)->sgpr_idx != -1) {
7819       /* Re-emit the streamout buffers because the SGPR idx can be different and with NGG streamout
7820        * they always need to be emitted because a buffer size of 0 is used to disable streamout.
7821        */
7822       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
7823 
7824       if (pdev->use_ngg_streamout && pdev->info.gfx_level < GFX12) {
7825          /* GFX11 needs GDS OA for streamout. */
7826          cmd_buffer->gds_oa_needed = true;
7827       }
7828    }
7829 
7830    if (radv_get_user_sgpr_info(shader, AC_UD_NUM_VERTS_PER_PRIM)->sgpr_idx != -1) {
7831       /* Re-emit the primitive topology because the SGPR idx can be different. */
7832       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
7833    }
7834 
7835    if (radv_get_user_sgpr_info(shader, AC_UD_SHADER_QUERY_STATE)->sgpr_idx != -1) {
7836       /* Re-emit shader query state when SGPR exists but location potentially changed. */
7837       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY;
7838    }
7839 
7840    const bool needs_vtx_sgpr =
7841       shader->info.stage == MESA_SHADER_VERTEX || shader->info.stage == MESA_SHADER_MESH ||
7842       (shader->info.stage == MESA_SHADER_GEOMETRY && !shader->info.merged_shader_compiled_separately) ||
7843       (shader->info.stage == MESA_SHADER_TESS_CTRL && !shader->info.merged_shader_compiled_separately);
7844 
7845    loc = radv_get_user_sgpr_info(shader, AC_UD_VS_BASE_VERTEX_START_INSTANCE);
7846    if (needs_vtx_sgpr && loc->sgpr_idx != -1) {
7847       cmd_buffer->state.vtx_base_sgpr = shader->info.user_data_0 + loc->sgpr_idx * 4;
7848       cmd_buffer->state.vtx_emit_num = loc->num_sgprs;
7849       cmd_buffer->state.uses_drawid = shader->info.vs.needs_draw_id;
7850       cmd_buffer->state.uses_baseinstance = shader->info.vs.needs_base_instance;
7851 
7852       if (shader->info.merged_shader_compiled_separately) {
7853          /* Merged shaders compiled separately (eg. VS+TCS) always declare these user SGPRS
7854           * because the input arguments must match.
7855           */
7856          cmd_buffer->state.uses_drawid = true;
7857          cmd_buffer->state.uses_baseinstance = true;
7858       }
7859 
7860       /* Re-emit some vertex states because the SGPR idx can be different. */
7861       cmd_buffer->state.last_first_instance = -1;
7862       cmd_buffer->state.last_vertex_offset_valid = false;
7863       cmd_buffer->state.last_drawid = -1;
7864    }
7865 
7866    if (mesh_shading != cmd_buffer->state.mesh_shading) {
7867       /* Re-emit VRS state because the combiner is different (vertex vs primitive). Re-emit
7868        * primitive topology because the mesh shading pipeline clobbered it.
7869        */
7870       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
7871    }
7872 
7873    cmd_buffer->state.mesh_shading = mesh_shading;
7874 }
7875 
7876 static void
radv_bind_vertex_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * vs)7877 radv_bind_vertex_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *vs)
7878 {
7879    radv_bind_pre_rast_shader(cmd_buffer, vs);
7880 
7881    /* Re-emit states that need to be updated when the vertex shader is compiled separately
7882     * because shader configs are combined.
7883     */
7884    if (vs->info.merged_shader_compiled_separately && vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
7885       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PATCH_CONTROL_POINTS;
7886    }
7887 
7888    /* Can't put anything else here due to merged shaders */
7889 }
7890 
7891 static void
radv_bind_tess_ctrl_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * tcs)7892 radv_bind_tess_ctrl_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *tcs)
7893 {
7894    radv_bind_pre_rast_shader(cmd_buffer, tcs);
7895 
7896    cmd_buffer->tess_rings_needed = true;
7897 
7898    /* Always re-emit patch control points/domain origin when a new pipeline with tessellation is
7899     * bound because a bunch of parameters (user SGPRs, TCS vertices out, ccw, etc) can be different.
7900     */
7901    cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_PATCH_CONTROL_POINTS | RADV_DYNAMIC_TESS_DOMAIN_ORIGIN;
7902 
7903    /* Re-emit the VS prolog when the tessellation control shader is compiled separately because
7904     * shader configs are combined and need to be updated.
7905     */
7906    if (tcs->info.merged_shader_compiled_separately)
7907       cmd_buffer->state.emitted_vs_prolog = NULL;
7908 }
7909 
7910 static void
radv_bind_tess_eval_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * tes)7911 radv_bind_tess_eval_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *tes)
7912 {
7913    radv_bind_pre_rast_shader(cmd_buffer, tes);
7914 
7915    /* Can't put anything else here due to merged shaders */
7916 }
7917 
7918 static void
radv_bind_geometry_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * gs)7919 radv_bind_geometry_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *gs)
7920 {
7921    radv_bind_pre_rast_shader(cmd_buffer, gs);
7922 
7923    cmd_buffer->esgs_ring_size_needed = MAX2(cmd_buffer->esgs_ring_size_needed, gs->info.gs_ring_info.esgs_ring_size);
7924    cmd_buffer->gsvs_ring_size_needed = MAX2(cmd_buffer->gsvs_ring_size_needed, gs->info.gs_ring_info.gsvs_ring_size);
7925 
7926    /* Re-emit the VS prolog when the geometry shader is compiled separately because shader configs
7927     * are combined and need to be updated.
7928     */
7929    if (gs->info.merged_shader_compiled_separately)
7930       cmd_buffer->state.emitted_vs_prolog = NULL;
7931 }
7932 
7933 static void
radv_bind_gs_copy_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * gs_copy_shader)7934 radv_bind_gs_copy_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *gs_copy_shader)
7935 {
7936    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7937 
7938    cmd_buffer->state.gs_copy_shader = gs_copy_shader;
7939 
7940    if (gs_copy_shader) {
7941       cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, gs_copy_shader->upload_seq);
7942 
7943       radv_cs_add_buffer(device->ws, cmd_buffer->cs, gs_copy_shader->bo);
7944    }
7945 }
7946 
7947 static void
radv_bind_mesh_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ms)7948 radv_bind_mesh_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ms)
7949 {
7950    radv_bind_pre_rast_shader(cmd_buffer, ms);
7951 
7952    cmd_buffer->mesh_scratch_ring_needed |= ms->info.ms.needs_ms_scratch_ring;
7953 }
7954 
7955 static void
radv_bind_fragment_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ps)7956 radv_bind_fragment_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ps)
7957 {
7958    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
7959    const struct radv_physical_device *pdev = radv_device_physical(device);
7960    const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
7961    const struct radv_shader *previous_ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
7962    const float min_sample_shading = 1.0f;
7963 
7964    if (ps->info.ps.needs_sample_positions) {
7965       cmd_buffer->sample_positions_needed = true;
7966    }
7967 
7968    /* Re-emit the FS state because the SGPR idx can be different. */
7969    if (radv_get_user_sgpr_info(ps, AC_UD_PS_STATE)->sgpr_idx != -1) {
7970       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_LINE_RASTERIZATION_MODE;
7971    }
7972 
7973    /* Re-emit the conservative rasterization mode because inner coverage is different. */
7974    if (!previous_ps || previous_ps->info.ps.reads_fully_covered != ps->info.ps.reads_fully_covered)
7975       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_CONSERVATIVE_RAST_MODE;
7976 
7977    if (gfx_level >= GFX10_3 && (!previous_ps || previous_ps->info.ps.force_sample_iter_shading_rate !=
7978                                                    ps->info.ps.force_sample_iter_shading_rate))
7979       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
7980 
7981    if (cmd_buffer->state.ms.sample_shading_enable != ps->info.ps.uses_sample_shading) {
7982       cmd_buffer->state.ms.sample_shading_enable = ps->info.ps.uses_sample_shading;
7983       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
7984 
7985       if (gfx_level >= GFX10_3)
7986          cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
7987    }
7988 
7989    if (cmd_buffer->state.ms.min_sample_shading != min_sample_shading) {
7990       cmd_buffer->state.ms.min_sample_shading = min_sample_shading;
7991       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
7992    }
7993 
7994    if (!previous_ps || previous_ps->info.regs.ps.db_shader_control != ps->info.regs.ps.db_shader_control ||
7995        previous_ps->info.ps.pops_is_per_sample != ps->info.ps.pops_is_per_sample)
7996       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL;
7997 
7998    if (!previous_ps || cmd_buffer->state.uses_fbfetch_output != ps->info.ps.uses_fbfetch_output) {
7999       cmd_buffer->state.uses_fbfetch_output = ps->info.ps.uses_fbfetch_output;
8000       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FBFETCH_OUTPUT;
8001    }
8002 
8003    /* Re-emit the PS epilog when a new fragment shader is bound. */
8004    if (ps->info.ps.has_epilog)
8005       cmd_buffer->state.emitted_ps_epilog = NULL;
8006 }
8007 
8008 static void
radv_bind_task_shader(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * ts)8009 radv_bind_task_shader(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *ts)
8010 {
8011    if (!radv_gang_init(cmd_buffer))
8012       return;
8013 
8014    cmd_buffer->task_rings_needed = true;
8015 }
8016 
8017 static void
radv_bind_rt_prolog(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * rt_prolog)8018 radv_bind_rt_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *rt_prolog)
8019 {
8020    cmd_buffer->state.rt_prolog = rt_prolog;
8021 
8022    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8023    const unsigned max_scratch_waves = radv_get_max_scratch_waves(device, rt_prolog);
8024    cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted, max_scratch_waves);
8025 
8026    cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, rt_prolog->upload_seq);
8027 
8028    radv_cs_add_buffer(device->ws, cmd_buffer->cs, rt_prolog->bo);
8029 }
8030 
8031 /* This function binds/unbinds a shader to the cmdbuffer state. */
8032 static void
radv_bind_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader * shader,gl_shader_stage stage)8033 radv_bind_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *shader, gl_shader_stage stage)
8034 {
8035    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8036 
8037    if (!shader) {
8038       cmd_buffer->state.shaders[stage] = NULL;
8039       cmd_buffer->state.active_stages &= ~mesa_to_vk_shader_stage(stage);
8040 
8041       /* Reset some dynamic states when a shader stage is unbound. */
8042       switch (stage) {
8043       case MESA_SHADER_FRAGMENT:
8044          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_DB_SHADER_CONTROL;
8045          cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_CONSERVATIVE_RAST_MODE | RADV_DYNAMIC_RASTERIZATION_SAMPLES |
8046                                             RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
8047          break;
8048       default:
8049          break;
8050       }
8051       return;
8052    }
8053 
8054    switch (stage) {
8055    case MESA_SHADER_VERTEX:
8056       radv_bind_vertex_shader(cmd_buffer, shader);
8057       break;
8058    case MESA_SHADER_TESS_CTRL:
8059       radv_bind_tess_ctrl_shader(cmd_buffer, shader);
8060       break;
8061    case MESA_SHADER_TESS_EVAL:
8062       radv_bind_tess_eval_shader(cmd_buffer, shader);
8063       break;
8064    case MESA_SHADER_GEOMETRY:
8065       radv_bind_geometry_shader(cmd_buffer, shader);
8066       break;
8067    case MESA_SHADER_FRAGMENT:
8068       radv_bind_fragment_shader(cmd_buffer, shader);
8069       break;
8070    case MESA_SHADER_MESH:
8071       radv_bind_mesh_shader(cmd_buffer, shader);
8072       break;
8073    case MESA_SHADER_TASK:
8074       radv_bind_task_shader(cmd_buffer, shader);
8075       break;
8076    case MESA_SHADER_COMPUTE: {
8077       cmd_buffer->compute_scratch_size_per_wave_needed =
8078          MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, shader->config.scratch_bytes_per_wave);
8079 
8080       const unsigned max_stage_waves = radv_get_max_scratch_waves(device, shader);
8081       cmd_buffer->compute_scratch_waves_wanted = MAX2(cmd_buffer->compute_scratch_waves_wanted, max_stage_waves);
8082       break;
8083    }
8084    case MESA_SHADER_INTERSECTION:
8085       /* no-op */
8086       break;
8087    default:
8088       unreachable("invalid shader stage");
8089    }
8090 
8091    cmd_buffer->state.shaders[stage] = shader;
8092    cmd_buffer->state.active_stages |= mesa_to_vk_shader_stage(stage);
8093 
8094    if (mesa_to_vk_shader_stage(stage) & RADV_GRAPHICS_STAGE_BITS) {
8095       cmd_buffer->scratch_size_per_wave_needed =
8096          MAX2(cmd_buffer->scratch_size_per_wave_needed, shader->config.scratch_bytes_per_wave);
8097 
8098       const unsigned max_stage_waves = radv_get_max_scratch_waves(device, shader);
8099       cmd_buffer->scratch_waves_wanted = MAX2(cmd_buffer->scratch_waves_wanted, max_stage_waves);
8100    }
8101 
8102    cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, shader->upload_seq);
8103 
8104    radv_cs_add_buffer(device->ws, cmd_buffer->cs, shader->bo);
8105 }
8106 
8107 static void
radv_reset_shader_object_state(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint)8108 radv_reset_shader_object_state(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint)
8109 {
8110    switch (pipelineBindPoint) {
8111    case VK_PIPELINE_BIND_POINT_COMPUTE:
8112       if (cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE]) {
8113          radv_bind_shader(cmd_buffer, NULL, MESA_SHADER_COMPUTE);
8114          cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE] = NULL;
8115       }
8116       break;
8117    case VK_PIPELINE_BIND_POINT_GRAPHICS:
8118       radv_foreach_stage(s, RADV_GRAPHICS_STAGE_BITS)
8119       {
8120          if (cmd_buffer->state.shader_objs[s]) {
8121             radv_bind_shader(cmd_buffer, NULL, s);
8122             cmd_buffer->state.shader_objs[s] = NULL;
8123          }
8124       }
8125       break;
8126    default:
8127       break;
8128    }
8129 
8130    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_GRAPHICS_SHADERS;
8131 }
8132 
8133 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)8134 radv_CmdBindPipeline(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint, VkPipeline _pipeline)
8135 {
8136    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8137    VK_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
8138    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8139    const struct radv_physical_device *pdev = radv_device_physical(device);
8140 
8141    radv_reset_shader_object_state(cmd_buffer, pipelineBindPoint);
8142 
8143    switch (pipelineBindPoint) {
8144    case VK_PIPELINE_BIND_POINT_COMPUTE: {
8145       struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
8146 
8147       if (cmd_buffer->state.compute_pipeline == compute_pipeline)
8148          return;
8149       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
8150 
8151       radv_bind_shader(cmd_buffer, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], MESA_SHADER_COMPUTE);
8152 
8153       cmd_buffer->state.compute_pipeline = compute_pipeline;
8154       cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
8155       break;
8156    }
8157    case VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR: {
8158       struct radv_ray_tracing_pipeline *rt_pipeline = radv_pipeline_to_ray_tracing(pipeline);
8159 
8160       if (cmd_buffer->state.rt_pipeline == rt_pipeline)
8161          return;
8162       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
8163 
8164       radv_bind_shader(cmd_buffer, rt_pipeline->base.base.shaders[MESA_SHADER_INTERSECTION], MESA_SHADER_INTERSECTION);
8165       radv_bind_rt_prolog(cmd_buffer, rt_pipeline->prolog);
8166 
8167       for (unsigned i = 0; i < rt_pipeline->stage_count; ++i) {
8168          struct radv_shader *shader = rt_pipeline->stages[i].shader;
8169          if (shader)
8170             radv_cs_add_buffer(device->ws, cmd_buffer->cs, shader->bo);
8171       }
8172 
8173       cmd_buffer->state.rt_pipeline = rt_pipeline;
8174       cmd_buffer->push_constant_stages |= RADV_RT_STAGE_BITS;
8175 
8176       /* Bind the stack size when it's not dynamic. */
8177       if (rt_pipeline->stack_size != -1u)
8178          cmd_buffer->state.rt_stack_size = rt_pipeline->stack_size;
8179 
8180       break;
8181    }
8182    case VK_PIPELINE_BIND_POINT_GRAPHICS: {
8183       struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
8184 
8185       /* Bind the non-dynamic graphics state from the pipeline unconditionally because some PSO
8186        * might have been overwritten between two binds of the same pipeline.
8187        */
8188       radv_bind_dynamic_state(cmd_buffer, &graphics_pipeline->dynamic_state);
8189 
8190       if (cmd_buffer->state.graphics_pipeline == graphics_pipeline)
8191          return;
8192       radv_mark_descriptor_sets_dirty(cmd_buffer, pipelineBindPoint);
8193 
8194       radv_foreach_stage(
8195          stage, (cmd_buffer->state.active_stages | graphics_pipeline->active_stages) & RADV_GRAPHICS_STAGE_BITS)
8196       {
8197          radv_bind_shader(cmd_buffer, graphics_pipeline->base.shaders[stage], stage);
8198       }
8199 
8200       radv_bind_gs_copy_shader(cmd_buffer, graphics_pipeline->base.gs_copy_shader);
8201 
8202       cmd_buffer->state.last_vgt_shader = graphics_pipeline->base.shaders[graphics_pipeline->last_vgt_api_stage];
8203 
8204       cmd_buffer->state.graphics_pipeline = graphics_pipeline;
8205 
8206       cmd_buffer->state.has_nggc = graphics_pipeline->has_ngg_culling;
8207       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_PIPELINE;
8208       cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
8209 
8210       /* Prefetch all pipeline shaders at first draw time. */
8211       cmd_buffer->state.prefetch_L2_mask |= RADV_PREFETCH_SHADERS;
8212 
8213       if (pdev->info.has_vgt_flush_ngg_legacy_bug &&
8214           (!cmd_buffer->state.emitted_graphics_pipeline ||
8215            (cmd_buffer->state.emitted_graphics_pipeline->is_ngg && !cmd_buffer->state.graphics_pipeline->is_ngg))) {
8216          /* Transitioning from NGG to legacy GS requires
8217           * VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH
8218           * is also emitted at the beginning of IBs when legacy
8219           * GS ring pointers are set.
8220           */
8221          cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
8222       }
8223 
8224       cmd_buffer->state.uses_dynamic_patch_control_points =
8225          !!(graphics_pipeline->dynamic_states & RADV_DYNAMIC_PATCH_CONTROL_POINTS);
8226 
8227       if (graphics_pipeline->active_stages & VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) {
8228          if (!cmd_buffer->state.uses_dynamic_patch_control_points) {
8229             /* Bind the tessellation state from the pipeline when it's not dynamic. */
8230             struct radv_shader *tcs = cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL];
8231 
8232             cmd_buffer->state.tess_num_patches = tcs->info.num_tess_patches;
8233             cmd_buffer->state.tess_lds_size = tcs->info.tcs.num_lds_blocks;
8234          }
8235       }
8236 
8237       const struct radv_shader *vs = radv_get_shader(graphics_pipeline->base.shaders, MESA_SHADER_VERTEX);
8238       if (vs) {
8239          /* Re-emit the VS prolog when a new vertex shader is bound. */
8240          if (vs->info.vs.has_prolog) {
8241             cmd_buffer->state.emitted_vs_prolog = NULL;
8242             cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
8243          }
8244 
8245          /* Re-emit the vertex buffer descriptors because they are really tied to the pipeline. */
8246          if (vs->info.vs.vb_desc_usage_mask) {
8247             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
8248          }
8249       }
8250 
8251       if (!cmd_buffer->state.emitted_graphics_pipeline ||
8252           cmd_buffer->state.spi_shader_col_format != graphics_pipeline->spi_shader_col_format) {
8253          cmd_buffer->state.spi_shader_col_format = graphics_pipeline->spi_shader_col_format;
8254          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
8255          if (pdev->info.rbplus_allowed)
8256             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
8257       }
8258 
8259       if (!cmd_buffer->state.emitted_graphics_pipeline ||
8260           cmd_buffer->state.cb_shader_mask != graphics_pipeline->cb_shader_mask) {
8261          cmd_buffer->state.cb_shader_mask = graphics_pipeline->cb_shader_mask;
8262          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
8263       }
8264 
8265       radv_bind_vs_input_state(cmd_buffer, graphics_pipeline);
8266 
8267       radv_bind_multisample_state(cmd_buffer, &graphics_pipeline->ms);
8268 
8269       radv_bind_custom_blend_mode(cmd_buffer, graphics_pipeline->custom_blend_mode);
8270 
8271       cmd_buffer->state.db_render_control = graphics_pipeline->db_render_control;
8272 
8273       cmd_buffer->state.rast_prim = graphics_pipeline->rast_prim;
8274 
8275       cmd_buffer->state.ia_multi_vgt_param = graphics_pipeline->ia_multi_vgt_param;
8276 
8277       cmd_buffer->state.uses_out_of_order_rast = graphics_pipeline->uses_out_of_order_rast;
8278       cmd_buffer->state.uses_vrs = graphics_pipeline->uses_vrs;
8279       cmd_buffer->state.uses_vrs_attachment = graphics_pipeline->uses_vrs_attachment;
8280       cmd_buffer->state.uses_vrs_coarse_shading = graphics_pipeline->uses_vrs_coarse_shading;
8281       break;
8282    }
8283    default:
8284       assert(!"invalid bind point");
8285       break;
8286    }
8287 
8288    cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].size = pipeline->push_constant_size;
8289    cmd_buffer->push_constant_state[vk_to_bind_point(pipelineBindPoint)].dynamic_offset_count =
8290       pipeline->dynamic_offset_count;
8291    cmd_buffer->descriptors[vk_to_bind_point(pipelineBindPoint)].need_indirect_descriptor_sets =
8292       pipeline->need_indirect_descriptor_sets;
8293 }
8294 
8295 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)8296 radv_CmdSetViewport(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
8297                     const VkViewport *pViewports)
8298 {
8299    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8300    struct radv_cmd_state *state = &cmd_buffer->state;
8301    ASSERTED const uint32_t total_count = firstViewport + viewportCount;
8302 
8303    assert(firstViewport < MAX_VIEWPORTS);
8304    assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
8305 
8306    if (state->dynamic.vk.vp.viewport_count < total_count)
8307       state->dynamic.vk.vp.viewport_count = total_count;
8308 
8309    memcpy(state->dynamic.vk.vp.viewports + firstViewport, pViewports, viewportCount * sizeof(*pViewports));
8310    for (unsigned i = 0; i < viewportCount; i++) {
8311       radv_get_viewport_xform(&pViewports[i], state->dynamic.hw_vp.xform[i + firstViewport].scale,
8312                               state->dynamic.hw_vp.xform[i + firstViewport].translate);
8313    }
8314 
8315    state->dirty_dynamic |= RADV_DYNAMIC_VIEWPORT;
8316    state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
8317 }
8318 
8319 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)8320 radv_CmdSetScissor(VkCommandBuffer commandBuffer, uint32_t firstScissor, uint32_t scissorCount,
8321                    const VkRect2D *pScissors)
8322 {
8323    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8324    struct radv_cmd_state *state = &cmd_buffer->state;
8325    ASSERTED const uint32_t total_count = firstScissor + scissorCount;
8326 
8327    assert(firstScissor < MAX_SCISSORS);
8328    assert(total_count >= 1 && total_count <= MAX_SCISSORS);
8329 
8330    if (state->dynamic.vk.vp.scissor_count < total_count)
8331       state->dynamic.vk.vp.scissor_count = total_count;
8332 
8333    memcpy(state->dynamic.vk.vp.scissors + firstScissor, pScissors, scissorCount * sizeof(*pScissors));
8334 
8335    state->dirty_dynamic |= RADV_DYNAMIC_SCISSOR;
8336 }
8337 
8338 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineWidth(VkCommandBuffer commandBuffer,float lineWidth)8339 radv_CmdSetLineWidth(VkCommandBuffer commandBuffer, float lineWidth)
8340 {
8341    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8342    struct radv_cmd_state *state = &cmd_buffer->state;
8343 
8344    state->dynamic.vk.rs.line.width = lineWidth;
8345 
8346    state->dirty_dynamic |= RADV_DYNAMIC_LINE_WIDTH;
8347    state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
8348 }
8349 
8350 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer,const float blendConstants[4])8351 radv_CmdSetBlendConstants(VkCommandBuffer commandBuffer, const float blendConstants[4])
8352 {
8353    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8354    struct radv_cmd_state *state = &cmd_buffer->state;
8355 
8356    memcpy(state->dynamic.vk.cb.blend_constants, blendConstants, sizeof(float) * 4);
8357 
8358    state->dirty_dynamic |= RADV_DYNAMIC_BLEND_CONSTANTS;
8359 }
8360 
8361 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)8362 radv_CmdSetDepthBounds(VkCommandBuffer commandBuffer, float minDepthBounds, float maxDepthBounds)
8363 {
8364    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8365    struct radv_cmd_state *state = &cmd_buffer->state;
8366 
8367    state->dynamic.vk.ds.depth.bounds_test.min = minDepthBounds;
8368    state->dynamic.vk.ds.depth.bounds_test.max = maxDepthBounds;
8369 
8370    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_BOUNDS;
8371 }
8372 
8373 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t compareMask)8374 radv_CmdSetStencilCompareMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t compareMask)
8375 {
8376    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8377    struct radv_cmd_state *state = &cmd_buffer->state;
8378 
8379    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
8380       state->dynamic.vk.ds.stencil.front.compare_mask = compareMask;
8381    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
8382       state->dynamic.vk.ds.stencil.back.compare_mask = compareMask;
8383 
8384    state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_COMPARE_MASK;
8385 }
8386 
8387 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t writeMask)8388 radv_CmdSetStencilWriteMask(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t writeMask)
8389 {
8390    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8391    struct radv_cmd_state *state = &cmd_buffer->state;
8392 
8393    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
8394       state->dynamic.vk.ds.stencil.front.write_mask = writeMask;
8395    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
8396       state->dynamic.vk.ds.stencil.back.write_mask = writeMask;
8397 
8398    state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_WRITE_MASK;
8399 }
8400 
8401 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilReference(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,uint32_t reference)8402 radv_CmdSetStencilReference(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, uint32_t reference)
8403 {
8404    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8405    struct radv_cmd_state *state = &cmd_buffer->state;
8406 
8407    if (faceMask & VK_STENCIL_FACE_FRONT_BIT)
8408       state->dynamic.vk.ds.stencil.front.reference = reference;
8409    if (faceMask & VK_STENCIL_FACE_BACK_BIT)
8410       state->dynamic.vk.ds.stencil.back.reference = reference;
8411 
8412    state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_REFERENCE;
8413 }
8414 
8415 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer,uint32_t firstDiscardRectangle,uint32_t discardRectangleCount,const VkRect2D * pDiscardRectangles)8416 radv_CmdSetDiscardRectangleEXT(VkCommandBuffer commandBuffer, uint32_t firstDiscardRectangle,
8417                                uint32_t discardRectangleCount, const VkRect2D *pDiscardRectangles)
8418 {
8419    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8420    struct radv_cmd_state *state = &cmd_buffer->state;
8421    ASSERTED const uint32_t total_count = firstDiscardRectangle + discardRectangleCount;
8422 
8423    assert(firstDiscardRectangle < MAX_DISCARD_RECTANGLES);
8424    assert(total_count >= 1 && total_count <= MAX_DISCARD_RECTANGLES);
8425 
8426    typed_memcpy(&state->dynamic.vk.dr.rectangles[firstDiscardRectangle], pDiscardRectangles, discardRectangleCount);
8427 
8428    state->dirty_dynamic |= RADV_DYNAMIC_DISCARD_RECTANGLE;
8429 }
8430 
8431 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer,const VkSampleLocationsInfoEXT * pSampleLocationsInfo)8432 radv_CmdSetSampleLocationsEXT(VkCommandBuffer commandBuffer, const VkSampleLocationsInfoEXT *pSampleLocationsInfo)
8433 {
8434    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8435    struct radv_cmd_state *state = &cmd_buffer->state;
8436 
8437    assert(pSampleLocationsInfo->sampleLocationsCount <= MAX_SAMPLE_LOCATIONS);
8438 
8439    state->dynamic.sample_location.per_pixel = pSampleLocationsInfo->sampleLocationsPerPixel;
8440    state->dynamic.sample_location.grid_size = pSampleLocationsInfo->sampleLocationGridSize;
8441    state->dynamic.sample_location.count = pSampleLocationsInfo->sampleLocationsCount;
8442    typed_memcpy(&state->dynamic.sample_location.locations[0], pSampleLocationsInfo->pSampleLocations,
8443                 pSampleLocationsInfo->sampleLocationsCount);
8444 
8445    state->dirty_dynamic |= RADV_DYNAMIC_SAMPLE_LOCATIONS;
8446 }
8447 
8448 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleKHR(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)8449 radv_CmdSetLineStippleKHR(VkCommandBuffer commandBuffer, uint32_t lineStippleFactor, uint16_t lineStipplePattern)
8450 {
8451    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8452    struct radv_cmd_state *state = &cmd_buffer->state;
8453 
8454    state->dynamic.vk.rs.line.stipple.factor = lineStippleFactor;
8455    state->dynamic.vk.rs.line.stipple.pattern = lineStipplePattern;
8456 
8457    state->dirty_dynamic |= RADV_DYNAMIC_LINE_STIPPLE;
8458 }
8459 
8460 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCullMode(VkCommandBuffer commandBuffer,VkCullModeFlags cullMode)8461 radv_CmdSetCullMode(VkCommandBuffer commandBuffer, VkCullModeFlags cullMode)
8462 {
8463    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8464    struct radv_cmd_state *state = &cmd_buffer->state;
8465 
8466    state->dynamic.vk.rs.cull_mode = cullMode;
8467 
8468    state->dirty_dynamic |= RADV_DYNAMIC_CULL_MODE;
8469 }
8470 
8471 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFrontFace(VkCommandBuffer commandBuffer,VkFrontFace frontFace)8472 radv_CmdSetFrontFace(VkCommandBuffer commandBuffer, VkFrontFace frontFace)
8473 {
8474    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8475    struct radv_cmd_state *state = &cmd_buffer->state;
8476 
8477    state->dynamic.vk.rs.front_face = frontFace;
8478 
8479    state->dirty_dynamic |= RADV_DYNAMIC_FRONT_FACE;
8480 }
8481 
8482 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer,VkPrimitiveTopology primitiveTopology)8483 radv_CmdSetPrimitiveTopology(VkCommandBuffer commandBuffer, VkPrimitiveTopology primitiveTopology)
8484 {
8485    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8486    struct radv_cmd_state *state = &cmd_buffer->state;
8487    unsigned primitive_topology = radv_translate_prim(primitiveTopology);
8488 
8489    if (radv_primitive_topology_is_line_list(state->dynamic.vk.ia.primitive_topology) !=
8490        radv_primitive_topology_is_line_list(primitive_topology))
8491       state->dirty_dynamic |= RADV_DYNAMIC_LINE_STIPPLE;
8492 
8493    if (radv_prim_is_points_or_lines(state->dynamic.vk.ia.primitive_topology) !=
8494        radv_prim_is_points_or_lines(primitive_topology))
8495       state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
8496 
8497    state->dynamic.vk.ia.primitive_topology = primitive_topology;
8498 
8499    state->dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_TOPOLOGY;
8500 }
8501 
8502 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)8503 radv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer, uint32_t viewportCount, const VkViewport *pViewports)
8504 {
8505    radv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
8506 }
8507 
8508 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer,uint32_t scissorCount,const VkRect2D * pScissors)8509 radv_CmdSetScissorWithCount(VkCommandBuffer commandBuffer, uint32_t scissorCount, const VkRect2D *pScissors)
8510 {
8511    radv_CmdSetScissor(commandBuffer, 0, scissorCount, pScissors);
8512 }
8513 
8514 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthTestEnable)8515 radv_CmdSetDepthTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthTestEnable)
8516 
8517 {
8518    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8519    struct radv_cmd_state *state = &cmd_buffer->state;
8520 
8521    state->dynamic.vk.ds.depth.test_enable = depthTestEnable;
8522 
8523    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_TEST_ENABLE;
8524 }
8525 
8526 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer,VkBool32 depthWriteEnable)8527 radv_CmdSetDepthWriteEnable(VkCommandBuffer commandBuffer, VkBool32 depthWriteEnable)
8528 {
8529    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8530    struct radv_cmd_state *state = &cmd_buffer->state;
8531 
8532    state->dynamic.vk.ds.depth.write_enable = depthWriteEnable;
8533 
8534    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_WRITE_ENABLE;
8535 }
8536 
8537 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer,VkCompareOp depthCompareOp)8538 radv_CmdSetDepthCompareOp(VkCommandBuffer commandBuffer, VkCompareOp depthCompareOp)
8539 {
8540    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8541    struct radv_cmd_state *state = &cmd_buffer->state;
8542 
8543    state->dynamic.vk.ds.depth.compare_op = depthCompareOp;
8544 
8545    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_COMPARE_OP;
8546 }
8547 
8548 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer,VkBool32 depthBoundsTestEnable)8549 radv_CmdSetDepthBoundsTestEnable(VkCommandBuffer commandBuffer, VkBool32 depthBoundsTestEnable)
8550 {
8551    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8552    struct radv_cmd_state *state = &cmd_buffer->state;
8553 
8554    state->dynamic.vk.ds.depth.bounds_test.enable = depthBoundsTestEnable;
8555 
8556    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_BOUNDS_TEST_ENABLE;
8557 }
8558 
8559 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer,VkBool32 stencilTestEnable)8560 radv_CmdSetStencilTestEnable(VkCommandBuffer commandBuffer, VkBool32 stencilTestEnable)
8561 {
8562    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8563    struct radv_cmd_state *state = &cmd_buffer->state;
8564 
8565    state->dynamic.vk.ds.stencil.test_enable = stencilTestEnable;
8566 
8567    state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_TEST_ENABLE;
8568 }
8569 
8570 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetStencilOp(VkCommandBuffer commandBuffer,VkStencilFaceFlags faceMask,VkStencilOp failOp,VkStencilOp passOp,VkStencilOp depthFailOp,VkCompareOp compareOp)8571 radv_CmdSetStencilOp(VkCommandBuffer commandBuffer, VkStencilFaceFlags faceMask, VkStencilOp failOp, VkStencilOp passOp,
8572                      VkStencilOp depthFailOp, VkCompareOp compareOp)
8573 {
8574    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8575    struct radv_cmd_state *state = &cmd_buffer->state;
8576 
8577    if (faceMask & VK_STENCIL_FACE_FRONT_BIT) {
8578       state->dynamic.vk.ds.stencil.front.op.fail = failOp;
8579       state->dynamic.vk.ds.stencil.front.op.pass = passOp;
8580       state->dynamic.vk.ds.stencil.front.op.depth_fail = depthFailOp;
8581       state->dynamic.vk.ds.stencil.front.op.compare = compareOp;
8582    }
8583 
8584    if (faceMask & VK_STENCIL_FACE_BACK_BIT) {
8585       state->dynamic.vk.ds.stencil.back.op.fail = failOp;
8586       state->dynamic.vk.ds.stencil.back.op.pass = passOp;
8587       state->dynamic.vk.ds.stencil.back.op.depth_fail = depthFailOp;
8588       state->dynamic.vk.ds.stencil.back.op.compare = compareOp;
8589    }
8590 
8591    state->dirty_dynamic |= RADV_DYNAMIC_STENCIL_OP;
8592 }
8593 
8594 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer,const VkExtent2D * pFragmentSize,const VkFragmentShadingRateCombinerOpKHR combinerOps[2])8595 radv_CmdSetFragmentShadingRateKHR(VkCommandBuffer commandBuffer, const VkExtent2D *pFragmentSize,
8596                                   const VkFragmentShadingRateCombinerOpKHR combinerOps[2])
8597 {
8598    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8599    struct radv_cmd_state *state = &cmd_buffer->state;
8600 
8601    state->dynamic.vk.fsr.fragment_size = *pFragmentSize;
8602    for (unsigned i = 0; i < 2; i++)
8603       state->dynamic.vk.fsr.combiner_ops[i] = combinerOps[i];
8604 
8605    state->dirty_dynamic |= RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
8606 }
8607 
8608 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer,VkBool32 depthBiasEnable)8609 radv_CmdSetDepthBiasEnable(VkCommandBuffer commandBuffer, VkBool32 depthBiasEnable)
8610 {
8611    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8612    struct radv_cmd_state *state = &cmd_buffer->state;
8613 
8614    state->dynamic.vk.rs.depth_bias.enable = depthBiasEnable;
8615 
8616    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_BIAS_ENABLE;
8617 }
8618 
8619 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer,VkBool32 primitiveRestartEnable)8620 radv_CmdSetPrimitiveRestartEnable(VkCommandBuffer commandBuffer, VkBool32 primitiveRestartEnable)
8621 {
8622    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8623    struct radv_cmd_state *state = &cmd_buffer->state;
8624 
8625    state->dynamic.vk.ia.primitive_restart_enable = primitiveRestartEnable;
8626 
8627    state->dirty_dynamic |= RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE;
8628 }
8629 
8630 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer,VkBool32 rasterizerDiscardEnable)8631 radv_CmdSetRasterizerDiscardEnable(VkCommandBuffer commandBuffer, VkBool32 rasterizerDiscardEnable)
8632 {
8633    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8634    struct radv_cmd_state *state = &cmd_buffer->state;
8635 
8636    state->dynamic.vk.rs.rasterizer_discard_enable = rasterizerDiscardEnable;
8637 
8638    state->dirty_dynamic |= RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE;
8639 }
8640 
8641 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer,uint32_t patchControlPoints)8642 radv_CmdSetPatchControlPointsEXT(VkCommandBuffer commandBuffer, uint32_t patchControlPoints)
8643 {
8644    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8645    struct radv_cmd_state *state = &cmd_buffer->state;
8646 
8647    state->dynamic.vk.ts.patch_control_points = patchControlPoints;
8648 
8649    state->dirty_dynamic |= RADV_DYNAMIC_PATCH_CONTROL_POINTS;
8650 }
8651 
8652 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer,VkLogicOp logicOp)8653 radv_CmdSetLogicOpEXT(VkCommandBuffer commandBuffer, VkLogicOp logicOp)
8654 {
8655    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8656    struct radv_cmd_state *state = &cmd_buffer->state;
8657    unsigned logic_op = radv_translate_blend_logic_op(logicOp);
8658 
8659    state->dynamic.vk.cb.logic_op = logic_op;
8660 
8661    state->dirty_dynamic |= RADV_DYNAMIC_LOGIC_OP;
8662 }
8663 
8664 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)8665 radv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer, uint32_t attachmentCount,
8666                                const VkBool32 *pColorWriteEnables)
8667 {
8668    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8669    struct radv_cmd_state *state = &cmd_buffer->state;
8670    uint8_t color_write_enable = 0;
8671 
8672    assert(attachmentCount <= MAX_RTS);
8673 
8674    for (uint32_t i = 0; i < attachmentCount; i++) {
8675       if (pColorWriteEnables[i]) {
8676          color_write_enable |= BITFIELD_BIT(i);
8677       }
8678    }
8679 
8680    state->dynamic.vk.cb.color_write_enables = color_write_enable;
8681 
8682    state->dirty_dynamic |= RADV_DYNAMIC_COLOR_WRITE_ENABLE;
8683 }
8684 
8685 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer,uint32_t vertexBindingDescriptionCount,const VkVertexInputBindingDescription2EXT * pVertexBindingDescriptions,uint32_t vertexAttributeDescriptionCount,const VkVertexInputAttributeDescription2EXT * pVertexAttributeDescriptions)8686 radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingDescriptionCount,
8687                           const VkVertexInputBindingDescription2EXT *pVertexBindingDescriptions,
8688                           uint32_t vertexAttributeDescriptionCount,
8689                           const VkVertexInputAttributeDescription2EXT *pVertexAttributeDescriptions)
8690 {
8691    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8692    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8693    const struct radv_physical_device *pdev = radv_device_physical(device);
8694    struct radv_cmd_state *state = &cmd_buffer->state;
8695    struct radv_vertex_input_state *vi_state = &state->vertex_input;
8696 
8697    const VkVertexInputBindingDescription2EXT *bindings[MAX_VBS];
8698    for (unsigned i = 0; i < vertexBindingDescriptionCount; i++)
8699       bindings[pVertexBindingDescriptions[i].binding] = &pVertexBindingDescriptions[i];
8700 
8701    state->vbo_misaligned_mask = 0;
8702    state->vbo_unaligned_mask = 0;
8703    state->vbo_misaligned_mask_invalid = 0;
8704 
8705    vi_state->attribute_mask = 0;
8706    vi_state->instance_rate_inputs = 0;
8707    vi_state->nontrivial_divisors = 0;
8708    vi_state->zero_divisors = 0;
8709    vi_state->post_shuffle = 0;
8710    vi_state->alpha_adjust_lo = 0;
8711    vi_state->alpha_adjust_hi = 0;
8712    vi_state->nontrivial_formats = 0;
8713    vi_state->bindings_match_attrib = true;
8714 
8715    enum amd_gfx_level chip = pdev->info.gfx_level;
8716    enum radeon_family family = pdev->info.family;
8717    const struct ac_vtx_format_info *vtx_info_table = ac_get_vtx_format_info_table(chip, family);
8718 
8719    for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) {
8720       const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i];
8721       const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding];
8722       unsigned loc = attrib->location;
8723 
8724       vi_state->attribute_mask |= 1u << loc;
8725       vi_state->bindings[loc] = attrib->binding;
8726       if (attrib->binding != loc)
8727          vi_state->bindings_match_attrib = false;
8728       if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE) {
8729          vi_state->instance_rate_inputs |= 1u << loc;
8730          vi_state->divisors[loc] = binding->divisor;
8731          if (binding->divisor == 0) {
8732             vi_state->zero_divisors |= 1u << loc;
8733          } else if (binding->divisor > 1) {
8734             vi_state->nontrivial_divisors |= 1u << loc;
8735          }
8736       }
8737       cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride;
8738       vi_state->offsets[loc] = attrib->offset;
8739 
8740       enum pipe_format format = vk_format_map[attrib->format];
8741       const struct ac_vtx_format_info *vtx_info = &vtx_info_table[format];
8742 
8743       vi_state->formats[loc] = format;
8744       uint8_t format_align_req_minus_1 = vtx_info->chan_byte_size >= 4 ? 3 : (vtx_info->element_size - 1);
8745       vi_state->format_align_req_minus_1[loc] = format_align_req_minus_1;
8746       uint8_t component_align_req_minus_1 =
8747          MIN2(vtx_info->chan_byte_size ? vtx_info->chan_byte_size : vtx_info->element_size, 4) - 1;
8748       vi_state->component_align_req_minus_1[loc] = component_align_req_minus_1;
8749       vi_state->format_sizes[loc] = vtx_info->element_size;
8750       vi_state->alpha_adjust_lo |= (vtx_info->alpha_adjust & 0x1) << loc;
8751       vi_state->alpha_adjust_hi |= (vtx_info->alpha_adjust >> 1) << loc;
8752       if (G_008F0C_DST_SEL_X(vtx_info->dst_sel) == V_008F0C_SQ_SEL_Z)
8753          vi_state->post_shuffle |= BITFIELD_BIT(loc);
8754 
8755       if (!(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1)))
8756          vi_state->nontrivial_formats |= BITFIELD_BIT(loc);
8757 
8758       if (state->vbo_bound_mask & BITFIELD_BIT(attrib->binding)) {
8759          uint32_t stride = binding->stride;
8760          uint64_t offset = cmd_buffer->vertex_bindings[attrib->binding].offset + vi_state->offsets[loc];
8761          if ((chip == GFX6 || chip >= GFX10) && ((stride | offset) & format_align_req_minus_1))
8762             state->vbo_misaligned_mask |= BITFIELD_BIT(loc);
8763          if ((stride | offset) & component_align_req_minus_1)
8764             state->vbo_unaligned_mask |= BITFIELD_BIT(loc);
8765       }
8766    }
8767 
8768    state->dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
8769    state->dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
8770 }
8771 
8772 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetPolygonModeEXT(VkCommandBuffer commandBuffer,VkPolygonMode polygonMode)8773 radv_CmdSetPolygonModeEXT(VkCommandBuffer commandBuffer, VkPolygonMode polygonMode)
8774 {
8775    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8776    struct radv_cmd_state *state = &cmd_buffer->state;
8777    unsigned polygon_mode = radv_translate_fill(polygonMode);
8778 
8779    if (radv_polygon_mode_is_points_or_lines(state->dynamic.vk.rs.polygon_mode) !=
8780        radv_polygon_mode_is_points_or_lines(polygon_mode))
8781       state->dirty |= RADV_CMD_DIRTY_GUARDBAND;
8782 
8783    state->dynamic.vk.rs.polygon_mode = polygon_mode;
8784 
8785    state->dirty_dynamic |= RADV_DYNAMIC_POLYGON_MODE;
8786 }
8787 
8788 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetTessellationDomainOriginEXT(VkCommandBuffer commandBuffer,VkTessellationDomainOrigin domainOrigin)8789 radv_CmdSetTessellationDomainOriginEXT(VkCommandBuffer commandBuffer, VkTessellationDomainOrigin domainOrigin)
8790 {
8791    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8792    struct radv_cmd_state *state = &cmd_buffer->state;
8793 
8794    state->dynamic.vk.ts.domain_origin = domainOrigin;
8795 
8796    state->dirty_dynamic |= RADV_DYNAMIC_TESS_DOMAIN_ORIGIN;
8797 }
8798 
8799 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLogicOpEnableEXT(VkCommandBuffer commandBuffer,VkBool32 logicOpEnable)8800 radv_CmdSetLogicOpEnableEXT(VkCommandBuffer commandBuffer, VkBool32 logicOpEnable)
8801 {
8802    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8803    struct radv_cmd_state *state = &cmd_buffer->state;
8804 
8805    state->dynamic.vk.cb.logic_op_enable = logicOpEnable;
8806 
8807    state->dirty_dynamic |= RADV_DYNAMIC_LOGIC_OP_ENABLE;
8808 }
8809 
8810 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineStippleEnableEXT(VkCommandBuffer commandBuffer,VkBool32 stippledLineEnable)8811 radv_CmdSetLineStippleEnableEXT(VkCommandBuffer commandBuffer, VkBool32 stippledLineEnable)
8812 {
8813    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8814    struct radv_cmd_state *state = &cmd_buffer->state;
8815 
8816    state->dynamic.vk.rs.line.stipple.enable = stippledLineEnable;
8817 
8818    state->dirty_dynamic |= RADV_DYNAMIC_LINE_STIPPLE_ENABLE;
8819 }
8820 
8821 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAlphaToCoverageEnableEXT(VkCommandBuffer commandBuffer,VkBool32 alphaToCoverageEnable)8822 radv_CmdSetAlphaToCoverageEnableEXT(VkCommandBuffer commandBuffer, VkBool32 alphaToCoverageEnable)
8823 {
8824    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8825    struct radv_cmd_state *state = &cmd_buffer->state;
8826 
8827    state->dynamic.vk.ms.alpha_to_coverage_enable = alphaToCoverageEnable;
8828 
8829    state->dirty_dynamic |= RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE;
8830 }
8831 
8832 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAlphaToOneEnableEXT(VkCommandBuffer commandBuffer,VkBool32 alphaToOneEnable)8833 radv_CmdSetAlphaToOneEnableEXT(VkCommandBuffer commandBuffer, VkBool32 alphaToOneEnable)
8834 {
8835    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8836    struct radv_cmd_state *state = &cmd_buffer->state;
8837 
8838    state->dynamic.vk.ms.alpha_to_one_enable = alphaToOneEnable;
8839 
8840    state->dirty_dynamic |= RADV_DYNAMIC_ALPHA_TO_ONE_ENABLE;
8841 }
8842 
8843 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleMaskEXT(VkCommandBuffer commandBuffer,VkSampleCountFlagBits samples,const VkSampleMask * pSampleMask)8844 radv_CmdSetSampleMaskEXT(VkCommandBuffer commandBuffer, VkSampleCountFlagBits samples, const VkSampleMask *pSampleMask)
8845 {
8846    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8847    struct radv_cmd_state *state = &cmd_buffer->state;
8848 
8849    state->dynamic.vk.ms.sample_mask = pSampleMask[0] & 0xffff;
8850 
8851    state->dirty_dynamic |= RADV_DYNAMIC_SAMPLE_MASK;
8852 }
8853 
8854 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClipEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthClipEnable)8855 radv_CmdSetDepthClipEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthClipEnable)
8856 {
8857    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8858    struct radv_cmd_state *state = &cmd_buffer->state;
8859 
8860    state->dynamic.vk.rs.depth_clip_enable = depthClipEnable;
8861 
8862    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_CLIP_ENABLE;
8863 }
8864 
8865 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetConservativeRasterizationModeEXT(VkCommandBuffer commandBuffer,VkConservativeRasterizationModeEXT conservativeRasterizationMode)8866 radv_CmdSetConservativeRasterizationModeEXT(VkCommandBuffer commandBuffer,
8867                                             VkConservativeRasterizationModeEXT conservativeRasterizationMode)
8868 {
8869    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8870    struct radv_cmd_state *state = &cmd_buffer->state;
8871 
8872    state->dynamic.vk.rs.conservative_mode = conservativeRasterizationMode;
8873 
8874    state->dirty_dynamic |= RADV_DYNAMIC_CONSERVATIVE_RAST_MODE;
8875 }
8876 
8877 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClipNegativeOneToOneEXT(VkCommandBuffer commandBuffer,VkBool32 negativeOneToOne)8878 radv_CmdSetDepthClipNegativeOneToOneEXT(VkCommandBuffer commandBuffer, VkBool32 negativeOneToOne)
8879 {
8880    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8881    struct radv_cmd_state *state = &cmd_buffer->state;
8882 
8883    state->dynamic.vk.vp.depth_clip_negative_one_to_one = negativeOneToOne;
8884 
8885    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE;
8886 }
8887 
8888 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetProvokingVertexModeEXT(VkCommandBuffer commandBuffer,VkProvokingVertexModeEXT provokingVertexMode)8889 radv_CmdSetProvokingVertexModeEXT(VkCommandBuffer commandBuffer, VkProvokingVertexModeEXT provokingVertexMode)
8890 {
8891    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8892    struct radv_cmd_state *state = &cmd_buffer->state;
8893 
8894    state->dynamic.vk.rs.provoking_vertex = provokingVertexMode;
8895 
8896    state->dirty_dynamic |= RADV_DYNAMIC_PROVOKING_VERTEX_MODE;
8897 }
8898 
8899 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthClampEnableEXT(VkCommandBuffer commandBuffer,VkBool32 depthClampEnable)8900 radv_CmdSetDepthClampEnableEXT(VkCommandBuffer commandBuffer, VkBool32 depthClampEnable)
8901 {
8902    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8903    struct radv_cmd_state *state = &cmd_buffer->state;
8904 
8905    state->dynamic.vk.rs.depth_clamp_enable = depthClampEnable;
8906 
8907    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_CLAMP_ENABLE;
8908 }
8909 
8910 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorWriteMaskEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkColorComponentFlags * pColorWriteMasks)8911 radv_CmdSetColorWriteMaskEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
8912                              const VkColorComponentFlags *pColorWriteMasks)
8913 {
8914    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8915    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
8916    const struct radv_physical_device *pdev = radv_device_physical(device);
8917    struct radv_cmd_state *state = &cmd_buffer->state;
8918 
8919    assert(firstAttachment + attachmentCount <= MAX_RTS);
8920 
8921    for (uint32_t i = 0; i < attachmentCount; i++) {
8922       uint32_t idx = firstAttachment + i;
8923 
8924       state->dynamic.vk.cb.attachments[idx].write_mask = pColorWriteMasks[i];
8925    }
8926 
8927    state->dirty_dynamic |= RADV_DYNAMIC_COLOR_WRITE_MASK;
8928 
8929    if (pdev->info.rbplus_allowed)
8930       state->dirty |= RADV_CMD_DIRTY_RBPLUS;
8931 }
8932 
8933 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorBlendEnableEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkBool32 * pColorBlendEnables)8934 radv_CmdSetColorBlendEnableEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
8935                                const VkBool32 *pColorBlendEnables)
8936 {
8937    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8938    struct radv_cmd_state *state = &cmd_buffer->state;
8939 
8940    assert(firstAttachment + attachmentCount <= MAX_RTS);
8941 
8942    for (uint32_t i = 0; i < attachmentCount; i++) {
8943       uint32_t idx = firstAttachment + i;
8944 
8945       state->dynamic.vk.cb.attachments[idx].blend_enable = pColorBlendEnables[i];
8946    }
8947 
8948    state->dirty_dynamic |= RADV_DYNAMIC_COLOR_BLEND_ENABLE;
8949 }
8950 
8951 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRasterizationSamplesEXT(VkCommandBuffer commandBuffer,VkSampleCountFlagBits rasterizationSamples)8952 radv_CmdSetRasterizationSamplesEXT(VkCommandBuffer commandBuffer, VkSampleCountFlagBits rasterizationSamples)
8953 {
8954    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8955    struct radv_cmd_state *state = &cmd_buffer->state;
8956 
8957    state->dynamic.vk.ms.rasterization_samples = rasterizationSamples;
8958 
8959    state->dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
8960 }
8961 
8962 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetLineRasterizationModeEXT(VkCommandBuffer commandBuffer,VkLineRasterizationModeKHR lineRasterizationMode)8963 radv_CmdSetLineRasterizationModeEXT(VkCommandBuffer commandBuffer, VkLineRasterizationModeKHR lineRasterizationMode)
8964 {
8965    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8966    struct radv_cmd_state *state = &cmd_buffer->state;
8967 
8968    state->dynamic.vk.rs.line.mode = lineRasterizationMode;
8969 
8970    state->dirty_dynamic |= RADV_DYNAMIC_LINE_RASTERIZATION_MODE;
8971 }
8972 
8973 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetColorBlendEquationEXT(VkCommandBuffer commandBuffer,uint32_t firstAttachment,uint32_t attachmentCount,const VkColorBlendEquationEXT * pColorBlendEquations)8974 radv_CmdSetColorBlendEquationEXT(VkCommandBuffer commandBuffer, uint32_t firstAttachment, uint32_t attachmentCount,
8975                                  const VkColorBlendEquationEXT *pColorBlendEquations)
8976 {
8977    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8978    struct radv_cmd_state *state = &cmd_buffer->state;
8979 
8980    assert(firstAttachment + attachmentCount <= MAX_RTS);
8981    for (uint32_t i = 0; i < attachmentCount; i++) {
8982       unsigned idx = firstAttachment + i;
8983 
8984       state->dynamic.vk.cb.attachments[idx].src_color_blend_factor = pColorBlendEquations[i].srcColorBlendFactor;
8985       state->dynamic.vk.cb.attachments[idx].dst_color_blend_factor = pColorBlendEquations[i].dstColorBlendFactor;
8986       state->dynamic.vk.cb.attachments[idx].color_blend_op = pColorBlendEquations[i].colorBlendOp;
8987       state->dynamic.vk.cb.attachments[idx].src_alpha_blend_factor = pColorBlendEquations[i].srcAlphaBlendFactor;
8988       state->dynamic.vk.cb.attachments[idx].dst_alpha_blend_factor = pColorBlendEquations[i].dstAlphaBlendFactor;
8989       state->dynamic.vk.cb.attachments[idx].alpha_blend_op = pColorBlendEquations[i].alphaBlendOp;
8990    }
8991 
8992    state->dirty_dynamic |= RADV_DYNAMIC_COLOR_BLEND_EQUATION;
8993 }
8994 
8995 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetSampleLocationsEnableEXT(VkCommandBuffer commandBuffer,VkBool32 sampleLocationsEnable)8996 radv_CmdSetSampleLocationsEnableEXT(VkCommandBuffer commandBuffer, VkBool32 sampleLocationsEnable)
8997 {
8998    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
8999    struct radv_cmd_state *state = &cmd_buffer->state;
9000 
9001    state->dynamic.vk.ms.sample_locations_enable = sampleLocationsEnable;
9002 
9003    state->dirty_dynamic |= RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE;
9004 }
9005 
9006 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleEnableEXT(VkCommandBuffer commandBuffer,VkBool32 discardRectangleEnable)9007 radv_CmdSetDiscardRectangleEnableEXT(VkCommandBuffer commandBuffer, VkBool32 discardRectangleEnable)
9008 {
9009    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9010    struct radv_cmd_state *state = &cmd_buffer->state;
9011 
9012    state->dynamic.vk.dr.enable = discardRectangleEnable;
9013    state->dynamic.vk.dr.rectangle_count = discardRectangleEnable ? MAX_DISCARD_RECTANGLES : 0;
9014 
9015    state->dirty_dynamic |= RADV_DYNAMIC_DISCARD_RECTANGLE_ENABLE;
9016 }
9017 
9018 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDiscardRectangleModeEXT(VkCommandBuffer commandBuffer,VkDiscardRectangleModeEXT discardRectangleMode)9019 radv_CmdSetDiscardRectangleModeEXT(VkCommandBuffer commandBuffer, VkDiscardRectangleModeEXT discardRectangleMode)
9020 {
9021    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9022    struct radv_cmd_state *state = &cmd_buffer->state;
9023 
9024    state->dynamic.vk.dr.mode = discardRectangleMode;
9025 
9026    state->dirty_dynamic |= RADV_DYNAMIC_DISCARD_RECTANGLE_MODE;
9027 }
9028 
9029 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetAttachmentFeedbackLoopEnableEXT(VkCommandBuffer commandBuffer,VkImageAspectFlags aspectMask)9030 radv_CmdSetAttachmentFeedbackLoopEnableEXT(VkCommandBuffer commandBuffer, VkImageAspectFlags aspectMask)
9031 {
9032    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9033    struct radv_cmd_state *state = &cmd_buffer->state;
9034 
9035    state->dynamic.feedback_loop_aspects = aspectMask;
9036 
9037    state->dirty_dynamic |= RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE;
9038 }
9039 
9040 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDepthBias2EXT(VkCommandBuffer commandBuffer,const VkDepthBiasInfoEXT * pDepthBiasInfo)9041 radv_CmdSetDepthBias2EXT(VkCommandBuffer commandBuffer, const VkDepthBiasInfoEXT *pDepthBiasInfo)
9042 {
9043    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9044    struct radv_cmd_state *state = &cmd_buffer->state;
9045 
9046    const VkDepthBiasRepresentationInfoEXT *dbr_info =
9047       vk_find_struct_const(pDepthBiasInfo->pNext, DEPTH_BIAS_REPRESENTATION_INFO_EXT);
9048 
9049    state->dynamic.vk.rs.depth_bias.constant = pDepthBiasInfo->depthBiasConstantFactor;
9050    state->dynamic.vk.rs.depth_bias.clamp = pDepthBiasInfo->depthBiasClamp;
9051    state->dynamic.vk.rs.depth_bias.slope = pDepthBiasInfo->depthBiasSlopeFactor;
9052    state->dynamic.vk.rs.depth_bias.representation =
9053       dbr_info ? dbr_info->depthBiasRepresentation : VK_DEPTH_BIAS_REPRESENTATION_LEAST_REPRESENTABLE_VALUE_FORMAT_EXT;
9054 
9055    state->dirty_dynamic |= RADV_DYNAMIC_DEPTH_BIAS;
9056 }
9057 
9058 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRenderingAttachmentLocationsKHR(VkCommandBuffer commandBuffer,const VkRenderingAttachmentLocationInfoKHR * pLocationInfo)9059 radv_CmdSetRenderingAttachmentLocationsKHR(VkCommandBuffer commandBuffer,
9060                                            const VkRenderingAttachmentLocationInfoKHR *pLocationInfo)
9061 {
9062    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9063    struct radv_cmd_state *state = &cmd_buffer->state;
9064 
9065    assume(pLocationInfo->colorAttachmentCount <= MESA_VK_MAX_COLOR_ATTACHMENTS);
9066    for (uint32_t i = 0; i < pLocationInfo->colorAttachmentCount; i++) {
9067       state->dynamic.vk.cal.color_map[i] = pLocationInfo->pColorAttachmentLocations[i] == VK_ATTACHMENT_UNUSED
9068                                               ? MESA_VK_ATTACHMENT_UNUSED
9069                                               : pLocationInfo->pColorAttachmentLocations[i];
9070    }
9071 
9072    state->dirty_dynamic |= RADV_DYNAMIC_COLOR_ATTACHMENT_MAP;
9073    state->dirty |= RADV_CMD_DIRTY_FBFETCH_OUTPUT;
9074 }
9075 
9076 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRenderingInputAttachmentIndicesKHR(VkCommandBuffer commandBuffer,const VkRenderingInputAttachmentIndexInfoKHR * pLocationInfo)9077 radv_CmdSetRenderingInputAttachmentIndicesKHR(VkCommandBuffer commandBuffer,
9078                                               const VkRenderingInputAttachmentIndexInfoKHR *pLocationInfo)
9079 {
9080    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9081    struct radv_cmd_state *state = &cmd_buffer->state;
9082 
9083    assume(pLocationInfo->colorAttachmentCount <= MESA_VK_MAX_COLOR_ATTACHMENTS);
9084    for (uint32_t i = 0; i < pLocationInfo->colorAttachmentCount; i++) {
9085       uint8_t val;
9086 
9087       if (!pLocationInfo->pColorAttachmentInputIndices) {
9088          val = i;
9089       } else if (pLocationInfo->pColorAttachmentInputIndices[i] == VK_ATTACHMENT_UNUSED) {
9090          val = MESA_VK_ATTACHMENT_UNUSED;
9091       } else {
9092          val = pLocationInfo->pColorAttachmentInputIndices[i];
9093       }
9094 
9095       state->dynamic.vk.ial.color_map[i] = val;
9096    }
9097 
9098    state->dynamic.vk.ial.depth_att = (pLocationInfo->pDepthInputAttachmentIndex == NULL ||
9099                                       *pLocationInfo->pDepthInputAttachmentIndex == VK_ATTACHMENT_UNUSED)
9100                                         ? MESA_VK_ATTACHMENT_UNUSED
9101                                         : *pLocationInfo->pDepthInputAttachmentIndex;
9102    state->dynamic.vk.ial.stencil_att = (pLocationInfo->pStencilInputAttachmentIndex == NULL ||
9103                                         *pLocationInfo->pStencilInputAttachmentIndex == VK_ATTACHMENT_UNUSED)
9104                                           ? MESA_VK_ATTACHMENT_UNUSED
9105                                           : *pLocationInfo->pStencilInputAttachmentIndex;
9106 
9107    state->dirty_dynamic |= RADV_DYNAMIC_INPUT_ATTACHMENT_MAP;
9108    state->dirty |= RADV_CMD_DIRTY_FBFETCH_OUTPUT;
9109 }
9110 
9111 static void
radv_handle_color_fbfetch_output(struct radv_cmd_buffer * cmd_buffer,uint32_t index)9112 radv_handle_color_fbfetch_output(struct radv_cmd_buffer *cmd_buffer, uint32_t index)
9113 {
9114    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
9115    struct radv_rendering_state *render = &cmd_buffer->state.render;
9116    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
9117    struct radv_attachment *att = &render->color_att[index];
9118 
9119    if (!att->iview)
9120       return;
9121 
9122    const struct radv_image *image = att->iview->image;
9123    if (!(image->vk.usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
9124       return;
9125 
9126    if (!radv_layout_dcc_compressed(device, image, att->iview->vk.base_mip_level, att->layout,
9127                                    radv_image_queue_family_mask(att->iview->image, cmd_buffer->qf, cmd_buffer->qf)))
9128       return;
9129 
9130    const uint32_t color_att_idx = d->vk.cal.color_map[index];
9131    if (color_att_idx == MESA_VK_ATTACHMENT_UNUSED)
9132       return;
9133 
9134    if (d->vk.ial.color_map[color_att_idx] != color_att_idx)
9135       return;
9136 
9137    const VkImageSubresourceRange range = {
9138       .aspectMask = att->iview->vk.aspects,
9139       .baseMipLevel = att->iview->vk.base_mip_level,
9140       .levelCount = att->iview->vk.level_count,
9141       .baseArrayLayer = att->iview->vk.base_array_layer,
9142       .layerCount = att->iview->vk.layer_count,
9143    };
9144 
9145    /* Consider previous rendering work for WAW hazards. */
9146    cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
9147                                                          VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, att->iview->image);
9148 
9149    /* Force a transition to FEEDBACK_LOOP_OPTIMAL to decompress DCC. */
9150    radv_handle_image_transition(cmd_buffer, att->iview->image, att->layout,
9151                                 VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, RADV_QUEUE_GENERAL,
9152                                 RADV_QUEUE_GENERAL, &range, NULL);
9153 
9154    att->layout = VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT;
9155 
9156    cmd_buffer->state.flush_bits |= radv_dst_access_flush(
9157       cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
9158       VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT | VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT, att->iview->image);
9159 
9160    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
9161 }
9162 
9163 static void
radv_handle_depth_fbfetch_output(struct radv_cmd_buffer * cmd_buffer)9164 radv_handle_depth_fbfetch_output(struct radv_cmd_buffer *cmd_buffer)
9165 {
9166    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
9167    struct radv_rendering_state *render = &cmd_buffer->state.render;
9168    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
9169    struct radv_attachment *att = &render->ds_att;
9170 
9171    if (!att->iview)
9172       return;
9173 
9174    const struct radv_image *image = att->iview->image;
9175    if (!(image->vk.usage & VK_IMAGE_USAGE_INPUT_ATTACHMENT_BIT))
9176       return;
9177 
9178    if (!radv_layout_is_htile_compressed(
9179           device, att->iview->image, att->layout,
9180           radv_image_queue_family_mask(att->iview->image, cmd_buffer->qf, cmd_buffer->qf)))
9181       return;
9182 
9183    if (d->vk.ial.depth_att == MESA_VK_ATTACHMENT_UNUSED && d->vk.ial.stencil_att == MESA_VK_ATTACHMENT_UNUSED)
9184       return;
9185 
9186    const VkImageSubresourceRange range = {
9187       .aspectMask = att->iview->vk.aspects,
9188       .baseMipLevel = att->iview->vk.base_mip_level,
9189       .levelCount = att->iview->vk.level_count,
9190       .baseArrayLayer = att->iview->vk.base_array_layer,
9191       .layerCount = att->iview->vk.layer_count,
9192    };
9193 
9194    /* Consider previous rendering work for WAW hazards. */
9195    cmd_buffer->state.flush_bits |=
9196       radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
9197                             VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, att->iview->image);
9198 
9199    /* Force a transition to FEEDBACK_LOOP_OPTIMAL to decompress HTILE. */
9200    radv_handle_image_transition(cmd_buffer, att->iview->image, att->layout,
9201                                 VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT, RADV_QUEUE_GENERAL,
9202                                 RADV_QUEUE_GENERAL, &range, NULL);
9203 
9204    att->layout = VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT;
9205    att->stencil_layout = VK_IMAGE_LAYOUT_ATTACHMENT_FEEDBACK_LOOP_OPTIMAL_EXT;
9206 
9207    cmd_buffer->state.flush_bits |= radv_dst_access_flush(
9208       cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
9209       VK_ACCESS_2_INPUT_ATTACHMENT_READ_BIT | VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT, att->iview->image);
9210 
9211    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
9212 }
9213 
9214 static void
radv_handle_fbfetch_output(struct radv_cmd_buffer * cmd_buffer)9215 radv_handle_fbfetch_output(struct radv_cmd_buffer *cmd_buffer)
9216 {
9217    const struct radv_rendering_state *render = &cmd_buffer->state.render;
9218 
9219    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_FBFETCH_OUTPUT;
9220 
9221    /* Nothing to do when dynamic rendering doesn't use concurrent input attachment writes. */
9222    if (render->has_input_attachment_no_concurrent_writes)
9223       return;
9224 
9225    /* Nothing to do when the bound fragment shader doesn't use subpass input attachments. */
9226    if (!cmd_buffer->state.uses_fbfetch_output)
9227       return;
9228 
9229    /* Check if any color attachments are compressed and also used as input attachments. */
9230    for (uint32_t i = 0; i < render->color_att_count; i++) {
9231       radv_handle_color_fbfetch_output(cmd_buffer, i);
9232    }
9233 
9234    /* Check if the depth/stencil attachment is compressed and also used as input attachment. */
9235    radv_handle_depth_fbfetch_output(cmd_buffer);
9236 }
9237 
9238 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCmdBuffers)9239 radv_CmdExecuteCommands(VkCommandBuffer commandBuffer, uint32_t commandBufferCount, const VkCommandBuffer *pCmdBuffers)
9240 {
9241    VK_FROM_HANDLE(radv_cmd_buffer, primary, commandBuffer);
9242    struct radv_device *device = radv_cmd_buffer_device(primary);
9243    const struct radv_physical_device *pdev = radv_device_physical(device);
9244 
9245    assert(commandBufferCount > 0);
9246 
9247    radv_emit_mip_change_flush_default(primary);
9248 
9249    /* Emit pending flushes on primary prior to executing secondary */
9250    radv_emit_cache_flush(primary);
9251 
9252    /* Make sure CP DMA is idle on primary prior to executing secondary. */
9253    radv_cp_dma_wait_for_idle(primary);
9254 
9255    for (uint32_t i = 0; i < commandBufferCount; i++) {
9256       VK_FROM_HANDLE(radv_cmd_buffer, secondary, pCmdBuffers[i]);
9257 
9258       /* Do not launch an IB2 for secondary command buffers that contain
9259        * DRAW_{INDEX}_INDIRECT_{MULTI} on GFX6-7 because it's illegal and hangs the GPU.
9260        */
9261       const bool allow_ib2 = !secondary->state.uses_draw_indirect || pdev->info.gfx_level >= GFX8;
9262 
9263       primary->scratch_size_per_wave_needed =
9264          MAX2(primary->scratch_size_per_wave_needed, secondary->scratch_size_per_wave_needed);
9265       primary->scratch_waves_wanted = MAX2(primary->scratch_waves_wanted, secondary->scratch_waves_wanted);
9266       primary->compute_scratch_size_per_wave_needed =
9267          MAX2(primary->compute_scratch_size_per_wave_needed, secondary->compute_scratch_size_per_wave_needed);
9268       primary->compute_scratch_waves_wanted =
9269          MAX2(primary->compute_scratch_waves_wanted, secondary->compute_scratch_waves_wanted);
9270 
9271       if (secondary->esgs_ring_size_needed > primary->esgs_ring_size_needed)
9272          primary->esgs_ring_size_needed = secondary->esgs_ring_size_needed;
9273       if (secondary->gsvs_ring_size_needed > primary->gsvs_ring_size_needed)
9274          primary->gsvs_ring_size_needed = secondary->gsvs_ring_size_needed;
9275       if (secondary->tess_rings_needed)
9276          primary->tess_rings_needed = true;
9277       if (secondary->task_rings_needed)
9278          primary->task_rings_needed = true;
9279       if (secondary->mesh_scratch_ring_needed)
9280          primary->mesh_scratch_ring_needed = true;
9281       if (secondary->sample_positions_needed)
9282          primary->sample_positions_needed = true;
9283       if (secondary->gds_needed)
9284          primary->gds_needed = true;
9285       if (secondary->gds_oa_needed)
9286          primary->gds_oa_needed = true;
9287 
9288       primary->shader_upload_seq = MAX2(primary->shader_upload_seq, secondary->shader_upload_seq);
9289 
9290       primary->state.uses_fbfetch_output |= secondary->state.uses_fbfetch_output;
9291 
9292       if (!secondary->state.render.has_image_views) {
9293          if (primary->state.dirty & RADV_CMD_DIRTY_FBFETCH_OUTPUT)
9294             radv_handle_fbfetch_output(primary);
9295 
9296          if (primary->state.render.active && (primary->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)) {
9297             /* Emit the framebuffer state from primary if secondary
9298              * has been recorded without a framebuffer, otherwise
9299              * fast color/depth clears can't work.
9300              */
9301             radv_emit_framebuffer_state(primary);
9302          }
9303       }
9304 
9305       if (secondary->gang.cs) {
9306          if (!radv_gang_init(primary))
9307             return;
9308 
9309          struct radeon_cmdbuf *ace_primary = primary->gang.cs;
9310          struct radeon_cmdbuf *ace_secondary = secondary->gang.cs;
9311 
9312          /* Emit pending flushes on primary prior to executing secondary. */
9313          radv_gang_cache_flush(primary);
9314 
9315          /* Wait for gang semaphores, if necessary. */
9316          if (radv_flush_gang_leader_semaphore(primary))
9317             radv_wait_gang_leader(primary);
9318          if (radv_flush_gang_follower_semaphore(primary))
9319             radv_wait_gang_follower(primary);
9320 
9321          /* Execute the secondary compute cmdbuf.
9322           * Don't use IB2 packets because they are not supported on compute queues.
9323           */
9324          device->ws->cs_execute_secondary(ace_primary, ace_secondary, false);
9325       }
9326 
9327       /* Update pending ACE internal flush bits from the secondary cmdbuf */
9328       primary->gang.flush_bits |= secondary->gang.flush_bits;
9329 
9330       /* Increment gang semaphores if secondary was dirty.
9331        * This happens when the secondary cmdbuf has a barrier which
9332        * isn't consumed by a draw call.
9333        */
9334       if (radv_gang_leader_sem_dirty(secondary))
9335          primary->gang.sem.leader_value++;
9336       if (radv_gang_follower_sem_dirty(secondary))
9337          primary->gang.sem.follower_value++;
9338 
9339       device->ws->cs_execute_secondary(primary->cs, secondary->cs, allow_ib2);
9340 
9341       /* When the secondary command buffer is compute only we don't
9342        * need to re-emit the current graphics pipeline.
9343        */
9344       if (secondary->state.emitted_graphics_pipeline) {
9345          primary->state.emitted_graphics_pipeline = secondary->state.emitted_graphics_pipeline;
9346       }
9347 
9348       /* When the secondary command buffer is graphics only we don't
9349        * need to re-emit the current compute pipeline.
9350        */
9351       if (secondary->state.emitted_compute_pipeline) {
9352          primary->state.emitted_compute_pipeline = secondary->state.emitted_compute_pipeline;
9353       }
9354 
9355       if (secondary->state.last_ia_multi_vgt_param) {
9356          primary->state.last_ia_multi_vgt_param = secondary->state.last_ia_multi_vgt_param;
9357       }
9358 
9359       if (secondary->state.last_ge_cntl) {
9360          primary->state.last_ge_cntl = secondary->state.last_ge_cntl;
9361       }
9362 
9363       primary->state.last_num_instances = secondary->state.last_num_instances;
9364       primary->state.last_subpass_color_count = secondary->state.last_subpass_color_count;
9365 
9366       if (secondary->state.last_index_type != -1) {
9367          primary->state.last_index_type = secondary->state.last_index_type;
9368       }
9369 
9370       primary->state.last_vrs_rates = secondary->state.last_vrs_rates;
9371       primary->state.last_force_vrs_rates_offset = secondary->state.last_force_vrs_rates_offset;
9372 
9373       primary->state.rb_noncoherent_dirty |= secondary->state.rb_noncoherent_dirty;
9374 
9375       primary->state.uses_draw_indirect |= secondary->state.uses_draw_indirect;
9376 
9377       for (uint32_t reg = 0; reg < RADV_NUM_ALL_TRACKED_REGS; reg++) {
9378          if (!BITSET_TEST(secondary->tracked_regs.reg_saved_mask, reg))
9379             continue;
9380 
9381          BITSET_SET(primary->tracked_regs.reg_saved_mask, reg);
9382          primary->tracked_regs.reg_value[reg] = secondary->tracked_regs.reg_value[reg];
9383       }
9384 
9385       memcpy(primary->tracked_regs.spi_ps_input_cntl, secondary->tracked_regs.spi_ps_input_cntl,
9386              sizeof(primary->tracked_regs.spi_ps_input_cntl));
9387    }
9388 
9389    /* After executing commands from secondary buffers we have to dirty
9390     * some states.
9391     */
9392    primary->state.dirty_dynamic |= RADV_DYNAMIC_ALL;
9393    primary->state.dirty |= RADV_CMD_DIRTY_PIPELINE | RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_GUARDBAND |
9394                            RADV_CMD_DIRTY_SHADER_QUERY | RADV_CMD_DIRTY_OCCLUSION_QUERY |
9395                            RADV_CMD_DIRTY_DB_SHADER_CONTROL | RADV_CMD_DIRTY_COLOR_OUTPUT;
9396    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_GRAPHICS);
9397    radv_mark_descriptor_sets_dirty(primary, VK_PIPELINE_BIND_POINT_COMPUTE);
9398 
9399    primary->state.last_first_instance = -1;
9400    primary->state.last_drawid = -1;
9401    primary->state.last_vertex_offset_valid = false;
9402 }
9403 
9404 static void
radv_mark_noncoherent_rb(struct radv_cmd_buffer * cmd_buffer)9405 radv_mark_noncoherent_rb(struct radv_cmd_buffer *cmd_buffer)
9406 {
9407    struct radv_rendering_state *render = &cmd_buffer->state.render;
9408 
9409    /* Have to be conservative in cmdbuffers with inherited attachments. */
9410    if (!render->has_image_views) {
9411       cmd_buffer->state.rb_noncoherent_dirty = true;
9412       return;
9413    }
9414 
9415    for (uint32_t i = 0; i < render->color_att_count; i++) {
9416       if (render->color_att[i].iview && !render->color_att[i].iview->image->l2_coherent) {
9417          cmd_buffer->state.rb_noncoherent_dirty = true;
9418          return;
9419       }
9420    }
9421    if (render->ds_att.iview && !render->ds_att.iview->image->l2_coherent)
9422       cmd_buffer->state.rb_noncoherent_dirty = true;
9423 }
9424 
9425 static VkImageLayout
attachment_initial_layout(const VkRenderingAttachmentInfo * att)9426 attachment_initial_layout(const VkRenderingAttachmentInfo *att)
9427 {
9428    const VkRenderingAttachmentInitialLayoutInfoMESA *layout_info =
9429       vk_find_struct_const(att->pNext, RENDERING_ATTACHMENT_INITIAL_LAYOUT_INFO_MESA);
9430    if (layout_info != NULL)
9431       return layout_info->initialLayout;
9432 
9433    return att->imageLayout;
9434 }
9435 
9436 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)9437 radv_CmdBeginRendering(VkCommandBuffer commandBuffer, const VkRenderingInfo *pRenderingInfo)
9438 {
9439    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9440    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
9441    const struct radv_physical_device *pdev = radv_device_physical(device);
9442 
9443    const struct VkSampleLocationsInfoEXT *sample_locs_info =
9444       vk_find_struct_const(pRenderingInfo->pNext, SAMPLE_LOCATIONS_INFO_EXT);
9445 
9446    struct radv_sample_locations_state sample_locations = {
9447       .count = 0,
9448    };
9449    if (sample_locs_info) {
9450       sample_locations = (struct radv_sample_locations_state){
9451          .per_pixel = sample_locs_info->sampleLocationsPerPixel,
9452          .grid_size = sample_locs_info->sampleLocationGridSize,
9453          .count = sample_locs_info->sampleLocationsCount,
9454       };
9455       typed_memcpy(sample_locations.locations, sample_locs_info->pSampleLocations,
9456                    sample_locs_info->sampleLocationsCount);
9457    }
9458 
9459    /* Dynamic rendering does not have implicit transitions, so limit the marker to
9460     * when a render pass is used.
9461     * Additionally, some internal meta operations called inside a barrier may issue
9462     * render calls (with dynamic rendering), so this makes sure those case don't
9463     * create a nested barrier scope.
9464     */
9465    if (cmd_buffer->vk.render_pass)
9466       radv_describe_barrier_start(cmd_buffer, RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC);
9467    uint32_t color_samples = 0, ds_samples = 0;
9468    struct radv_attachment color_att[MAX_RTS];
9469    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
9470       const VkRenderingAttachmentInfo *att_info = &pRenderingInfo->pColorAttachments[i];
9471 
9472       color_att[i] = (struct radv_attachment){.iview = NULL};
9473       if (att_info->imageView == VK_NULL_HANDLE)
9474          continue;
9475 
9476       VK_FROM_HANDLE(radv_image_view, iview, att_info->imageView);
9477       color_att[i].format = iview->vk.format;
9478       color_att[i].iview = iview;
9479       color_att[i].layout = att_info->imageLayout;
9480       radv_initialise_color_surface(device, &color_att[i].cb, iview);
9481 
9482       if (att_info->resolveMode != VK_RESOLVE_MODE_NONE && att_info->resolveImageView != VK_NULL_HANDLE) {
9483          color_att[i].resolve_mode = att_info->resolveMode;
9484          color_att[i].resolve_iview = radv_image_view_from_handle(att_info->resolveImageView);
9485          color_att[i].resolve_layout = att_info->resolveImageLayout;
9486       }
9487 
9488       color_samples = MAX2(color_samples, color_att[i].iview->vk.image->samples);
9489 
9490       VkImageLayout initial_layout = attachment_initial_layout(att_info);
9491       if (initial_layout != color_att[i].layout) {
9492          assert(!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT));
9493          radv_handle_rendering_image_transition(cmd_buffer, color_att[i].iview, pRenderingInfo->layerCount,
9494                                                 pRenderingInfo->viewMask, initial_layout, VK_IMAGE_LAYOUT_UNDEFINED,
9495                                                 color_att[i].layout, VK_IMAGE_LAYOUT_UNDEFINED, &sample_locations);
9496       }
9497    }
9498 
9499    struct radv_attachment ds_att = {.iview = NULL};
9500    VkImageAspectFlags ds_att_aspects = 0;
9501    const VkRenderingAttachmentInfo *d_att_info = pRenderingInfo->pDepthAttachment;
9502    const VkRenderingAttachmentInfo *s_att_info = pRenderingInfo->pStencilAttachment;
9503    if ((d_att_info != NULL && d_att_info->imageView != VK_NULL_HANDLE) ||
9504        (s_att_info != NULL && s_att_info->imageView != VK_NULL_HANDLE)) {
9505       struct radv_image_view *d_iview = NULL, *s_iview = NULL;
9506       struct radv_image_view *d_res_iview = NULL, *s_res_iview = NULL;
9507       VkImageLayout initial_depth_layout = VK_IMAGE_LAYOUT_UNDEFINED;
9508       VkImageLayout initial_stencil_layout = VK_IMAGE_LAYOUT_UNDEFINED;
9509 
9510       if (d_att_info != NULL && d_att_info->imageView != VK_NULL_HANDLE) {
9511          d_iview = radv_image_view_from_handle(d_att_info->imageView);
9512          initial_depth_layout = attachment_initial_layout(d_att_info);
9513          ds_att.layout = d_att_info->imageLayout;
9514 
9515          if (d_att_info->resolveMode != VK_RESOLVE_MODE_NONE && d_att_info->resolveImageView != VK_NULL_HANDLE) {
9516             d_res_iview = radv_image_view_from_handle(d_att_info->resolveImageView);
9517             ds_att.resolve_mode = d_att_info->resolveMode;
9518             ds_att.resolve_layout = d_att_info->resolveImageLayout;
9519          }
9520       }
9521 
9522       if (s_att_info != NULL && s_att_info->imageView != VK_NULL_HANDLE) {
9523          s_iview = radv_image_view_from_handle(s_att_info->imageView);
9524          initial_stencil_layout = attachment_initial_layout(s_att_info);
9525          ds_att.stencil_layout = s_att_info->imageLayout;
9526 
9527          if (s_att_info->resolveMode != VK_RESOLVE_MODE_NONE && s_att_info->resolveImageView != VK_NULL_HANDLE) {
9528             s_res_iview = radv_image_view_from_handle(s_att_info->resolveImageView);
9529             ds_att.stencil_resolve_mode = s_att_info->resolveMode;
9530             ds_att.stencil_resolve_layout = s_att_info->resolveImageLayout;
9531          }
9532       }
9533 
9534       assert(d_iview == NULL || s_iview == NULL || d_iview == s_iview);
9535       ds_att.iview = d_iview ? d_iview : s_iview, ds_att.format = ds_att.iview->vk.format;
9536 
9537       if (d_iview && s_iview) {
9538          ds_att_aspects = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT;
9539       } else if (d_iview) {
9540          ds_att_aspects = VK_IMAGE_ASPECT_DEPTH_BIT;
9541       } else {
9542          ds_att_aspects = VK_IMAGE_ASPECT_STENCIL_BIT;
9543       }
9544 
9545       radv_initialise_ds_surface(device, &ds_att.ds, ds_att.iview, ds_att_aspects);
9546 
9547       assert(d_res_iview == NULL || s_res_iview == NULL || d_res_iview == s_res_iview);
9548       ds_att.resolve_iview = d_res_iview ? d_res_iview : s_res_iview;
9549 
9550       ds_samples = ds_att.iview->vk.image->samples;
9551 
9552       if (initial_depth_layout != ds_att.layout || initial_stencil_layout != ds_att.stencil_layout) {
9553          assert(!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT));
9554          radv_handle_rendering_image_transition(cmd_buffer, ds_att.iview, pRenderingInfo->layerCount,
9555                                                 pRenderingInfo->viewMask, initial_depth_layout, initial_stencil_layout,
9556                                                 ds_att.layout, ds_att.stencil_layout, &sample_locations);
9557       }
9558    }
9559    if (cmd_buffer->vk.render_pass)
9560       radv_describe_barrier_end(cmd_buffer);
9561 
9562    const VkRenderingFragmentShadingRateAttachmentInfoKHR *fsr_info =
9563       vk_find_struct_const(pRenderingInfo->pNext, RENDERING_FRAGMENT_SHADING_RATE_ATTACHMENT_INFO_KHR);
9564    struct radv_attachment vrs_att = {.iview = NULL};
9565    VkExtent2D vrs_texel_size = {.width = 0};
9566    if (fsr_info && fsr_info->imageView) {
9567       VK_FROM_HANDLE(radv_image_view, iview, fsr_info->imageView);
9568       vrs_att = (struct radv_attachment){
9569          .format = iview->vk.format,
9570          .iview = iview,
9571          .layout = fsr_info->imageLayout,
9572       };
9573       vrs_texel_size = fsr_info->shadingRateAttachmentTexelSize;
9574    }
9575 
9576    /* Now that we've done any layout transitions which may invoke meta, we can
9577     * fill out the actual rendering info and set up for the client's render pass.
9578     */
9579    radv_cmd_buffer_reset_rendering(cmd_buffer);
9580 
9581    struct radv_rendering_state *render = &cmd_buffer->state.render;
9582    render->active = true;
9583    render->has_image_views = true;
9584    render->has_input_attachment_no_concurrent_writes =
9585       !!(pRenderingInfo->flags & VK_RENDERING_INPUT_ATTACHMENT_NO_CONCURRENT_WRITES_BIT_MESA);
9586    render->area = pRenderingInfo->renderArea;
9587    render->view_mask = pRenderingInfo->viewMask;
9588    render->layer_count = pRenderingInfo->layerCount;
9589    render->color_samples = color_samples;
9590    render->ds_samples = ds_samples;
9591    render->max_samples = MAX2(color_samples, ds_samples);
9592    render->sample_locations = sample_locations;
9593    render->color_att_count = pRenderingInfo->colorAttachmentCount;
9594    typed_memcpy(render->color_att, color_att, render->color_att_count);
9595    render->ds_att = ds_att;
9596    render->ds_att_aspects = ds_att_aspects;
9597    render->vrs_att = vrs_att;
9598    render->vrs_texel_size = vrs_texel_size;
9599    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER | RADV_CMD_DIRTY_FBFETCH_OUTPUT;
9600 
9601    if (pdev->info.rbplus_allowed)
9602       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
9603 
9604    cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_DEPTH_BIAS | RADV_DYNAMIC_STENCIL_TEST_ENABLE;
9605    if (pdev->info.gfx_level >= GFX12)
9606       cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_RASTERIZATION_SAMPLES;
9607 
9608    if (render->vrs_att.iview && pdev->info.gfx_level == GFX10_3) {
9609       if (render->ds_att.iview &&
9610           radv_htile_enabled(render->ds_att.iview->image, render->ds_att.iview->vk.base_mip_level)) {
9611          /* When we have a VRS attachment and a depth/stencil attachment, we just need to copy the
9612           * VRS rates to the HTILE buffer of the attachment.
9613           */
9614          struct radv_image_view *ds_iview = render->ds_att.iview;
9615          struct radv_image *ds_image = ds_iview->image;
9616          uint32_t level = ds_iview->vk.base_mip_level;
9617 
9618          /* HTILE buffer */
9619          uint64_t htile_offset = ds_image->bindings[0].offset + ds_image->planes[0].surface.meta_offset +
9620                                  ds_image->planes[0].surface.u.gfx9.meta_levels[level].offset;
9621          uint64_t htile_size = ds_image->planes[0].surface.u.gfx9.meta_levels[level].size;
9622          struct radv_buffer htile_buffer;
9623 
9624          radv_buffer_init(&htile_buffer, device, ds_image->bindings[0].bo, htile_size, htile_offset);
9625 
9626          assert(render->area.offset.x + render->area.extent.width <= ds_image->vk.extent.width &&
9627                 render->area.offset.x + render->area.extent.height <= ds_image->vk.extent.height);
9628 
9629          /* Copy the VRS rates to the HTILE buffer. */
9630          radv_copy_vrs_htile(cmd_buffer, render->vrs_att.iview, &render->area, ds_image, &htile_buffer, true);
9631 
9632          radv_buffer_finish(&htile_buffer);
9633       } else {
9634          /* When a subpass uses a VRS attachment without binding a depth/stencil attachment, or when
9635           * HTILE isn't enabled, we use a fallback that copies the VRS rates to our internal HTILE buffer.
9636           */
9637          struct radv_image *ds_image = radv_cmd_buffer_get_vrs_image(cmd_buffer);
9638 
9639          if (ds_image && render->area.offset.x < ds_image->vk.extent.width &&
9640              render->area.offset.y < ds_image->vk.extent.height) {
9641             /* HTILE buffer */
9642             struct radv_buffer *htile_buffer = device->vrs.buffer;
9643 
9644             VkRect2D area = render->area;
9645             area.extent.width = MIN2(area.extent.width, ds_image->vk.extent.width - area.offset.x);
9646             area.extent.height = MIN2(area.extent.height, ds_image->vk.extent.height - area.offset.y);
9647 
9648             /* Copy the VRS rates to the HTILE buffer. */
9649             radv_copy_vrs_htile(cmd_buffer, render->vrs_att.iview, &area, ds_image, htile_buffer, false);
9650          }
9651       }
9652    }
9653 
9654    const uint32_t minx = render->area.offset.x;
9655    const uint32_t miny = render->area.offset.y;
9656    const uint32_t maxx = minx + render->area.extent.width;
9657    const uint32_t maxy = miny + render->area.extent.height;
9658 
9659    radeon_check_space(device->ws, cmd_buffer->cs, 6);
9660 
9661    if (pdev->info.gfx_level >= GFX12) {
9662       radeon_set_context_reg(cmd_buffer->cs, R_028204_PA_SC_WINDOW_SCISSOR_TL,
9663                              S_028204_TL_X(minx) | S_028204_TL_Y_GFX12(miny));
9664       radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
9665                              S_028208_BR_X(maxx - 1) | S_028208_BR_Y(maxy - 1)); /* inclusive */
9666    } else {
9667       radeon_set_context_reg(cmd_buffer->cs, R_028204_PA_SC_WINDOW_SCISSOR_TL,
9668                              S_028204_TL_X(minx) | S_028204_TL_Y_GFX6(miny));
9669       radeon_set_context_reg(cmd_buffer->cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
9670                              S_028208_BR_X(maxx) | S_028208_BR_Y(maxy));
9671    }
9672 
9673    radv_emit_fb_mip_change_flush(cmd_buffer);
9674 
9675    if (!(pRenderingInfo->flags & VK_RENDERING_RESUMING_BIT))
9676       radv_cmd_buffer_clear_rendering(cmd_buffer, pRenderingInfo);
9677 }
9678 
9679 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndRendering(VkCommandBuffer commandBuffer)9680 radv_CmdEndRendering(VkCommandBuffer commandBuffer)
9681 {
9682    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
9683 
9684    radv_mark_noncoherent_rb(cmd_buffer);
9685    radv_cmd_buffer_resolve_rendering(cmd_buffer);
9686    radv_cmd_buffer_reset_rendering(cmd_buffer);
9687 }
9688 
9689 static void
radv_emit_view_index_per_stage(struct radeon_cmdbuf * cs,const struct radv_shader * shader,uint32_t base_reg,unsigned index)9690 radv_emit_view_index_per_stage(struct radeon_cmdbuf *cs, const struct radv_shader *shader, uint32_t base_reg,
9691                                unsigned index)
9692 {
9693    const uint32_t view_index_offset = radv_get_user_sgpr_loc(shader, AC_UD_VIEW_INDEX);
9694 
9695    if (!view_index_offset)
9696       return;
9697 
9698    radeon_set_sh_reg(cs, view_index_offset, index);
9699 }
9700 
9701 static void
radv_emit_view_index(const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * cs,unsigned index)9702 radv_emit_view_index(const struct radv_cmd_state *cmd_state, struct radeon_cmdbuf *cs, unsigned index)
9703 {
9704    radv_foreach_stage(stage, cmd_state->active_stages & ~VK_SHADER_STAGE_TASK_BIT_EXT)
9705    {
9706       const struct radv_shader *shader = radv_get_shader(cmd_state->shaders, stage);
9707 
9708       radv_emit_view_index_per_stage(cs, shader, shader->info.user_data_0, index);
9709    }
9710 
9711    if (cmd_state->gs_copy_shader) {
9712       radv_emit_view_index_per_stage(cs, cmd_state->gs_copy_shader, R_00B130_SPI_SHADER_USER_DATA_VS_0, index);
9713    }
9714 }
9715 
9716 /**
9717  * Emulates predication for MEC using COND_EXEC.
9718  * When the current command buffer is predicating, emit a COND_EXEC packet
9719  * so that the MEC skips the next few dwords worth of packets.
9720  *
9721  * To make it work with inverted conditional rendering, we allocate
9722  * space in the upload BO and emit some packets to invert the condition.
9723  */
9724 static void
radv_cs_emit_compute_predication(const struct radv_device * device,struct radv_cmd_state * state,struct radeon_cmdbuf * cs,uint64_t inv_va,bool * inv_emitted,unsigned dwords)9725 radv_cs_emit_compute_predication(const struct radv_device *device, struct radv_cmd_state *state,
9726                                  struct radeon_cmdbuf *cs, uint64_t inv_va, bool *inv_emitted, unsigned dwords)
9727 {
9728    const struct radv_physical_device *pdev = radv_device_physical(device);
9729 
9730    if (!state->predicating)
9731       return;
9732 
9733    uint64_t va = state->predication_va;
9734 
9735    if (!state->predication_type) {
9736       /* Invert the condition the first time it is needed. */
9737       if (!*inv_emitted) {
9738          const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
9739 
9740          *inv_emitted = true;
9741 
9742          /* Write 1 to the inverted predication VA. */
9743          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
9744          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
9745                             COPY_DATA_WR_CONFIRM | (gfx_level == GFX6 ? COPY_DATA_ENGINE_PFP : 0));
9746          radeon_emit(cs, 1);
9747          radeon_emit(cs, 0);
9748          radeon_emit(cs, inv_va);
9749          radeon_emit(cs, inv_va >> 32);
9750 
9751          /* If the API predication VA == 0, skip next command. */
9752          radv_emit_cond_exec(device, cs, va, 6 /* 1x COPY_DATA size */);
9753 
9754          /* Write 0 to the new predication VA (when the API condition != 0) */
9755          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
9756          radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
9757                             COPY_DATA_WR_CONFIRM | (gfx_level == GFX6 ? COPY_DATA_ENGINE_PFP : 0));
9758          radeon_emit(cs, 0);
9759          radeon_emit(cs, 0);
9760          radeon_emit(cs, inv_va);
9761          radeon_emit(cs, inv_va >> 32);
9762       }
9763 
9764       va = inv_va;
9765    }
9766 
9767    radv_emit_cond_exec(device, cs, va, dwords);
9768 }
9769 
9770 static void
radv_cs_emit_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_count,uint32_t use_opaque)9771 radv_cs_emit_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_count, uint32_t use_opaque)
9772 {
9773    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, cmd_buffer->state.predicating));
9774    radeon_emit(cmd_buffer->cs, vertex_count);
9775    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
9776 }
9777 
9778 /**
9779  * Emit a PKT3_DRAW_INDEX_2 packet to render "index_count` vertices.
9780  *
9781  * The starting address "index_va" may point anywhere within the index buffer. The number of
9782  * indexes allocated in the index buffer *past that point* is specified by "max_index_count".
9783  * Hardware uses this information to return 0 for out-of-bounds reads.
9784  */
9785 static void
radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer * cmd_buffer,uint64_t index_va,uint32_t max_index_count,uint32_t index_count,bool not_eop)9786 radv_cs_emit_draw_indexed_packet(struct radv_cmd_buffer *cmd_buffer, uint64_t index_va, uint32_t max_index_count,
9787                                  uint32_t index_count, bool not_eop)
9788 {
9789    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DRAW_INDEX_2, 4, cmd_buffer->state.predicating));
9790    radeon_emit(cmd_buffer->cs, max_index_count);
9791    radeon_emit(cmd_buffer->cs, index_va);
9792    radeon_emit(cmd_buffer->cs, index_va >> 32);
9793    radeon_emit(cmd_buffer->cs, index_count);
9794    /* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
9795     * can be changed between draws and GS fast launch must be disabled.
9796     * NOT_EOP doesn't work on gfx6-gfx9 and gfx12.
9797     */
9798    radeon_emit(cmd_buffer->cs, V_0287F0_DI_SRC_SEL_DMA | S_0287F0_NOT_EOP(not_eop));
9799 }
9800 
9801 /* MUST inline this function to avoid massive perf loss in drawoverhead */
9802 ALWAYS_INLINE static void
radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer * cmd_buffer,bool indexed,uint32_t draw_count,uint64_t count_va,uint32_t stride)9803 radv_cs_emit_indirect_draw_packet(struct radv_cmd_buffer *cmd_buffer, bool indexed, uint32_t draw_count,
9804                                   uint64_t count_va, uint32_t stride)
9805 {
9806    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9807    const unsigned di_src_sel = indexed ? V_0287F0_DI_SRC_SEL_DMA : V_0287F0_DI_SRC_SEL_AUTO_INDEX;
9808    bool draw_id_enable = cmd_buffer->state.uses_drawid;
9809    uint32_t base_reg = cmd_buffer->state.vtx_base_sgpr;
9810    uint32_t vertex_offset_reg, start_instance_reg = 0, draw_id_reg = 0;
9811    bool predicating = cmd_buffer->state.predicating;
9812    assert(base_reg);
9813 
9814    /* just reset draw state for vertex data */
9815    cmd_buffer->state.last_first_instance = -1;
9816    cmd_buffer->state.last_num_instances = -1;
9817    cmd_buffer->state.last_drawid = -1;
9818    cmd_buffer->state.last_vertex_offset_valid = false;
9819 
9820    vertex_offset_reg = (base_reg - SI_SH_REG_OFFSET) >> 2;
9821    if (cmd_buffer->state.uses_baseinstance)
9822       start_instance_reg = ((base_reg + (draw_id_enable ? 8 : 4)) - SI_SH_REG_OFFSET) >> 2;
9823    if (draw_id_enable)
9824       draw_id_reg = ((base_reg + 4) - SI_SH_REG_OFFSET) >> 2;
9825 
9826    if (draw_count == 1 && !count_va && !draw_id_enable) {
9827       radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT : PKT3_DRAW_INDIRECT, 3, predicating));
9828       radeon_emit(cs, 0);
9829       radeon_emit(cs, vertex_offset_reg);
9830       radeon_emit(cs, start_instance_reg);
9831       radeon_emit(cs, di_src_sel);
9832    } else {
9833       radeon_emit(cs, PKT3(indexed ? PKT3_DRAW_INDEX_INDIRECT_MULTI : PKT3_DRAW_INDIRECT_MULTI, 8, predicating));
9834       radeon_emit(cs, 0);
9835       radeon_emit(cs, vertex_offset_reg);
9836       radeon_emit(cs, start_instance_reg);
9837       radeon_emit(cs, draw_id_reg | S_2C3_DRAW_INDEX_ENABLE(draw_id_enable) | S_2C3_COUNT_INDIRECT_ENABLE(!!count_va));
9838       radeon_emit(cs, draw_count); /* count */
9839       radeon_emit(cs, count_va);   /* count_addr */
9840       radeon_emit(cs, count_va >> 32);
9841       radeon_emit(cs, stride); /* stride */
9842       radeon_emit(cs, di_src_sel);
9843    }
9844 
9845    cmd_buffer->state.uses_draw_indirect = true;
9846 }
9847 
9848 ALWAYS_INLINE static void
radv_cs_emit_indirect_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t draw_count,uint64_t count_va,uint32_t stride)9849 radv_cs_emit_indirect_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t draw_count, uint64_t count_va,
9850                                        uint32_t stride)
9851 {
9852    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
9853    const struct radv_physical_device *pdev = radv_device_physical(device);
9854    const struct radv_shader *mesh_shader = cmd_buffer->state.shaders[MESA_SHADER_MESH];
9855    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9856    uint32_t base_reg = cmd_buffer->state.vtx_base_sgpr;
9857    bool predicating = cmd_buffer->state.predicating;
9858    assert(base_reg || (!cmd_buffer->state.uses_drawid && !mesh_shader->info.cs.uses_grid_size));
9859 
9860    /* Reset draw state. */
9861    cmd_buffer->state.last_first_instance = -1;
9862    cmd_buffer->state.last_num_instances = -1;
9863    cmd_buffer->state.last_drawid = -1;
9864    cmd_buffer->state.last_vertex_offset_valid = false;
9865 
9866    uint32_t xyz_dim_enable = mesh_shader->info.cs.uses_grid_size;
9867    uint32_t xyz_dim_reg = !xyz_dim_enable ? 0 : (base_reg - SI_SH_REG_OFFSET) >> 2;
9868    uint32_t draw_id_enable = !!cmd_buffer->state.uses_drawid;
9869    uint32_t draw_id_reg = !draw_id_enable ? 0 : (base_reg + (xyz_dim_enable ? 12 : 0) - SI_SH_REG_OFFSET) >> 2;
9870 
9871    uint32_t mode1_enable = !pdev->mesh_fast_launch_2;
9872 
9873    radeon_emit(cs, PKT3(PKT3_DISPATCH_MESH_INDIRECT_MULTI, 7, predicating) | PKT3_RESET_FILTER_CAM_S(1));
9874    radeon_emit(cs, 0); /* data_offset */
9875    radeon_emit(cs, S_4C1_XYZ_DIM_REG(xyz_dim_reg) | S_4C1_DRAW_INDEX_REG(draw_id_reg));
9876    if (pdev->info.gfx_level >= GFX11)
9877       radeon_emit(cs, S_4C2_DRAW_INDEX_ENABLE(draw_id_enable) | S_4C2_COUNT_INDIRECT_ENABLE(!!count_va) |
9878                          S_4C2_XYZ_DIM_ENABLE(xyz_dim_enable) | S_4C2_MODE1_ENABLE(mode1_enable));
9879    else
9880       radeon_emit(cs, S_4C2_DRAW_INDEX_ENABLE(draw_id_enable) | S_4C2_COUNT_INDIRECT_ENABLE(!!count_va));
9881    radeon_emit(cs, draw_count);
9882    radeon_emit(cs, count_va);
9883    radeon_emit(cs, count_va >> 32);
9884    radeon_emit(cs, stride);
9885    radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
9886 }
9887 
9888 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_direct_ace_packet(const struct radv_device * device,const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * ace_cs,const uint32_t x,const uint32_t y,const uint32_t z)9889 radv_cs_emit_dispatch_taskmesh_direct_ace_packet(const struct radv_device *device,
9890                                                  const struct radv_cmd_state *cmd_state, struct radeon_cmdbuf *ace_cs,
9891                                                  const uint32_t x, const uint32_t y, const uint32_t z)
9892 {
9893    const struct radv_shader *task_shader = cmd_state->shaders[MESA_SHADER_TASK];
9894    const bool predicating = cmd_state->predicating;
9895    const uint32_t dispatch_initiator =
9896       device->dispatch_initiator_task | S_00B800_CS_W32_EN(task_shader->info.wave_size == 32);
9897    const uint32_t ring_entry_reg = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
9898 
9899    radeon_emit(ace_cs, PKT3(PKT3_DISPATCH_TASKMESH_DIRECT_ACE, 4, predicating) | PKT3_SHADER_TYPE_S(1));
9900    radeon_emit(ace_cs, x);
9901    radeon_emit(ace_cs, y);
9902    radeon_emit(ace_cs, z);
9903    radeon_emit(ace_cs, dispatch_initiator);
9904    radeon_emit(ace_cs, ring_entry_reg & 0xFFFF);
9905 }
9906 
9907 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(const struct radv_device * device,const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * ace_cs,uint64_t data_va,uint32_t draw_count,uint64_t count_va,uint32_t stride)9908 radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(const struct radv_device *device,
9909                                                          const struct radv_cmd_state *cmd_state,
9910                                                          struct radeon_cmdbuf *ace_cs, uint64_t data_va,
9911                                                          uint32_t draw_count, uint64_t count_va, uint32_t stride)
9912 {
9913    assert((data_va & 0x03) == 0);
9914    assert((count_va & 0x03) == 0);
9915 
9916    const struct radv_shader *task_shader = cmd_state->shaders[MESA_SHADER_TASK];
9917 
9918    const uint32_t dispatch_initiator =
9919       device->dispatch_initiator_task | S_00B800_CS_W32_EN(task_shader->info.wave_size == 32);
9920    const uint32_t ring_entry_reg = radv_get_user_sgpr(task_shader, AC_UD_TASK_RING_ENTRY);
9921    const uint32_t xyz_dim_reg = radv_get_user_sgpr(task_shader, AC_UD_CS_GRID_SIZE);
9922    const uint32_t draw_id_reg = radv_get_user_sgpr(task_shader, AC_UD_CS_TASK_DRAW_ID);
9923 
9924    radeon_emit(ace_cs, PKT3(PKT3_DISPATCH_TASKMESH_INDIRECT_MULTI_ACE, 9, 0) | PKT3_SHADER_TYPE_S(1));
9925    radeon_emit(ace_cs, data_va);
9926    radeon_emit(ace_cs, data_va >> 32);
9927    radeon_emit(ace_cs, S_AD2_RING_ENTRY_REG(ring_entry_reg));
9928    radeon_emit(ace_cs, S_AD3_COUNT_INDIRECT_ENABLE(!!count_va) | S_AD3_DRAW_INDEX_ENABLE(!!draw_id_reg) |
9929                           S_AD3_XYZ_DIM_ENABLE(!!xyz_dim_reg) | S_AD3_DRAW_INDEX_REG(draw_id_reg));
9930    radeon_emit(ace_cs, S_AD4_XYZ_DIM_REG(xyz_dim_reg));
9931    radeon_emit(ace_cs, draw_count);
9932    radeon_emit(ace_cs, count_va);
9933    radeon_emit(ace_cs, count_va >> 32);
9934    radeon_emit(ace_cs, stride);
9935    radeon_emit(ace_cs, dispatch_initiator);
9936 }
9937 
9938 ALWAYS_INLINE static void
radv_cs_emit_dispatch_taskmesh_gfx_packet(const struct radv_device * device,const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * cs)9939 radv_cs_emit_dispatch_taskmesh_gfx_packet(const struct radv_device *device, const struct radv_cmd_state *cmd_state,
9940                                           struct radeon_cmdbuf *cs)
9941 {
9942    const struct radv_physical_device *pdev = radv_device_physical(device);
9943    const struct radv_shader *mesh_shader = cmd_state->shaders[MESA_SHADER_MESH];
9944    const bool predicating = cmd_state->predicating;
9945 
9946    const uint32_t ring_entry_reg = radv_get_user_sgpr(mesh_shader, AC_UD_TASK_RING_ENTRY);
9947 
9948    uint32_t xyz_dim_en = mesh_shader->info.cs.uses_grid_size;
9949    uint32_t xyz_dim_reg = !xyz_dim_en ? 0 : (cmd_state->vtx_base_sgpr - SI_SH_REG_OFFSET) >> 2;
9950    uint32_t mode1_en = !pdev->mesh_fast_launch_2;
9951    uint32_t linear_dispatch_en = cmd_state->shaders[MESA_SHADER_TASK]->info.cs.linear_taskmesh_dispatch;
9952    const bool sqtt_en = !!device->sqtt.bo;
9953 
9954    radeon_emit(cs, PKT3(PKT3_DISPATCH_TASKMESH_GFX, 2, predicating) | PKT3_RESET_FILTER_CAM_S(1));
9955    radeon_emit(cs, S_4D0_RING_ENTRY_REG(ring_entry_reg) | S_4D0_XYZ_DIM_REG(xyz_dim_reg));
9956    if (pdev->info.gfx_level >= GFX11)
9957       radeon_emit(cs, S_4D1_XYZ_DIM_ENABLE(xyz_dim_en) | S_4D1_MODE1_ENABLE(mode1_en) |
9958                          S_4D1_LINEAR_DISPATCH_ENABLE(linear_dispatch_en) | S_4D1_THREAD_TRACE_MARKER_ENABLE(sqtt_en));
9959    else
9960       radeon_emit(cs, S_4D1_THREAD_TRACE_MARKER_ENABLE(sqtt_en));
9961    radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
9962 }
9963 
9964 ALWAYS_INLINE static void
radv_emit_userdata_vertex_internal(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)9965 radv_emit_userdata_vertex_internal(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
9966                                    const uint32_t vertex_offset)
9967 {
9968    struct radv_cmd_state *state = &cmd_buffer->state;
9969    struct radeon_cmdbuf *cs = cmd_buffer->cs;
9970    const bool uses_baseinstance = state->uses_baseinstance;
9971    const bool uses_drawid = state->uses_drawid;
9972 
9973    radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, state->vtx_emit_num);
9974 
9975    radeon_emit(cs, vertex_offset);
9976    state->last_vertex_offset_valid = true;
9977    state->last_vertex_offset = vertex_offset;
9978    if (uses_drawid) {
9979       radeon_emit(cs, 0);
9980       state->last_drawid = 0;
9981    }
9982    if (uses_baseinstance) {
9983       radeon_emit(cs, info->first_instance);
9984       state->last_first_instance = info->first_instance;
9985    }
9986 }
9987 
9988 ALWAYS_INLINE static void
radv_emit_userdata_vertex(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,const uint32_t vertex_offset)9989 radv_emit_userdata_vertex(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
9990                           const uint32_t vertex_offset)
9991 {
9992    const struct radv_cmd_state *state = &cmd_buffer->state;
9993    const bool uses_baseinstance = state->uses_baseinstance;
9994    const bool uses_drawid = state->uses_drawid;
9995 
9996    if (!state->last_vertex_offset_valid || vertex_offset != state->last_vertex_offset ||
9997        (uses_drawid && 0 != state->last_drawid) ||
9998        (uses_baseinstance && info->first_instance != state->last_first_instance))
9999       radv_emit_userdata_vertex_internal(cmd_buffer, info, vertex_offset);
10000 }
10001 
10002 ALWAYS_INLINE static void
radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer * cmd_buffer,uint32_t vertex_offset,uint32_t drawid)10003 radv_emit_userdata_vertex_drawid(struct radv_cmd_buffer *cmd_buffer, uint32_t vertex_offset, uint32_t drawid)
10004 {
10005    struct radv_cmd_state *state = &cmd_buffer->state;
10006    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10007    radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, 1 + !!drawid);
10008    radeon_emit(cs, vertex_offset);
10009    state->last_vertex_offset_valid = true;
10010    state->last_vertex_offset = vertex_offset;
10011    if (drawid)
10012       radeon_emit(cs, drawid);
10013 }
10014 
10015 ALWAYS_INLINE static void
radv_emit_userdata_mesh(struct radv_cmd_buffer * cmd_buffer,const uint32_t x,const uint32_t y,const uint32_t z)10016 radv_emit_userdata_mesh(struct radv_cmd_buffer *cmd_buffer, const uint32_t x, const uint32_t y, const uint32_t z)
10017 {
10018    struct radv_cmd_state *state = &cmd_buffer->state;
10019    const struct radv_shader *mesh_shader = state->shaders[MESA_SHADER_MESH];
10020    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10021    const bool uses_drawid = state->uses_drawid;
10022    const bool uses_grid_size = mesh_shader->info.cs.uses_grid_size;
10023 
10024    if (!uses_drawid && !uses_grid_size)
10025       return;
10026 
10027    radeon_set_sh_reg_seq(cs, state->vtx_base_sgpr, state->vtx_emit_num);
10028    if (uses_grid_size) {
10029       radeon_emit(cs, x);
10030       radeon_emit(cs, y);
10031       radeon_emit(cs, z);
10032    }
10033    if (uses_drawid) {
10034       radeon_emit(cs, 0);
10035       state->last_drawid = 0;
10036    }
10037 }
10038 
10039 ALWAYS_INLINE static void
radv_emit_userdata_task(const struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * ace_cs,uint32_t x,uint32_t y,uint32_t z)10040 radv_emit_userdata_task(const struct radv_cmd_state *cmd_state, struct radeon_cmdbuf *ace_cs, uint32_t x, uint32_t y,
10041                         uint32_t z)
10042 {
10043    const struct radv_shader *task_shader = cmd_state->shaders[MESA_SHADER_TASK];
10044 
10045    const uint32_t xyz_offset = radv_get_user_sgpr_loc(task_shader, AC_UD_CS_GRID_SIZE);
10046    const uint32_t draw_id_offset = radv_get_user_sgpr_loc(task_shader, AC_UD_CS_TASK_DRAW_ID);
10047 
10048    if (xyz_offset) {
10049       radeon_set_sh_reg_seq(ace_cs, xyz_offset, 3);
10050       radeon_emit(ace_cs, x);
10051       radeon_emit(ace_cs, y);
10052       radeon_emit(ace_cs, z);
10053    }
10054 
10055    if (draw_id_offset) {
10056       radeon_set_sh_reg_seq(ace_cs, draw_id_offset, 1);
10057       radeon_emit(ace_cs, 0);
10058    }
10059 }
10060 
10061 ALWAYS_INLINE static void
radv_emit_draw_packets_indexed(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * minfo,uint32_t stride,const int32_t * vertexOffset)10062 radv_emit_draw_packets_indexed(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info,
10063                                uint32_t drawCount, const VkMultiDrawIndexedInfoEXT *minfo, uint32_t stride,
10064                                const int32_t *vertexOffset)
10065 
10066 {
10067    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10068    const struct radv_physical_device *pdev = radv_device_physical(device);
10069    struct radv_cmd_state *state = &cmd_buffer->state;
10070    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10071    const int index_size = radv_get_vgt_index_size(state->index_type);
10072    unsigned i = 0;
10073    const bool uses_drawid = state->uses_drawid;
10074    const bool can_eop = !uses_drawid && pdev->info.gfx_level >= GFX10 && pdev->info.gfx_level < GFX12;
10075 
10076    if (uses_drawid) {
10077       if (vertexOffset) {
10078          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
10079          vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
10080             uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
10081             uint64_t index_va = state->index_va + draw->firstIndex * index_size;
10082 
10083             /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
10084             if (!remaining_indexes && pdev->info.has_zero_index_buffer_bug)
10085                radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
10086 
10087             if (i > 0)
10088                radeon_set_sh_reg(cs, state->vtx_base_sgpr + sizeof(uint32_t), i);
10089 
10090             if (!state->render.view_mask) {
10091                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10092             } else {
10093                u_foreach_bit (view, state->render.view_mask) {
10094                   radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10095 
10096                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10097                }
10098             }
10099          }
10100       } else {
10101          vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
10102             uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
10103             uint64_t index_va = state->index_va + draw->firstIndex * index_size;
10104 
10105             /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
10106             if (!remaining_indexes && pdev->info.has_zero_index_buffer_bug)
10107                radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
10108 
10109             if (i > 0) {
10110                assert(state->last_vertex_offset_valid);
10111                if (state->last_vertex_offset != draw->vertexOffset)
10112                   radv_emit_userdata_vertex_drawid(cmd_buffer, draw->vertexOffset, i);
10113                else
10114                   radeon_set_sh_reg(cs, state->vtx_base_sgpr + sizeof(uint32_t), i);
10115             } else
10116                radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
10117 
10118             if (!state->render.view_mask) {
10119                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10120             } else {
10121                u_foreach_bit (view, state->render.view_mask) {
10122                   radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10123 
10124                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10125                }
10126             }
10127          }
10128       }
10129       if (drawCount > 1) {
10130          state->last_drawid = drawCount - 1;
10131       }
10132    } else {
10133       if (vertexOffset) {
10134          if (pdev->info.gfx_level == GFX10) {
10135             /* GFX10 has a bug that consecutive draw packets with NOT_EOP must not have
10136              * count == 0 for the last draw that doesn't have NOT_EOP.
10137              */
10138             while (drawCount > 1) {
10139                const VkMultiDrawIndexedInfoEXT *last =
10140                   (const VkMultiDrawIndexedInfoEXT *)(((const uint8_t *)minfo) + (drawCount - 1) * stride);
10141                if (last->indexCount)
10142                   break;
10143                drawCount--;
10144             }
10145          }
10146 
10147          radv_emit_userdata_vertex(cmd_buffer, info, *vertexOffset);
10148          vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
10149             uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
10150             uint64_t index_va = state->index_va + draw->firstIndex * index_size;
10151 
10152             /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
10153             if (!remaining_indexes && pdev->info.has_zero_index_buffer_bug)
10154                radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
10155 
10156             if (!state->render.view_mask) {
10157                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount,
10158                                                 can_eop && i < drawCount - 1);
10159             } else {
10160                u_foreach_bit (view, state->render.view_mask) {
10161                   radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10162 
10163                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10164                }
10165             }
10166          }
10167       } else {
10168          vk_foreach_multi_draw_indexed (draw, i, minfo, drawCount, stride) {
10169             uint32_t remaining_indexes = MAX2(state->max_index_count, draw->firstIndex) - draw->firstIndex;
10170             uint64_t index_va = state->index_va + draw->firstIndex * index_size;
10171 
10172             /* Handle draw calls with 0-sized index buffers if the GPU can't support them. */
10173             if (!remaining_indexes && pdev->info.has_zero_index_buffer_bug)
10174                radv_handle_zero_index_buffer_bug(cmd_buffer, &index_va, &remaining_indexes);
10175 
10176             const VkMultiDrawIndexedInfoEXT *next =
10177                (const VkMultiDrawIndexedInfoEXT *)(i < drawCount - 1 ? ((uint8_t *)draw + stride) : NULL);
10178             const bool offset_changes = next && next->vertexOffset != draw->vertexOffset;
10179             radv_emit_userdata_vertex(cmd_buffer, info, draw->vertexOffset);
10180 
10181             if (!state->render.view_mask) {
10182                radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount,
10183                                                 can_eop && !offset_changes && i < drawCount - 1);
10184             } else {
10185                u_foreach_bit (view, state->render.view_mask) {
10186                   radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10187 
10188                   radv_cs_emit_draw_indexed_packet(cmd_buffer, index_va, remaining_indexes, draw->indexCount, false);
10189                }
10190             }
10191          }
10192       }
10193       if (drawCount > 1) {
10194          state->last_drawid = drawCount - 1;
10195       }
10196    }
10197 }
10198 
10199 ALWAYS_INLINE static void
radv_emit_direct_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,const VkMultiDrawInfoEXT * minfo,uint32_t use_opaque,uint32_t stride)10200 radv_emit_direct_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount,
10201                               const VkMultiDrawInfoEXT *minfo, uint32_t use_opaque, uint32_t stride)
10202 {
10203    unsigned i = 0;
10204    const uint32_t view_mask = cmd_buffer->state.render.view_mask;
10205    const bool uses_drawid = cmd_buffer->state.uses_drawid;
10206    uint32_t last_start = 0;
10207 
10208    vk_foreach_multi_draw (draw, i, minfo, drawCount, stride) {
10209       if (!i)
10210          radv_emit_userdata_vertex(cmd_buffer, info, draw->firstVertex);
10211       else
10212          radv_emit_userdata_vertex_drawid(cmd_buffer, draw->firstVertex, uses_drawid ? i : 0);
10213 
10214       if (!view_mask) {
10215          radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
10216       } else {
10217          u_foreach_bit (view, view_mask) {
10218             radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10219             radv_cs_emit_draw_packet(cmd_buffer, draw->vertexCount, use_opaque);
10220          }
10221       }
10222       last_start = draw->firstVertex;
10223    }
10224    if (drawCount > 1) {
10225       struct radv_cmd_state *state = &cmd_buffer->state;
10226       assert(state->last_vertex_offset_valid);
10227       state->last_vertex_offset = last_start;
10228       if (uses_drawid)
10229          state->last_drawid = drawCount - 1;
10230    }
10231 }
10232 
10233 static void
radv_cs_emit_mesh_dispatch_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)10234 radv_cs_emit_mesh_dispatch_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
10235 {
10236    radeon_emit(cmd_buffer->cs, PKT3(PKT3_DISPATCH_MESH_DIRECT, 3, cmd_buffer->state.predicating));
10237    radeon_emit(cmd_buffer->cs, x);
10238    radeon_emit(cmd_buffer->cs, y);
10239    radeon_emit(cmd_buffer->cs, z);
10240    radeon_emit(cmd_buffer->cs, S_0287F0_SOURCE_SELECT(V_0287F0_DI_SRC_SEL_AUTO_INDEX));
10241 }
10242 
10243 ALWAYS_INLINE static void
radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)10244 radv_emit_direct_mesh_draw_packet(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
10245 {
10246    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10247    const struct radv_physical_device *pdev = radv_device_physical(device);
10248    const uint32_t view_mask = cmd_buffer->state.render.view_mask;
10249 
10250    radv_emit_userdata_mesh(cmd_buffer, x, y, z);
10251 
10252    if (pdev->mesh_fast_launch_2) {
10253       if (!view_mask) {
10254          radv_cs_emit_mesh_dispatch_packet(cmd_buffer, x, y, z);
10255       } else {
10256          u_foreach_bit (view, view_mask) {
10257             radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10258             radv_cs_emit_mesh_dispatch_packet(cmd_buffer, x, y, z);
10259          }
10260       }
10261    } else {
10262       const uint32_t count = x * y * z;
10263       if (!view_mask) {
10264          radv_cs_emit_draw_packet(cmd_buffer, count, 0);
10265       } else {
10266          u_foreach_bit (view, view_mask) {
10267             radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
10268             radv_cs_emit_draw_packet(cmd_buffer, count, 0);
10269          }
10270       }
10271    }
10272 }
10273 
10274 ALWAYS_INLINE static void
radv_emit_indirect_mesh_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)10275 radv_emit_indirect_mesh_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
10276 {
10277    const struct radv_cmd_state *state = &cmd_buffer->state;
10278    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10279    struct radeon_winsys *ws = device->ws;
10280    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10281    const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
10282    const uint64_t count_va = !info->count_buffer ? 0
10283                                                  : radv_buffer_get_va(info->count_buffer->bo) +
10284                                                       info->count_buffer->offset + info->count_buffer_offset;
10285 
10286    radv_cs_add_buffer(ws, cs, info->indirect->bo);
10287 
10288    if (info->count_buffer) {
10289       radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
10290    }
10291 
10292    radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
10293    radeon_emit(cs, 1);
10294    radeon_emit(cs, va);
10295    radeon_emit(cs, va >> 32);
10296 
10297    if (state->uses_drawid) {
10298       const struct radv_shader *mesh_shader = state->shaders[MESA_SHADER_MESH];
10299       unsigned reg = state->vtx_base_sgpr + (mesh_shader->info.cs.uses_grid_size ? 12 : 0);
10300       radeon_set_sh_reg_seq(cs, reg, 1);
10301       radeon_emit(cs, 0);
10302    }
10303 
10304    if (!state->render.view_mask) {
10305       radv_cs_emit_indirect_mesh_draw_packet(cmd_buffer, info->count, count_va, info->stride);
10306    } else {
10307       u_foreach_bit (i, state->render.view_mask) {
10308          radv_emit_view_index(&cmd_buffer->state, cs, i);
10309          radv_cs_emit_indirect_mesh_draw_packet(cmd_buffer, info->count, count_va, info->stride);
10310       }
10311    }
10312 }
10313 
10314 ALWAYS_INLINE static void
radv_emit_direct_taskmesh_draw_packets(const struct radv_device * device,struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * cs,struct radeon_cmdbuf * ace_cs,uint32_t x,uint32_t y,uint32_t z)10315 radv_emit_direct_taskmesh_draw_packets(const struct radv_device *device, struct radv_cmd_state *cmd_state,
10316                                        struct radeon_cmdbuf *cs, struct radeon_cmdbuf *ace_cs, uint32_t x, uint32_t y,
10317                                        uint32_t z)
10318 {
10319    const uint32_t view_mask = cmd_state->render.view_mask;
10320    const unsigned num_views = MAX2(1, util_bitcount(view_mask));
10321    const unsigned ace_predication_size = num_views * 6; /* DISPATCH_TASKMESH_DIRECT_ACE size */
10322 
10323    radv_emit_userdata_task(cmd_state, ace_cs, x, y, z);
10324    radv_cs_emit_compute_predication(device, cmd_state, ace_cs, cmd_state->mec_inv_pred_va,
10325                                     &cmd_state->mec_inv_pred_emitted, ace_predication_size);
10326 
10327    if (!view_mask) {
10328       radv_cs_emit_dispatch_taskmesh_direct_ace_packet(device, cmd_state, ace_cs, x, y, z);
10329       radv_cs_emit_dispatch_taskmesh_gfx_packet(device, cmd_state, cs);
10330    } else {
10331       u_foreach_bit (view, view_mask) {
10332          radv_emit_view_index(cmd_state, cs, view);
10333 
10334          radv_cs_emit_dispatch_taskmesh_direct_ace_packet(device, cmd_state, ace_cs, x, y, z);
10335          radv_cs_emit_dispatch_taskmesh_gfx_packet(device, cmd_state, cs);
10336       }
10337    }
10338 }
10339 
10340 static void
radv_emit_indirect_taskmesh_draw_packets(const struct radv_device * device,struct radv_cmd_state * cmd_state,struct radeon_cmdbuf * cs,struct radeon_cmdbuf * ace_cs,const struct radv_draw_info * info,uint64_t workaround_cond_va)10341 radv_emit_indirect_taskmesh_draw_packets(const struct radv_device *device, struct radv_cmd_state *cmd_state,
10342                                          struct radeon_cmdbuf *cs, struct radeon_cmdbuf *ace_cs,
10343                                          const struct radv_draw_info *info, uint64_t workaround_cond_va)
10344 {
10345    const struct radv_physical_device *pdev = radv_device_physical(device);
10346    const uint32_t view_mask = cmd_state->render.view_mask;
10347    struct radeon_winsys *ws = device->ws;
10348    const unsigned num_views = MAX2(1, util_bitcount(view_mask));
10349    unsigned ace_predication_size = num_views * 11; /* DISPATCH_TASKMESH_INDIRECT_MULTI_ACE size */
10350 
10351    const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
10352    const uint64_t count_va = !info->count_buffer ? 0
10353                                                  : radv_buffer_get_va(info->count_buffer->bo) +
10354                                                       info->count_buffer->offset + info->count_buffer_offset;
10355 
10356    if (count_va)
10357       radv_cs_add_buffer(ws, ace_cs, info->count_buffer->bo);
10358 
10359    if (pdev->info.has_taskmesh_indirect0_bug && count_va) {
10360       /* MEC firmware bug workaround.
10361        * When the count buffer contains zero, DISPATCH_TASKMESH_INDIRECT_MULTI_ACE hangs.
10362        * - We must ensure that DISPATCH_TASKMESH_INDIRECT_MULTI_ACE
10363        *   is only executed when the count buffer contains non-zero.
10364        * - Furthermore, we must also ensure that each DISPATCH_TASKMESH_GFX packet
10365        *   has a matching ACE packet.
10366        *
10367        * As a workaround:
10368        * - Reserve a dword in the upload buffer and initialize it to 1 for the workaround
10369        * - When count != 0, write 0 to the workaround BO and execute the indirect dispatch
10370        * - When workaround BO != 0 (count was 0), execute an empty direct dispatch
10371        */
10372       radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
10373       radeon_emit(ace_cs,
10374                   COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
10375       radeon_emit(ace_cs, 1);
10376       radeon_emit(ace_cs, 0);
10377       radeon_emit(ace_cs, workaround_cond_va);
10378       radeon_emit(ace_cs, workaround_cond_va >> 32);
10379 
10380       /* 2x COND_EXEC + 1x COPY_DATA + Nx DISPATCH_TASKMESH_DIRECT_ACE */
10381       ace_predication_size += 2 * 5 + 6 + 6 * num_views;
10382    }
10383 
10384    radv_cs_add_buffer(ws, ace_cs, info->indirect->bo);
10385    radv_cs_emit_compute_predication(device, cmd_state, ace_cs, cmd_state->mec_inv_pred_va,
10386                                     &cmd_state->mec_inv_pred_emitted, ace_predication_size);
10387 
10388    if (workaround_cond_va) {
10389       radv_emit_cond_exec(device, ace_cs, count_va,
10390                           6 + 11 * num_views /* 1x COPY_DATA + Nx DISPATCH_TASKMESH_INDIRECT_MULTI_ACE */);
10391 
10392       radeon_emit(ace_cs, PKT3(PKT3_COPY_DATA, 4, 0));
10393       radeon_emit(ace_cs,
10394                   COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
10395       radeon_emit(ace_cs, 0);
10396       radeon_emit(ace_cs, 0);
10397       radeon_emit(ace_cs, workaround_cond_va);
10398       radeon_emit(ace_cs, workaround_cond_va >> 32);
10399    }
10400 
10401    if (!view_mask) {
10402       radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(device, cmd_state, ace_cs, va, info->count, count_va,
10403                                                                info->stride);
10404       radv_cs_emit_dispatch_taskmesh_gfx_packet(device, cmd_state, cs);
10405    } else {
10406       u_foreach_bit (view, view_mask) {
10407          radv_emit_view_index(cmd_state, cs, view);
10408 
10409          radv_cs_emit_dispatch_taskmesh_indirect_multi_ace_packet(device, cmd_state, ace_cs, va, info->count, count_va,
10410                                                                   info->stride);
10411          radv_cs_emit_dispatch_taskmesh_gfx_packet(device, cmd_state, cs);
10412       }
10413    }
10414 
10415    if (workaround_cond_va) {
10416       radv_emit_cond_exec(device, ace_cs, workaround_cond_va, 6 * num_views /* Nx DISPATCH_TASKMESH_DIRECT_ACE */);
10417 
10418       for (unsigned v = 0; v < num_views; ++v) {
10419          radv_cs_emit_dispatch_taskmesh_direct_ace_packet(device, cmd_state, ace_cs, 0, 0, 0);
10420       }
10421    }
10422 }
10423 
10424 static void
radv_emit_indirect_draw_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)10425 radv_emit_indirect_draw_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
10426 {
10427    const struct radv_cmd_state *state = &cmd_buffer->state;
10428    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10429    struct radeon_winsys *ws = device->ws;
10430    struct radeon_cmdbuf *cs = cmd_buffer->cs;
10431    const uint64_t va = radv_buffer_get_va(info->indirect->bo) + info->indirect->offset + info->indirect_offset;
10432    const uint64_t count_va = info->count_buffer ? radv_buffer_get_va(info->count_buffer->bo) +
10433                                                      info->count_buffer->offset + info->count_buffer_offset
10434                                                 : 0;
10435 
10436    radv_cs_add_buffer(ws, cs, info->indirect->bo);
10437 
10438    radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0));
10439    radeon_emit(cs, 1);
10440    radeon_emit(cs, va);
10441    radeon_emit(cs, va >> 32);
10442 
10443    if (info->count_buffer) {
10444       radv_cs_add_buffer(ws, cs, info->count_buffer->bo);
10445    }
10446 
10447    if (!state->render.view_mask) {
10448       radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, info->stride);
10449    } else {
10450       u_foreach_bit (i, state->render.view_mask) {
10451          radv_emit_view_index(&cmd_buffer->state, cs, i);
10452 
10453          radv_cs_emit_indirect_draw_packet(cmd_buffer, info->indexed, info->count, count_va, info->stride);
10454       }
10455    }
10456 }
10457 
10458 static uint64_t
radv_get_needed_dynamic_states(struct radv_cmd_buffer * cmd_buffer)10459 radv_get_needed_dynamic_states(struct radv_cmd_buffer *cmd_buffer)
10460 {
10461    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10462    const struct radv_physical_device *pdev = radv_device_physical(device);
10463    uint64_t dynamic_states = RADV_DYNAMIC_ALL;
10464 
10465    if (cmd_buffer->state.graphics_pipeline)
10466       return cmd_buffer->state.graphics_pipeline->needed_dynamic_state;
10467 
10468    /* Clear unnecessary dynamic states for shader objects. */
10469    if (!cmd_buffer->state.shaders[MESA_SHADER_TESS_CTRL])
10470       dynamic_states &= ~(RADV_DYNAMIC_PATCH_CONTROL_POINTS | RADV_DYNAMIC_TESS_DOMAIN_ORIGIN);
10471 
10472    if (pdev->info.gfx_level >= GFX10_3) {
10473       if (cmd_buffer->state.shaders[MESA_SHADER_MESH])
10474          dynamic_states &= ~(RADV_DYNAMIC_VERTEX_INPUT | RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE |
10475                              RADV_DYNAMIC_PRIMITIVE_RESTART_ENABLE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY);
10476    } else {
10477       dynamic_states &= ~RADV_DYNAMIC_FRAGMENT_SHADING_RATE;
10478    }
10479 
10480    return dynamic_states;
10481 }
10482 
10483 /*
10484  * Vega and raven have a bug which triggers if there are multiple context
10485  * register contexts active at the same time with different scissor values.
10486  *
10487  * There are two possible workarounds:
10488  * 1) Wait for PS_PARTIAL_FLUSH every time the scissor is changed. That way
10489  *    there is only ever 1 active set of scissor values at the same time.
10490  *
10491  * 2) Whenever the hardware switches contexts we have to set the scissor
10492  *    registers again even if it is a noop. That way the new context gets
10493  *    the correct scissor values.
10494  *
10495  * This implements option 2. radv_need_late_scissor_emission needs to
10496  * return true on affected HW if radv_emit_all_graphics_states sets
10497  * any context registers.
10498  */
10499 static bool
radv_need_late_scissor_emission(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)10500 radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
10501 {
10502    if (cmd_buffer->state.context_roll_without_scissor_emitted || info->strmout_buffer)
10503       return true;
10504 
10505    uint64_t used_dynamic_states = radv_get_needed_dynamic_states(cmd_buffer);
10506 
10507    used_dynamic_states &= ~RADV_DYNAMIC_VERTEX_INPUT;
10508 
10509    if (cmd_buffer->state.dirty_dynamic & used_dynamic_states)
10510       return true;
10511 
10512    /* Index, vertex and streamout buffers don't change context regs.
10513     * We assume that any other dirty flag causes context rolls.
10514     */
10515    uint64_t used_states = RADV_CMD_DIRTY_ALL;
10516    used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER | RADV_CMD_DIRTY_VERTEX_BUFFER | RADV_CMD_DIRTY_STREAMOUT_BUFFER);
10517 
10518    return cmd_buffer->state.dirty & used_states;
10519 }
10520 
10521 ALWAYS_INLINE static uint32_t
radv_get_ngg_culling_settings(struct radv_cmd_buffer * cmd_buffer,bool vp_y_inverted)10522 radv_get_ngg_culling_settings(struct radv_cmd_buffer *cmd_buffer, bool vp_y_inverted)
10523 {
10524    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
10525 
10526    /* Disable shader culling entirely when conservative overestimate is used.
10527     * The face culling algorithm can delete very tiny triangles (even if unintended).
10528     */
10529    if (d->vk.rs.conservative_mode == VK_CONSERVATIVE_RASTERIZATION_MODE_OVERESTIMATE_EXT)
10530       return radv_nggc_none;
10531 
10532    /* With graphics pipeline library, NGG culling is unconditionally compiled into shaders
10533     * because we don't know the primitive topology at compile time, so we should
10534     * disable it dynamically for points or lines.
10535     */
10536    const unsigned num_vertices_per_prim = radv_conv_prim_to_gs_out(d->vk.ia.primitive_topology, true) + 1;
10537    if (num_vertices_per_prim != 3)
10538       return radv_nggc_none;
10539 
10540    /* Cull every triangle when rasterizer discard is enabled. */
10541    if (d->vk.rs.rasterizer_discard_enable)
10542       return radv_nggc_front_face | radv_nggc_back_face;
10543 
10544    uint32_t nggc_settings = radv_nggc_none;
10545 
10546    /* The culling code needs to know whether face is CW or CCW. */
10547    bool ccw = d->vk.rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
10548 
10549    /* Take inverted viewport into account. */
10550    ccw ^= vp_y_inverted;
10551 
10552    if (ccw)
10553       nggc_settings |= radv_nggc_face_is_ccw;
10554 
10555    /* Face culling settings. */
10556    if (d->vk.rs.cull_mode & VK_CULL_MODE_FRONT_BIT)
10557       nggc_settings |= radv_nggc_front_face;
10558    if (d->vk.rs.cull_mode & VK_CULL_MODE_BACK_BIT)
10559       nggc_settings |= radv_nggc_back_face;
10560 
10561    /* Small primitive culling assumes a sample position at (0.5, 0.5)
10562     * so don't enable it with user sample locations.
10563     */
10564    if (!d->vk.ms.sample_locations_enable) {
10565       nggc_settings |= radv_nggc_small_primitives;
10566 
10567       /* small_prim_precision = num_samples / 2^subpixel_bits
10568        * num_samples is also always a power of two, so the small prim precision can only be
10569        * a power of two between 2^-2 and 2^-6, therefore it's enough to remember the exponent.
10570        */
10571       unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
10572       unsigned subpixel_bits = 256;
10573       int32_t small_prim_precision_log2 = util_logbase2(rasterization_samples) - util_logbase2(subpixel_bits);
10574       nggc_settings |= ((uint32_t)small_prim_precision_log2 << 24u);
10575    }
10576 
10577    return nggc_settings;
10578 }
10579 
10580 static void
radv_emit_ngg_culling_state(struct radv_cmd_buffer * cmd_buffer)10581 radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer)
10582 {
10583    const struct radv_shader *last_vgt_shader = cmd_buffer->state.last_vgt_shader;
10584 
10585    /* Get viewport transform. */
10586    float vp_scale[2], vp_translate[2];
10587    memcpy(vp_scale, cmd_buffer->state.dynamic.hw_vp.xform[0].scale, 2 * sizeof(float));
10588    memcpy(vp_translate, cmd_buffer->state.dynamic.hw_vp.xform[0].translate, 2 * sizeof(float));
10589    bool vp_y_inverted = (-vp_scale[1] + vp_translate[1]) > (vp_scale[1] + vp_translate[1]);
10590 
10591    /* Get current culling settings. */
10592    uint32_t nggc_settings = radv_get_ngg_culling_settings(cmd_buffer, vp_y_inverted);
10593 
10594    if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
10595        (cmd_buffer->state.dirty_dynamic & (RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_RASTERIZATION_SAMPLES))) {
10596       /* Correction for inverted Y */
10597       if (vp_y_inverted) {
10598          vp_scale[1] = -vp_scale[1];
10599          vp_translate[1] = -vp_translate[1];
10600       }
10601 
10602       /* Correction for number of samples per pixel. */
10603       for (unsigned i = 0; i < 2; ++i) {
10604          vp_scale[i] *= (float)cmd_buffer->state.dynamic.vk.ms.rasterization_samples;
10605          vp_translate[i] *= (float)cmd_buffer->state.dynamic.vk.ms.rasterization_samples;
10606       }
10607 
10608       uint32_t vp_reg_values[4] = {fui(vp_scale[0]), fui(vp_scale[1]), fui(vp_translate[0]), fui(vp_translate[1])};
10609       const uint32_t ngg_viewport_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_VIEWPORT);
10610       radeon_set_sh_reg_seq(cmd_buffer->cs, ngg_viewport_offset, 4);
10611       radeon_emit_array(cmd_buffer->cs, vp_reg_values, 4);
10612    }
10613 
10614    const uint32_t ngg_culling_settings_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_CULLING_SETTINGS);
10615 
10616    radeon_set_sh_reg(cmd_buffer->cs, ngg_culling_settings_offset, nggc_settings);
10617 }
10618 
10619 static void
radv_emit_fs_state(struct radv_cmd_buffer * cmd_buffer)10620 radv_emit_fs_state(struct radv_cmd_buffer *cmd_buffer)
10621 {
10622    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
10623 
10624    if (!ps)
10625       return;
10626 
10627    const uint32_t ps_state_offset = radv_get_user_sgpr_loc(ps, AC_UD_PS_STATE);
10628    if (!ps_state_offset)
10629       return;
10630 
10631    const unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
10632    const unsigned ps_iter_samples = radv_get_ps_iter_samples(cmd_buffer);
10633    const uint16_t ps_iter_mask = ac_get_ps_iter_mask(ps_iter_samples);
10634    const unsigned rast_prim = radv_get_rasterization_prim(cmd_buffer);
10635    const unsigned ps_state = SET_SGPR_FIELD(PS_STATE_NUM_SAMPLES, rasterization_samples) |
10636                              SET_SGPR_FIELD(PS_STATE_PS_ITER_MASK, ps_iter_mask) |
10637                              SET_SGPR_FIELD(PS_STATE_LINE_RAST_MODE, radv_get_line_mode(cmd_buffer)) |
10638                              SET_SGPR_FIELD(PS_STATE_RAST_PRIM, rast_prim);
10639 
10640    radeon_set_sh_reg(cmd_buffer->cs, ps_state_offset, ps_state);
10641 }
10642 
10643 static void
radv_emit_db_shader_control(struct radv_cmd_buffer * cmd_buffer)10644 radv_emit_db_shader_control(struct radv_cmd_buffer *cmd_buffer)
10645 {
10646    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10647    const struct radv_physical_device *pdev = radv_device_physical(device);
10648    const struct radeon_info *gpu_info = &pdev->info;
10649    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
10650    const struct radv_dynamic_state *d = &cmd_buffer->state.dynamic;
10651    const bool uses_ds_feedback_loop =
10652       !!(d->feedback_loop_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT));
10653    const unsigned rasterization_samples = radv_get_rasterization_samples(cmd_buffer);
10654 
10655    uint32_t db_shader_control;
10656 
10657    if (ps) {
10658       db_shader_control = ps->info.regs.ps.db_shader_control;
10659    } else {
10660       db_shader_control = S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_ANY_Z) |
10661                           S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z) |
10662                           S_02880C_DUAL_QUAD_DISABLE(gpu_info->has_rbplus && !gpu_info->rbplus_allowed);
10663    }
10664 
10665    /* When a depth/stencil attachment is used inside feedback loops, use LATE_Z to make sure shader invocations read the
10666     * correct value.
10667     * Also apply the bug workaround for smoothing (overrasterization) on GFX6.
10668     */
10669    if (uses_ds_feedback_loop || (gpu_info->gfx_level == GFX6 &&
10670                                  radv_get_line_mode(cmd_buffer) == VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_KHR))
10671       db_shader_control = (db_shader_control & C_02880C_Z_ORDER) | S_02880C_Z_ORDER(V_02880C_LATE_Z);
10672 
10673    if (ps && ps->info.ps.pops) {
10674       /* POPS_OVERLAP_NUM_SAMPLES (OVERRIDE_INTRINSIC_RATE on GFX11, must always be enabled for POPS) controls the
10675        * interlock granularity.
10676        * PixelInterlock: 1x.
10677        * SampleInterlock: MSAA_EXPOSED_SAMPLES (much faster at common edges of adjacent primitives with MSAA).
10678        */
10679       if (gpu_info->gfx_level >= GFX11) {
10680          db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE_ENABLE(1);
10681          if (ps->info.ps.pops_is_per_sample)
10682             db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE(util_logbase2(rasterization_samples));
10683       } else {
10684          if (ps->info.ps.pops_is_per_sample)
10685             db_shader_control |= S_02880C_POPS_OVERLAP_NUM_SAMPLES(util_logbase2(rasterization_samples));
10686 
10687          if (gpu_info->has_pops_missed_overlap_bug) {
10688             radeon_set_context_reg(cmd_buffer->cs, R_028060_DB_DFSM_CONTROL,
10689                                    S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
10690                                       S_028060_POPS_DRAIN_PS_ON_OVERLAP(rasterization_samples >= 8));
10691          }
10692       }
10693    } else if (gpu_info->has_export_conflict_bug && rasterization_samples == 1) {
10694       for (uint32_t i = 0; i < MAX_RTS; i++) {
10695          if (d->vk.cb.attachments[i].write_mask && d->vk.cb.attachments[i].blend_enable) {
10696             db_shader_control |= S_02880C_OVERRIDE_INTRINSIC_RATE_ENABLE(1) | S_02880C_OVERRIDE_INTRINSIC_RATE(2);
10697             break;
10698          }
10699       }
10700    }
10701 
10702    if (pdev->info.gfx_level >= GFX12) {
10703       radeon_opt_set_context_reg(cmd_buffer, R_02806C_DB_SHADER_CONTROL, RADV_TRACKED_DB_SHADER_CONTROL,
10704                                  db_shader_control);
10705    } else {
10706       radeon_opt_set_context_reg(cmd_buffer, R_02880C_DB_SHADER_CONTROL, RADV_TRACKED_DB_SHADER_CONTROL,
10707                                  db_shader_control);
10708    }
10709 
10710    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_DB_SHADER_CONTROL;
10711 }
10712 
10713 static void
radv_emit_streamout_enable_state(struct radv_cmd_buffer * cmd_buffer)10714 radv_emit_streamout_enable_state(struct radv_cmd_buffer *cmd_buffer)
10715 {
10716    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10717    const struct radv_physical_device *pdev = radv_device_physical(device);
10718    const struct radv_streamout_state *so = &cmd_buffer->state.streamout;
10719    const bool streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
10720    uint32_t enabled_stream_buffers_mask = 0;
10721 
10722    assert(!pdev->use_ngg_streamout);
10723 
10724    if (streamout_enabled && cmd_buffer->state.last_vgt_shader) {
10725       const struct radv_shader_info *info = &cmd_buffer->state.last_vgt_shader->info;
10726 
10727       enabled_stream_buffers_mask = info->so.enabled_stream_buffers_mask;
10728 
10729       u_foreach_bit (i, so->enabled_mask) {
10730          radeon_set_context_reg(cmd_buffer->cs, R_028AD4_VGT_STRMOUT_VTX_STRIDE_0 + 16 * i, info->so.strides[i]);
10731       }
10732    }
10733 
10734    radeon_set_context_reg_seq(cmd_buffer->cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
10735    radeon_emit(cmd_buffer->cs, S_028B94_STREAMOUT_0_EN(streamout_enabled) | S_028B94_RAST_STREAM(0) |
10736                                   S_028B94_STREAMOUT_1_EN(streamout_enabled) |
10737                                   S_028B94_STREAMOUT_2_EN(streamout_enabled) |
10738                                   S_028B94_STREAMOUT_3_EN(streamout_enabled));
10739    radeon_emit(cmd_buffer->cs, so->hw_enabled_mask & enabled_stream_buffers_mask);
10740 
10741    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_STREAMOUT_ENABLE;
10742 }
10743 
10744 static gl_shader_stage
radv_cmdbuf_get_last_vgt_api_stage(const struct radv_cmd_buffer * cmd_buffer)10745 radv_cmdbuf_get_last_vgt_api_stage(const struct radv_cmd_buffer *cmd_buffer)
10746 {
10747    if (cmd_buffer->state.active_stages & VK_SHADER_STAGE_MESH_BIT_EXT)
10748       return MESA_SHADER_MESH;
10749 
10750    return util_last_bit(cmd_buffer->state.active_stages & BITFIELD_MASK(MESA_SHADER_FRAGMENT)) - 1;
10751 }
10752 
10753 static void
radv_emit_color_output_state(struct radv_cmd_buffer * cmd_buffer)10754 radv_emit_color_output_state(struct radv_cmd_buffer *cmd_buffer)
10755 {
10756    const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10757    const struct radv_physical_device *pdev = radv_device_physical(device);
10758 
10759    uint32_t col_format_compacted = radv_compact_spi_shader_col_format(cmd_buffer->state.spi_shader_col_format);
10760 
10761    if (pdev->info.gfx_level >= GFX12) {
10762       radeon_set_context_reg(cmd_buffer->cs, R_028854_CB_SHADER_MASK, cmd_buffer->state.cb_shader_mask);
10763       radeon_set_context_reg(cmd_buffer->cs, R_028654_SPI_SHADER_COL_FORMAT, col_format_compacted);
10764    } else {
10765       radeon_set_context_reg(cmd_buffer->cs, R_02823C_CB_SHADER_MASK, cmd_buffer->state.cb_shader_mask);
10766       radeon_set_context_reg(cmd_buffer->cs, R_028714_SPI_SHADER_COL_FORMAT, col_format_compacted);
10767    }
10768 
10769    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_COLOR_OUTPUT;
10770 }
10771 
10772 static void
radv_emit_all_graphics_states(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info)10773 radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info)
10774 {
10775    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10776    const struct radv_physical_device *pdev = radv_device_physical(device);
10777    struct radv_shader_part *ps_epilog = NULL;
10778 
10779    if (cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT] &&
10780        cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]->info.ps.has_epilog) {
10781       if ((cmd_buffer->state.emitted_graphics_pipeline != cmd_buffer->state.graphics_pipeline ||
10782            ((cmd_buffer->state.dirty & (RADV_CMD_DIRTY_GRAPHICS_SHADERS | RADV_CMD_DIRTY_FRAMEBUFFER)) ||
10783             (cmd_buffer->state.dirty_dynamic &
10784              (RADV_DYNAMIC_COLOR_WRITE_MASK | RADV_DYNAMIC_COLOR_BLEND_ENABLE | RADV_DYNAMIC_ALPHA_TO_COVERAGE_ENABLE |
10785               RADV_DYNAMIC_COLOR_BLEND_EQUATION | RADV_DYNAMIC_ALPHA_TO_ONE_ENABLE |
10786               RADV_DYNAMIC_COLOR_ATTACHMENT_MAP))))) {
10787          ps_epilog = lookup_ps_epilog(cmd_buffer);
10788          if (!ps_epilog) {
10789             vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
10790             return;
10791          }
10792 
10793          uint32_t col_format = ps_epilog->spi_shader_col_format;
10794          uint32_t cb_shader_mask = ps_epilog->cb_shader_mask;
10795 
10796          assert(cmd_buffer->state.custom_blend_mode == 0);
10797 
10798          if (radv_needs_null_export_workaround(device, cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT], 0) &&
10799              !col_format)
10800             col_format = V_028714_SPI_SHADER_32_R;
10801 
10802          if (cmd_buffer->state.spi_shader_col_format != col_format) {
10803             cmd_buffer->state.spi_shader_col_format = col_format;
10804             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
10805             if (pdev->info.rbplus_allowed)
10806                cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
10807          }
10808 
10809          if (cmd_buffer->state.cb_shader_mask != cb_shader_mask) {
10810             cmd_buffer->state.cb_shader_mask = cb_shader_mask;
10811             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
10812          }
10813       }
10814    }
10815 
10816    /* Determine whether GFX9 late scissor workaround should be applied based on:
10817     * 1. radv_need_late_scissor_emission
10818     * 2. any dirty dynamic flags that may cause context rolls
10819     */
10820    const bool late_scissor_emission =
10821       pdev->info.has_gfx9_scissor_bug ? radv_need_late_scissor_emission(cmd_buffer, info) : false;
10822 
10823    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_RBPLUS)
10824       radv_emit_rbplus_state(cmd_buffer);
10825 
10826    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_SHADER_QUERY)
10827       radv_flush_shader_query_state(cmd_buffer);
10828 
10829    if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_OCCLUSION_QUERY) ||
10830        (cmd_buffer->state.dirty_dynamic & (RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY)))
10831       radv_flush_occlusion_query_state(cmd_buffer);
10832 
10833    if (((cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) ||
10834         (cmd_buffer->state.dirty_dynamic &
10835          (RADV_DYNAMIC_CULL_MODE | RADV_DYNAMIC_FRONT_FACE | RADV_DYNAMIC_RASTERIZER_DISCARD_ENABLE |
10836           RADV_DYNAMIC_VIEWPORT | RADV_DYNAMIC_CONSERVATIVE_RAST_MODE | RADV_DYNAMIC_RASTERIZATION_SAMPLES |
10837           RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_SAMPLE_LOCATIONS_ENABLE))) &&
10838        cmd_buffer->state.has_nggc)
10839       radv_emit_ngg_culling_state(cmd_buffer);
10840 
10841    if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
10842        (cmd_buffer->state.dirty_dynamic &
10843         (RADV_DYNAMIC_COLOR_WRITE_MASK | RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_LINE_RASTERIZATION_MODE |
10844          RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE)))
10845       radv_emit_binning_state(cmd_buffer);
10846 
10847    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE) {
10848       radv_emit_graphics_pipeline(cmd_buffer);
10849    } else if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
10850       radv_emit_graphics_shaders(cmd_buffer);
10851    }
10852 
10853    if (ps_epilog)
10854       radv_emit_ps_epilog_state(cmd_buffer, ps_epilog);
10855 
10856    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_COLOR_OUTPUT)
10857       radv_emit_color_output_state(cmd_buffer);
10858 
10859    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
10860       radv_emit_framebuffer_state(cmd_buffer);
10861 
10862    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GUARDBAND)
10863       radv_emit_guardband_state(cmd_buffer);
10864 
10865    if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_DB_SHADER_CONTROL) ||
10866        (cmd_buffer->state.dirty_dynamic &
10867         (RADV_DYNAMIC_COLOR_WRITE_MASK | RADV_DYNAMIC_COLOR_BLEND_ENABLE | RADV_DYNAMIC_RASTERIZATION_SAMPLES |
10868          RADV_DYNAMIC_LINE_RASTERIZATION_MODE | RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE |
10869          RADV_DYNAMIC_ATTACHMENT_FEEDBACK_LOOP_ENABLE)))
10870       radv_emit_db_shader_control(cmd_buffer);
10871 
10872    if (info->indexed && info->indirect && cmd_buffer->state.dirty & RADV_CMD_DIRTY_INDEX_BUFFER)
10873       radv_emit_index_buffer(cmd_buffer);
10874 
10875    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_STREAMOUT_ENABLE)
10876       radv_emit_streamout_enable_state(cmd_buffer);
10877 
10878    const uint64_t dynamic_states = cmd_buffer->state.dirty_dynamic & radv_get_needed_dynamic_states(cmd_buffer);
10879 
10880    if (dynamic_states) {
10881       radv_cmd_buffer_flush_dynamic_state(cmd_buffer, dynamic_states);
10882 
10883       if (dynamic_states & (RADV_DYNAMIC_RASTERIZATION_SAMPLES | RADV_DYNAMIC_LINE_RASTERIZATION_MODE |
10884                             RADV_DYNAMIC_PRIMITIVE_TOPOLOGY | RADV_DYNAMIC_POLYGON_MODE))
10885          radv_emit_fs_state(cmd_buffer);
10886    }
10887 
10888    radv_emit_draw_registers(cmd_buffer, info);
10889 
10890    if (late_scissor_emission) {
10891       radv_emit_scissor(cmd_buffer);
10892       cmd_buffer->state.context_roll_without_scissor_emitted = false;
10893    }
10894 }
10895 
10896 static void
radv_bind_graphics_shaders(struct radv_cmd_buffer * cmd_buffer)10897 radv_bind_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
10898 {
10899    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
10900    const struct radv_physical_device *pdev = radv_device_physical(device);
10901    uint32_t push_constant_size = 0, dynamic_offset_count = 0;
10902    bool need_indirect_descriptor_sets = false;
10903 
10904    for (unsigned s = 0; s <= MESA_SHADER_MESH; s++) {
10905       const struct radv_shader_object *shader_obj = cmd_buffer->state.shader_objs[s];
10906       struct radv_shader *shader = NULL;
10907 
10908       if (s == MESA_SHADER_COMPUTE)
10909          continue;
10910 
10911       if (!shader_obj) {
10912          radv_bind_shader(cmd_buffer, NULL, s);
10913          continue;
10914       }
10915 
10916       /* Select shader variants. */
10917       if (s == MESA_SHADER_VERTEX && (cmd_buffer->state.shader_objs[MESA_SHADER_TESS_CTRL] ||
10918                                       cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY])) {
10919          if (cmd_buffer->state.shader_objs[MESA_SHADER_TESS_CTRL]) {
10920             shader = shader_obj->as_ls.shader;
10921          } else {
10922             shader = shader_obj->as_es.shader;
10923          }
10924       } else if (s == MESA_SHADER_TESS_EVAL && cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]) {
10925          shader = shader_obj->as_es.shader;
10926       } else {
10927          shader = shader_obj->shader;
10928       }
10929 
10930       radv_bind_shader(cmd_buffer, shader, s);
10931       if (!shader)
10932          continue;
10933 
10934       /* Compute push constants/indirect descriptors state. */
10935       need_indirect_descriptor_sets |= radv_get_user_sgpr_info(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS)->sgpr_idx != -1;
10936       push_constant_size += shader_obj->push_constant_size;
10937       dynamic_offset_count += shader_obj->dynamic_offset_count;
10938    }
10939 
10940    /* Determine the last VGT shader. */
10941    const gl_shader_stage last_vgt_api_stage = radv_cmdbuf_get_last_vgt_api_stage(cmd_buffer);
10942 
10943    assume(last_vgt_api_stage != MESA_SHADER_NONE);
10944    if (pdev->info.has_vgt_flush_ngg_legacy_bug &&
10945        (!cmd_buffer->state.last_vgt_shader || (cmd_buffer->state.last_vgt_shader->info.is_ngg &&
10946                                                !cmd_buffer->state.shaders[last_vgt_api_stage]->info.is_ngg))) {
10947       /* Transitioning from NGG to legacy GS requires VGT_FLUSH on GFX10 and Navi21. VGT_FLUSH is
10948        * also emitted at the beginning of IBs when legacy GS ring pointers are set.
10949        */
10950       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_FLUSH;
10951    }
10952 
10953    cmd_buffer->state.last_vgt_shader = cmd_buffer->state.shaders[last_vgt_api_stage];
10954 
10955    struct radv_shader *gs_copy_shader = cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]
10956                                            ? cmd_buffer->state.shader_objs[MESA_SHADER_GEOMETRY]->gs.copy_shader
10957                                            : NULL;
10958 
10959    radv_bind_gs_copy_shader(cmd_buffer, gs_copy_shader);
10960 
10961    /* Determine NGG GS info. */
10962    if (cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY] &&
10963        cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.is_ngg &&
10964        cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY]->info.merged_shader_compiled_separately) {
10965       struct radv_shader *es = cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
10966                                   ? cmd_buffer->state.shaders[MESA_SHADER_TESS_EVAL]
10967                                   : cmd_buffer->state.shaders[MESA_SHADER_VERTEX];
10968       struct radv_shader *gs = cmd_buffer->state.shaders[MESA_SHADER_GEOMETRY];
10969 
10970       gfx10_get_ngg_info(device, &es->info, &gs->info, &gs->info.ngg_info);
10971       radv_precompute_registers_hw_ngg(device, &gs->config, &gs->info);
10972    }
10973 
10974    /* Determine the rasterized primitive. */
10975    if (cmd_buffer->state.active_stages &
10976        (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT |
10977         VK_SHADER_STAGE_GEOMETRY_BIT | VK_SHADER_STAGE_MESH_BIT_EXT)) {
10978       cmd_buffer->state.rast_prim = radv_get_vgt_gs_out(cmd_buffer->state.shaders, 0);
10979    }
10980 
10981    const struct radv_shader *vs = radv_get_shader(cmd_buffer->state.shaders, MESA_SHADER_VERTEX);
10982    if (vs) {
10983       /* Re-emit the VS prolog when a new vertex shader is bound. */
10984       if (vs->info.vs.has_prolog) {
10985          cmd_buffer->state.emitted_vs_prolog = NULL;
10986          cmd_buffer->state.dirty_dynamic |= RADV_DYNAMIC_VERTEX_INPUT;
10987       }
10988 
10989       /* Re-emit the vertex buffer descriptors because they are really tied to the pipeline. */
10990       if (vs->info.vs.vb_desc_usage_mask) {
10991          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
10992       }
10993    }
10994 
10995    const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
10996    if (ps && !ps->info.ps.has_epilog) {
10997       uint32_t col_format = 0, cb_shader_mask = 0;
10998       if (radv_needs_null_export_workaround(device, ps, 0))
10999          col_format = V_028714_SPI_SHADER_32_R;
11000 
11001       if (cmd_buffer->state.spi_shader_col_format != col_format) {
11002          cmd_buffer->state.spi_shader_col_format = col_format;
11003          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
11004          if (pdev->info.rbplus_allowed)
11005             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_RBPLUS;
11006       }
11007 
11008       if (cmd_buffer->state.cb_shader_mask != cb_shader_mask) {
11009          cmd_buffer->state.cb_shader_mask = cb_shader_mask;
11010          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_COLOR_OUTPUT;
11011       }
11012    }
11013 
11014    /* Update push constants/indirect descriptors state. */
11015    struct radv_descriptor_state *descriptors_state =
11016       radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
11017    struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_GRAPHICS];
11018 
11019    descriptors_state->need_indirect_descriptor_sets = need_indirect_descriptor_sets;
11020    pc_state->size = push_constant_size;
11021    pc_state->dynamic_offset_count = dynamic_offset_count;
11022 
11023    if (pdev->info.gfx_level <= GFX9) {
11024       cmd_buffer->state.ia_multi_vgt_param = radv_compute_ia_multi_vgt_param(device, cmd_buffer->state.shaders);
11025    }
11026 
11027    if (cmd_buffer->state.active_stages &
11028        (VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT | VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT)) {
11029       cmd_buffer->state.uses_dynamic_patch_control_points = true;
11030    }
11031 }
11032 
11033 /* MUST inline this function to avoid massive perf loss in drawoverhead */
11034 ALWAYS_INLINE static bool
radv_before_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,bool dgc)11035 radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount, bool dgc)
11036 {
11037    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11038    const struct radv_physical_device *pdev = radv_device_physical(device);
11039    const bool has_prefetch = pdev->info.gfx_level >= GFX7;
11040 
11041    ASSERTED const unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
11042 
11043    if (likely(!info->indirect)) {
11044       /* GFX6-GFX7 treat instance_count==0 as instance_count==1. There is
11045        * no workaround for indirect draws, but we can at least skip
11046        * direct draws.
11047        */
11048       if (unlikely(!info->instance_count))
11049          return false;
11050 
11051       /* Handle count == 0. */
11052       if (unlikely(!info->count && !info->strmout_buffer))
11053          return false;
11054    }
11055 
11056    if (!info->indexed && pdev->info.gfx_level >= GFX7) {
11057       /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
11058        * so the state must be re-emitted before the next indexed
11059        * draw.
11060        */
11061       cmd_buffer->state.last_index_type = -1;
11062    }
11063 
11064    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FBFETCH_OUTPUT)
11065       radv_handle_fbfetch_output(cmd_buffer);
11066 
11067    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
11068       radv_bind_graphics_shaders(cmd_buffer);
11069    }
11070 
11071    /* Use optimal packet order based on whether we need to sync the
11072     * pipeline.
11073     */
11074    if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
11075                                        RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
11076       /* If we have to wait for idle, set all states first, so that
11077        * all SET packets are processed in parallel with previous draw
11078        * calls. Then upload descriptors, set shader pointers, and
11079        * draw, and prefetch at the end. This ensures that the time
11080        * the CUs are idle is very short. (there are only SET_SH
11081        * packets between the wait and the draw)
11082        */
11083       radv_emit_all_graphics_states(cmd_buffer, info);
11084       radv_emit_cache_flush(cmd_buffer);
11085       /* <-- CUs are idle here --> */
11086 
11087       radv_upload_graphics_shader_descriptors(cmd_buffer);
11088    } else {
11089       const bool need_prefetch = has_prefetch && cmd_buffer->state.prefetch_L2_mask;
11090 
11091       /* If we don't wait for idle, start prefetches first, then set
11092        * states, and draw at the end.
11093        */
11094       radv_emit_cache_flush(cmd_buffer);
11095 
11096       if (need_prefetch) {
11097          /* Only prefetch the vertex shader and VBO descriptors
11098           * in order to start the draw as soon as possible.
11099           */
11100          radv_emit_prefetch_L2(cmd_buffer, true);
11101       }
11102 
11103       radv_upload_graphics_shader_descriptors(cmd_buffer);
11104 
11105       radv_emit_all_graphics_states(cmd_buffer, info);
11106    }
11107 
11108    if (!dgc)
11109       radv_describe_draw(cmd_buffer);
11110    if (likely(!info->indirect)) {
11111       struct radv_cmd_state *state = &cmd_buffer->state;
11112       struct radeon_cmdbuf *cs = cmd_buffer->cs;
11113       assert(state->vtx_base_sgpr);
11114       if (state->last_num_instances != info->instance_count) {
11115          radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
11116          radeon_emit(cs, info->instance_count);
11117          state->last_num_instances = info->instance_count;
11118       }
11119    }
11120    assert(cmd_buffer->cs->cdw <= cdw_max);
11121 
11122    return true;
11123 }
11124 
11125 ALWAYS_INLINE static bool
radv_before_taskmesh_draw(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * info,uint32_t drawCount,bool dgc)11126 radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount,
11127                           bool dgc)
11128 {
11129    /* For direct draws, this makes sure we don't draw anything.
11130     * For indirect draws, this is necessary to prevent a GPU hang (on MEC version < 100).
11131     */
11132    if (unlikely(!info->count))
11133       return false;
11134 
11135    if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_GRAPHICS_SHADERS) {
11136       radv_bind_graphics_shaders(cmd_buffer);
11137    }
11138 
11139    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11140    struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
11141    struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
11142 
11143    assert(!task_shader || ace_cs);
11144 
11145    const VkShaderStageFlags stages =
11146       VK_SHADER_STAGE_MESH_BIT_EXT | VK_SHADER_STAGE_FRAGMENT_BIT | (task_shader ? VK_SHADER_STAGE_TASK_BIT_EXT : 0);
11147    const bool need_task_semaphore = task_shader && radv_flush_gang_leader_semaphore(cmd_buffer);
11148 
11149    ASSERTED const unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 4096 + 128 * (drawCount - 1));
11150    ASSERTED const unsigned ace_cdw_max =
11151       !ace_cs ? 0 : radeon_check_space(device->ws, ace_cs, 4096 + 128 * (drawCount - 1));
11152 
11153    radv_emit_all_graphics_states(cmd_buffer, info);
11154 
11155    radv_emit_cache_flush(cmd_buffer);
11156 
11157    if (task_shader) {
11158       radv_gang_cache_flush(cmd_buffer);
11159 
11160       if (need_task_semaphore) {
11161          radv_wait_gang_leader(cmd_buffer);
11162       }
11163    }
11164 
11165    radv_flush_descriptors(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
11166 
11167    const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
11168    if (pc_stages)
11169       radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
11170 
11171    if (!dgc)
11172       radv_describe_draw(cmd_buffer);
11173    if (likely(!info->indirect)) {
11174       struct radv_cmd_state *state = &cmd_buffer->state;
11175       if (unlikely(state->last_num_instances != 1)) {
11176          struct radeon_cmdbuf *cs = cmd_buffer->cs;
11177          radeon_emit(cs, PKT3(PKT3_NUM_INSTANCES, 0, false));
11178          radeon_emit(cs, 1);
11179          state->last_num_instances = 1;
11180       }
11181    }
11182 
11183    assert(cmd_buffer->cs->cdw <= cdw_max);
11184    assert(!ace_cs || ace_cs->cdw <= ace_cdw_max);
11185 
11186    cmd_buffer->state.last_index_type = -1;
11187 
11188    return true;
11189 }
11190 
11191 ALWAYS_INLINE static void
radv_after_draw(struct radv_cmd_buffer * cmd_buffer,bool dgc)11192 radv_after_draw(struct radv_cmd_buffer *cmd_buffer, bool dgc)
11193 {
11194    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11195    const struct radv_physical_device *pdev = radv_device_physical(device);
11196    const struct radeon_info *gpu_info = &pdev->info;
11197    bool has_prefetch = pdev->info.gfx_level >= GFX7;
11198    /* Start prefetches after the draw has been started. Both will
11199     * run in parallel, but starting the draw first is more
11200     * important.
11201     */
11202    if (has_prefetch && cmd_buffer->state.prefetch_L2_mask) {
11203       radv_emit_prefetch_L2(cmd_buffer, false);
11204    }
11205 
11206    /* Workaround for a VGT hang when streamout is enabled.
11207     * It must be done after drawing.
11208     */
11209    if (radv_is_streamout_enabled(cmd_buffer) &&
11210        (gpu_info->family == CHIP_HAWAII || gpu_info->family == CHIP_TONGA || gpu_info->family == CHIP_FIJI)) {
11211       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VGT_STREAMOUT_SYNC;
11212    }
11213 
11214    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_PS_PARTIAL_FLUSH, dgc);
11215 }
11216 
11217 VKAPI_ATTR void VKAPI_CALL
radv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)11218 radv_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount, uint32_t instanceCount, uint32_t firstVertex,
11219              uint32_t firstInstance)
11220 {
11221    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11222    struct radv_draw_info info;
11223 
11224    info.count = vertexCount;
11225    info.instance_count = instanceCount;
11226    info.first_instance = firstInstance;
11227    info.strmout_buffer = NULL;
11228    info.indirect = NULL;
11229    info.indexed = false;
11230 
11231    if (!radv_before_draw(cmd_buffer, &info, 1, false))
11232       return;
11233    const VkMultiDrawInfoEXT minfo = {firstVertex, vertexCount};
11234    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, 0, 0);
11235    radv_after_draw(cmd_buffer, false);
11236 }
11237 
11238 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)11239 radv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount, const VkMultiDrawInfoEXT *pVertexInfo,
11240                      uint32_t instanceCount, uint32_t firstInstance, uint32_t stride)
11241 {
11242    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11243    struct radv_draw_info info;
11244 
11245    if (!drawCount)
11246       return;
11247 
11248    info.count = pVertexInfo->vertexCount;
11249    info.instance_count = instanceCount;
11250    info.first_instance = firstInstance;
11251    info.strmout_buffer = NULL;
11252    info.indirect = NULL;
11253    info.indexed = false;
11254 
11255    if (!radv_before_draw(cmd_buffer, &info, drawCount, false))
11256       return;
11257    radv_emit_direct_draw_packets(cmd_buffer, &info, drawCount, pVertexInfo, 0, stride);
11258    radv_after_draw(cmd_buffer, false);
11259 }
11260 
11261 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)11262 radv_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount, uint32_t instanceCount, uint32_t firstIndex,
11263                     int32_t vertexOffset, uint32_t firstInstance)
11264 {
11265    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11266    struct radv_draw_info info;
11267 
11268    info.indexed = true;
11269    info.count = indexCount;
11270    info.instance_count = instanceCount;
11271    info.first_instance = firstInstance;
11272    info.strmout_buffer = NULL;
11273    info.indirect = NULL;
11274 
11275    if (!radv_before_draw(cmd_buffer, &info, 1, false))
11276       return;
11277    const VkMultiDrawIndexedInfoEXT minfo = {firstIndex, indexCount, vertexOffset};
11278    radv_emit_draw_packets_indexed(cmd_buffer, &info, 1, &minfo, 0, NULL);
11279    radv_after_draw(cmd_buffer, false);
11280 }
11281 
11282 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)11283 radv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
11284                             const VkMultiDrawIndexedInfoEXT *pIndexInfo, uint32_t instanceCount, uint32_t firstInstance,
11285                             uint32_t stride, const int32_t *pVertexOffset)
11286 {
11287    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11288    struct radv_draw_info info;
11289 
11290    if (!drawCount)
11291       return;
11292 
11293    const VkMultiDrawIndexedInfoEXT *minfo = pIndexInfo;
11294    info.indexed = true;
11295    info.count = minfo->indexCount;
11296    info.instance_count = instanceCount;
11297    info.first_instance = firstInstance;
11298    info.strmout_buffer = NULL;
11299    info.indirect = NULL;
11300 
11301    if (!radv_before_draw(cmd_buffer, &info, drawCount, false))
11302       return;
11303    radv_emit_draw_packets_indexed(cmd_buffer, &info, drawCount, pIndexInfo, stride, pVertexOffset);
11304    radv_after_draw(cmd_buffer, false);
11305 }
11306 
11307 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)11308 radv_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount,
11309                      uint32_t stride)
11310 {
11311    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11312    VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11313    struct radv_draw_info info;
11314 
11315    info.count = drawCount;
11316    info.indirect = buffer;
11317    info.indirect_offset = offset;
11318    info.stride = stride;
11319    info.strmout_buffer = NULL;
11320    info.count_buffer = NULL;
11321    info.indexed = false;
11322    info.instance_count = 0;
11323 
11324    if (!radv_before_draw(cmd_buffer, &info, 1, false))
11325       return;
11326    radv_emit_indirect_draw_packets(cmd_buffer, &info);
11327    radv_after_draw(cmd_buffer, false);
11328 }
11329 
11330 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)11331 radv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, uint32_t drawCount,
11332                             uint32_t stride)
11333 {
11334    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11335    VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11336    struct radv_draw_info info;
11337 
11338    info.indexed = true;
11339    info.count = drawCount;
11340    info.indirect = buffer;
11341    info.indirect_offset = offset;
11342    info.stride = stride;
11343    info.count_buffer = NULL;
11344    info.strmout_buffer = NULL;
11345    info.instance_count = 0;
11346 
11347    if (!radv_before_draw(cmd_buffer, &info, 1, false))
11348       return;
11349    radv_emit_indirect_draw_packets(cmd_buffer, &info);
11350    radv_after_draw(cmd_buffer, false);
11351 }
11352 
11353 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)11354 radv_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset, VkBuffer _countBuffer,
11355                           VkDeviceSize countBufferOffset, uint32_t maxDrawCount, uint32_t stride)
11356 {
11357    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11358    VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11359    VK_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
11360    struct radv_draw_info info;
11361 
11362    info.count = maxDrawCount;
11363    info.indirect = buffer;
11364    info.indirect_offset = offset;
11365    info.count_buffer = count_buffer;
11366    info.count_buffer_offset = countBufferOffset;
11367    info.stride = stride;
11368    info.strmout_buffer = NULL;
11369    info.indexed = false;
11370    info.instance_count = 0;
11371 
11372    if (!radv_before_draw(cmd_buffer, &info, 1, false))
11373       return;
11374    radv_emit_indirect_draw_packets(cmd_buffer, &info);
11375    radv_after_draw(cmd_buffer, false);
11376 }
11377 
11378 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)11379 radv_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
11380                                  VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
11381                                  uint32_t stride)
11382 {
11383    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11384    VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11385    VK_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
11386    struct radv_draw_info info;
11387 
11388    info.indexed = true;
11389    info.count = maxDrawCount;
11390    info.indirect = buffer;
11391    info.indirect_offset = offset;
11392    info.count_buffer = count_buffer;
11393    info.count_buffer_offset = countBufferOffset;
11394    info.stride = stride;
11395    info.strmout_buffer = NULL;
11396    info.instance_count = 0;
11397 
11398    if (!radv_before_draw(cmd_buffer, &info, 1, false))
11399       return;
11400    radv_emit_indirect_draw_packets(cmd_buffer, &info);
11401    radv_after_draw(cmd_buffer, false);
11402 }
11403 
11404 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksEXT(VkCommandBuffer commandBuffer,uint32_t x,uint32_t y,uint32_t z)11405 radv_CmdDrawMeshTasksEXT(VkCommandBuffer commandBuffer, uint32_t x, uint32_t y, uint32_t z)
11406 {
11407    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11408    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11409    struct radv_draw_info info;
11410 
11411    info.count = x * y * z;
11412    info.instance_count = 1;
11413    info.first_instance = 0;
11414    info.stride = 0;
11415    info.indexed = false;
11416    info.strmout_buffer = NULL;
11417    info.count_buffer = NULL;
11418    info.indirect = NULL;
11419 
11420    if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, false))
11421       return;
11422 
11423    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
11424       radv_emit_direct_taskmesh_draw_packets(device, &cmd_buffer->state, cmd_buffer->cs, cmd_buffer->gang.cs, x, y, z);
11425    } else {
11426       radv_emit_direct_mesh_draw_packet(cmd_buffer, x, y, z);
11427    }
11428 
11429    radv_after_draw(cmd_buffer, false);
11430 }
11431 
11432 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectEXT(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)11433 radv_CmdDrawMeshTasksIndirectEXT(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
11434                                  uint32_t drawCount, uint32_t stride)
11435 {
11436    if (!drawCount)
11437       return;
11438 
11439    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11440    VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11441    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11442    struct radv_draw_info info;
11443 
11444    info.indirect = buffer;
11445    info.indirect_offset = offset;
11446    info.stride = stride;
11447    info.count = drawCount;
11448    info.strmout_buffer = NULL;
11449    info.count_buffer = NULL;
11450    info.indexed = false;
11451    info.instance_count = 0;
11452 
11453    if (!radv_before_taskmesh_draw(cmd_buffer, &info, drawCount, false))
11454       return;
11455 
11456    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
11457       radv_emit_indirect_taskmesh_draw_packets(device, &cmd_buffer->state, cmd_buffer->cs, cmd_buffer->gang.cs, &info,
11458                                                0);
11459    } else {
11460       radv_emit_indirect_mesh_draw_packets(cmd_buffer, &info);
11461    }
11462 
11463    radv_after_draw(cmd_buffer, false);
11464 }
11465 
11466 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer _countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)11467 radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset,
11468                                       VkBuffer _countBuffer, VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
11469                                       uint32_t stride)
11470 {
11471 
11472    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11473    VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
11474    VK_FROM_HANDLE(radv_buffer, count_buffer, _countBuffer);
11475    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11476    const struct radv_physical_device *pdev = radv_device_physical(device);
11477    struct radv_draw_info info;
11478 
11479    info.indirect = buffer;
11480    info.indirect_offset = offset;
11481    info.stride = stride;
11482    info.count = maxDrawCount;
11483    info.strmout_buffer = NULL;
11484    info.count_buffer = count_buffer;
11485    info.count_buffer_offset = countBufferOffset;
11486    info.indexed = false;
11487    info.instance_count = 0;
11488 
11489    if (!radv_before_taskmesh_draw(cmd_buffer, &info, maxDrawCount, false))
11490       return;
11491 
11492    if (radv_cmdbuf_has_stage(cmd_buffer, MESA_SHADER_TASK)) {
11493       uint64_t workaround_cond_va = 0;
11494 
11495       if (pdev->info.has_taskmesh_indirect0_bug && info.count_buffer) {
11496          /* Allocate a 32-bit value for the MEC firmware bug workaround. */
11497          uint32_t workaround_cond_init = 0;
11498          uint32_t workaround_cond_off;
11499 
11500          if (!radv_cmd_buffer_upload_data(cmd_buffer, 4, &workaround_cond_init, &workaround_cond_off))
11501             vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
11502 
11503          workaround_cond_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + workaround_cond_off;
11504       }
11505 
11506       radv_emit_indirect_taskmesh_draw_packets(device, &cmd_buffer->state, cmd_buffer->cs, cmd_buffer->gang.cs, &info,
11507                                                workaround_cond_va);
11508    } else {
11509       radv_emit_indirect_mesh_draw_packets(cmd_buffer, &info);
11510    }
11511 
11512    radv_after_draw(cmd_buffer, false);
11513 }
11514 
11515 /* TODO: Use these functions with the normal dispatch path. */
11516 static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer);
11517 static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer);
11518 
11519 VKAPI_ATTR void VKAPI_CALL
radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)11520 radv_CmdPreprocessGeneratedCommandsNV(VkCommandBuffer commandBuffer,
11521                                       const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
11522 {
11523    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11524    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
11525    VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
11526 
11527    if (!radv_dgc_can_preprocess(layout, pipeline))
11528       return;
11529 
11530    /* VK_EXT_conditional_rendering says that copy commands should not be
11531     * affected by conditional rendering.
11532     */
11533    const bool old_predicating = cmd_buffer->state.predicating;
11534    cmd_buffer->state.predicating = false;
11535 
11536    radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, false);
11537 
11538    /* Restore conditional rendering. */
11539    cmd_buffer->state.predicating = old_predicating;
11540 }
11541 
11542 static void
radv_dgc_execute_ib(struct radv_cmd_buffer * cmd_buffer,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)11543 radv_dgc_execute_ib(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
11544 {
11545    VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
11546    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11547    const bool has_task_shader = radv_dgc_with_task_shader(pGeneratedCommandsInfo);
11548 
11549    const uint32_t cmdbuf_size = radv_get_indirect_gfx_cmdbuf_size(pGeneratedCommandsInfo);
11550    const uint64_t ib_va =
11551       radv_buffer_get_va(prep_buffer->bo) + prep_buffer->offset + pGeneratedCommandsInfo->preprocessOffset;
11552 
11553    device->ws->cs_execute_ib(cmd_buffer->cs, NULL, ib_va, cmdbuf_size >> 2, cmd_buffer->state.predicating);
11554 
11555    if (has_task_shader) {
11556       const uint32_t ace_cmdbuf_size = radv_get_indirect_ace_cmdbuf_size(pGeneratedCommandsInfo);
11557       const uint64_t ace_ib_va = ib_va + radv_get_indirect_ace_cmdbuf_offset(pGeneratedCommandsInfo);
11558 
11559       assert(cmd_buffer->gang.cs);
11560       device->ws->cs_execute_ib(cmd_buffer->gang.cs, NULL, ace_ib_va, ace_cmdbuf_size >> 2,
11561                                 cmd_buffer->state.predicating);
11562    }
11563 }
11564 
11565 VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer,VkBool32 isPreprocessed,const VkGeneratedCommandsInfoNV * pGeneratedCommandsInfo)11566 radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
11567                                    const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
11568 {
11569    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
11570    VK_FROM_HANDLE(radv_indirect_command_layout, layout, pGeneratedCommandsInfo->indirectCommandsLayout);
11571    VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
11572    VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
11573    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11574    const bool compute = layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE;
11575    const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo);
11576    const struct radv_physical_device *pdev = radv_device_physical(device);
11577 
11578    /* Secondary command buffers are needed for the full extension but can't use
11579     * PKT3_INDIRECT_BUFFER.
11580     */
11581    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
11582 
11583    if (use_predication) {
11584       VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
11585       const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset +
11586                           pGeneratedCommandsInfo->sequencesCountOffset;
11587 
11588       radv_begin_conditional_rendering(cmd_buffer, va, true);
11589    }
11590 
11591    if (!radv_dgc_can_preprocess(layout, pipeline)) {
11592       /* Suspend conditional rendering when the DGC execute is called on the compute queue to
11593        * generate a cmdbuf which will skips dispatches when necessary. This is because the
11594        * compute queue is missing IB2 which means it's not possible to skip the cmdbuf entirely.
11595        * It should also be suspended when task shaders are used because the DGC ACE IB would be
11596        * uninitialized otherwise.
11597        */
11598       const bool suspend_cond_render =
11599          (cmd_buffer->qf == RADV_QUEUE_COMPUTE || radv_dgc_with_task_shader(pGeneratedCommandsInfo));
11600       const bool old_predicating = cmd_buffer->state.predicating;
11601 
11602       if (suspend_cond_render && cmd_buffer->state.predicating) {
11603          cmd_buffer->state.predicating = false;
11604       }
11605 
11606       radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo, old_predicating);
11607 
11608       if (suspend_cond_render) {
11609          cmd_buffer->state.predicating = old_predicating;
11610       }
11611 
11612       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH | RADV_CMD_FLAG_INV_VCACHE | RADV_CMD_FLAG_INV_L2;
11613 
11614       if (radv_dgc_with_task_shader(pGeneratedCommandsInfo)) {
11615          /* Make sure the DGC ACE IB will wait for the DGC prepare shader before the execution
11616           * starts.
11617           */
11618          radv_gang_barrier(cmd_buffer, VK_PIPELINE_STAGE_2_COMMAND_PREPROCESS_BIT_NV,
11619                            VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT);
11620       }
11621    }
11622 
11623    if (compute) {
11624       radv_dgc_before_dispatch(cmd_buffer);
11625 
11626       if (!pGeneratedCommandsInfo->pipeline)
11627          cmd_buffer->has_indirect_pipeline_binds = true;
11628    } else {
11629       struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
11630       struct radv_draw_info info;
11631 
11632       info.count = pGeneratedCommandsInfo->sequencesCount;
11633       info.indirect = prep_buffer; /* We're not really going use it this way, but a good signal
11634                                    that this is not direct. */
11635       info.indirect_offset = 0;
11636       info.stride = 0;
11637       info.strmout_buffer = NULL;
11638       info.count_buffer = NULL;
11639       info.indexed = layout->indexed;
11640       info.instance_count = 0;
11641 
11642       if (radv_pipeline_has_stage(graphics_pipeline, MESA_SHADER_MESH)) {
11643          if (!radv_before_taskmesh_draw(cmd_buffer, &info, 1, true))
11644             return;
11645       } else {
11646          if (!radv_before_draw(cmd_buffer, &info, 1, true))
11647             return;
11648       }
11649    }
11650 
11651    const uint32_t view_mask = cmd_buffer->state.render.view_mask;
11652 
11653    if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
11654       radeon_emit(cmd_buffer->cs, PKT3(PKT3_PFP_SYNC_ME, 0, cmd_buffer->state.predicating));
11655       radeon_emit(cmd_buffer->cs, 0);
11656    }
11657 
11658    radv_cs_add_buffer(device->ws, cmd_buffer->cs, prep_buffer->bo);
11659 
11660    if (compute || !view_mask) {
11661       radv_dgc_execute_ib(cmd_buffer, pGeneratedCommandsInfo);
11662    } else {
11663       u_foreach_bit (view, view_mask) {
11664          radv_emit_view_index(&cmd_buffer->state, cmd_buffer->cs, view);
11665 
11666          radv_dgc_execute_ib(cmd_buffer, pGeneratedCommandsInfo);
11667       }
11668    }
11669 
11670    if (compute) {
11671       cmd_buffer->push_constant_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
11672 
11673       if (!pGeneratedCommandsInfo->pipeline)
11674          radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
11675 
11676       radv_dgc_after_dispatch(cmd_buffer);
11677    } else {
11678       struct radv_graphics_pipeline *graphics_pipeline = radv_pipeline_to_graphics(pipeline);
11679 
11680       if (layout->binds_index_buffer) {
11681          cmd_buffer->state.last_index_type = -1;
11682          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_INDEX_BUFFER;
11683       }
11684 
11685       if (layout->bind_vbo_mask)
11686          cmd_buffer->state.dirty |= RADV_CMD_DIRTY_VERTEX_BUFFER;
11687 
11688       cmd_buffer->push_constant_stages |= graphics_pipeline->active_stages;
11689 
11690       if (!layout->indexed && pdev->info.gfx_level >= GFX7) {
11691          /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE, so the state must be
11692           * re-emitted before the next indexed draw.
11693           */
11694          cmd_buffer->state.last_index_type = -1;
11695       }
11696 
11697       cmd_buffer->state.last_num_instances = -1;
11698       cmd_buffer->state.last_vertex_offset_valid = false;
11699       cmd_buffer->state.last_first_instance = -1;
11700       cmd_buffer->state.last_drawid = -1;
11701 
11702       radv_after_draw(cmd_buffer, true);
11703    }
11704 
11705    if (use_predication) {
11706       radv_end_conditional_rendering(cmd_buffer);
11707    }
11708 }
11709 
11710 static void
radv_save_dispatch_size(struct radv_cmd_buffer * cmd_buffer,uint64_t indirect_va)11711 radv_save_dispatch_size(struct radv_cmd_buffer *cmd_buffer, uint64_t indirect_va)
11712 {
11713    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11714 
11715    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11716    radeon_check_space(device->ws, cs, 18);
11717 
11718    uint64_t va = radv_buffer_get_va(device->trace_bo) + offsetof(struct radv_trace_data, indirect_dispatch);
11719 
11720    for (uint32_t i = 0; i < 3; i++) {
11721       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11722       radeon_emit(cs,
11723                   COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
11724       radeon_emit(cs, indirect_va);
11725       radeon_emit(cs, indirect_va >> 32);
11726       radeon_emit(cs, va);
11727       radeon_emit(cs, va >> 32);
11728 
11729       indirect_va += 4;
11730       va += 4;
11731    }
11732 }
11733 
11734 static void
radv_emit_dispatch_packets(struct radv_cmd_buffer * cmd_buffer,const struct radv_shader * compute_shader,const struct radv_dispatch_info * info)11735 radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *compute_shader,
11736                            const struct radv_dispatch_info *info)
11737 {
11738    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11739    const struct radv_physical_device *pdev = radv_device_physical(device);
11740    unsigned dispatch_initiator = device->dispatch_initiator;
11741    struct radeon_winsys *ws = device->ws;
11742    bool predicating = cmd_buffer->state.predicating;
11743    struct radeon_cmdbuf *cs = cmd_buffer->cs;
11744    const uint32_t grid_size_offset = radv_get_user_sgpr_loc(compute_shader, AC_UD_CS_GRID_SIZE);
11745 
11746    radv_describe_dispatch(cmd_buffer, info);
11747 
11748    ASSERTED unsigned cdw_max = radeon_check_space(ws, cs, 30);
11749 
11750    if (compute_shader->info.wave_size == 32) {
11751       assert(pdev->info.gfx_level >= GFX10);
11752       dispatch_initiator |= S_00B800_CS_W32_EN(1);
11753    }
11754 
11755    if (info->ordered)
11756       dispatch_initiator &= ~S_00B800_ORDER_MODE(1);
11757 
11758    if (info->va) {
11759       if (radv_device_fault_detection_enabled(device))
11760          radv_save_dispatch_size(cmd_buffer, info->va);
11761 
11762       if (info->indirect)
11763          radv_cs_add_buffer(ws, cs, info->indirect);
11764 
11765       if (info->unaligned) {
11766          radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
11767          if (pdev->info.gfx_level >= GFX12) {
11768             radeon_emit(cs, S_00B81C_NUM_THREAD_FULL_GFX12(compute_shader->info.cs.block_size[0]));
11769             radeon_emit(cs, S_00B820_NUM_THREAD_FULL_GFX12(compute_shader->info.cs.block_size[1]));
11770          } else {
11771             radeon_emit(cs, S_00B81C_NUM_THREAD_FULL_GFX6(compute_shader->info.cs.block_size[0]));
11772             radeon_emit(cs, S_00B820_NUM_THREAD_FULL_GFX6(compute_shader->info.cs.block_size[1]));
11773          }
11774          radeon_emit(cs, S_00B824_NUM_THREAD_FULL(compute_shader->info.cs.block_size[2]));
11775 
11776          dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
11777       }
11778 
11779       if (grid_size_offset) {
11780          if (device->load_grid_size_from_user_sgpr) {
11781             assert(pdev->info.gfx_level >= GFX10_3);
11782             radeon_emit(cs, PKT3(PKT3_LOAD_SH_REG_INDEX, 3, 0));
11783             radeon_emit(cs, info->va);
11784             radeon_emit(cs, info->va >> 32);
11785             radeon_emit(cs, (grid_size_offset - SI_SH_REG_OFFSET) >> 2);
11786             radeon_emit(cs, 3);
11787          } else {
11788             radv_emit_shader_pointer(device, cmd_buffer->cs, grid_size_offset, info->va, true);
11789          }
11790       }
11791 
11792       if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
11793          uint64_t indirect_va = info->va;
11794          const bool needs_align32_workaround = pdev->info.has_async_compute_align32_bug &&
11795                                                cmd_buffer->qf == RADV_QUEUE_COMPUTE &&
11796                                                !util_is_aligned(indirect_va, 32);
11797          const unsigned ace_predication_size =
11798             4 /* DISPATCH_INDIRECT */ + (needs_align32_workaround ? 6 * 3 /* 3x COPY_DATA */ : 0);
11799 
11800          radv_cs_emit_compute_predication(device, &cmd_buffer->state, cs, cmd_buffer->state.mec_inv_pred_va,
11801                                           &cmd_buffer->state.mec_inv_pred_emitted, ace_predication_size);
11802 
11803          if (needs_align32_workaround) {
11804             const uint64_t unaligned_va = indirect_va;
11805             UNUSED void *ptr;
11806             uint32_t offset;
11807 
11808             if (!radv_cmd_buffer_upload_alloc_aligned(cmd_buffer, sizeof(VkDispatchIndirectCommand), 32, &offset, &ptr))
11809                return;
11810 
11811             indirect_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
11812 
11813             for (uint32_t i = 0; i < 3; i++) {
11814                const uint64_t src_va = unaligned_va + i * 4;
11815                const uint64_t dst_va = indirect_va + i * 4;
11816 
11817                radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
11818                radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) |
11819                                   COPY_DATA_WR_CONFIRM);
11820                radeon_emit(cs, src_va);
11821                radeon_emit(cs, src_va >> 32);
11822                radeon_emit(cs, dst_va);
11823                radeon_emit(cs, dst_va >> 32);
11824             }
11825          }
11826 
11827          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 2, 0) | PKT3_SHADER_TYPE_S(1));
11828          radeon_emit(cs, indirect_va);
11829          radeon_emit(cs, indirect_va >> 32);
11830          radeon_emit(cs, dispatch_initiator);
11831       } else {
11832          radeon_emit(cs, PKT3(PKT3_SET_BASE, 2, 0) | PKT3_SHADER_TYPE_S(1));
11833          radeon_emit(cs, 1);
11834          radeon_emit(cs, info->va);
11835          radeon_emit(cs, info->va >> 32);
11836 
11837          if (cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
11838             radv_cs_emit_compute_predication(device, &cmd_buffer->state, cs, cmd_buffer->state.mec_inv_pred_va,
11839                                              &cmd_buffer->state.mec_inv_pred_emitted, 3 /* PKT3_DISPATCH_INDIRECT */);
11840             predicating = false;
11841          }
11842 
11843          radeon_emit(cs, PKT3(PKT3_DISPATCH_INDIRECT, 1, predicating) | PKT3_SHADER_TYPE_S(1));
11844          radeon_emit(cs, 0);
11845          radeon_emit(cs, dispatch_initiator);
11846       }
11847    } else {
11848       const unsigned *cs_block_size = compute_shader->info.cs.block_size;
11849       unsigned blocks[3] = {info->blocks[0], info->blocks[1], info->blocks[2]};
11850       unsigned offsets[3] = {info->offsets[0], info->offsets[1], info->offsets[2]};
11851 
11852       if (info->unaligned) {
11853          unsigned remainder[3];
11854 
11855          /* If aligned, these should be an entire block size,
11856           * not 0.
11857           */
11858          remainder[0] = blocks[0] + cs_block_size[0] - ALIGN_NPOT(blocks[0], cs_block_size[0]);
11859          remainder[1] = blocks[1] + cs_block_size[1] - ALIGN_NPOT(blocks[1], cs_block_size[1]);
11860          remainder[2] = blocks[2] + cs_block_size[2] - ALIGN_NPOT(blocks[2], cs_block_size[2]);
11861 
11862          blocks[0] = DIV_ROUND_UP(blocks[0], cs_block_size[0]);
11863          blocks[1] = DIV_ROUND_UP(blocks[1], cs_block_size[1]);
11864          blocks[2] = DIV_ROUND_UP(blocks[2], cs_block_size[2]);
11865 
11866          for (unsigned i = 0; i < 3; ++i) {
11867             assert(offsets[i] % cs_block_size[i] == 0);
11868             offsets[i] /= cs_block_size[i];
11869          }
11870 
11871          radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
11872          if (pdev->info.gfx_level >= GFX12) {
11873             radeon_emit(cs,
11874                         S_00B81C_NUM_THREAD_FULL_GFX12(cs_block_size[0]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
11875             radeon_emit(cs,
11876                         S_00B820_NUM_THREAD_FULL_GFX12(cs_block_size[1]) | S_00B820_NUM_THREAD_PARTIAL(remainder[1]));
11877          } else {
11878             radeon_emit(cs,
11879                         S_00B81C_NUM_THREAD_FULL_GFX6(cs_block_size[0]) | S_00B81C_NUM_THREAD_PARTIAL(remainder[0]));
11880             radeon_emit(cs,
11881                         S_00B820_NUM_THREAD_FULL_GFX6(cs_block_size[1]) | S_00B820_NUM_THREAD_PARTIAL(remainder[1]));
11882          }
11883          radeon_emit(cs, S_00B824_NUM_THREAD_FULL(cs_block_size[2]) | S_00B824_NUM_THREAD_PARTIAL(remainder[2]));
11884 
11885          dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
11886       }
11887 
11888       if (grid_size_offset) {
11889          if (device->load_grid_size_from_user_sgpr) {
11890             radeon_set_sh_reg_seq(cs, grid_size_offset, 3);
11891             radeon_emit(cs, blocks[0]);
11892             radeon_emit(cs, blocks[1]);
11893             radeon_emit(cs, blocks[2]);
11894          } else {
11895             uint32_t offset;
11896             if (!radv_cmd_buffer_upload_data(cmd_buffer, 12, blocks, &offset))
11897                return;
11898 
11899             uint64_t va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
11900             radv_emit_shader_pointer(device, cmd_buffer->cs, grid_size_offset, va, true);
11901          }
11902       }
11903 
11904       if (offsets[0] || offsets[1] || offsets[2]) {
11905          radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
11906          radeon_emit(cs, offsets[0]);
11907          radeon_emit(cs, offsets[1]);
11908          radeon_emit(cs, offsets[2]);
11909 
11910          /* The blocks in the packet are not counts but end values. */
11911          for (unsigned i = 0; i < 3; ++i)
11912             blocks[i] += offsets[i];
11913       } else {
11914          dispatch_initiator |= S_00B800_FORCE_START_AT_000(1);
11915       }
11916 
11917       if (cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
11918          radv_cs_emit_compute_predication(device, &cmd_buffer->state, cs, cmd_buffer->state.mec_inv_pred_va,
11919                                           &cmd_buffer->state.mec_inv_pred_emitted, 5 /* DISPATCH_DIRECT size */);
11920          predicating = false;
11921       }
11922 
11923       if (pdev->info.has_async_compute_threadgroup_bug && cmd_buffer->qf == RADV_QUEUE_COMPUTE) {
11924          for (unsigned i = 0; i < 3; i++) {
11925             if (info->unaligned) {
11926                /* info->blocks is already in thread dimensions for unaligned dispatches. */
11927                blocks[i] = info->blocks[i];
11928             } else {
11929                /* Force the async compute dispatch to be in "thread" dim mode to workaround a hw bug. */
11930                blocks[i] *= cs_block_size[i];
11931             }
11932 
11933             dispatch_initiator |= S_00B800_USE_THREAD_DIMENSIONS(1);
11934          }
11935       }
11936 
11937       radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, predicating) | PKT3_SHADER_TYPE_S(1));
11938       radeon_emit(cs, blocks[0]);
11939       radeon_emit(cs, blocks[1]);
11940       radeon_emit(cs, blocks[2]);
11941       radeon_emit(cs, dispatch_initiator);
11942    }
11943 
11944    assert(cmd_buffer->cs->cdw <= cdw_max);
11945 }
11946 
11947 static void
radv_upload_compute_shader_descriptors(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)11948 radv_upload_compute_shader_descriptors(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint bind_point)
11949 {
11950    radv_flush_descriptors(cmd_buffer, VK_SHADER_STAGE_COMPUTE_BIT, bind_point);
11951    const VkShaderStageFlags stages =
11952       bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR ? RADV_RT_STAGE_BITS : VK_SHADER_STAGE_COMPUTE_BIT;
11953    const VkShaderStageFlags pc_stages = radv_must_flush_constants(cmd_buffer, stages, bind_point);
11954    if (pc_stages)
11955       radv_flush_constants(cmd_buffer, pc_stages, bind_point);
11956 }
11957 
11958 static void
radv_emit_rt_stack_size(struct radv_cmd_buffer * cmd_buffer)11959 radv_emit_rt_stack_size(struct radv_cmd_buffer *cmd_buffer)
11960 {
11961    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11962    const struct radv_shader *rt_prolog = cmd_buffer->state.rt_prolog;
11963    unsigned rsrc2 = rt_prolog->config.rsrc2;
11964    if (cmd_buffer->state.rt_stack_size)
11965       rsrc2 |= S_00B12C_SCRATCH_EN(1);
11966 
11967    radeon_check_space(device->ws, cmd_buffer->cs, 3);
11968    radeon_set_sh_reg(cmd_buffer->cs, rt_prolog->info.regs.pgm_rsrc2, rsrc2);
11969 }
11970 
11971 static void
radv_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info,struct radv_compute_pipeline * pipeline,struct radv_shader * compute_shader,VkPipelineBindPoint bind_point)11972 radv_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info,
11973               struct radv_compute_pipeline *pipeline, struct radv_shader *compute_shader,
11974               VkPipelineBindPoint bind_point)
11975 {
11976    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
11977    const struct radv_physical_device *pdev = radv_device_physical(device);
11978    bool has_prefetch = pdev->info.gfx_level >= GFX7;
11979    bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
11980 
11981    if (compute_shader->info.cs.regalloc_hang_bug)
11982       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
11983 
11984    if (cmd_buffer->state.flush_bits & (RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
11985                                        RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH)) {
11986       /* If we have to wait for idle, set all states first, so that
11987        * all SET packets are processed in parallel with previous draw
11988        * calls. Then upload descriptors, set shader pointers, and
11989        * dispatch, and prefetch at the end. This ensures that the
11990        * time the CUs are idle is very short. (there are only SET_SH
11991        * packets between the wait and the draw)
11992        */
11993       radv_emit_compute_pipeline(cmd_buffer, pipeline);
11994       if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR)
11995          radv_emit_rt_stack_size(cmd_buffer);
11996       radv_emit_cache_flush(cmd_buffer);
11997       /* <-- CUs are idle here --> */
11998 
11999       radv_upload_compute_shader_descriptors(cmd_buffer, bind_point);
12000 
12001       radv_emit_dispatch_packets(cmd_buffer, compute_shader, info);
12002       /* <-- CUs are busy here --> */
12003 
12004       /* Start prefetches after the dispatch has been started. Both
12005        * will run in parallel, but starting the dispatch first is
12006        * more important.
12007        */
12008       if (has_prefetch && pipeline_is_dirty) {
12009          radv_emit_shader_prefetch(cmd_buffer, compute_shader);
12010       }
12011    } else {
12012       /* If we don't wait for idle, start prefetches first, then set
12013        * states, and dispatch at the end.
12014        */
12015       radv_emit_cache_flush(cmd_buffer);
12016 
12017       if (has_prefetch && pipeline_is_dirty) {
12018          radv_emit_shader_prefetch(cmd_buffer, compute_shader);
12019       }
12020 
12021       radv_upload_compute_shader_descriptors(cmd_buffer, bind_point);
12022 
12023       radv_emit_compute_pipeline(cmd_buffer, pipeline);
12024       if (bind_point == VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR)
12025          radv_emit_rt_stack_size(cmd_buffer);
12026       radv_emit_dispatch_packets(cmd_buffer, compute_shader, info);
12027    }
12028 
12029    if (pipeline_is_dirty) {
12030       /* Raytracing uses compute shaders but has separate bind points and pipelines.
12031        * So if we set compute userdata & shader registers we should dirty the raytracing
12032        * ones and the other way around.
12033        *
12034        * We only need to do this when the pipeline is dirty because when we switch between
12035        * the two we always need to switch pipelines.
12036        */
12037       radv_mark_descriptor_sets_dirty(cmd_buffer, bind_point == VK_PIPELINE_BIND_POINT_COMPUTE
12038                                                      ? VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR
12039                                                      : VK_PIPELINE_BIND_POINT_COMPUTE);
12040    }
12041 
12042    if (compute_shader->info.cs.regalloc_hang_bug)
12043       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
12044 
12045    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, false);
12046 }
12047 
12048 static void
radv_dgc_before_dispatch(struct radv_cmd_buffer * cmd_buffer)12049 radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer)
12050 {
12051    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12052    const struct radv_physical_device *pdev = radv_device_physical(device);
12053    struct radv_compute_pipeline *pipeline = cmd_buffer->state.compute_pipeline;
12054    struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
12055    bool pipeline_is_dirty = pipeline != cmd_buffer->state.emitted_compute_pipeline;
12056 
12057    /* We will have run the DGC patch shaders before, so we can assume that there is something to
12058     * flush. Otherwise, we just split radv_dispatch in two. One pre-dispatch and another one
12059     * post-dispatch. */
12060 
12061    if (compute_shader->info.cs.regalloc_hang_bug)
12062       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_PS_PARTIAL_FLUSH | RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
12063 
12064    if (pipeline)
12065       radv_emit_compute_pipeline(cmd_buffer, pipeline);
12066    radv_emit_cache_flush(cmd_buffer);
12067 
12068    radv_upload_compute_shader_descriptors(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
12069 
12070    if (pipeline_is_dirty) {
12071       const bool has_prefetch = pdev->info.gfx_level >= GFX7;
12072 
12073       if (has_prefetch)
12074          radv_emit_shader_prefetch(cmd_buffer, compute_shader);
12075 
12076       /* Raytracing uses compute shaders but has separate bind points and pipelines.
12077        * So if we set compute userdata & shader registers we should dirty the raytracing
12078        * ones and the other way around.
12079        *
12080        * We only need to do this when the pipeline is dirty because when we switch between
12081        * the two we always need to switch pipelines.
12082        */
12083       radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
12084    }
12085 }
12086 
12087 static void
radv_dgc_after_dispatch(struct radv_cmd_buffer * cmd_buffer)12088 radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer)
12089 {
12090    struct radv_shader *compute_shader = cmd_buffer->state.shaders[MESA_SHADER_COMPUTE];
12091 
12092    if (compute_shader->info.cs.regalloc_hang_bug)
12093       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_CS_PARTIAL_FLUSH;
12094 
12095    radv_cmd_buffer_after_draw(cmd_buffer, RADV_CMD_FLAG_CS_PARTIAL_FLUSH, true);
12096 }
12097 
12098 void
radv_compute_dispatch(struct radv_cmd_buffer * cmd_buffer,const struct radv_dispatch_info * info)12099 radv_compute_dispatch(struct radv_cmd_buffer *cmd_buffer, const struct radv_dispatch_info *info)
12100 {
12101    radv_dispatch(cmd_buffer, info, cmd_buffer->state.compute_pipeline, cmd_buffer->state.shaders[MESA_SHADER_COMPUTE],
12102                  VK_PIPELINE_BIND_POINT_COMPUTE);
12103 }
12104 
12105 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t base_x,uint32_t base_y,uint32_t base_z,uint32_t x,uint32_t y,uint32_t z)12106 radv_CmdDispatchBase(VkCommandBuffer commandBuffer, uint32_t base_x, uint32_t base_y, uint32_t base_z, uint32_t x,
12107                      uint32_t y, uint32_t z)
12108 {
12109    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12110    struct radv_dispatch_info info = {0};
12111 
12112    info.blocks[0] = x;
12113    info.blocks[1] = y;
12114    info.blocks[2] = z;
12115 
12116    info.offsets[0] = base_x;
12117    info.offsets[1] = base_y;
12118    info.offsets[2] = base_z;
12119    radv_compute_dispatch(cmd_buffer, &info);
12120 }
12121 
12122 VKAPI_ATTR void VKAPI_CALL
radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)12123 radv_CmdDispatchIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer, VkDeviceSize offset)
12124 {
12125    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12126    VK_FROM_HANDLE(radv_buffer, buffer, _buffer);
12127    struct radv_dispatch_info info = {0};
12128 
12129    info.indirect = buffer->bo;
12130    info.va = radv_buffer_get_va(buffer->bo) + buffer->offset + offset;
12131 
12132    radv_compute_dispatch(cmd_buffer, &info);
12133 }
12134 
12135 void
radv_unaligned_dispatch(struct radv_cmd_buffer * cmd_buffer,uint32_t x,uint32_t y,uint32_t z)12136 radv_unaligned_dispatch(struct radv_cmd_buffer *cmd_buffer, uint32_t x, uint32_t y, uint32_t z)
12137 {
12138    struct radv_dispatch_info info = {0};
12139 
12140    info.blocks[0] = x;
12141    info.blocks[1] = y;
12142    info.blocks[2] = z;
12143    info.unaligned = 1;
12144 
12145    radv_compute_dispatch(cmd_buffer, &info);
12146 }
12147 
12148 void
radv_indirect_dispatch(struct radv_cmd_buffer * cmd_buffer,struct radeon_winsys_bo * bo,uint64_t va)12149 radv_indirect_dispatch(struct radv_cmd_buffer *cmd_buffer, struct radeon_winsys_bo *bo, uint64_t va)
12150 {
12151    struct radv_dispatch_info info = {0};
12152 
12153    info.indirect = bo;
12154    info.va = va;
12155 
12156    radv_compute_dispatch(cmd_buffer, &info);
12157 }
12158 
12159 static void
radv_trace_trace_rays(struct radv_cmd_buffer * cmd_buffer,const VkTraceRaysIndirectCommand2KHR * cmd,uint64_t indirect_va)12160 radv_trace_trace_rays(struct radv_cmd_buffer *cmd_buffer, const VkTraceRaysIndirectCommand2KHR *cmd,
12161                       uint64_t indirect_va)
12162 {
12163    if (!cmd || indirect_va)
12164       return;
12165 
12166    struct radv_rra_ray_history_data *data = malloc(sizeof(struct radv_rra_ray_history_data));
12167    if (!data)
12168       return;
12169 
12170    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12171    uint32_t width = DIV_ROUND_UP(cmd->width, device->rra_trace.ray_history_resolution_scale);
12172    uint32_t height = DIV_ROUND_UP(cmd->height, device->rra_trace.ray_history_resolution_scale);
12173    uint32_t depth = DIV_ROUND_UP(cmd->depth, device->rra_trace.ray_history_resolution_scale);
12174 
12175    struct radv_rra_ray_history_counter counter = {
12176       .dispatch_size = {width, height, depth},
12177       .hit_shader_count = cmd->hitShaderBindingTableSize / cmd->hitShaderBindingTableStride,
12178       .miss_shader_count = cmd->missShaderBindingTableSize / cmd->missShaderBindingTableStride,
12179       .shader_count = cmd_buffer->state.rt_pipeline->stage_count,
12180       .pipeline_api_hash = cmd_buffer->state.rt_pipeline->base.base.pipeline_hash,
12181       .mode = 1,
12182       .stride = sizeof(uint32_t),
12183       .data_size = 0,
12184       .ray_id_begin = 0,
12185       .ray_id_end = 0xFFFFFFFF,
12186       .pipeline_type = RADV_RRA_PIPELINE_RAY_TRACING,
12187    };
12188 
12189    struct radv_rra_ray_history_dispatch_size dispatch_size = {
12190       .size = {width, height, depth},
12191    };
12192 
12193    struct radv_rra_ray_history_traversal_flags traversal_flags = {0};
12194 
12195    data->metadata = (struct radv_rra_ray_history_metadata){
12196       .counter_info.type = RADV_RRA_COUNTER_INFO,
12197       .counter_info.size = sizeof(struct radv_rra_ray_history_counter),
12198       .counter = counter,
12199 
12200       .dispatch_size_info.type = RADV_RRA_DISPATCH_SIZE,
12201       .dispatch_size_info.size = sizeof(struct radv_rra_ray_history_dispatch_size),
12202       .dispatch_size = dispatch_size,
12203 
12204       .traversal_flags_info.type = RADV_RRA_TRAVERSAL_FLAGS,
12205       .traversal_flags_info.size = sizeof(struct radv_rra_ray_history_traversal_flags),
12206       .traversal_flags = traversal_flags,
12207    };
12208 
12209    uint32_t dispatch_index = util_dynarray_num_elements(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *)
12210                              << 16;
12211 
12212    util_dynarray_append(&cmd_buffer->ray_history, struct radv_rra_ray_history_data *, data);
12213 
12214    cmd_buffer->state.flush_bits |=
12215       RADV_CMD_FLAG_INV_SCACHE | RADV_CMD_FLAG_CS_PARTIAL_FLUSH |
12216       radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_WRITE_BIT, NULL) |
12217       radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_READ_BIT, NULL);
12218 
12219    radv_update_buffer_cp(cmd_buffer,
12220                          device->rra_trace.ray_history_addr + offsetof(struct radv_ray_history_header, dispatch_index),
12221                          &dispatch_index, sizeof(dispatch_index));
12222 }
12223 
12224 enum radv_rt_mode {
12225    radv_rt_mode_direct,
12226    radv_rt_mode_indirect,
12227    radv_rt_mode_indirect2,
12228 };
12229 
12230 static void
radv_upload_trace_rays_params(struct radv_cmd_buffer * cmd_buffer,VkTraceRaysIndirectCommand2KHR * tables,enum radv_rt_mode mode,uint64_t * launch_size_va,uint64_t * sbt_va)12231 radv_upload_trace_rays_params(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables,
12232                               enum radv_rt_mode mode, uint64_t *launch_size_va, uint64_t *sbt_va)
12233 {
12234    uint32_t upload_size = mode == radv_rt_mode_direct ? sizeof(VkTraceRaysIndirectCommand2KHR)
12235                                                       : offsetof(VkTraceRaysIndirectCommand2KHR, width);
12236 
12237    uint32_t offset;
12238    if (!radv_cmd_buffer_upload_data(cmd_buffer, upload_size, tables, &offset))
12239       return;
12240 
12241    uint64_t upload_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + offset;
12242 
12243    if (mode == radv_rt_mode_direct)
12244       *launch_size_va = upload_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
12245    if (sbt_va)
12246       *sbt_va = upload_va;
12247 }
12248 
12249 static void
radv_trace_rays(struct radv_cmd_buffer * cmd_buffer,VkTraceRaysIndirectCommand2KHR * tables,uint64_t indirect_va,enum radv_rt_mode mode)12250 radv_trace_rays(struct radv_cmd_buffer *cmd_buffer, VkTraceRaysIndirectCommand2KHR *tables, uint64_t indirect_va,
12251                 enum radv_rt_mode mode)
12252 {
12253    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12254    const struct radv_physical_device *pdev = radv_device_physical(device);
12255    const struct radv_instance *instance = radv_physical_device_instance(pdev);
12256 
12257    if (instance->debug_flags & RADV_DEBUG_NO_RT)
12258       return;
12259 
12260    if (unlikely(device->rra_trace.ray_history_buffer))
12261       radv_trace_trace_rays(cmd_buffer, tables, indirect_va);
12262 
12263    struct radv_compute_pipeline *pipeline = &cmd_buffer->state.rt_pipeline->base;
12264    struct radv_shader *rt_prolog = cmd_buffer->state.rt_prolog;
12265 
12266    /* Reserve scratch for stacks manually since it is not handled by the compute path. */
12267    uint32_t scratch_bytes_per_wave = rt_prolog->config.scratch_bytes_per_wave;
12268    uint32_t wave_size = rt_prolog->info.wave_size;
12269 
12270    /* The hardware register is specified as a multiple of 64 or 256 DWORDS. */
12271    unsigned scratch_alloc_granule = pdev->info.gfx_level >= GFX11 ? 256 : 1024;
12272    scratch_bytes_per_wave += align(cmd_buffer->state.rt_stack_size * wave_size, scratch_alloc_granule);
12273 
12274    cmd_buffer->compute_scratch_size_per_wave_needed =
12275       MAX2(cmd_buffer->compute_scratch_size_per_wave_needed, scratch_bytes_per_wave);
12276 
12277    /* Since the workgroup size is 8x4 (or 8x8), 1D dispatches can only fill 8 threads per wave at most. To increase
12278     * occupancy, it's beneficial to convert to a 2D dispatch in these cases. */
12279    if (tables && tables->height == 1 && tables->width >= cmd_buffer->state.rt_prolog->info.cs.block_size[0])
12280       tables->height = ACO_RT_CONVERTED_2D_LAUNCH_SIZE;
12281 
12282    struct radv_dispatch_info info = {0};
12283    info.unaligned = true;
12284 
12285    uint64_t launch_size_va = 0;
12286    uint64_t sbt_va = 0;
12287 
12288    if (mode != radv_rt_mode_indirect2) {
12289       launch_size_va = indirect_va;
12290       radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, &sbt_va);
12291    } else {
12292       launch_size_va = indirect_va + offsetof(VkTraceRaysIndirectCommand2KHR, width);
12293       sbt_va = indirect_va;
12294    }
12295 
12296    uint32_t remaining_ray_count = 0;
12297 
12298    if (mode == radv_rt_mode_direct) {
12299       info.blocks[0] = tables->width;
12300       info.blocks[1] = tables->height;
12301       info.blocks[2] = tables->depth;
12302 
12303       if (tables->height == ACO_RT_CONVERTED_2D_LAUNCH_SIZE) {
12304          /* We need the ray count for the 2D dispatch to be a multiple of the y block size for the division to work, and
12305           * a multiple of the x block size because the invocation offset must be a multiple of the block size when
12306           * dispatching the remaining rays. Fortunately, the x block size is itself a multiple of the y block size, so
12307           * we only need to ensure that the ray count is a multiple of the x block size. */
12308          remaining_ray_count = tables->width % rt_prolog->info.cs.block_size[0];
12309 
12310          uint32_t ray_count = tables->width - remaining_ray_count;
12311          info.blocks[0] = ray_count / rt_prolog->info.cs.block_size[1];
12312          info.blocks[1] = rt_prolog->info.cs.block_size[1];
12313       }
12314    } else
12315       info.va = launch_size_va;
12316 
12317    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 15);
12318 
12319    const uint32_t sbt_descriptors_offset = radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_SBT_DESCRIPTORS);
12320    if (sbt_descriptors_offset) {
12321       radv_emit_shader_pointer(device, cmd_buffer->cs, sbt_descriptors_offset, sbt_va, true);
12322    }
12323 
12324    const uint32_t ray_launch_size_addr_offset = radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_RAY_LAUNCH_SIZE_ADDR);
12325    if (ray_launch_size_addr_offset) {
12326       radv_emit_shader_pointer(device, cmd_buffer->cs, ray_launch_size_addr_offset, launch_size_va, true);
12327    }
12328 
12329    const uint32_t ray_dynamic_callback_stack_base_offset =
12330       radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_RAY_DYNAMIC_CALLABLE_STACK_BASE);
12331    if (ray_dynamic_callback_stack_base_offset) {
12332       const struct radv_shader_info *cs_info = &rt_prolog->info;
12333       radeon_set_sh_reg(cmd_buffer->cs, ray_dynamic_callback_stack_base_offset,
12334                         rt_prolog->config.scratch_bytes_per_wave / cs_info->wave_size);
12335    }
12336 
12337    const uint32_t traversal_shader_addr_offset = radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_TRAVERSAL_SHADER_ADDR);
12338    struct radv_shader *traversal_shader = cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION];
12339    if (traversal_shader_addr_offset && traversal_shader) {
12340       uint64_t traversal_va = traversal_shader->va | radv_rt_priority_traversal;
12341       radv_emit_shader_pointer(device, cmd_buffer->cs, traversal_shader_addr_offset, traversal_va, true);
12342    }
12343 
12344    assert(cmd_buffer->cs->cdw <= cdw_max);
12345 
12346    radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
12347 
12348    if (remaining_ray_count) {
12349       info.blocks[0] = remaining_ray_count;
12350       info.blocks[1] = 1;
12351       info.offsets[0] = tables->width - remaining_ray_count;
12352 
12353       /* Reset the ray launch size so the prolog doesn't think this is a converted dispatch */
12354       tables->height = 1;
12355       radv_upload_trace_rays_params(cmd_buffer, tables, mode, &launch_size_va, NULL);
12356       if (ray_launch_size_addr_offset) {
12357          radv_emit_shader_pointer(device, cmd_buffer->cs, ray_launch_size_addr_offset, launch_size_va, true);
12358       }
12359 
12360       radv_dispatch(cmd_buffer, &info, pipeline, rt_prolog, VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
12361    }
12362 }
12363 
12364 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,uint32_t width,uint32_t height,uint32_t depth)12365 radv_CmdTraceRaysKHR(VkCommandBuffer commandBuffer, const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
12366                      const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
12367                      const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
12368                      const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable, uint32_t width,
12369                      uint32_t height, uint32_t depth)
12370 {
12371    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12372 
12373    VkTraceRaysIndirectCommand2KHR tables = {
12374       .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
12375       .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
12376       .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
12377       .missShaderBindingTableSize = pMissShaderBindingTable->size,
12378       .missShaderBindingTableStride = pMissShaderBindingTable->stride,
12379       .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
12380       .hitShaderBindingTableSize = pHitShaderBindingTable->size,
12381       .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
12382       .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
12383       .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
12384       .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
12385       .width = width,
12386       .height = height,
12387       .depth = depth,
12388    };
12389 
12390    radv_trace_rays(cmd_buffer, &tables, 0, radv_rt_mode_direct);
12391 }
12392 
12393 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,const VkStridedDeviceAddressRegionKHR * pRaygenShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pMissShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pHitShaderBindingTable,const VkStridedDeviceAddressRegionKHR * pCallableShaderBindingTable,VkDeviceAddress indirectDeviceAddress)12394 radv_CmdTraceRaysIndirectKHR(VkCommandBuffer commandBuffer,
12395                              const VkStridedDeviceAddressRegionKHR *pRaygenShaderBindingTable,
12396                              const VkStridedDeviceAddressRegionKHR *pMissShaderBindingTable,
12397                              const VkStridedDeviceAddressRegionKHR *pHitShaderBindingTable,
12398                              const VkStridedDeviceAddressRegionKHR *pCallableShaderBindingTable,
12399                              VkDeviceAddress indirectDeviceAddress)
12400 {
12401    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12402    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12403 
12404    assert(device->use_global_bo_list);
12405 
12406    VkTraceRaysIndirectCommand2KHR tables = {
12407       .raygenShaderRecordAddress = pRaygenShaderBindingTable->deviceAddress,
12408       .raygenShaderRecordSize = pRaygenShaderBindingTable->size,
12409       .missShaderBindingTableAddress = pMissShaderBindingTable->deviceAddress,
12410       .missShaderBindingTableSize = pMissShaderBindingTable->size,
12411       .missShaderBindingTableStride = pMissShaderBindingTable->stride,
12412       .hitShaderBindingTableAddress = pHitShaderBindingTable->deviceAddress,
12413       .hitShaderBindingTableSize = pHitShaderBindingTable->size,
12414       .hitShaderBindingTableStride = pHitShaderBindingTable->stride,
12415       .callableShaderBindingTableAddress = pCallableShaderBindingTable->deviceAddress,
12416       .callableShaderBindingTableSize = pCallableShaderBindingTable->size,
12417       .callableShaderBindingTableStride = pCallableShaderBindingTable->stride,
12418    };
12419 
12420    radv_trace_rays(cmd_buffer, &tables, indirectDeviceAddress, radv_rt_mode_indirect);
12421 }
12422 
12423 VKAPI_ATTR void VKAPI_CALL
radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer,VkDeviceAddress indirectDeviceAddress)12424 radv_CmdTraceRaysIndirect2KHR(VkCommandBuffer commandBuffer, VkDeviceAddress indirectDeviceAddress)
12425 {
12426    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12427    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12428 
12429    assert(device->use_global_bo_list);
12430 
12431    radv_trace_rays(cmd_buffer, NULL, indirectDeviceAddress, radv_rt_mode_indirect2);
12432 }
12433 
12434 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer,uint32_t size)12435 radv_CmdSetRayTracingPipelineStackSizeKHR(VkCommandBuffer commandBuffer, uint32_t size)
12436 {
12437    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12438    cmd_buffer->state.rt_stack_size = size;
12439 }
12440 
12441 /*
12442  * For HTILE we have the following interesting clear words:
12443  *   0xfffff30f: Uncompressed, full depth range, for depth+stencil HTILE
12444  *   0xfffc000f: Uncompressed, full depth range, for depth only HTILE.
12445  *   0xfffffff0: Clear depth to 1.0
12446  *   0x00000000: Clear depth to 0.0
12447  */
12448 static void
radv_initialize_htile(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)12449 radv_initialize_htile(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
12450                       const VkImageSubresourceRange *range)
12451 {
12452    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12453    struct radv_cmd_state *state = &cmd_buffer->state;
12454    uint32_t htile_value = radv_get_htile_initial_value(device, image);
12455    VkClearDepthStencilValue value = {0};
12456    struct radv_barrier_data barrier = {0};
12457 
12458    barrier.layout_transitions.init_mask_ram = 1;
12459    radv_describe_layout_transition(cmd_buffer, &barrier);
12460 
12461    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is consistent
12462     * in considering previous rendering work for WAW hazards. */
12463    state->flush_bits |= radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
12464                                               VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, image);
12465 
12466    if (image->planes[0].surface.has_stencil &&
12467        !(range->aspectMask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT))) {
12468       /* Flush caches before performing a separate aspect initialization because it's a
12469        * read-modify-write operation.
12470        */
12471       state->flush_bits |=
12472          radv_dst_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, VK_ACCESS_2_SHADER_READ_BIT, image);
12473    }
12474 
12475    state->flush_bits |= radv_clear_htile(cmd_buffer, image, range, htile_value);
12476 
12477    radv_set_ds_clear_metadata(cmd_buffer, image, range, value, range->aspectMask);
12478 
12479    if (radv_image_is_tc_compat_htile(image) && (range->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)) {
12480       /* Initialize the TC-compat metada value to 0 because by
12481        * default DB_Z_INFO.RANGE_PRECISION is set to 1, and we only
12482        * need have to conditionally update its value when performing
12483        * a fast depth clear.
12484        */
12485       radv_set_tc_compat_zrange_metadata(cmd_buffer, image, range, 0);
12486    }
12487 }
12488 
12489 static void
radv_handle_depth_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)12490 radv_handle_depth_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
12491                                    VkImageLayout src_layout, VkImageLayout dst_layout, unsigned src_queue_mask,
12492                                    unsigned dst_queue_mask, const VkImageSubresourceRange *range,
12493                                    struct radv_sample_locations_state *sample_locs)
12494 {
12495    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12496 
12497    if (!radv_htile_enabled(image, range->baseMipLevel))
12498       return;
12499 
12500    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
12501       radv_initialize_htile(cmd_buffer, image, range);
12502    } else if (radv_layout_is_htile_compressed(device, image, src_layout, src_queue_mask) &&
12503               !radv_layout_is_htile_compressed(device, image, dst_layout, dst_queue_mask)) {
12504       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
12505 
12506       radv_expand_depth_stencil(cmd_buffer, image, range, sample_locs);
12507 
12508       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_FLUSH_AND_INV_DB | RADV_CMD_FLAG_FLUSH_AND_INV_DB_META;
12509    }
12510 }
12511 
12512 static uint32_t
radv_init_cmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)12513 radv_init_cmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range,
12514                 uint32_t value)
12515 {
12516    struct radv_barrier_data barrier = {0};
12517 
12518    barrier.layout_transitions.init_mask_ram = 1;
12519    radv_describe_layout_transition(cmd_buffer, &barrier);
12520 
12521    return radv_clear_cmask(cmd_buffer, image, range, value);
12522 }
12523 
12524 uint32_t
radv_init_fmask(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range)12525 radv_init_fmask(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range)
12526 {
12527    static const uint32_t fmask_clear_values[4] = {0x00000000, 0x02020202, 0xE4E4E4E4, 0x76543210};
12528    uint32_t log2_samples = util_logbase2(image->vk.samples);
12529    uint32_t value = fmask_clear_values[log2_samples];
12530    struct radv_barrier_data barrier = {0};
12531 
12532    barrier.layout_transitions.init_mask_ram = 1;
12533    radv_describe_layout_transition(cmd_buffer, &barrier);
12534 
12535    return radv_clear_fmask(cmd_buffer, image, range, value);
12536 }
12537 
12538 uint32_t
radv_init_dcc(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,const VkImageSubresourceRange * range,uint32_t value)12539 radv_init_dcc(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, const VkImageSubresourceRange *range,
12540               uint32_t value)
12541 {
12542    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12543    const struct radv_physical_device *pdev = radv_device_physical(device);
12544    struct radv_barrier_data barrier = {0};
12545    uint32_t flush_bits = 0;
12546    unsigned size = 0;
12547 
12548    barrier.layout_transitions.init_mask_ram = 1;
12549    radv_describe_layout_transition(cmd_buffer, &barrier);
12550 
12551    flush_bits |= radv_clear_dcc(cmd_buffer, image, range, value);
12552 
12553    if (pdev->info.gfx_level == GFX8) {
12554       /* When DCC is enabled with mipmaps, some levels might not
12555        * support fast clears and we have to initialize them as "fully
12556        * expanded".
12557        */
12558       /* Compute the size of all fast clearable DCC levels. */
12559       for (unsigned i = 0; i < image->planes[0].surface.num_meta_levels; i++) {
12560          struct legacy_surf_dcc_level *dcc_level = &image->planes[0].surface.u.legacy.color.dcc_level[i];
12561          unsigned dcc_fast_clear_size = dcc_level->dcc_slice_fast_clear_size * image->vk.array_layers;
12562 
12563          if (!dcc_fast_clear_size)
12564             break;
12565 
12566          size = dcc_level->dcc_offset + dcc_fast_clear_size;
12567       }
12568 
12569       /* Initialize the mipmap levels without DCC. */
12570       if (size != image->planes[0].surface.meta_size) {
12571          flush_bits |= radv_fill_buffer(cmd_buffer, image, image->bindings[0].bo,
12572                                         radv_image_get_va(image, 0) + image->planes[0].surface.meta_offset + size,
12573                                         image->planes[0].surface.meta_size - size, 0xffffffff);
12574       }
12575    }
12576 
12577    return flush_bits;
12578 }
12579 
12580 /**
12581  * Initialize DCC/FMASK/CMASK metadata for a color image.
12582  */
12583 static void
radv_init_color_image_metadata(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)12584 radv_init_color_image_metadata(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
12585                                VkImageLayout dst_layout, unsigned src_queue_mask, unsigned dst_queue_mask,
12586                                const VkImageSubresourceRange *range)
12587 {
12588    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12589    uint32_t flush_bits = 0;
12590 
12591    /* Transitioning from LAYOUT_UNDEFINED layout not everyone is
12592     * consistent in considering previous rendering work for WAW hazards.
12593     */
12594    cmd_buffer->state.flush_bits |= radv_src_access_flush(cmd_buffer, VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT,
12595                                                          VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, image);
12596 
12597    if (radv_image_has_cmask(image)) {
12598       static const uint32_t cmask_clear_values[4] = {0xffffffff, 0xdddddddd, 0xeeeeeeee, 0xffffffff};
12599       uint32_t log2_samples = util_logbase2(image->vk.samples);
12600 
12601       flush_bits |= radv_init_cmask(cmd_buffer, image, range, cmask_clear_values[log2_samples]);
12602    }
12603 
12604    if (radv_image_has_fmask(image)) {
12605       flush_bits |= radv_init_fmask(cmd_buffer, image, range);
12606    }
12607 
12608    if (radv_dcc_enabled(image, range->baseMipLevel)) {
12609       uint32_t value = 0xffffffffu; /* Fully expanded mode. */
12610 
12611       if (radv_layout_dcc_compressed(device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
12612          value = 0u;
12613       }
12614 
12615       flush_bits |= radv_init_dcc(cmd_buffer, image, range, value);
12616    }
12617 
12618    if (radv_image_has_cmask(image) || radv_dcc_enabled(image, range->baseMipLevel)) {
12619       radv_update_fce_metadata(cmd_buffer, image, range, false);
12620 
12621       uint32_t color_values[2] = {0};
12622       radv_set_color_clear_metadata(cmd_buffer, image, range, color_values);
12623    }
12624 
12625    cmd_buffer->state.flush_bits |= flush_bits;
12626 }
12627 
12628 static void
radv_retile_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned dst_queue_mask)12629 radv_retile_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
12630                        VkImageLayout dst_layout, unsigned dst_queue_mask)
12631 {
12632    /* If the image is read-only, we don't have to retile DCC because it can't change. */
12633    if (!(image->vk.usage & RADV_IMAGE_USAGE_WRITE_BITS))
12634       return;
12635 
12636    if (src_layout != VK_IMAGE_LAYOUT_PRESENT_SRC_KHR &&
12637        (dst_layout == VK_IMAGE_LAYOUT_PRESENT_SRC_KHR || (dst_queue_mask & (1u << RADV_QUEUE_FOREIGN))))
12638       radv_retile_dcc(cmd_buffer, image);
12639 }
12640 
12641 static bool
radv_image_need_retile(const struct radv_cmd_buffer * cmd_buffer,const struct radv_image * image)12642 radv_image_need_retile(const struct radv_cmd_buffer *cmd_buffer, const struct radv_image *image)
12643 {
12644    return cmd_buffer->qf != RADV_QUEUE_TRANSFER && image->planes[0].surface.display_dcc_offset &&
12645           image->planes[0].surface.display_dcc_offset != image->planes[0].surface.meta_offset;
12646 }
12647 
12648 /**
12649  * Handle color image transitions for DCC/FMASK/CMASK.
12650  */
12651 static void
radv_handle_color_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,unsigned src_queue_mask,unsigned dst_queue_mask,const VkImageSubresourceRange * range)12652 radv_handle_color_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image,
12653                                    VkImageLayout src_layout, VkImageLayout dst_layout, unsigned src_queue_mask,
12654                                    unsigned dst_queue_mask, const VkImageSubresourceRange *range)
12655 {
12656    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12657    bool dcc_decompressed = false, fast_clear_flushed = false;
12658 
12659    if (!radv_image_has_cmask(image) && !radv_image_has_fmask(image) && !radv_dcc_enabled(image, range->baseMipLevel))
12660       return;
12661 
12662    if (src_layout == VK_IMAGE_LAYOUT_UNDEFINED) {
12663       radv_init_color_image_metadata(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask, range);
12664 
12665       if (radv_image_need_retile(cmd_buffer, image))
12666          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
12667       return;
12668    }
12669 
12670    if (radv_dcc_enabled(image, range->baseMipLevel)) {
12671       if (src_layout == VK_IMAGE_LAYOUT_PREINITIALIZED) {
12672          cmd_buffer->state.flush_bits |= radv_init_dcc(cmd_buffer, image, range, 0xffffffffu);
12673       } else if (radv_layout_dcc_compressed(device, image, range->baseMipLevel, src_layout, src_queue_mask) &&
12674                  !radv_layout_dcc_compressed(device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
12675          radv_decompress_dcc(cmd_buffer, image, range);
12676          dcc_decompressed = true;
12677       } else if (radv_layout_can_fast_clear(device, image, range->baseMipLevel, src_layout, src_queue_mask) &&
12678                  !radv_layout_can_fast_clear(device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
12679          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
12680          fast_clear_flushed = true;
12681       }
12682 
12683       if (radv_image_need_retile(cmd_buffer, image))
12684          radv_retile_transition(cmd_buffer, image, src_layout, dst_layout, dst_queue_mask);
12685    } else if (radv_image_has_cmask(image) || radv_image_has_fmask(image)) {
12686       if (radv_layout_can_fast_clear(device, image, range->baseMipLevel, src_layout, src_queue_mask) &&
12687           !radv_layout_can_fast_clear(device, image, range->baseMipLevel, dst_layout, dst_queue_mask)) {
12688          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
12689          fast_clear_flushed = true;
12690       }
12691    }
12692 
12693    /* MSAA color decompress. */
12694    const enum radv_fmask_compression src_fmask_comp =
12695       radv_layout_fmask_compression(device, image, src_layout, src_queue_mask);
12696    const enum radv_fmask_compression dst_fmask_comp =
12697       radv_layout_fmask_compression(device, image, dst_layout, dst_queue_mask);
12698    if (src_fmask_comp <= dst_fmask_comp)
12699       return;
12700 
12701    if (src_fmask_comp == RADV_FMASK_COMPRESSION_FULL) {
12702       if (radv_dcc_enabled(image, range->baseMipLevel) && !radv_image_use_dcc_image_stores(device, image) &&
12703           !dcc_decompressed) {
12704          /* A DCC decompress is required before expanding FMASK
12705           * when DCC stores aren't supported to avoid being in
12706           * a state where DCC is compressed and the main
12707           * surface is uncompressed.
12708           */
12709          radv_decompress_dcc(cmd_buffer, image, range);
12710       } else if (!fast_clear_flushed) {
12711          /* A FMASK decompress is required before expanding
12712           * FMASK.
12713           */
12714          radv_fast_clear_flush_image_inplace(cmd_buffer, image, range);
12715       }
12716    }
12717 
12718    if (dst_fmask_comp == RADV_FMASK_COMPRESSION_NONE) {
12719       struct radv_barrier_data barrier = {0};
12720       barrier.layout_transitions.fmask_color_expand = 1;
12721       radv_describe_layout_transition(cmd_buffer, &barrier);
12722 
12723       radv_expand_fmask_image_inplace(cmd_buffer, image, range);
12724    }
12725 }
12726 
12727 static void
radv_handle_image_transition(struct radv_cmd_buffer * cmd_buffer,struct radv_image * image,VkImageLayout src_layout,VkImageLayout dst_layout,uint32_t src_family_index,uint32_t dst_family_index,const VkImageSubresourceRange * range,struct radv_sample_locations_state * sample_locs)12728 radv_handle_image_transition(struct radv_cmd_buffer *cmd_buffer, struct radv_image *image, VkImageLayout src_layout,
12729                              VkImageLayout dst_layout, uint32_t src_family_index, uint32_t dst_family_index,
12730                              const VkImageSubresourceRange *range, struct radv_sample_locations_state *sample_locs)
12731 {
12732    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12733    const struct radv_physical_device *pdev = radv_device_physical(device);
12734    enum radv_queue_family src_qf = vk_queue_to_radv(pdev, src_family_index);
12735    enum radv_queue_family dst_qf = vk_queue_to_radv(pdev, dst_family_index);
12736    if (image->exclusive && src_family_index != dst_family_index) {
12737       /* This is an acquire or a release operation and there will be
12738        * a corresponding release/acquire. Do the transition in the
12739        * most flexible queue. */
12740 
12741       assert(src_qf == cmd_buffer->qf || dst_qf == cmd_buffer->qf);
12742 
12743       if (src_family_index == VK_QUEUE_FAMILY_EXTERNAL || src_family_index == VK_QUEUE_FAMILY_FOREIGN_EXT)
12744          return;
12745 
12746       if (cmd_buffer->qf == RADV_QUEUE_TRANSFER)
12747          return;
12748 
12749       if (cmd_buffer->qf == RADV_QUEUE_COMPUTE && (src_qf == RADV_QUEUE_GENERAL || dst_qf == RADV_QUEUE_GENERAL))
12750          return;
12751    }
12752 
12753    unsigned src_queue_mask = radv_image_queue_family_mask(image, src_qf, cmd_buffer->qf);
12754    unsigned dst_queue_mask = radv_image_queue_family_mask(image, dst_qf, cmd_buffer->qf);
12755 
12756    if (src_layout == dst_layout && src_queue_mask == dst_queue_mask)
12757       return;
12758 
12759    if (image->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
12760       radv_handle_depth_image_transition(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask,
12761                                          range, sample_locs);
12762    } else {
12763       radv_handle_color_image_transition(cmd_buffer, image, src_layout, dst_layout, src_queue_mask, dst_queue_mask,
12764                                          range);
12765    }
12766 }
12767 
12768 static void
radv_cp_dma_wait_for_stages(struct radv_cmd_buffer * cmd_buffer,VkPipelineStageFlags2 stage_mask)12769 radv_cp_dma_wait_for_stages(struct radv_cmd_buffer *cmd_buffer, VkPipelineStageFlags2 stage_mask)
12770 {
12771    /* Make sure CP DMA is idle because the driver might have performed a DMA operation for copying a
12772     * buffer (or a MSAA image using FMASK). Note that updating a buffer is considered a clear
12773     * operation but it might also use a CP DMA copy in some rare situations. Other operations using
12774     * a CP DMA clear are implicitly synchronized (see CP_DMA_SYNC).
12775     */
12776    if (stage_mask &
12777        (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT | VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
12778         VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
12779       radv_cp_dma_wait_for_idle(cmd_buffer);
12780 }
12781 
12782 void
radv_emit_cache_flush(struct radv_cmd_buffer * cmd_buffer)12783 radv_emit_cache_flush(struct radv_cmd_buffer *cmd_buffer)
12784 {
12785    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12786    const struct radv_physical_device *pdev = radv_device_physical(device);
12787    bool is_compute = cmd_buffer->qf == RADV_QUEUE_COMPUTE;
12788 
12789    if (is_compute)
12790       cmd_buffer->state.flush_bits &=
12791          ~(RADV_CMD_FLAG_FLUSH_AND_INV_CB | RADV_CMD_FLAG_FLUSH_AND_INV_CB_META | RADV_CMD_FLAG_FLUSH_AND_INV_DB |
12792            RADV_CMD_FLAG_FLUSH_AND_INV_DB_META | RADV_CMD_FLAG_INV_L2_METADATA | RADV_CMD_FLAG_PS_PARTIAL_FLUSH |
12793            RADV_CMD_FLAG_VS_PARTIAL_FLUSH | RADV_CMD_FLAG_VGT_FLUSH | RADV_CMD_FLAG_START_PIPELINE_STATS |
12794            RADV_CMD_FLAG_STOP_PIPELINE_STATS);
12795 
12796    if (!cmd_buffer->state.flush_bits) {
12797       radv_describe_barrier_end_delayed(cmd_buffer);
12798       return;
12799    }
12800 
12801    radv_cs_emit_cache_flush(device->ws, cmd_buffer->cs, pdev->info.gfx_level, &cmd_buffer->gfx9_fence_idx,
12802                             cmd_buffer->gfx9_fence_va, radv_cmd_buffer_uses_mec(cmd_buffer),
12803                             cmd_buffer->state.flush_bits, &cmd_buffer->state.sqtt_flush_bits,
12804                             cmd_buffer->gfx9_eop_bug_va);
12805 
12806    if (radv_device_fault_detection_enabled(device))
12807       radv_cmd_buffer_trace_emit(cmd_buffer);
12808 
12809    if (cmd_buffer->state.flush_bits & RADV_CMD_FLAG_INV_L2)
12810       cmd_buffer->state.rb_noncoherent_dirty = false;
12811 
12812    /* Clear the caches that have been flushed to avoid syncing too much
12813     * when there is some pending active queries.
12814     */
12815    cmd_buffer->active_query_flush_bits &= ~cmd_buffer->state.flush_bits;
12816 
12817    cmd_buffer->state.flush_bits = 0;
12818 
12819    /* If the driver used a compute shader for resetting a query pool, it
12820     * should be finished at this point.
12821     */
12822    cmd_buffer->pending_reset_query = false;
12823 
12824    radv_describe_barrier_end_delayed(cmd_buffer);
12825 }
12826 
12827 static void
radv_barrier(struct radv_cmd_buffer * cmd_buffer,uint32_t dep_count,const VkDependencyInfo * dep_infos,enum rgp_barrier_reason reason)12828 radv_barrier(struct radv_cmd_buffer *cmd_buffer, uint32_t dep_count, const VkDependencyInfo *dep_infos,
12829              enum rgp_barrier_reason reason)
12830 {
12831    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12832    enum radv_cmd_flush_bits src_flush_bits = 0;
12833    enum radv_cmd_flush_bits dst_flush_bits = 0;
12834    VkPipelineStageFlags2 src_stage_mask = 0;
12835    VkPipelineStageFlags2 dst_stage_mask = 0;
12836 
12837    if (cmd_buffer->state.render.active)
12838       radv_mark_noncoherent_rb(cmd_buffer);
12839 
12840    radv_describe_barrier_start(cmd_buffer, reason);
12841 
12842    for (uint32_t dep_idx = 0; dep_idx < dep_count; dep_idx++) {
12843       const VkDependencyInfo *dep_info = &dep_infos[dep_idx];
12844 
12845       for (uint32_t i = 0; i < dep_info->memoryBarrierCount; i++) {
12846          const VkMemoryBarrier2 *barrier = &dep_info->pMemoryBarriers[i];
12847          src_stage_mask |= barrier->srcStageMask;
12848          src_flush_bits |= radv_src_access_flush(cmd_buffer, barrier->srcStageMask, barrier->srcAccessMask, NULL);
12849          dst_stage_mask |= barrier->dstStageMask;
12850          dst_flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dstStageMask, barrier->dstAccessMask, NULL);
12851       }
12852 
12853       for (uint32_t i = 0; i < dep_info->bufferMemoryBarrierCount; i++) {
12854          const VkBufferMemoryBarrier2 *barrier = &dep_info->pBufferMemoryBarriers[i];
12855          src_stage_mask |= barrier->srcStageMask;
12856          src_flush_bits |= radv_src_access_flush(cmd_buffer, barrier->srcStageMask, barrier->srcAccessMask, NULL);
12857          dst_stage_mask |= barrier->dstStageMask;
12858          dst_flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dstStageMask, barrier->dstAccessMask, NULL);
12859       }
12860 
12861       for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
12862          const VkImageMemoryBarrier2 *barrier = &dep_info->pImageMemoryBarriers[i];
12863          VK_FROM_HANDLE(radv_image, image, barrier->image);
12864 
12865          src_stage_mask |= barrier->srcStageMask;
12866          src_flush_bits |= radv_src_access_flush(cmd_buffer, barrier->srcStageMask, barrier->srcAccessMask, image);
12867          dst_stage_mask |= barrier->dstStageMask;
12868          dst_flush_bits |= radv_dst_access_flush(cmd_buffer, barrier->dstStageMask, barrier->dstAccessMask, image);
12869       }
12870    }
12871 
12872    /* The Vulkan spec 1.1.98 says:
12873     *
12874     * "An execution dependency with only
12875     *  VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT in the destination stage mask
12876     *  will only prevent that stage from executing in subsequently
12877     *  submitted commands. As this stage does not perform any actual
12878     *  execution, this is not observable - in effect, it does not delay
12879     *  processing of subsequent commands. Similarly an execution dependency
12880     *  with only VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT in the source stage mask
12881     *  will effectively not wait for any prior commands to complete."
12882     */
12883    if (dst_stage_mask != VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT)
12884       radv_stage_flush(cmd_buffer, src_stage_mask);
12885    cmd_buffer->state.flush_bits |= src_flush_bits;
12886 
12887    radv_gang_barrier(cmd_buffer, src_stage_mask, 0);
12888 
12889    for (uint32_t dep_idx = 0; dep_idx < dep_count; dep_idx++) {
12890       const VkDependencyInfo *dep_info = &dep_infos[dep_idx];
12891 
12892       for (uint32_t i = 0; i < dep_info->imageMemoryBarrierCount; i++) {
12893          VK_FROM_HANDLE(radv_image, image, dep_info->pImageMemoryBarriers[i].image);
12894 
12895          const struct VkSampleLocationsInfoEXT *sample_locs_info =
12896             vk_find_struct_const(dep_info->pImageMemoryBarriers[i].pNext, SAMPLE_LOCATIONS_INFO_EXT);
12897          struct radv_sample_locations_state sample_locations;
12898 
12899          if (sample_locs_info) {
12900             assert(image->vk.create_flags & VK_IMAGE_CREATE_SAMPLE_LOCATIONS_COMPATIBLE_DEPTH_BIT_EXT);
12901             sample_locations.per_pixel = sample_locs_info->sampleLocationsPerPixel;
12902             sample_locations.grid_size = sample_locs_info->sampleLocationGridSize;
12903             sample_locations.count = sample_locs_info->sampleLocationsCount;
12904             typed_memcpy(&sample_locations.locations[0], sample_locs_info->pSampleLocations,
12905                          sample_locs_info->sampleLocationsCount);
12906          }
12907 
12908          radv_handle_image_transition(
12909             cmd_buffer, image, dep_info->pImageMemoryBarriers[i].oldLayout, dep_info->pImageMemoryBarriers[i].newLayout,
12910             dep_info->pImageMemoryBarriers[i].srcQueueFamilyIndex,
12911             dep_info->pImageMemoryBarriers[i].dstQueueFamilyIndex, &dep_info->pImageMemoryBarriers[i].subresourceRange,
12912             sample_locs_info ? &sample_locations : NULL);
12913       }
12914    }
12915 
12916    radv_gang_barrier(cmd_buffer, 0, dst_stage_mask);
12917 
12918    if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
12919       /* SDMA NOP packet waits for all pending SDMA operations to complete.
12920        * Note that GFX9+ is supposed to have RAW dependency tracking, but it's buggy
12921        * so we can't rely on it fow now.
12922        */
12923       radeon_check_space(device->ws, cmd_buffer->cs, 1);
12924       radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_NOP, 0, 0));
12925    } else {
12926       const bool is_gfx_or_ace = cmd_buffer->qf == RADV_QUEUE_GENERAL || cmd_buffer->qf == RADV_QUEUE_COMPUTE;
12927       if (is_gfx_or_ace)
12928          radv_cp_dma_wait_for_stages(cmd_buffer, src_stage_mask);
12929    }
12930 
12931    cmd_buffer->state.flush_bits |= dst_flush_bits;
12932 
12933    radv_describe_barrier_end(cmd_buffer);
12934 }
12935 
12936 VKAPI_ATTR void VKAPI_CALL
radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)12937 radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo)
12938 {
12939    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
12940    enum rgp_barrier_reason barrier_reason;
12941 
12942    if (cmd_buffer->vk.runtime_rp_barrier) {
12943       barrier_reason = RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC;
12944    } else {
12945       barrier_reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
12946    }
12947 
12948    radv_barrier(cmd_buffer, 1, pDependencyInfo, barrier_reason);
12949 }
12950 
12951 static void
write_event(struct radv_cmd_buffer * cmd_buffer,struct radv_event * event,VkPipelineStageFlags2 stageMask,unsigned value)12952 write_event(struct radv_cmd_buffer *cmd_buffer, struct radv_event *event, VkPipelineStageFlags2 stageMask,
12953             unsigned value)
12954 {
12955    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
12956    const struct radv_physical_device *pdev = radv_device_physical(device);
12957    struct radeon_cmdbuf *cs = cmd_buffer->cs;
12958    uint64_t va = radv_buffer_get_va(event->bo);
12959 
12960    if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC || cmd_buffer->qf == RADV_QUEUE_VIDEO_ENC) {
12961       radv_vcn_write_event(cmd_buffer, event, value);
12962       return;
12963    }
12964 
12965    radv_emit_cache_flush(cmd_buffer);
12966 
12967    radv_cs_add_buffer(device->ws, cs, event->bo);
12968 
12969    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 28);
12970 
12971    if (stageMask & (VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
12972                     VK_PIPELINE_STAGE_2_CLEAR_BIT)) {
12973       /* Be conservative for now. */
12974       stageMask |= VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT;
12975    }
12976 
12977    /* Flags that only require a top-of-pipe event. */
12978    VkPipelineStageFlags2 top_of_pipe_flags = VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
12979 
12980    /* Flags that only require a post-index-fetch event. */
12981    VkPipelineStageFlags2 post_index_fetch_flags =
12982       top_of_pipe_flags | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT | VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT;
12983 
12984    /* Flags that only require signaling post PS. */
12985    VkPipelineStageFlags2 post_ps_flags =
12986       post_index_fetch_flags | VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
12987       VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT | VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
12988       VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT | VK_PIPELINE_STAGE_2_MESH_SHADER_BIT_EXT |
12989       VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT | VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
12990       VK_PIPELINE_STAGE_2_FRAGMENT_SHADING_RATE_ATTACHMENT_BIT_KHR | VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
12991       VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT;
12992 
12993    /* Flags that only require signaling post CS. */
12994    VkPipelineStageFlags2 post_cs_flags = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
12995 
12996    radv_cp_dma_wait_for_stages(cmd_buffer, stageMask);
12997 
12998    if (!(stageMask & ~top_of_pipe_flags)) {
12999       /* Just need to sync the PFP engine. */
13000       radv_write_data(cmd_buffer, V_370_PFP, va, 1, &value, false);
13001    } else if (!(stageMask & ~post_index_fetch_flags)) {
13002       /* Sync ME because PFP reads index and indirect buffers. */
13003       radv_write_data(cmd_buffer, V_370_ME, va, 1, &value, false);
13004    } else {
13005       unsigned event_type;
13006 
13007       if (!(stageMask & ~post_ps_flags)) {
13008          /* Sync previous fragment shaders. */
13009          event_type = V_028A90_PS_DONE;
13010       } else if (!(stageMask & ~post_cs_flags)) {
13011          /* Sync previous compute shaders. */
13012          event_type = V_028A90_CS_DONE;
13013       } else {
13014          /* Otherwise, sync all prior GPU work. */
13015          event_type = V_028A90_BOTTOM_OF_PIPE_TS;
13016       }
13017 
13018       radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, event_type, 0, EOP_DST_SEL_MEM,
13019                                    EOP_DATA_SEL_VALUE_32BIT, va, value, cmd_buffer->gfx9_eop_bug_va);
13020    }
13021 
13022    assert(cmd_buffer->cs->cdw <= cdw_max);
13023 }
13024 
13025 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)13026 radv_CmdSetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, const VkDependencyInfo *pDependencyInfo)
13027 {
13028    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13029    VK_FROM_HANDLE(radv_event, event, _event);
13030    VkPipelineStageFlags2 src_stage_mask = 0;
13031 
13032    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
13033       src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
13034    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
13035       src_stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
13036    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
13037       src_stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
13038 
13039    write_event(cmd_buffer, event, src_stage_mask, 1);
13040 }
13041 
13042 VKAPI_ATTR void VKAPI_CALL
radv_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)13043 radv_CmdResetEvent2(VkCommandBuffer commandBuffer, VkEvent _event, VkPipelineStageFlags2 stageMask)
13044 {
13045    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13046    VK_FROM_HANDLE(radv_event, event, _event);
13047 
13048    write_event(cmd_buffer, event, stageMask, 0);
13049 }
13050 
13051 VKAPI_ATTR void VKAPI_CALL
radv_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)13052 radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const VkEvent *pEvents,
13053                     const VkDependencyInfo *pDependencyInfos)
13054 {
13055    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13056    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13057    struct radeon_cmdbuf *cs = cmd_buffer->cs;
13058 
13059    if (cmd_buffer->qf == RADV_QUEUE_VIDEO_DEC || cmd_buffer->qf == RADV_QUEUE_VIDEO_ENC)
13060       return;
13061 
13062    for (unsigned i = 0; i < eventCount; ++i) {
13063       VK_FROM_HANDLE(radv_event, event, pEvents[i]);
13064       uint64_t va = radv_buffer_get_va(event->bo);
13065 
13066       radv_cs_add_buffer(device->ws, cs, event->bo);
13067 
13068       ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cs, 7);
13069 
13070       radv_cp_wait_mem(cs, cmd_buffer->qf, WAIT_REG_MEM_EQUAL, va, 1, 0xffffffff);
13071       assert(cmd_buffer->cs->cdw <= cdw_max);
13072    }
13073 
13074    radv_barrier(cmd_buffer, eventCount, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS);
13075 }
13076 
13077 void
radv_emit_set_predication_state(struct radv_cmd_buffer * cmd_buffer,bool draw_visible,unsigned pred_op,uint64_t va)13078 radv_emit_set_predication_state(struct radv_cmd_buffer *cmd_buffer, bool draw_visible, unsigned pred_op, uint64_t va)
13079 {
13080    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13081    const struct radv_physical_device *pdev = radv_device_physical(device);
13082    uint32_t op = 0;
13083 
13084    radeon_check_space(device->ws, cmd_buffer->cs, 4);
13085 
13086    if (va) {
13087       assert(pred_op == PREDICATION_OP_BOOL32 || pred_op == PREDICATION_OP_BOOL64);
13088 
13089       op = PRED_OP(pred_op);
13090 
13091       /* PREDICATION_DRAW_VISIBLE means that if the 32-bit value is
13092        * zero, all rendering commands are discarded. Otherwise, they
13093        * are discarded if the value is non zero.
13094        */
13095       op |= draw_visible ? PREDICATION_DRAW_VISIBLE : PREDICATION_DRAW_NOT_VISIBLE;
13096    }
13097    if (pdev->info.gfx_level >= GFX9) {
13098       radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 2, 0));
13099       radeon_emit(cmd_buffer->cs, op);
13100       radeon_emit(cmd_buffer->cs, va);
13101       radeon_emit(cmd_buffer->cs, va >> 32);
13102    } else {
13103       radeon_emit(cmd_buffer->cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
13104       radeon_emit(cmd_buffer->cs, va);
13105       radeon_emit(cmd_buffer->cs, op | ((va >> 32) & 0xFF));
13106    }
13107 }
13108 
13109 void
radv_begin_conditional_rendering(struct radv_cmd_buffer * cmd_buffer,uint64_t va,bool draw_visible)13110 radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va, bool draw_visible)
13111 {
13112    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13113    const struct radv_physical_device *pdev = radv_device_physical(device);
13114    struct radeon_cmdbuf *cs = cmd_buffer->cs;
13115    unsigned pred_op = PREDICATION_OP_BOOL32;
13116 
13117    radv_emit_cache_flush(cmd_buffer);
13118 
13119    if (cmd_buffer->qf == RADV_QUEUE_GENERAL) {
13120       if (!pdev->info.has_32bit_predication) {
13121          uint64_t pred_value = 0, pred_va;
13122          unsigned pred_offset;
13123 
13124          /* From the Vulkan spec 1.1.107:
13125           *
13126           * "If the 32-bit value at offset in buffer memory is zero,
13127           *  then the rendering commands are discarded, otherwise they
13128           *  are executed as normal. If the value of the predicate in
13129           *  buffer memory changes while conditional rendering is
13130           *  active, the rendering commands may be discarded in an
13131           *  implementation-dependent way. Some implementations may
13132           *  latch the value of the predicate upon beginning conditional
13133           *  rendering while others may read it before every rendering
13134           *  command."
13135           *
13136           * But, the AMD hardware treats the predicate as a 64-bit
13137           * value which means we need a workaround in the driver.
13138           * Luckily, it's not required to support if the value changes
13139           * when predication is active.
13140           *
13141           * The workaround is as follows:
13142           * 1) allocate a 64-value in the upload BO and initialize it
13143           *    to 0
13144           * 2) copy the 32-bit predicate value to the upload BO
13145           * 3) use the new allocated VA address for predication
13146           *
13147           * Based on the conditionalrender demo, it's faster to do the
13148           * COPY_DATA in ME  (+ sync PFP) instead of PFP.
13149           */
13150          radv_cmd_buffer_upload_data(cmd_buffer, 8, &pred_value, &pred_offset);
13151 
13152          pred_va = radv_buffer_get_va(cmd_buffer->upload.upload_bo) + pred_offset;
13153 
13154          radeon_check_space(device->ws, cmd_buffer->cs, 8);
13155 
13156          radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13157          radeon_emit(
13158             cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
13159          radeon_emit(cs, va);
13160          radeon_emit(cs, va >> 32);
13161          radeon_emit(cs, pred_va);
13162          radeon_emit(cs, pred_va >> 32);
13163 
13164          radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
13165          radeon_emit(cs, 0);
13166 
13167          va = pred_va;
13168          pred_op = PREDICATION_OP_BOOL64;
13169       }
13170 
13171       radv_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
13172    } else {
13173       /* Compute queue doesn't support predication and it's emulated elsewhere. */
13174    }
13175 
13176    /* Store conditional rendering user info. */
13177    cmd_buffer->state.predicating = true;
13178    cmd_buffer->state.predication_type = draw_visible;
13179    cmd_buffer->state.predication_op = pred_op;
13180    cmd_buffer->state.predication_va = va;
13181    cmd_buffer->state.mec_inv_pred_emitted = false;
13182 }
13183 
13184 void
radv_end_conditional_rendering(struct radv_cmd_buffer * cmd_buffer)13185 radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer)
13186 {
13187    if (cmd_buffer->qf == RADV_QUEUE_GENERAL) {
13188       radv_emit_set_predication_state(cmd_buffer, false, 0, 0);
13189    } else {
13190       /* Compute queue doesn't support predication, no need to emit anything here. */
13191    }
13192 
13193    /* Reset conditional rendering user info. */
13194    cmd_buffer->state.predicating = false;
13195    cmd_buffer->state.predication_type = -1;
13196    cmd_buffer->state.predication_op = 0;
13197    cmd_buffer->state.predication_va = 0;
13198    cmd_buffer->state.mec_inv_pred_emitted = false;
13199 }
13200 
13201 /* VK_EXT_conditional_rendering */
13202 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)13203 radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
13204                                      const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
13205 {
13206    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13207    VK_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
13208    bool draw_visible = true;
13209    uint64_t va;
13210 
13211    va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
13212 
13213    /* By default, if the 32-bit value at offset in buffer memory is zero,
13214     * then the rendering commands are discarded, otherwise they are
13215     * executed as normal. If the inverted flag is set, all commands are
13216     * discarded if the value is non zero.
13217     */
13218    if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
13219       draw_visible = false;
13220    }
13221 
13222    radv_begin_conditional_rendering(cmd_buffer, va, draw_visible);
13223 }
13224 
13225 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)13226 radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
13227 {
13228    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13229 
13230    radv_end_conditional_rendering(cmd_buffer);
13231 }
13232 
13233 /* VK_EXT_transform_feedback */
13234 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)13235 radv_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer, uint32_t firstBinding, uint32_t bindingCount,
13236                                         const VkBuffer *pBuffers, const VkDeviceSize *pOffsets,
13237                                         const VkDeviceSize *pSizes)
13238 {
13239    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13240    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13241    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
13242    uint8_t enabled_mask = 0;
13243 
13244    assert(firstBinding + bindingCount <= MAX_SO_BUFFERS);
13245    for (uint32_t i = 0; i < bindingCount; i++) {
13246       uint32_t idx = firstBinding + i;
13247 
13248       sb[idx].buffer = radv_buffer_from_handle(pBuffers[i]);
13249       sb[idx].offset = pOffsets[i];
13250 
13251       if (!pSizes || pSizes[i] == VK_WHOLE_SIZE) {
13252          sb[idx].size = sb[idx].buffer->vk.size - sb[idx].offset;
13253       } else {
13254          sb[idx].size = pSizes[i];
13255       }
13256 
13257       radv_cs_add_buffer(device->ws, cmd_buffer->cs, sb[idx].buffer->bo);
13258 
13259       enabled_mask |= 1 << idx;
13260    }
13261 
13262    cmd_buffer->state.streamout.enabled_mask |= enabled_mask;
13263 
13264    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_BUFFER;
13265 }
13266 
13267 static void
radv_set_streamout_enable(struct radv_cmd_buffer * cmd_buffer,bool enable)13268 radv_set_streamout_enable(struct radv_cmd_buffer *cmd_buffer, bool enable)
13269 {
13270    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13271    const struct radv_physical_device *pdev = radv_device_physical(device);
13272    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
13273    bool old_streamout_enabled = radv_is_streamout_enabled(cmd_buffer);
13274    uint32_t old_hw_enabled_mask = so->hw_enabled_mask;
13275 
13276    so->streamout_enabled = enable;
13277 
13278    so->hw_enabled_mask =
13279       so->enabled_mask | (so->enabled_mask << 4) | (so->enabled_mask << 8) | (so->enabled_mask << 12);
13280 
13281    if (!pdev->use_ngg_streamout && ((old_streamout_enabled != radv_is_streamout_enabled(cmd_buffer)) ||
13282                                     (old_hw_enabled_mask != so->hw_enabled_mask)))
13283       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
13284 
13285    if (pdev->use_ngg_streamout) {
13286       /* Re-emit streamout desciptors because with NGG streamout, a buffer size of 0 acts like a
13287        * disable bit and this is needed when streamout needs to be ignored in shaders.
13288        */
13289       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_SHADER_QUERY | RADV_CMD_DIRTY_STREAMOUT_BUFFER;
13290    }
13291 }
13292 
13293 static void
radv_flush_vgt_streamout(struct radv_cmd_buffer * cmd_buffer)13294 radv_flush_vgt_streamout(struct radv_cmd_buffer *cmd_buffer)
13295 {
13296    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13297    const struct radv_physical_device *pdev = radv_device_physical(device);
13298    struct radeon_cmdbuf *cs = cmd_buffer->cs;
13299    unsigned reg_strmout_cntl;
13300 
13301    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 14);
13302 
13303    /* The register is at different places on different ASICs. */
13304    if (pdev->info.gfx_level >= GFX9) {
13305       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
13306       radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
13307       radeon_emit(cs, S_370_DST_SEL(V_370_MEM_MAPPED_REGISTER) | S_370_ENGINE_SEL(V_370_ME));
13308       radeon_emit(cs, R_0300FC_CP_STRMOUT_CNTL >> 2);
13309       radeon_emit(cs, 0);
13310       radeon_emit(cs, 0);
13311    } else if (pdev->info.gfx_level >= GFX7) {
13312       reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
13313       radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
13314    } else {
13315       reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
13316       radeon_set_config_reg(cs, reg_strmout_cntl, 0);
13317    }
13318 
13319    radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
13320    radeon_emit(cs, EVENT_TYPE(V_028A90_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
13321 
13322    radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
13323    radeon_emit(cs, WAIT_REG_MEM_EQUAL);    /* wait until the register is equal to the reference value */
13324    radeon_emit(cs, reg_strmout_cntl >> 2); /* register */
13325    radeon_emit(cs, 0);
13326    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
13327    radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
13328    radeon_emit(cs, 4);                              /* poll interval */
13329 
13330    assert(cs->cdw <= cdw_max);
13331 }
13332 
13333 VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)13334 radv_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
13335                                   uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
13336                                   const VkDeviceSize *pCounterBufferOffsets)
13337 {
13338    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13339    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13340    const struct radv_physical_device *pdev = radv_device_physical(device);
13341    struct radv_streamout_binding *sb = cmd_buffer->streamout_bindings;
13342    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
13343    struct radeon_cmdbuf *cs = cmd_buffer->cs;
13344    bool first_target = true;
13345 
13346    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
13347    if (!pdev->use_ngg_streamout)
13348       radv_flush_vgt_streamout(cmd_buffer);
13349 
13350    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, MAX_SO_BUFFERS * 10);
13351 
13352    u_foreach_bit (i, so->enabled_mask) {
13353       int32_t counter_buffer_idx = i - firstCounterBuffer;
13354       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
13355          counter_buffer_idx = -1;
13356 
13357       bool append = counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
13358       uint64_t va = 0;
13359 
13360       if (append) {
13361          VK_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
13362          uint64_t counter_buffer_offset = 0;
13363 
13364          if (pCounterBufferOffsets)
13365             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
13366 
13367          va += radv_buffer_get_va(buffer->bo);
13368          va += buffer->offset + counter_buffer_offset;
13369 
13370          radv_cs_add_buffer(device->ws, cs, buffer->bo);
13371       }
13372 
13373       if (pdev->info.gfx_level >= GFX12) {
13374          /* Only the first streamout target holds information. */
13375          if (first_target) {
13376             if (append) {
13377                radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13378                radeon_emit(
13379                   cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
13380                radeon_emit(cs, va);
13381                radeon_emit(cs, va >> 32);
13382                radeon_emit(cs, (R_0309B0_GE_GS_ORDERED_ID_BASE >> 2));
13383                radeon_emit(cs, 0);
13384             } else {
13385                radeon_set_uconfig_reg(cs, R_0309B0_GE_GS_ORDERED_ID_BASE, 0);
13386             }
13387 
13388             first_target = false;
13389          }
13390       } else if (pdev->use_ngg_streamout) {
13391          if (append) {
13392             radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13393             radeon_emit(cs,
13394                         COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
13395             radeon_emit(cs, va);
13396             radeon_emit(cs, va >> 32);
13397             radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
13398             radeon_emit(cs, 0);
13399          } else {
13400             /* The PKT3 CAM bit workaround seems needed for initializing this GDS register to zero. */
13401             radeon_set_uconfig_perfctr_reg(pdev->info.gfx_level, cmd_buffer->qf, cs,
13402                                            R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 + i * 4, 0);
13403          }
13404       } else {
13405          /* AMD GCN binds streamout buffers as shader resources.
13406           * VGT only counts primitives and tells the shader through
13407           * SGPRs what to do.
13408           */
13409          radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, sb[i].size >> 2);
13410 
13411          cmd_buffer->state.context_roll_without_scissor_emitted = true;
13412 
13413          if (append) {
13414             radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
13415             radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) |   /* offset in bytes */
13416                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
13417             radeon_emit(cs, 0);                                                 /* unused */
13418             radeon_emit(cs, 0);                                                 /* unused */
13419             radeon_emit(cs, va);                                                /* src address lo */
13420             radeon_emit(cs, va >> 32);                                          /* src address hi */
13421          } else {
13422             /* Start from the beginning. */
13423             radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
13424             radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) |      /* offset in bytes */
13425                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
13426             radeon_emit(cs, 0);                                                    /* unused */
13427             radeon_emit(cs, 0);                                                    /* unused */
13428             radeon_emit(cs, 0);                                                    /* unused */
13429             radeon_emit(cs, 0);                                                    /* unused */
13430          }
13431       }
13432    }
13433 
13434    assert(cs->cdw <= cdw_max);
13435 
13436    radv_set_streamout_enable(cmd_buffer, true);
13437 
13438    if (!pdev->use_ngg_streamout)
13439       cmd_buffer->state.dirty |= RADV_CMD_DIRTY_STREAMOUT_ENABLE;
13440 }
13441 
13442 VKAPI_ATTR void VKAPI_CALL
radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)13443 radv_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer, uint32_t counterBufferCount,
13444                                 const VkBuffer *pCounterBuffers, const VkDeviceSize *pCounterBufferOffsets)
13445 {
13446    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13447    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13448    const struct radv_physical_device *pdev = radv_device_physical(device);
13449    struct radv_streamout_state *so = &cmd_buffer->state.streamout;
13450    struct radeon_cmdbuf *cs = cmd_buffer->cs;
13451 
13452    assert(firstCounterBuffer + counterBufferCount <= MAX_SO_BUFFERS);
13453 
13454    if (pdev->info.gfx_level >= GFX12) {
13455       /* Nothing to do. The streamout state buffer already contains the next ordered ID, which
13456        * is the only thing we need to restore.
13457        */
13458       radv_set_streamout_enable(cmd_buffer, false);
13459       return;
13460    }
13461 
13462    if (pdev->use_ngg_streamout) {
13463       /* Wait for streamout to finish before reading GDS_STRMOUT registers. */
13464       cmd_buffer->state.flush_bits |= RADV_CMD_FLAG_VS_PARTIAL_FLUSH;
13465       radv_emit_cache_flush(cmd_buffer);
13466    } else {
13467       radv_flush_vgt_streamout(cmd_buffer);
13468    }
13469 
13470    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, MAX_SO_BUFFERS * 12);
13471 
13472    u_foreach_bit (i, so->enabled_mask) {
13473       int32_t counter_buffer_idx = i - firstCounterBuffer;
13474       if (counter_buffer_idx >= 0 && counter_buffer_idx >= counterBufferCount)
13475          counter_buffer_idx = -1;
13476 
13477       bool append = counter_buffer_idx >= 0 && pCounterBuffers && pCounterBuffers[counter_buffer_idx];
13478       uint64_t va = 0;
13479 
13480       if (append) {
13481          VK_FROM_HANDLE(radv_buffer, buffer, pCounterBuffers[counter_buffer_idx]);
13482          uint64_t counter_buffer_offset = 0;
13483 
13484          if (pCounterBufferOffsets)
13485             counter_buffer_offset = pCounterBufferOffsets[counter_buffer_idx];
13486 
13487          va += radv_buffer_get_va(buffer->bo);
13488          va += buffer->offset + counter_buffer_offset;
13489 
13490          radv_cs_add_buffer(device->ws, cs, buffer->bo);
13491       }
13492 
13493       if (pdev->use_ngg_streamout) {
13494          if (append) {
13495             radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13496             radeon_emit(cs,
13497                         COPY_DATA_SRC_SEL(COPY_DATA_REG) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
13498             radeon_emit(cs, (R_031088_GDS_STRMOUT_DWORDS_WRITTEN_0 >> 2) + i);
13499             radeon_emit(cs, 0);
13500             radeon_emit(cs, va);
13501             radeon_emit(cs, va >> 32);
13502          }
13503       } else {
13504          if (append) {
13505             radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
13506             radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | STRMOUT_DATA_TYPE(1) | /* offset in bytes */
13507                                STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
13508                                STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
13509             radeon_emit(cs, va);                                  /* dst address lo */
13510             radeon_emit(cs, va >> 32);                            /* dst address hi */
13511             radeon_emit(cs, 0);                                   /* unused */
13512             radeon_emit(cs, 0);                                   /* unused */
13513          }
13514 
13515          /* Deactivate transform feedback by zeroing the buffer size.
13516           * The counters (primitives generated, primitives emitted) may
13517           * be enabled even if there is not buffer bound. This ensures
13518           * that the primitives-emitted query won't increment.
13519           */
13520          radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0);
13521 
13522          cmd_buffer->state.context_roll_without_scissor_emitted = true;
13523       }
13524    }
13525 
13526    assert(cmd_buffer->cs->cdw <= cdw_max);
13527 
13528    radv_set_streamout_enable(cmd_buffer, false);
13529 }
13530 
13531 static void
radv_emit_strmout_buffer(struct radv_cmd_buffer * cmd_buffer,const struct radv_draw_info * draw_info)13532 radv_emit_strmout_buffer(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *draw_info)
13533 {
13534    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13535    const struct radv_physical_device *pdev = radv_device_physical(device);
13536    const enum amd_gfx_level gfx_level = pdev->info.gfx_level;
13537    uint64_t va = radv_buffer_get_va(draw_info->strmout_buffer->bo);
13538    struct radeon_cmdbuf *cs = cmd_buffer->cs;
13539 
13540    va += draw_info->strmout_buffer->offset + draw_info->strmout_buffer_offset;
13541 
13542    radeon_set_context_reg(cs, R_028B30_VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE, draw_info->stride);
13543 
13544    if (gfx_level >= GFX10) {
13545       /* Emitting a COPY_DATA packet should be enough because RADV doesn't support preemption
13546        * (shadow memory) but for unknown reasons, it can lead to GPU hangs on GFX10+.
13547        */
13548       radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0));
13549       radeon_emit(cs, 0);
13550 
13551       radeon_emit(cs, PKT3(PKT3_LOAD_CONTEXT_REG_INDEX, 3, 0));
13552       radeon_emit(cs, va);
13553       radeon_emit(cs, va >> 32);
13554       radeon_emit(cs, (R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE - SI_CONTEXT_REG_OFFSET) >> 2);
13555       radeon_emit(cs, 1); /* 1 DWORD */
13556    } else {
13557       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13558       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_SRC_MEM) | COPY_DATA_DST_SEL(COPY_DATA_REG) | COPY_DATA_WR_CONFIRM);
13559       radeon_emit(cs, va);
13560       radeon_emit(cs, va >> 32);
13561       radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
13562       radeon_emit(cs, 0); /* unused */
13563    }
13564 
13565    radv_cs_add_buffer(device->ws, cs, draw_info->strmout_buffer->bo);
13566 }
13567 
13568 VKAPI_ATTR void VKAPI_CALL
radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer _counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)13569 radv_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer, uint32_t instanceCount, uint32_t firstInstance,
13570                                  VkBuffer _counterBuffer, VkDeviceSize counterBufferOffset, uint32_t counterOffset,
13571                                  uint32_t vertexStride)
13572 {
13573    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13574    VK_FROM_HANDLE(radv_buffer, counterBuffer, _counterBuffer);
13575    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13576    const struct radv_physical_device *pdev = radv_device_physical(device);
13577    struct radv_draw_info info;
13578 
13579    info.count = 0;
13580    info.instance_count = instanceCount;
13581    info.first_instance = firstInstance;
13582    info.strmout_buffer = counterBuffer;
13583    info.strmout_buffer_offset = counterBufferOffset;
13584    info.stride = vertexStride;
13585    info.indexed = false;
13586    info.indirect = NULL;
13587 
13588    if (!radv_before_draw(cmd_buffer, &info, 1, false))
13589       return;
13590    struct VkMultiDrawInfoEXT minfo = {0, 0};
13591    radv_emit_strmout_buffer(cmd_buffer, &info);
13592    radv_emit_direct_draw_packets(cmd_buffer, &info, 1, &minfo, S_0287F0_USE_OPAQUE(1), 0);
13593 
13594    if (pdev->info.gfx_level == GFX12) {
13595       /* DrawTransformFeedback requires 3 SQ_NON_EVENTs after the packet. */
13596       for (unsigned i = 0; i < 3; i++) {
13597          radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
13598          radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_SQ_NON_EVENT) | EVENT_INDEX(0));
13599       }
13600    }
13601 
13602    radv_after_draw(cmd_buffer, false);
13603 }
13604 
13605 /* VK_AMD_buffer_marker */
13606 VKAPI_ATTR void VKAPI_CALL
radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkBuffer dstBuffer,VkDeviceSize dstOffset,uint32_t marker)13607 radv_CmdWriteBufferMarker2AMD(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkBuffer dstBuffer,
13608                               VkDeviceSize dstOffset, uint32_t marker)
13609 {
13610    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13611    VK_FROM_HANDLE(radv_buffer, buffer, dstBuffer);
13612    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13613    const struct radv_physical_device *pdev = radv_device_physical(device);
13614    struct radeon_cmdbuf *cs = cmd_buffer->cs;
13615    const uint64_t va = radv_buffer_get_va(buffer->bo) + buffer->offset + dstOffset;
13616 
13617    if (cmd_buffer->qf == RADV_QUEUE_TRANSFER) {
13618       radeon_check_space(device->ws, cmd_buffer->cs, 4);
13619       radeon_emit(cmd_buffer->cs, SDMA_PACKET(SDMA_OPCODE_FENCE, 0, SDMA_FENCE_MTYPE_UC));
13620       radeon_emit(cs, va);
13621       radeon_emit(cs, va >> 32);
13622       radeon_emit(cs, marker);
13623       return;
13624    }
13625 
13626    radv_emit_cache_flush(cmd_buffer);
13627 
13628    ASSERTED unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 12);
13629 
13630    if (!(stage & ~VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT)) {
13631       radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0));
13632       radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | COPY_DATA_DST_SEL(COPY_DATA_DST_MEM) | COPY_DATA_WR_CONFIRM);
13633       radeon_emit(cs, marker);
13634       radeon_emit(cs, 0);
13635       radeon_emit(cs, va);
13636       radeon_emit(cs, va >> 32);
13637    } else {
13638       radv_cs_emit_write_event_eop(cs, pdev->info.gfx_level, cmd_buffer->qf, V_028A90_BOTTOM_OF_PIPE_TS, 0,
13639                                    EOP_DST_SEL_MEM, EOP_DATA_SEL_VALUE_32BIT, va, marker, cmd_buffer->gfx9_eop_bug_va);
13640    }
13641 
13642    assert(cmd_buffer->cs->cdw <= cdw_max);
13643 }
13644 
13645 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline pipeline,uint32_t groupIndex)13646 radv_CmdBindPipelineShaderGroupNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
13647                                   VkPipeline pipeline, uint32_t groupIndex)
13648 {
13649    fprintf(stderr, "radv: unimplemented vkCmdBindPipelineShaderGroupNV\n");
13650    abort();
13651 }
13652 
13653 /* VK_NV_device_generated_commands_compute */
13654 VKAPI_ATTR void VKAPI_CALL
radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)13655 radv_CmdUpdatePipelineIndirectBufferNV(VkCommandBuffer commandBuffer, VkPipelineBindPoint pipelineBindPoint,
13656                                        VkPipeline _pipeline)
13657 {
13658    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13659    VK_FROM_HANDLE(radv_pipeline, pipeline, _pipeline);
13660    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13661    const struct radv_compute_pipeline *compute_pipeline = radv_pipeline_to_compute(pipeline);
13662    const struct radeon_cmdbuf *cs = &compute_pipeline->indirect.cs;
13663    const uint64_t va = compute_pipeline->indirect.va;
13664    struct radv_compute_pipeline_metadata metadata;
13665    uint32_t offset = 0;
13666 
13667    radv_get_compute_shader_metadata(device, compute_pipeline->base.shaders[MESA_SHADER_COMPUTE], &metadata);
13668 
13669    radv_write_data(cmd_buffer, V_370_ME, va + offset, sizeof(metadata) / 4, (const uint32_t *)&metadata, false);
13670    offset += sizeof(metadata);
13671 
13672    radv_write_data(cmd_buffer, V_370_ME, va + offset, 1, (const uint32_t *)&cs->cdw, false);
13673    offset += sizeof(uint32_t);
13674 
13675    radv_write_data(cmd_buffer, V_370_ME, va + offset, cs->cdw, (const uint32_t *)cs->buf, false);
13676    offset += cs->cdw * sizeof(uint32_t);
13677 
13678    assert(offset < compute_pipeline->indirect.size);
13679 }
13680 
13681 /* VK_EXT_descriptor_buffer */
13682 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer,uint32_t bufferCount,const VkDescriptorBufferBindingInfoEXT * pBindingInfos)13683 radv_CmdBindDescriptorBuffersEXT(VkCommandBuffer commandBuffer, uint32_t bufferCount,
13684                                  const VkDescriptorBufferBindingInfoEXT *pBindingInfos)
13685 {
13686    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13687 
13688    for (uint32_t i = 0; i < bufferCount; i++) {
13689       cmd_buffer->descriptor_buffers[i] = pBindingInfos[i].address;
13690    }
13691 }
13692 
13693 static void
radv_set_descriptor_buffer_offsets(struct radv_cmd_buffer * cmd_buffer,const VkSetDescriptorBufferOffsetsInfoEXT * pSetDescriptorBufferOffsetsInfo,VkPipelineBindPoint bind_point)13694 radv_set_descriptor_buffer_offsets(struct radv_cmd_buffer *cmd_buffer,
13695                                    const VkSetDescriptorBufferOffsetsInfoEXT *pSetDescriptorBufferOffsetsInfo,
13696                                    VkPipelineBindPoint bind_point)
13697 {
13698    struct radv_descriptor_state *descriptors_state = radv_get_descriptors_state(cmd_buffer, bind_point);
13699 
13700    for (unsigned i = 0; i < pSetDescriptorBufferOffsetsInfo->setCount; i++) {
13701       const uint32_t buffer_idx = pSetDescriptorBufferOffsetsInfo->pBufferIndices[i];
13702       const uint64_t offset = pSetDescriptorBufferOffsetsInfo->pOffsets[i];
13703       unsigned idx = i + pSetDescriptorBufferOffsetsInfo->firstSet;
13704 
13705       descriptors_state->descriptor_buffers[idx] = cmd_buffer->descriptor_buffers[buffer_idx] + offset;
13706 
13707       radv_set_descriptor_set(cmd_buffer, bind_point, NULL, idx);
13708    }
13709 }
13710 
13711 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetDescriptorBufferOffsets2EXT(VkCommandBuffer commandBuffer,const VkSetDescriptorBufferOffsetsInfoEXT * pSetDescriptorBufferOffsetsInfo)13712 radv_CmdSetDescriptorBufferOffsets2EXT(VkCommandBuffer commandBuffer,
13713                                        const VkSetDescriptorBufferOffsetsInfoEXT *pSetDescriptorBufferOffsetsInfo)
13714 {
13715    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13716 
13717    if (pSetDescriptorBufferOffsetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) {
13718       radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo, VK_PIPELINE_BIND_POINT_COMPUTE);
13719    }
13720 
13721    if (pSetDescriptorBufferOffsetsInfo->stageFlags & RADV_GRAPHICS_STAGE_BITS) {
13722       radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo, VK_PIPELINE_BIND_POINT_GRAPHICS);
13723    }
13724 
13725    if (pSetDescriptorBufferOffsetsInfo->stageFlags & RADV_RT_STAGE_BITS) {
13726       radv_set_descriptor_buffer_offsets(cmd_buffer, pSetDescriptorBufferOffsetsInfo,
13727                                          VK_PIPELINE_BIND_POINT_RAY_TRACING_KHR);
13728    }
13729 }
13730 
13731 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(VkCommandBuffer commandBuffer,const VkBindDescriptorBufferEmbeddedSamplersInfoEXT * pBindDescriptorBufferEmbeddedSamplersInfo)13732 radv_CmdBindDescriptorBufferEmbeddedSamplers2EXT(
13733    VkCommandBuffer commandBuffer,
13734    const VkBindDescriptorBufferEmbeddedSamplersInfoEXT *pBindDescriptorBufferEmbeddedSamplersInfo)
13735 {
13736    /* This is a no-op because embedded samplers are inlined at compile time. */
13737 }
13738 
13739 /* VK_EXT_shader_object */
13740 static void
radv_reset_pipeline_state(struct radv_cmd_buffer * cmd_buffer,VkPipelineBindPoint pipelineBindPoint)13741 radv_reset_pipeline_state(struct radv_cmd_buffer *cmd_buffer, VkPipelineBindPoint pipelineBindPoint)
13742 {
13743    switch (pipelineBindPoint) {
13744    case VK_PIPELINE_BIND_POINT_COMPUTE:
13745       if (cmd_buffer->state.compute_pipeline) {
13746          radv_bind_shader(cmd_buffer, NULL, MESA_SHADER_COMPUTE);
13747          cmd_buffer->state.compute_pipeline = NULL;
13748       }
13749       if (cmd_buffer->state.emitted_compute_pipeline) {
13750          cmd_buffer->state.emitted_compute_pipeline = NULL;
13751       }
13752       break;
13753    case VK_PIPELINE_BIND_POINT_GRAPHICS:
13754       if (cmd_buffer->state.graphics_pipeline) {
13755          radv_foreach_stage(s, cmd_buffer->state.graphics_pipeline->active_stages)
13756          {
13757             radv_bind_shader(cmd_buffer, NULL, s);
13758          }
13759          cmd_buffer->state.graphics_pipeline = NULL;
13760 
13761          cmd_buffer->state.gs_copy_shader = NULL;
13762          cmd_buffer->state.last_vgt_shader = NULL;
13763          cmd_buffer->state.has_nggc = false;
13764          cmd_buffer->state.emitted_vs_prolog = NULL;
13765          cmd_buffer->state.spi_shader_col_format = 0;
13766          cmd_buffer->state.cb_shader_mask = 0;
13767          cmd_buffer->state.ms.sample_shading_enable = false;
13768          cmd_buffer->state.ms.min_sample_shading = 1.0f;
13769          cmd_buffer->state.rast_prim = 0;
13770          cmd_buffer->state.uses_out_of_order_rast = false;
13771          cmd_buffer->state.uses_vrs_attachment = false;
13772       }
13773       if (cmd_buffer->state.emitted_graphics_pipeline) {
13774          radv_bind_custom_blend_mode(cmd_buffer, 0);
13775 
13776          if (cmd_buffer->state.db_render_control) {
13777             cmd_buffer->state.db_render_control = 0;
13778             cmd_buffer->state.dirty |= RADV_CMD_DIRTY_FRAMEBUFFER;
13779          }
13780 
13781          cmd_buffer->state.uses_vrs = false;
13782          cmd_buffer->state.uses_vrs_coarse_shading = false;
13783 
13784          cmd_buffer->state.emitted_graphics_pipeline = NULL;
13785       }
13786       break;
13787    default:
13788       break;
13789    }
13790 
13791    cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
13792 }
13793 
13794 static void
radv_bind_compute_shader(struct radv_cmd_buffer * cmd_buffer,struct radv_shader_object * shader_obj)13795 radv_bind_compute_shader(struct radv_cmd_buffer *cmd_buffer, struct radv_shader_object *shader_obj)
13796 {
13797    struct radv_shader *shader = shader_obj ? shader_obj->shader : NULL;
13798    struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
13799    const struct radv_physical_device *pdev = radv_device_physical(device);
13800    struct radeon_cmdbuf *cs = cmd_buffer->cs;
13801 
13802    radv_bind_shader(cmd_buffer, shader, MESA_SHADER_COMPUTE);
13803 
13804    if (!shader_obj)
13805       return;
13806 
13807    ASSERTED const unsigned cdw_max = radeon_check_space(device->ws, cmd_buffer->cs, 128);
13808 
13809    radv_emit_compute_shader(pdev, cs, shader);
13810 
13811    /* Update push constants/indirect descriptors state. */
13812    struct radv_descriptor_state *descriptors_state =
13813       radv_get_descriptors_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
13814    struct radv_push_constant_state *pc_state = &cmd_buffer->push_constant_state[VK_PIPELINE_BIND_POINT_COMPUTE];
13815 
13816    descriptors_state->need_indirect_descriptor_sets =
13817       radv_get_user_sgpr_info(shader, AC_UD_INDIRECT_DESCRIPTOR_SETS)->sgpr_idx != -1;
13818    pc_state->size = shader_obj->push_constant_size;
13819    pc_state->dynamic_offset_count = shader_obj->dynamic_offset_count;
13820 
13821    assert(cmd_buffer->cs->cdw <= cdw_max);
13822 }
13823 
13824 VKAPI_ATTR void VKAPI_CALL
radv_CmdBindShadersEXT(VkCommandBuffer commandBuffer,uint32_t stageCount,const VkShaderStageFlagBits * pStages,const VkShaderEXT * pShaders)13825 radv_CmdBindShadersEXT(VkCommandBuffer commandBuffer, uint32_t stageCount, const VkShaderStageFlagBits *pStages,
13826                        const VkShaderEXT *pShaders)
13827 {
13828    VK_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
13829    VkShaderStageFlagBits bound_stages = 0;
13830 
13831    for (uint32_t i = 0; i < stageCount; i++) {
13832       const gl_shader_stage stage = vk_to_mesa_shader_stage(pStages[i]);
13833 
13834       if (!pShaders) {
13835          cmd_buffer->state.shader_objs[stage] = NULL;
13836          continue;
13837       }
13838 
13839       VK_FROM_HANDLE(radv_shader_object, shader_obj, pShaders[i]);
13840 
13841       cmd_buffer->state.shader_objs[stage] = shader_obj;
13842 
13843       bound_stages |= pStages[i];
13844    }
13845 
13846    if (bound_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
13847       radv_reset_pipeline_state(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
13848       radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_COMPUTE);
13849 
13850       radv_bind_compute_shader(cmd_buffer, cmd_buffer->state.shader_objs[MESA_SHADER_COMPUTE]);
13851    }
13852 
13853    if (bound_stages & RADV_GRAPHICS_STAGE_BITS) {
13854       radv_reset_pipeline_state(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
13855       radv_mark_descriptor_sets_dirty(cmd_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS);
13856 
13857       /* Graphics shaders are handled at draw time because of shader variants. */
13858    }
13859 
13860    cmd_buffer->state.dirty |= RADV_CMD_DIRTY_GRAPHICS_SHADERS;
13861 }
13862 
13863 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationModeNV(VkCommandBuffer commandBuffer,VkCoverageModulationModeNV coverageModulationMode)13864 radv_CmdSetCoverageModulationModeNV(VkCommandBuffer commandBuffer, VkCoverageModulationModeNV coverageModulationMode)
13865 {
13866    unreachable("Not supported by RADV.");
13867 }
13868 
13869 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationTableEnableNV(VkCommandBuffer commandBuffer,VkBool32 coverageModulationTableEnable)13870 radv_CmdSetCoverageModulationTableEnableNV(VkCommandBuffer commandBuffer, VkBool32 coverageModulationTableEnable)
13871 {
13872    unreachable("Not supported by RADV.");
13873 }
13874 
13875 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageModulationTableNV(VkCommandBuffer commandBuffer,uint32_t coverageModulationTableCount,const float * pCoverageModulationTable)13876 radv_CmdSetCoverageModulationTableNV(VkCommandBuffer commandBuffer, uint32_t coverageModulationTableCount,
13877                                      const float *pCoverageModulationTable)
13878 {
13879    unreachable("Not supported by RADV.");
13880 }
13881 
13882 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageReductionModeNV(VkCommandBuffer commandBuffer,VkCoverageReductionModeNV coverageReductionMode)13883 radv_CmdSetCoverageReductionModeNV(VkCommandBuffer commandBuffer, VkCoverageReductionModeNV coverageReductionMode)
13884 {
13885    unreachable("Not supported by RADV.");
13886 }
13887 
13888 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageToColorEnableNV(VkCommandBuffer commandBuffer,VkBool32 coverageToColorEnable)13889 radv_CmdSetCoverageToColorEnableNV(VkCommandBuffer commandBuffer, VkBool32 coverageToColorEnable)
13890 {
13891    unreachable("Not supported by RADV.");
13892 }
13893 
13894 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetCoverageToColorLocationNV(VkCommandBuffer commandBuffer,uint32_t coverageToColorLocation)13895 radv_CmdSetCoverageToColorLocationNV(VkCommandBuffer commandBuffer, uint32_t coverageToColorLocation)
13896 {
13897    unreachable("Not supported by RADV.");
13898 }
13899 
13900 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetRepresentativeFragmentTestEnableNV(VkCommandBuffer commandBuffer,VkBool32 representativeFragmentTestEnable)13901 radv_CmdSetRepresentativeFragmentTestEnableNV(VkCommandBuffer commandBuffer, VkBool32 representativeFragmentTestEnable)
13902 {
13903    unreachable("Not supported by RADV.");
13904 }
13905 
13906 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetShadingRateImageEnableNV(VkCommandBuffer commandBuffer,VkBool32 shadingRateImageEnable)13907 radv_CmdSetShadingRateImageEnableNV(VkCommandBuffer commandBuffer, VkBool32 shadingRateImageEnable)
13908 {
13909    unreachable("Not supported by RADV.");
13910 }
13911 
13912 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportSwizzleNV(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewportSwizzleNV * pViewportSwizzles)13913 radv_CmdSetViewportSwizzleNV(VkCommandBuffer commandBuffer, uint32_t firstViewport, uint32_t viewportCount,
13914                              const VkViewportSwizzleNV *pViewportSwizzles)
13915 {
13916    unreachable("Not supported by RADV.");
13917 }
13918 
13919 VKAPI_ATTR void VKAPI_CALL
radv_CmdSetViewportWScalingEnableNV(VkCommandBuffer commandBuffer,VkBool32 viewportWScalingEnable)13920 radv_CmdSetViewportWScalingEnableNV(VkCommandBuffer commandBuffer, VkBool32 viewportWScalingEnable)
13921 {
13922    unreachable("Not supported by RADV.");
13923 }
13924