xref: /aosp_15_r20/external/mesa3d/src/asahi/vulkan/hk_cmd_draw.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include <assert.h>
8 #include "agx_bg_eot.h"
9 #include "agx_bo.h"
10 #include "agx_compile.h"
11 #include "agx_compiler.h"
12 #include "agx_device.h"
13 #include "agx_helpers.h"
14 #include "agx_linker.h"
15 #include "agx_nir_lower_gs.h"
16 #include "agx_nir_lower_vbo.h"
17 #include "agx_ppp.h"
18 #include "agx_tilebuffer.h"
19 #include "agx_usc.h"
20 #include "agx_uvs.h"
21 #include "hk_buffer.h"
22 #include "hk_cmd_buffer.h"
23 #include "hk_device.h"
24 #include "hk_entrypoints.h"
25 #include "hk_image.h"
26 #include "hk_image_view.h"
27 #include "hk_physical_device.h"
28 #include "hk_private.h"
29 #include "hk_shader.h"
30 
31 #include "asahi/genxml/agx_pack.h"
32 #include "asahi/lib/libagx_shaders.h"
33 #include "asahi/lib/shaders/draws.h"
34 #include "asahi/lib/shaders/geometry.h"
35 #include "shaders/query.h"
36 #include "shaders/tessellator.h"
37 #include "util/bitpack_helpers.h"
38 #include "util/blend.h"
39 #include "util/format/format_utils.h"
40 #include "util/format/u_formats.h"
41 #include "util/macros.h"
42 #include "util/ralloc.h"
43 #include "vulkan/vulkan_core.h"
44 #include "layout.h"
45 #include "nir.h"
46 #include "nir_builder.h"
47 #include "nir_lower_blend.h"
48 #include "nir_xfb_info.h"
49 #include "pool.h"
50 #include "shader_enums.h"
51 #include "vk_blend.h"
52 #include "vk_enum_to_str.h"
53 #include "vk_format.h"
54 #include "vk_graphics_state.h"
55 #include "vk_pipeline.h"
56 #include "vk_render_pass.h"
57 #include "vk_standard_sample_locations.h"
58 #include "vk_util.h"
59 
60 #define IS_DIRTY(bit) BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_##bit)
61 
62 #define IS_SHADER_DIRTY(bit)                                                   \
63    (cmd->state.gfx.shaders_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
64 
65 #define IS_LINKED_DIRTY(bit)                                                   \
66    (cmd->state.gfx.linked_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
67 
68 struct hk_draw {
69    struct hk_grid b;
70    struct hk_addr_range index;
71    bool indexed;
72    uint32_t start;
73    uint32_t index_bias;
74    uint32_t start_instance;
75 
76    /* Indicates that the indirect draw consists of raw VDM commands and should
77     * be stream linked to. Used to accelerate tessellation.
78     */
79    bool raw;
80 
81    /* Set within hk_draw() but here so geometry/tessellation can override */
82    bool restart;
83    enum agx_index_size index_size;
84 };
85 
86 static struct hk_draw
hk_draw_indirect(uint64_t ptr)87 hk_draw_indirect(uint64_t ptr)
88 {
89    return (struct hk_draw){.b = hk_grid_indirect(ptr)};
90 }
91 
92 static struct hk_draw
hk_draw_indexed_indirect(uint64_t ptr,struct hk_addr_range index,enum agx_index_size index_size,bool restart)93 hk_draw_indexed_indirect(uint64_t ptr, struct hk_addr_range index,
94                          enum agx_index_size index_size, bool restart)
95 {
96    return (struct hk_draw){
97       .b = hk_grid_indirect(ptr),
98       .index = index,
99       .indexed = true,
100       .index_size = index_size,
101       .restart = restart,
102    };
103 }
104 
105 /* XXX: deduplicate */
106 static inline enum mesa_prim
vk_conv_topology(VkPrimitiveTopology topology)107 vk_conv_topology(VkPrimitiveTopology topology)
108 {
109    switch (topology) {
110    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
111       return MESA_PRIM_POINTS;
112    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
113       return MESA_PRIM_LINES;
114    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
115       return MESA_PRIM_LINE_STRIP;
116    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
117 #pragma GCC diagnostic push
118 #pragma GCC diagnostic ignored "-Wswitch"
119    case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
120 #pragma GCC diagnostic pop
121       return MESA_PRIM_TRIANGLES;
122    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
123       return MESA_PRIM_TRIANGLE_STRIP;
124    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
125       return MESA_PRIM_TRIANGLE_FAN;
126    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
127       return MESA_PRIM_LINES_ADJACENCY;
128    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
129       return MESA_PRIM_LINE_STRIP_ADJACENCY;
130    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
131       return MESA_PRIM_TRIANGLES_ADJACENCY;
132    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
133       return MESA_PRIM_TRIANGLE_STRIP_ADJACENCY;
134    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
135       return MESA_PRIM_PATCHES;
136    default:
137       unreachable("invalid");
138    }
139 }
140 
141 static void
hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer * cmd)142 hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd)
143 {
144    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
145 
146    /* These depend on color attachment count */
147    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
148    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
149    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
150    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
151 
152    /* These depend on the depth/stencil format */
153    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
154    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
155    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
156    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
157    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
158 
159    /* This may depend on render targets for ESO */
160    BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
161 }
162 
163 void
hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)164 hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
165                              const VkCommandBufferBeginInfo *pBeginInfo)
166 {
167    if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
168        (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
169       char gcbiar_data[VK_GCBIARR_DATA_SIZE(HK_MAX_RTS)];
170       const VkRenderingInfo *resume_info =
171          vk_get_command_buffer_inheritance_as_rendering_resume(
172             cmd->vk.level, pBeginInfo, gcbiar_data);
173       if (resume_info) {
174          hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), resume_info);
175       } else {
176          const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
177             vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
178                                                              pBeginInfo);
179          assert(inheritance_info);
180 
181          struct hk_rendering_state *render = &cmd->state.gfx.render;
182          render->flags = inheritance_info->flags;
183          render->area = (VkRect2D){};
184          render->layer_count = 0;
185          render->view_mask = inheritance_info->viewMask;
186          render->tilebuffer.nr_samples = inheritance_info->rasterizationSamples;
187 
188          render->color_att_count = inheritance_info->colorAttachmentCount;
189          for (uint32_t i = 0; i < render->color_att_count; i++) {
190             render->color_att[i].vk_format =
191                inheritance_info->pColorAttachmentFormats[i];
192          }
193          render->depth_att.vk_format = inheritance_info->depthAttachmentFormat;
194          render->stencil_att.vk_format =
195             inheritance_info->stencilAttachmentFormat;
196 
197          hk_cmd_buffer_dirty_render_pass(cmd);
198       }
199    }
200 
201    hk_cmd_buffer_dirty_all(cmd);
202 
203    /* If multiview is disabled, always read 0. If multiview is enabled,
204     * hk_set_view_index will dirty the root each draw.
205     */
206    cmd->state.gfx.descriptors.root.draw.view_index = 0;
207    cmd->state.gfx.descriptors.root_dirty = true;
208 }
209 
210 void
hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer * cmd)211 hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd)
212 {
213    hk_cmd_buffer_dirty_all(cmd);
214 
215    /* From the Vulkan 1.3.275 spec:
216     *
217     *    "...There is one exception to this rule - if the primary command
218     *    buffer is inside a render pass instance, then the render pass and
219     *    subpass state is not disturbed by executing secondary command
220     *    buffers."
221     *
222     * We need to reset everything EXCEPT the render pass state.
223     */
224    struct hk_rendering_state render_save = cmd->state.gfx.render;
225    memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
226    cmd->state.gfx.render = render_save;
227 }
228 
229 static void
hk_attachment_init(struct hk_attachment * att,const VkRenderingAttachmentInfo * info)230 hk_attachment_init(struct hk_attachment *att,
231                    const VkRenderingAttachmentInfo *info)
232 {
233    if (info == NULL || info->imageView == VK_NULL_HANDLE) {
234       *att = (struct hk_attachment){
235          .iview = NULL,
236       };
237       return;
238    }
239 
240    VK_FROM_HANDLE(hk_image_view, iview, info->imageView);
241    *att = (struct hk_attachment){
242       .vk_format = iview->vk.format,
243       .iview = iview,
244    };
245 
246    if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
247       VK_FROM_HANDLE(hk_image_view, res_iview, info->resolveImageView);
248       att->resolve_mode = info->resolveMode;
249       att->resolve_iview = res_iview;
250    }
251 }
252 
253 VKAPI_ATTR void VKAPI_CALL
hk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)254 hk_GetRenderingAreaGranularityKHR(
255    VkDevice device, const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
256    VkExtent2D *pGranularity)
257 {
258    *pGranularity = (VkExtent2D){.width = 1, .height = 1};
259 }
260 
261 static struct hk_bg_eot
hk_build_bg_eot(struct hk_cmd_buffer * cmd,const VkRenderingInfo * info,bool store,bool partial_render,bool incomplete_render_area)262 hk_build_bg_eot(struct hk_cmd_buffer *cmd, const VkRenderingInfo *info,
263                 bool store, bool partial_render, bool incomplete_render_area)
264 {
265    struct hk_device *dev = hk_cmd_buffer_device(cmd);
266    struct hk_rendering_state *render = &cmd->state.gfx.render;
267 
268    /* Construct the key */
269    struct agx_bg_eot_key key = {.tib = render->tilebuffer};
270    static_assert(AGX_BG_EOT_NONE == 0, "default initializer");
271 
272    key.tib.layered = (render->cr.layers > 1);
273 
274    bool needs_textures_for_spilled_rts =
275       agx_tilebuffer_spills(&render->tilebuffer) && !partial_render && !store;
276 
277    for (unsigned i = 0; i < info->colorAttachmentCount; ++i) {
278       const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
279       if (att_info->imageView == VK_NULL_HANDLE)
280          continue;
281 
282       /* Partial render programs exist only to store/load the tilebuffer to
283        * main memory. When render targets are already spilled to main memory,
284        * there's nothing to do.
285        */
286       if (key.tib.spilled[i] && (partial_render || store))
287          continue;
288 
289       if (store) {
290          bool store = att_info->storeOp == VK_ATTACHMENT_STORE_OP_STORE;
291 
292          /* When resolving, we store the intermediate multisampled image as the
293           * resolve is a separate control stream. This could be optimized.
294           */
295          store |= att_info->resolveMode != VK_RESOLVE_MODE_NONE;
296 
297          /* Partial renders always need to flush to memory. */
298          store |= partial_render;
299 
300          if (store)
301             key.op[i] = AGX_EOT_STORE;
302       } else {
303          bool load = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD;
304          bool clear = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR;
305 
306          /* The background program used for partial renders must always load
307           * whatever was stored in the mid-frame end-of-tile program.
308           */
309          load |= partial_render;
310 
311          /* With an incomplete render area, we're forced to load back tiles and
312           * then use the 3D pipe for the clear.
313           */
314          load |= incomplete_render_area;
315 
316          /* Don't read back spilled render targets, they're already in memory */
317          load &= !key.tib.spilled[i];
318 
319          /* Don't apply clears for spilled render targets when we clear the
320           * render area explicitly after.
321           */
322          if (key.tib.spilled[i] && incomplete_render_area)
323             continue;
324 
325          if (load)
326             key.op[i] = AGX_BG_LOAD;
327          else if (clear)
328             key.op[i] = AGX_BG_CLEAR;
329       }
330    }
331 
332    /* Begin building the pipeline */
333    size_t usc_size = agx_usc_size(3 + HK_MAX_RTS);
334    struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
335    if (!t.cpu)
336       return (struct hk_bg_eot){.usc = t.gpu};
337 
338    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
339 
340    bool uses_txf = false;
341    unsigned uniforms = 0;
342    unsigned nr_tex = 0;
343 
344    for (unsigned rt = 0; rt < HK_MAX_RTS; ++rt) {
345       const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[rt];
346       struct hk_image_view *iview = render->color_att[rt].iview;
347 
348       if (key.op[rt] == AGX_BG_LOAD) {
349          uses_txf = true;
350 
351          uint32_t index = key.tib.layered
352                              ? iview->planes[0].layered_background_desc_index
353                              : iview->planes[0].background_desc_index;
354 
355          agx_usc_pack(&b, TEXTURE, cfg) {
356             /* Shifted to match eMRT indexing, could be optimized */
357             cfg.start = rt * 2;
358             cfg.count = 1;
359             cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
360          }
361 
362          nr_tex = (rt * 2) + 1;
363       } else if (key.op[rt] == AGX_BG_CLEAR) {
364          static_assert(sizeof(att_info->clearValue.color) == 16, "fixed ABI");
365          uint64_t colour =
366             hk_pool_upload(cmd, &att_info->clearValue.color, 16, 16);
367 
368          agx_usc_uniform(&b, 4 + (8 * rt), 8, colour);
369          uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
370       } else if (key.op[rt] == AGX_EOT_STORE) {
371          uint32_t index = key.tib.layered
372                              ? iview->planes[0].layered_eot_pbe_desc_index
373                              : iview->planes[0].eot_pbe_desc_index;
374 
375          agx_usc_pack(&b, TEXTURE, cfg) {
376             cfg.start = rt;
377             cfg.count = 1;
378             cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
379          }
380 
381          nr_tex = rt + 1;
382       }
383    }
384 
385    if (needs_textures_for_spilled_rts) {
386       hk_usc_upload_spilled_rt_descs(&b, cmd);
387       uniforms = MAX2(uniforms, 4);
388    }
389 
390    if (uses_txf) {
391       agx_usc_push_packed(&b, SAMPLER, dev->rodata.txf_sampler);
392    }
393 
394    /* For attachmentless rendering, we don't know the sample count until
395     * draw-time. But we have trivial bg/eot programs in that case too.
396     */
397    if (key.tib.nr_samples >= 1) {
398       agx_usc_push_packed(&b, SHARED, &key.tib.usc);
399    } else {
400       assert(key.tib.sample_size_B == 0);
401       agx_usc_shared_none(&b);
402 
403       key.tib.nr_samples = 1;
404    }
405 
406    /* Get the shader */
407    key.reserved_preamble = uniforms;
408    /* XXX: locking? */
409    struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&dev->bg_eot, &key);
410 
411    agx_usc_pack(&b, SHADER, cfg) {
412       cfg.code = agx_usc_addr(&dev->dev, shader->ptr);
413       cfg.unk_2 = 0;
414    }
415 
416    agx_usc_pack(&b, REGISTERS, cfg)
417       cfg.register_count = shader->info.nr_gprs;
418 
419    if (shader->info.has_preamble) {
420       agx_usc_pack(&b, PRESHADER, cfg) {
421          cfg.code =
422             agx_usc_addr(&dev->dev, shader->ptr + shader->info.preamble_offset);
423       }
424    } else {
425       agx_usc_pack(&b, NO_PRESHADER, cfg)
426          ;
427    }
428 
429    struct hk_bg_eot ret = {.usc = t.gpu};
430 
431    agx_pack(&ret.counts, COUNTS, cfg) {
432       cfg.uniform_register_count = shader->info.push_count;
433       cfg.preshader_register_count = shader->info.nr_preamble_gprs;
434       cfg.texture_state_register_count = nr_tex;
435       cfg.sampler_state_register_count =
436          agx_translate_sampler_state_count(uses_txf ? 1 : 0, false);
437    }
438 
439    return ret;
440 }
441 
442 static bool
is_aligned(unsigned x,unsigned pot_alignment)443 is_aligned(unsigned x, unsigned pot_alignment)
444 {
445    assert(util_is_power_of_two_nonzero(pot_alignment));
446    return (x & (pot_alignment - 1)) == 0;
447 }
448 
449 static void
hk_merge_render_iview(struct hk_rendering_state * render,struct hk_image_view * iview)450 hk_merge_render_iview(struct hk_rendering_state *render,
451                       struct hk_image_view *iview)
452 {
453    if (iview) {
454       unsigned samples = iview->vk.image->samples;
455       /* TODO: is this right for ycbcr? */
456       unsigned level = iview->vk.base_mip_level;
457       unsigned width = u_minify(iview->vk.image->extent.width, level);
458       unsigned height = u_minify(iview->vk.image->extent.height, level);
459 
460       assert(render->tilebuffer.nr_samples == 0 ||
461              render->tilebuffer.nr_samples == samples);
462       render->tilebuffer.nr_samples = samples;
463 
464       /* TODO: Is this merging logic sound? Not sure how this is supposed to
465        * work conceptually.
466        */
467       render->cr.width = MAX2(render->cr.width, width);
468       render->cr.height = MAX2(render->cr.height, height);
469    }
470 }
471 
472 static void
hk_pack_zls_control(struct agx_zls_control_packed * packed,struct ail_layout * z_layout,struct ail_layout * s_layout,const VkRenderingAttachmentInfo * attach_z,const VkRenderingAttachmentInfo * attach_s,bool incomplete_render_area,bool partial_render)473 hk_pack_zls_control(struct agx_zls_control_packed *packed,
474                     struct ail_layout *z_layout, struct ail_layout *s_layout,
475                     const VkRenderingAttachmentInfo *attach_z,
476                     const VkRenderingAttachmentInfo *attach_s,
477                     bool incomplete_render_area, bool partial_render)
478 {
479    agx_pack(packed, ZLS_CONTROL, zls_control) {
480       if (z_layout) {
481          zls_control.z_store_enable =
482             attach_z->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
483             attach_z->resolveMode != VK_RESOLVE_MODE_NONE || partial_render;
484 
485          zls_control.z_load_enable =
486             attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
487             incomplete_render_area;
488 
489          if (ail_is_compressed(z_layout)) {
490             zls_control.z_compress_1 = true;
491             zls_control.z_compress_2 = true;
492          }
493 
494          if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
495             zls_control.z_format = AGX_ZLS_FORMAT_16;
496          } else {
497             zls_control.z_format = AGX_ZLS_FORMAT_32F;
498          }
499       }
500 
501       if (s_layout) {
502          /* TODO:
503           * Fail
504           * dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input.dont_care.store.self_dep_clear_draw_use_input_aspect
505           * without the force
506           * .. maybe a VkRenderPass emulation bug.
507           */
508          zls_control.s_store_enable =
509             attach_s->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
510             attach_s->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
511             true;
512 
513          zls_control.s_load_enable =
514             attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
515             incomplete_render_area;
516 
517          if (ail_is_compressed(s_layout)) {
518             zls_control.s_compress_1 = true;
519             zls_control.s_compress_2 = true;
520          }
521       }
522    }
523 }
524 
525 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)526 hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
527                      const VkRenderingInfo *pRenderingInfo)
528 {
529    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
530    struct hk_rendering_state *render = &cmd->state.gfx.render;
531 
532    memset(render, 0, sizeof(*render));
533 
534    render->flags = pRenderingInfo->flags;
535    render->area = pRenderingInfo->renderArea;
536    render->view_mask = pRenderingInfo->viewMask;
537    render->layer_count = pRenderingInfo->layerCount;
538    render->tilebuffer.nr_samples = 0;
539 
540    const uint32_t layer_count = render->view_mask
541                                    ? util_last_bit(render->view_mask)
542                                    : render->layer_count;
543 
544    render->color_att_count = pRenderingInfo->colorAttachmentCount;
545    for (uint32_t i = 0; i < render->color_att_count; i++) {
546       hk_attachment_init(&render->color_att[i],
547                          &pRenderingInfo->pColorAttachments[i]);
548    }
549 
550    hk_attachment_init(&render->depth_att, pRenderingInfo->pDepthAttachment);
551    hk_attachment_init(&render->stencil_att, pRenderingInfo->pStencilAttachment);
552 
553    for (uint32_t i = 0; i < render->color_att_count; i++) {
554       hk_merge_render_iview(render, render->color_att[i].iview);
555    }
556 
557    hk_merge_render_iview(render,
558                          render->depth_att.iview ?: render->stencil_att.iview);
559 
560    /* Infer for attachmentless. samples is inferred at draw-time. */
561    render->cr.width =
562       MAX2(render->cr.width, render->area.offset.x + render->area.extent.width);
563 
564    render->cr.height = MAX2(render->cr.height,
565                             render->area.offset.y + render->area.extent.height);
566 
567    render->cr.layers = layer_count;
568 
569    /* Choose a tilebuffer layout given the framebuffer key */
570    enum pipe_format formats[HK_MAX_RTS] = {0};
571    for (unsigned i = 0; i < render->color_att_count; ++i) {
572       formats[i] = vk_format_to_pipe_format(render->color_att[i].vk_format);
573    }
574 
575    /* For now, we force layered=true since it makes compatibility problems way
576     * easier.
577     */
578    render->tilebuffer = agx_build_tilebuffer_layout(
579       formats, render->color_att_count, render->tilebuffer.nr_samples, true);
580 
581    hk_cmd_buffer_dirty_render_pass(cmd);
582 
583    /* Determine whether the render area is complete, enabling us to use a
584     * fast-clear.
585     *
586     * TODO: If it is incomplete but tile aligned, it should be possibly to fast
587     * clear with the appropriate settings. This is critical for performance.
588     */
589    bool incomplete_render_area =
590       render->area.offset.x > 0 || render->area.offset.y > 0 ||
591       render->area.extent.width < render->cr.width ||
592       render->area.extent.height < render->cr.height ||
593       (render->view_mask &&
594        render->view_mask != BITFIELD64_MASK(render->cr.layers));
595 
596    render->cr.bg.main = hk_build_bg_eot(cmd, pRenderingInfo, false, false,
597                                         incomplete_render_area);
598    render->cr.bg.partial =
599       hk_build_bg_eot(cmd, pRenderingInfo, false, true, incomplete_render_area);
600 
601    render->cr.eot.main =
602       hk_build_bg_eot(cmd, pRenderingInfo, true, false, incomplete_render_area);
603    render->cr.eot.partial = render->cr.eot.main;
604 
605    render->cr.isp_bgobjvals = 0x300;
606 
607    const VkRenderingAttachmentInfo *attach_z = pRenderingInfo->pDepthAttachment;
608    const VkRenderingAttachmentInfo *attach_s =
609       pRenderingInfo->pStencilAttachment;
610 
611    render->cr.iogpu_unk_214 = 0xc000;
612 
613    struct ail_layout *z_layout = NULL, *s_layout = NULL;
614 
615    if (attach_z != NULL && attach_z != VK_NULL_HANDLE && attach_z->imageView) {
616       struct hk_image_view *view = render->depth_att.iview;
617       struct hk_image *image =
618          container_of(view->vk.image, struct hk_image, vk);
619 
620       z_layout = &image->planes[0].layout;
621 
622       unsigned level = view->vk.base_mip_level;
623       unsigned first_layer = view->vk.base_array_layer;
624 
625       const struct util_format_description *desc =
626          util_format_description(vk_format_to_pipe_format(view->vk.format));
627 
628       assert(desc->format == PIPE_FORMAT_Z32_FLOAT ||
629              desc->format == PIPE_FORMAT_Z16_UNORM ||
630              desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
631 
632       render->cr.depth.buffer =
633          hk_image_base_address(image, 0) +
634          ail_get_layer_level_B(z_layout, first_layer, level);
635 
636       /* Main stride in pages */
637       assert((z_layout->depth_px == 1 ||
638               is_aligned(z_layout->layer_stride_B, AIL_PAGESIZE)) &&
639              "Page aligned Z layers");
640 
641       unsigned stride_pages = z_layout->layer_stride_B / AIL_PAGESIZE;
642       render->cr.depth.stride = ((stride_pages - 1) << 14) | 1;
643 
644       assert(z_layout->tiling != AIL_TILING_LINEAR && "must tile");
645 
646       if (ail_is_compressed(z_layout)) {
647          render->cr.depth.meta =
648             hk_image_base_address(image, 0) + z_layout->metadata_offset_B +
649             (first_layer * z_layout->compression_layer_stride_B) +
650             z_layout->level_offsets_compressed_B[level];
651 
652          /* Meta stride in cache lines */
653          assert(
654             is_aligned(z_layout->compression_layer_stride_B, AIL_CACHELINE) &&
655             "Cacheline aligned Z meta layers");
656 
657          unsigned stride_lines =
658             z_layout->compression_layer_stride_B / AIL_CACHELINE;
659          render->cr.depth.meta_stride = (stride_lines - 1) << 14;
660       }
661 
662       float clear_depth = attach_z->clearValue.depthStencil.depth;
663 
664       if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
665          render->cr.isp_bgobjdepth = _mesa_float_to_unorm(clear_depth, 16);
666          render->cr.iogpu_unk_214 |= 0x40000;
667       } else {
668          render->cr.isp_bgobjdepth = fui(clear_depth);
669       }
670    }
671 
672    if (attach_s != NULL && attach_s != VK_NULL_HANDLE && attach_s->imageView) {
673       struct hk_image_view *view = render->stencil_att.iview;
674       struct hk_image *image =
675          container_of(view->vk.image, struct hk_image, vk);
676 
677       /* Stencil is always the last plane (possibly the only plane) */
678       unsigned plane = image->plane_count - 1;
679       s_layout = &image->planes[plane].layout;
680       assert(s_layout->format == PIPE_FORMAT_S8_UINT);
681 
682       unsigned level = view->vk.base_mip_level;
683       unsigned first_layer = view->vk.base_array_layer;
684 
685       render->cr.stencil.buffer =
686          hk_image_base_address(image, plane) +
687          ail_get_layer_level_B(s_layout, first_layer, level);
688 
689       /* Main stride in pages */
690       assert((s_layout->depth_px == 1 ||
691               is_aligned(s_layout->layer_stride_B, AIL_PAGESIZE)) &&
692              "Page aligned S layers");
693       unsigned stride_pages = s_layout->layer_stride_B / AIL_PAGESIZE;
694       render->cr.stencil.stride = ((stride_pages - 1) << 14) | 1;
695 
696       if (ail_is_compressed(s_layout)) {
697          render->cr.stencil.meta =
698             hk_image_base_address(image, plane) + s_layout->metadata_offset_B +
699             (first_layer * s_layout->compression_layer_stride_B) +
700             s_layout->level_offsets_compressed_B[level];
701 
702          /* Meta stride in cache lines */
703          assert(
704             is_aligned(s_layout->compression_layer_stride_B, AIL_CACHELINE) &&
705             "Cacheline aligned S meta layers");
706 
707          unsigned stride_lines =
708             s_layout->compression_layer_stride_B / AIL_CACHELINE;
709 
710          render->cr.stencil.meta_stride = (stride_lines - 1) << 14;
711       }
712 
713       render->cr.isp_bgobjvals |= attach_s->clearValue.depthStencil.stencil;
714    }
715 
716    hk_pack_zls_control(&render->cr.zls_control, z_layout, s_layout, attach_z,
717                        attach_s, incomplete_render_area, false);
718 
719    hk_pack_zls_control(&render->cr.zls_control_partial, z_layout, s_layout,
720                        attach_z, attach_s, incomplete_render_area, true);
721 
722    /* If multiview is disabled, always read 0. If multiview is enabled,
723     * hk_set_view_index will dirty the root each draw.
724     */
725    cmd->state.gfx.descriptors.root.draw.view_index = 0;
726    cmd->state.gfx.descriptors.root_dirty = true;
727 
728    if (render->flags & VK_RENDERING_RESUMING_BIT)
729       return;
730 
731    /* The first control stream of the render pass is special since it gets
732     * the clears. Create it and swap in the clear.
733     */
734    assert(!cmd->current_cs.gfx && "not already in a render pass");
735    struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
736    if (!cs)
737       return;
738 
739    cs->cr.bg.main = render->cr.bg.main;
740    cs->cr.zls_control = render->cr.zls_control;
741 
742    /* Reordering barrier for post-gfx, in case we had any. */
743    hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
744 
745    /* Don't reorder compute across render passes.
746     *
747     * TODO: Check if this is necessary if the proper PipelineBarriers are
748     * handled... there may be CTS bugs...
749     */
750    hk_cmd_buffer_end_compute(cmd);
751 
752    /* If we spill colour attachments, we need to decompress them. This happens
753     * at the start of the render; it is not re-emitted when resuming
754     * secondaries. It could be hoisted to the start of the command buffer but
755     * we're not that clever yet.
756     */
757    if (agx_tilebuffer_spills(&render->tilebuffer)) {
758       struct hk_device *dev = hk_cmd_buffer_device(cmd);
759 
760       perf_debug(dev, "eMRT render pass");
761 
762       for (unsigned i = 0; i < render->color_att_count; ++i) {
763          struct hk_image_view *view = render->color_att[i].iview;
764          if (view) {
765             struct hk_image *image =
766                container_of(view->vk.image, struct hk_image, vk);
767 
768             /* TODO: YCbCr interaction? */
769             uint8_t plane = 0;
770             uint8_t image_plane = view->planes[plane].image_plane;
771             struct ail_layout *layout = &image->planes[image_plane].layout;
772 
773             if (ail_is_level_compressed(layout, view->vk.base_mip_level)) {
774                struct hk_device *dev = hk_cmd_buffer_device(cmd);
775                perf_debug(dev, "Decompressing in-place");
776 
777                struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
778                   cmd, &cmd->current_cs.pre_gfx, true);
779                if (!cs)
780                   return;
781 
782                unsigned level = view->vk.base_mip_level;
783 
784                struct agx_ptr data =
785                   hk_pool_alloc(cmd, sizeof(struct libagx_decompress_push), 64);
786                struct libagx_decompress_push *push = data.cpu;
787                agx_fill_decompress_push(
788                   push, layout, view->vk.base_array_layer, level,
789                   hk_image_base_address(image, image_plane));
790 
791                push->compressed = view->planes[plane].emrt_texture;
792                push->uncompressed = view->planes[plane].emrt_pbe;
793 
794                struct hk_grid grid =
795                   hk_grid(ail_metadata_width_tl(layout, level) * 32,
796                           ail_metadata_height_tl(layout, level), layer_count);
797 
798                struct agx_decompress_key key = {
799                   .nr_samples = layout->sample_count_sa,
800                };
801 
802                struct hk_shader *s =
803                   hk_meta_kernel(dev, agx_nir_decompress, &key, sizeof(key));
804 
805                uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &data.gpu, 8);
806                hk_dispatch_with_usc(dev, cs, s, usc, grid, hk_grid(32, 1, 1));
807             }
808          }
809       }
810    }
811 
812    if (incomplete_render_area) {
813       uint32_t clear_count = 0;
814       VkClearAttachment clear_att[HK_MAX_RTS + 1];
815       for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
816          const VkRenderingAttachmentInfo *att_info =
817             &pRenderingInfo->pColorAttachments[i];
818          if (att_info->imageView == VK_NULL_HANDLE ||
819              att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
820             continue;
821 
822          clear_att[clear_count++] = (VkClearAttachment){
823             .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
824             .colorAttachment = i,
825             .clearValue = att_info->clearValue,
826          };
827       }
828 
829       clear_att[clear_count] = (VkClearAttachment){
830          .aspectMask = 0,
831       };
832 
833       if (attach_z && attach_z->imageView != VK_NULL_HANDLE &&
834           attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
835          clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
836          clear_att[clear_count].clearValue.depthStencil.depth =
837             attach_z->clearValue.depthStencil.depth;
838       }
839 
840       if (attach_s != NULL && attach_s->imageView != VK_NULL_HANDLE &&
841           attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
842          clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
843          clear_att[clear_count].clearValue.depthStencil.stencil =
844             attach_s->clearValue.depthStencil.stencil;
845       }
846 
847       if (clear_att[clear_count].aspectMask != 0)
848          clear_count++;
849 
850       if (clear_count > 0) {
851          const VkClearRect clear_rect = {
852             .rect = render->area,
853             .baseArrayLayer = 0,
854             .layerCount = render->view_mask ? 1 : render->layer_count,
855          };
856 
857          hk_CmdClearAttachments(hk_cmd_buffer_to_handle(cmd), clear_count,
858                                 clear_att, 1, &clear_rect);
859       }
860    }
861 }
862 
863 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndRendering(VkCommandBuffer commandBuffer)864 hk_CmdEndRendering(VkCommandBuffer commandBuffer)
865 {
866    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
867    struct hk_rendering_state *render = &cmd->state.gfx.render;
868 
869    hk_cmd_buffer_end_graphics(cmd);
870 
871    bool need_resolve = false;
872 
873    /* Translate render state back to VK for meta */
874    VkRenderingAttachmentInfo vk_color_att[HK_MAX_RTS];
875    for (uint32_t i = 0; i < render->color_att_count; i++) {
876       if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
877          need_resolve = true;
878 
879       vk_color_att[i] = (VkRenderingAttachmentInfo){
880          .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
881          .imageView = hk_image_view_to_handle(render->color_att[i].iview),
882          .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
883          .resolveMode = render->color_att[i].resolve_mode,
884          .resolveImageView =
885             hk_image_view_to_handle(render->color_att[i].resolve_iview),
886          .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
887       };
888    }
889 
890    const VkRenderingAttachmentInfo vk_depth_att = {
891       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
892       .imageView = hk_image_view_to_handle(render->depth_att.iview),
893       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
894       .resolveMode = render->depth_att.resolve_mode,
895       .resolveImageView =
896          hk_image_view_to_handle(render->depth_att.resolve_iview),
897       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
898    };
899    if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
900       need_resolve = true;
901 
902    const VkRenderingAttachmentInfo vk_stencil_att = {
903       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
904       .imageView = hk_image_view_to_handle(render->stencil_att.iview),
905       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
906       .resolveMode = render->stencil_att.resolve_mode,
907       .resolveImageView =
908          hk_image_view_to_handle(render->stencil_att.resolve_iview),
909       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
910    };
911    if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
912       need_resolve = true;
913 
914    const VkRenderingInfo vk_render = {
915       .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
916       .renderArea = render->area,
917       .layerCount = render->layer_count,
918       .viewMask = render->view_mask,
919       .colorAttachmentCount = render->color_att_count,
920       .pColorAttachments = vk_color_att,
921       .pDepthAttachment = &vk_depth_att,
922       .pStencilAttachment = &vk_stencil_att,
923    };
924 
925    if (render->flags & VK_RENDERING_SUSPENDING_BIT)
926       need_resolve = false;
927 
928    memset(render, 0, sizeof(*render));
929 
930    if (need_resolve) {
931       hk_meta_resolve_rendering(cmd, &vk_render);
932    }
933 }
934 
935 static uint64_t
hk_geometry_state(struct hk_cmd_buffer * cmd)936 hk_geometry_state(struct hk_cmd_buffer *cmd)
937 {
938    struct hk_device *dev = hk_cmd_buffer_device(cmd);
939 
940    /* We tie heap allocation to geometry state allocation, so allocate now. */
941    if (unlikely(!dev->heap)) {
942       size_t size = 128 * 1024 * 1024;
943       dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
944 
945       /* The geometry state buffer is initialized here and then is treated by
946        * the CPU as rodata, even though the GPU uses it for scratch internally.
947        */
948       off_t off = dev->rodata.geometry_state - dev->rodata.bo->va->addr;
949       struct agx_geometry_state *map = dev->rodata.bo->map + off;
950 
951       *map = (struct agx_geometry_state){
952          .heap = dev->heap->va->addr,
953          .heap_size = size,
954       };
955    }
956 
957    /* We need to free all allocations after each command buffer execution */
958    if (!cmd->uses_heap) {
959       uint64_t addr = dev->rodata.geometry_state;
960 
961       /* Zeroing the allocated index frees everything */
962       hk_queue_write(cmd,
963                      addr + offsetof(struct agx_geometry_state, heap_bottom), 0,
964                      true /* after gfx */);
965 
966       cmd->uses_heap = true;
967    }
968 
969    return dev->rodata.geometry_state;
970 }
971 
972 static uint64_t
hk_upload_gsi_params(struct hk_cmd_buffer * cmd,struct hk_draw draw)973 hk_upload_gsi_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
974 {
975    struct hk_device *dev = hk_cmd_buffer_device(cmd);
976    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
977    struct hk_graphics_state *gfx = &cmd->state.gfx;
978    struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
979 
980    unsigned index_size_B =
981       draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
982 
983    uint64_t vb;
984    if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
985       assert(index_size_B == 4);
986 
987       vb = desc->root.draw.tess_params +
988            offsetof(struct libagx_tess_args, tes_buffer);
989    } else {
990       vb = desc->root.root_desc_addr +
991            offsetof(struct hk_root_descriptor_table, draw.vertex_output_buffer);
992    }
993 
994    struct agx_gs_setup_indirect_params gsi = {
995       .index_buffer = draw.index.addr,
996       .index_size_B = index_size_B,
997       .index_buffer_range_el = draw.index.range / index_size_B,
998       .zero_sink = dev->rodata.zero_sink,
999       .draw = draw.b.ptr,
1000       .vertex_buffer = vb,
1001       .ia = desc->root.draw.input_assembly,
1002       .geom = desc->root.draw.geometry_params,
1003       .vs_outputs = vs->b.info.outputs,
1004    };
1005 
1006    return hk_pool_upload(cmd, &gsi, sizeof(gsi), 8);
1007 }
1008 
1009 static uint64_t
hk_upload_ia_params(struct hk_cmd_buffer * cmd,struct hk_draw draw)1010 hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
1011 {
1012    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1013    assert(!draw.b.indirect && "indirect params written by GPU");
1014 
1015    struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
1016 
1017    if (draw.indexed) {
1018       unsigned index_size_B = agx_index_size_to_B(draw.index_size);
1019       unsigned range_el = draw.index.range / index_size_B;
1020 
1021       ia.index_buffer =
1022          libagx_index_buffer(draw.index.addr, range_el, draw.start,
1023                              index_size_B, dev->rodata.zero_sink);
1024 
1025       ia.index_buffer_range_el =
1026          libagx_index_buffer_range_el(range_el, draw.start);
1027    }
1028 
1029    return hk_pool_upload(cmd, &ia, sizeof(ia), 8);
1030 }
1031 
1032 static enum mesa_prim
hk_gs_in_prim(struct hk_cmd_buffer * cmd)1033 hk_gs_in_prim(struct hk_cmd_buffer *cmd)
1034 {
1035    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1036    struct hk_graphics_state *gfx = &cmd->state.gfx;
1037    struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
1038 
1039    if (tes != NULL)
1040       return tes->variants[HK_GS_VARIANT_RAST].info.ts.out_prim;
1041    else
1042       return vk_conv_topology(dyn->ia.primitive_topology);
1043 }
1044 
1045 static enum mesa_prim
hk_rast_prim(struct hk_cmd_buffer * cmd)1046 hk_rast_prim(struct hk_cmd_buffer *cmd)
1047 {
1048    struct hk_graphics_state *gfx = &cmd->state.gfx;
1049    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1050 
1051    if (gs != NULL)
1052       return gs->variants[HK_GS_VARIANT_RAST].info.gs.out_prim;
1053    else
1054       return hk_gs_in_prim(cmd);
1055 }
1056 
1057 static uint64_t
hk_upload_geometry_params(struct hk_cmd_buffer * cmd,struct hk_draw draw)1058 hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
1059 {
1060    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1061    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1062    struct hk_graphics_state *gfx = &cmd->state.gfx;
1063    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1064    struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
1065 
1066    bool rast_disc = dyn->rs.rasterizer_discard_enable;
1067    struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
1068 
1069    /* XXX: We should deduplicate this logic */
1070    bool restart = (draw.indexed && draw.restart);
1071    bool indirect =
1072       draw.b.indirect || gfx->shaders[MESA_SHADER_TESS_EVAL] || restart;
1073    enum mesa_prim mode = hk_gs_in_prim(cmd);
1074 
1075    if (restart) {
1076       mode = u_decomposed_prim(mode);
1077    }
1078 
1079    struct agx_geometry_params params = {
1080       .state = hk_geometry_state(cmd),
1081       .indirect_desc = cmd->geom_indirect,
1082       .flat_outputs = fs ? fs->info.fs.interp.flat : 0,
1083       .input_topology = mode,
1084 
1085       /* Overriden by the indirect setup kernel. As tess->GS is always indirect,
1086        * we can assume here that we're VS->GS.
1087        */
1088       .input_buffer = desc->root.draw.vertex_output_buffer,
1089       .input_mask = desc->root.draw.vertex_outputs,
1090    };
1091 
1092    if (gfx->xfb_enabled) {
1093       for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb); ++i) {
1094          params.xfb_base_original[i] = gfx->xfb[i].addr;
1095          params.xfb_size[i] = gfx->xfb[i].range;
1096          params.xfb_offs_ptrs[i] = gfx->xfb_offsets + i * sizeof(uint32_t);
1097       }
1098    }
1099 
1100    for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb_query); ++i) {
1101       uint64_t q = gfx->xfb_query[i];
1102 
1103       if (q) {
1104          params.xfb_prims_generated_counter[i] = q;
1105          params.prims_generated_counter[i] = q + sizeof(uint64_t);
1106       }
1107    }
1108 
1109    /* Calculate input primitive count for direct draws, and allocate the vertex
1110     * & count buffers. GPU calculates and allocates for indirect draws.
1111     */
1112    unsigned count_buffer_stride = count->info.gs.count_words * 4;
1113 
1114    if (indirect) {
1115       params.count_buffer_stride = count_buffer_stride;
1116       params.vs_grid[2] = params.gs_grid[2] = 1;
1117    } else {
1118       uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
1119 
1120       params.vs_grid[0] = verts;
1121       params.gs_grid[0] = u_decomposed_prims_for_vertices(mode, verts);
1122 
1123       params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
1124       params.input_primitives = params.gs_grid[0] * instances;
1125 
1126       unsigned size = params.input_primitives * count_buffer_stride;
1127       if (size) {
1128          params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
1129       }
1130    }
1131 
1132    desc->root_dirty = true;
1133    return hk_pool_upload(cmd, &params, sizeof(params), 8);
1134 }
1135 
1136 /*
1137  * Tessellation has a fast path where the tessellator generates a VDM Index List
1138  * command per patch, as well as a slow path using prefix sums to generate a
1139  * single combined API draw. We need the latter if tessellation is fed into
1140  * another software stage (geometry shading), or if we need accurate primitive
1141  * IDs in the linked fragment shader (since that would require a prefix sum
1142  * anyway).
1143  */
1144 static bool
hk_tess_needs_prefix_sum(struct hk_cmd_buffer * cmd)1145 hk_tess_needs_prefix_sum(struct hk_cmd_buffer *cmd)
1146 {
1147    struct hk_graphics_state *gfx = &cmd->state.gfx;
1148 
1149    return gfx->shaders[MESA_SHADER_GEOMETRY] || gfx->generate_primitive_id;
1150 }
1151 
1152 static uint64_t
hk_upload_tess_params(struct hk_cmd_buffer * cmd,struct hk_draw draw)1153 hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
1154 {
1155    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1156    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1157    struct hk_graphics_state *gfx = &cmd->state.gfx;
1158    struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
1159    struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]);
1160 
1161    struct libagx_tess_args args = {
1162       .heap = hk_geometry_state(cmd),
1163       .tcs_stride_el = tcs->info.tcs.output_stride / 4,
1164       .statistic = hk_pipeline_stat_addr(
1165          cmd,
1166          VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT),
1167 
1168       .input_patch_size = dyn->ts.patch_control_points,
1169       .output_patch_size = tcs->info.tcs.output_patch_size,
1170       .tcs_patch_constants = tcs->info.tcs.nr_patch_outputs,
1171       .tcs_per_vertex_outputs = tcs->info.tcs.per_vertex_outputs,
1172    };
1173 
1174    bool with_counts = hk_tess_needs_prefix_sum(cmd);
1175 
1176    /* This assumes !with_counts, if we have counts it's only one draw */
1177    uint32_t draw_stride_el = tes->info.ts.point_mode ? 4 : 6;
1178    size_t draw_stride_B = draw_stride_el * sizeof(uint32_t);
1179 
1180    /* heap is allocated by hk_geometry_state */
1181    args.patch_coord_buffer = dev->heap->va->addr;
1182 
1183    if (!draw.b.indirect) {
1184       unsigned in_patches = draw.b.count[0] / args.input_patch_size;
1185       if (in_patches == 0)
1186          unreachable("todo: drop the draw?");
1187 
1188       unsigned unrolled_patches = in_patches * draw.b.count[1];
1189 
1190       uint32_t alloc = 0;
1191       uint32_t tcs_out_offs = alloc;
1192       alloc += unrolled_patches * args.tcs_stride_el * 4 * 32;
1193 
1194       uint32_t patch_coord_offs = alloc;
1195       alloc += unrolled_patches * 4 * 32;
1196 
1197       uint32_t count_offs = alloc;
1198       if (with_counts)
1199          alloc += unrolled_patches * sizeof(uint32_t) * 32;
1200 
1201       uint32_t draw_offs = alloc;
1202 
1203       if (with_counts) {
1204          /* Single API draw */
1205          alloc += 5 * sizeof(uint32_t);
1206       } else {
1207          /* Padding added because VDM overreads */
1208          alloc += (draw_stride_B * unrolled_patches) +
1209                   (AGX_VDM_BARRIER_LENGTH + 0x800);
1210       }
1211 
1212       struct agx_ptr blob = hk_pool_alloc(cmd, alloc, 4);
1213       args.tcs_buffer = blob.gpu + tcs_out_offs;
1214       args.patches_per_instance = in_patches;
1215       args.coord_allocs = blob.gpu + patch_coord_offs;
1216       args.nr_patches = unrolled_patches;
1217       args.out_draws = blob.gpu + draw_offs;
1218 
1219       gfx->tess_out_draws = args.out_draws;
1220 
1221       if (with_counts) {
1222          args.counts = blob.gpu + count_offs;
1223       } else {
1224          /* Arrange so we return after all generated draws */
1225          uint8_t *ret = (uint8_t *)blob.cpu + draw_offs +
1226                         (draw_stride_B * unrolled_patches);
1227 
1228          agx_pack(ret, VDM_BARRIER, cfg) {
1229             cfg.returns = true;
1230          }
1231       }
1232    } else {
1233       unreachable("todo: indirect with tess");
1234 #if 0
1235       args.tcs_statistic = agx_get_query_address(
1236          batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
1237 
1238       args.indirect = agx_indirect_buffer_ptr(batch, indirect);
1239 
1240       /* Allocate 3x indirect global+local grids for VS/TCS/tess */
1241       uint32_t grid_stride = sizeof(uint32_t) * 6;
1242       args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
1243 
1244       vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride);
1245       tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride);
1246       tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride);
1247 
1248       args.vertex_outputs = ctx->vs->b.info.outputs;
1249       args.vertex_output_buffer_ptr =
1250          agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
1251 
1252       batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr;
1253 
1254       if (with_counts) {
1255          args.out_draws = agx_pool_alloc_aligned_with_bo(
1256                              &batch->pool, draw_stride, 4, &draw_bo)
1257                              .gpu;
1258       } else {
1259          unreachable("need an extra indirection...");
1260       }
1261 #endif
1262    }
1263 
1264    return hk_pool_upload(cmd, &args, sizeof(args), 8);
1265 }
1266 
1267 static struct hk_api_shader *
hk_build_meta_shader_locked(struct hk_device * dev,struct hk_internal_key * key,hk_internal_builder_t builder)1268 hk_build_meta_shader_locked(struct hk_device *dev, struct hk_internal_key *key,
1269                             hk_internal_builder_t builder)
1270 {
1271    /* Try to get the cached shader */
1272    struct hash_entry *ent = _mesa_hash_table_search(dev->kernels.ht, key);
1273    if (ent)
1274       return ent->data;
1275 
1276    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1277                                                   &agx_nir_options, NULL);
1278    builder(&b, key->key);
1279 
1280    const struct vk_pipeline_robustness_state rs = {
1281       .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
1282       .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1283       .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1284       .vertex_inputs = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1285    };
1286 
1287    struct vk_shader_compile_info info = {
1288       .stage = b.shader->info.stage,
1289       .nir = b.shader,
1290       .robustness = &rs,
1291    };
1292 
1293    /* We need to link libagx and assign shared before preprocessing, matching
1294     * what the driver would otherwise produce.
1295     */
1296    agx_link_libagx(b.shader, dev->dev.libagx);
1297 
1298    if (info.stage == MESA_SHADER_COMPUTE) {
1299       NIR_PASS(_, b.shader, nir_lower_vars_to_explicit_types,
1300                nir_var_mem_shared, glsl_get_cl_type_size_align);
1301 
1302       /* Commit to the layout so we don't clobber later */
1303       b.shader->info.shared_memory_explicit_layout = true;
1304 
1305       NIR_PASS(_, b.shader, nir_lower_explicit_io, nir_var_mem_shared,
1306                nir_address_format_62bit_generic);
1307    }
1308 
1309    hk_preprocess_nir_internal(dev->vk.physical, b.shader);
1310 
1311    struct hk_api_shader *s;
1312    if (hk_compile_shader(dev, &info, NULL, NULL, &s) != VK_SUCCESS)
1313       return NULL;
1314 
1315    /* ..and cache it before we return. The key is on the stack right now, so
1316     * clone it before using it as a hash table key. The clone is logically owned
1317     * by the hash table.
1318     */
1319    size_t total_key_size = sizeof(*key) + key->key_size;
1320    void *cloned_key = ralloc_memdup(dev->kernels.ht, key, total_key_size);
1321 
1322    _mesa_hash_table_insert(dev->kernels.ht, cloned_key, s);
1323    return s;
1324 }
1325 
1326 struct hk_api_shader *
hk_meta_shader(struct hk_device * dev,hk_internal_builder_t builder,void * data,size_t data_size)1327 hk_meta_shader(struct hk_device *dev, hk_internal_builder_t builder, void *data,
1328                size_t data_size)
1329 {
1330    size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
1331 
1332    struct hk_internal_key *key = alloca(total_key_size);
1333    key->builder = builder;
1334    key->key_size = data_size;
1335 
1336    if (data_size)
1337       memcpy(key->key, data, data_size);
1338 
1339    simple_mtx_lock(&dev->kernels.lock);
1340    struct hk_api_shader *s = hk_build_meta_shader_locked(dev, key, builder);
1341    simple_mtx_unlock(&dev->kernels.lock);
1342 
1343    return s;
1344 }
1345 
1346 static struct hk_draw
hk_draw_without_restart(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_draw draw,uint32_t draw_count)1347 hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1348                         struct hk_draw draw, uint32_t draw_count)
1349 {
1350    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1351    struct hk_graphics_state *gfx = &cmd->state.gfx;
1352    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1353 
1354    perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
1355 
1356    /* The unroll kernel assumes an indirect draw. Synthesize one if needed */
1357    if (!draw.b.indirect) {
1358       uint32_t desc[5] = {draw.b.count[0], draw.b.count[1], draw.start,
1359                           draw.index_bias, draw.start_instance};
1360 
1361       draw =
1362          hk_draw_indexed_indirect(hk_pool_upload(cmd, desc, sizeof(desc), 4),
1363                                   draw.index, draw.index_size, true);
1364    }
1365 
1366    /* Next, we unroll the index buffer used by the indirect draw */
1367    struct agx_unroll_restart_key key = {
1368       .prim = vk_conv_topology(dyn->ia.primitive_topology),
1369       .index_size_B = agx_index_size_to_B(draw.index_size),
1370    };
1371 
1372    struct agx_restart_unroll_params ia = {
1373       .heap = hk_geometry_state(cmd),
1374       .index_buffer = draw.index.addr,
1375       .count = hk_pool_upload(cmd, &draw_count, sizeof(uint32_t), 4),
1376       .draws = draw.b.ptr,
1377       .out_draws = hk_pool_alloc(cmd, 5 * sizeof(uint32_t) * draw_count, 4).gpu,
1378       .max_draws = 1 /* TODO: MDI */,
1379       .restart_index = gfx->index.restart,
1380       .index_buffer_size_el = draw.index.range / key.index_size_B,
1381       .flatshade_first =
1382          dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT,
1383       .zero_sink = dev->rodata.zero_sink,
1384    };
1385 
1386    struct hk_shader *s =
1387       hk_meta_kernel(dev, agx_nir_unroll_restart, &key, sizeof(key));
1388 
1389    uint64_t params = hk_pool_upload(cmd, &ia, sizeof(ia), 8);
1390    uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
1391    hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1024 * draw_count, 1, 1),
1392                         hk_grid(1024, 1, 1));
1393 
1394    struct hk_addr_range out_index = {
1395       .addr = dev->heap->va->addr,
1396       .range = dev->heap->size,
1397    };
1398 
1399    return hk_draw_indexed_indirect(ia.out_draws, out_index, draw.index_size,
1400                                    false /* restart */);
1401 }
1402 
1403 static struct hk_draw
hk_launch_gs_prerast(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_draw draw)1404 hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1405                      struct hk_draw draw)
1406 {
1407    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1408    struct hk_graphics_state *gfx = &cmd->state.gfx;
1409    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1410    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1411    struct hk_grid grid_vs, grid_gs;
1412 
1413    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1414    bool rast_disc = dyn->rs.rasterizer_discard_enable;
1415 
1416    hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
1417 
1418    struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
1419    struct hk_shader *main = hk_main_gs_variant(gs, rast_disc);
1420    struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
1421    struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc);
1422 
1423    unsigned count_words = count->info.gs.count_words;
1424 
1425    if (false /* TODO */)
1426       perf_debug(dev, "Transform feedbck");
1427    else if (count_words)
1428       perf_debug(dev, "Geometry shader with counts");
1429    else
1430       perf_debug(dev, "Geometry shader without counts");
1431 
1432    enum mesa_prim mode = hk_gs_in_prim(cmd);
1433 
1434    if (draw.indexed && draw.restart) {
1435       draw = hk_draw_without_restart(cmd, cs, draw, 1);
1436       mode = u_decomposed_prim(mode);
1437    }
1438 
1439    /* Setup grids */
1440    if (draw.b.indirect) {
1441       struct agx_gs_setup_indirect_key key = {.prim = mode};
1442 
1443       struct hk_shader *gsi =
1444          hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key));
1445 
1446       uint64_t push = hk_upload_gsi_params(cmd, draw);
1447       uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push));
1448 
1449       hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1),
1450                            hk_grid(1, 1, 1));
1451 
1452       uint64_t geometry_params = desc->root.draw.geometry_params;
1453       grid_vs = hk_grid_indirect(geometry_params +
1454                                  offsetof(struct agx_geometry_params, vs_grid));
1455 
1456       grid_gs = hk_grid_indirect(geometry_params +
1457                                  offsetof(struct agx_geometry_params, gs_grid));
1458    } else {
1459       grid_vs = grid_gs = draw.b;
1460       grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
1461    }
1462 
1463    /* Launch the vertex shader first */
1464    hk_reserve_scratch(cmd, cs, vs);
1465    hk_dispatch_with_usc(dev, cs, vs,
1466                         hk_upload_usc_words(cmd, vs,
1467                                             vs->info.stage == MESA_SHADER_VERTEX
1468                                                ? gfx->linked[MESA_SHADER_VERTEX]
1469                                                : vs->only_linked),
1470                         grid_vs, hk_grid(1, 1, 1));
1471 
1472    /* If we need counts, launch the count shader and prefix sum the results. */
1473    if (count_words) {
1474       hk_dispatch_with_local_size(cmd, cs, count, grid_gs, hk_grid(1, 1, 1));
1475 
1476       struct hk_api_shader *prefix_sum = hk_meta_shader(
1477          dev, agx_nir_prefix_sum_gs, &count_words, sizeof(count_words));
1478 
1479       /* XXX: hack */
1480       hk_only_variant(prefix_sum)->info.stage = MESA_SHADER_GEOMETRY;
1481 
1482       hk_dispatch_with_local_size(cmd, cs, hk_only_variant(prefix_sum),
1483                                   hk_grid(1024 * count_words, 1, 1),
1484                                   hk_grid(1024, 1, 1));
1485    }
1486 
1487    /* Pre-GS shader */
1488    hk_dispatch_with_local_size(cmd, cs, pre_gs, hk_grid(1, 1, 1),
1489                                hk_grid(1, 1, 1));
1490 
1491    /* Pre-rast geometry shader */
1492    hk_dispatch_with_local_size(cmd, cs, main, grid_gs, hk_grid(1, 1, 1));
1493 
1494    struct hk_addr_range range = (struct hk_addr_range){
1495       .addr = dev->heap->va->addr,
1496       .range = dev->heap->size,
1497    };
1498 
1499    bool restart = cmd->state.gfx.topology != AGX_PRIMITIVE_POINTS;
1500    return hk_draw_indexed_indirect(cmd->geom_indirect, range,
1501                                    AGX_INDEX_SIZE_U32, restart);
1502 }
1503 
1504 static struct hk_draw
hk_launch_tess(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_draw draw)1505 hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw)
1506 {
1507    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1508    struct hk_graphics_state *gfx = &cmd->state.gfx;
1509    struct hk_grid grid_vs, grid_tcs, grid_tess;
1510 
1511    struct hk_shader *vs = hk_bound_sw_vs(gfx);
1512    struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
1513    struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]);
1514 
1515    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1516    uint32_t input_patch_size = dyn->ts.patch_control_points;
1517 
1518    hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
1519 
1520    perf_debug(dev, "Tessellation");
1521 
1522    uint64_t tcs_stat = hk_pipeline_stat_addr(
1523       cmd, VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT);
1524 
1525    /* Setup grids */
1526    if (draw.b.indirect) {
1527       unreachable("todo: indirect tess");
1528 #if 0
1529       struct agx_gs_setup_indirect_key key = {.prim = mode};
1530 
1531       struct hk_shader *gsi =
1532          hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key));
1533 
1534       uint64_t push = hk_upload_gsi_params(cmd, draw);
1535       uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push));
1536 
1537       hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1),
1538                            hk_grid(1, 1, 1));
1539 
1540       uint64_t geometry_params = desc->root.draw.geometry_params;
1541       grid_vs = hk_grid_indirect(geometry_params +
1542                                  offsetof(struct agx_geometry_params, vs_grid));
1543 
1544       grid_gs = hk_grid_indirect(geometry_params +
1545                                  offsetof(struct agx_geometry_params, gs_grid));
1546 #endif
1547    } else {
1548       uint32_t patches = draw.b.count[0] / input_patch_size;
1549       grid_vs = grid_tcs = draw.b;
1550 
1551       grid_tcs.count[0] = patches * tcs->info.tcs.output_patch_size;
1552       grid_tess = hk_grid(patches * draw.b.count[1], 1, 1);
1553 
1554       /* TCS invocation counter increments once per-patch */
1555       if (tcs_stat) {
1556          perf_debug(dev, "Direct TCS statistic");
1557 
1558          struct libagx_increment_params args = {
1559             .statistic = tcs_stat,
1560             .delta = patches,
1561          };
1562 
1563          struct hk_shader *s =
1564             hk_meta_kernel(dev, agx_nir_increment_statistic, NULL, 0);
1565 
1566          uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8);
1567          uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
1568 
1569          hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1),
1570                               hk_grid(1, 1, 1));
1571       }
1572    }
1573 
1574    /* First launch the VS and TCS */
1575    hk_reserve_scratch(cmd, cs, vs);
1576    hk_reserve_scratch(cmd, cs, tcs);
1577 
1578    hk_dispatch_with_usc(
1579       dev, cs, vs,
1580       hk_upload_usc_words(cmd, vs, gfx->linked[MESA_SHADER_VERTEX]), grid_vs,
1581       hk_grid(64, 1, 1));
1582 
1583    hk_dispatch_with_usc(
1584       dev, cs, tcs, hk_upload_usc_words(cmd, tcs, tcs->only_linked), grid_tcs,
1585       hk_grid(tcs->info.tcs.output_patch_size, 1, 1));
1586 
1587    /* TODO indirect */
1588 
1589    bool with_counts = hk_tess_needs_prefix_sum(cmd);
1590    uint64_t state = gfx->descriptors.root.draw.tess_params;
1591 
1592    /* If the domain is flipped, we need to flip the winding order */
1593    bool ccw = tes->info.ts.ccw;
1594    ccw ^= dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
1595 
1596    enum libagx_tess_partitioning partitioning =
1597       tes->info.ts.spacing == TESS_SPACING_EQUAL
1598          ? LIBAGX_TESS_PARTITIONING_INTEGER
1599       : tes->info.ts.spacing == TESS_SPACING_FRACTIONAL_ODD
1600          ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
1601          : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
1602 
1603    enum libagx_tess_output_primitive prim =
1604       tes->info.ts.point_mode ? LIBAGX_TESS_OUTPUT_POINT
1605       : ccw                   ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW
1606                               : LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
1607 
1608    struct agx_tessellator_key key = {
1609       .prim = tes->info.ts.mode,
1610       .output_primitive = prim,
1611       .partitioning = partitioning,
1612    };
1613 
1614    if (with_counts) {
1615       perf_debug(dev, "Tessellation with counts");
1616 
1617       /* Generate counts */
1618       key.mode = LIBAGX_TESS_MODE_COUNT;
1619       {
1620          struct hk_shader *tess =
1621             hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key));
1622 
1623          hk_dispatch_with_usc(
1624             dev, cs, tess,
1625             hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)),
1626             grid_tess, hk_grid(64, 1, 1));
1627       }
1628 
1629       /* Prefix sum counts, allocating index buffer space. */
1630       {
1631          struct hk_shader *sum =
1632             hk_meta_kernel(dev, agx_nir_prefix_sum_tess, NULL, 0);
1633 
1634          hk_dispatch_with_usc(
1635             dev, cs, sum,
1636             hk_upload_usc_words_kernel(cmd, sum, &state, sizeof(state)),
1637             hk_grid(1024, 1, 1), hk_grid(1024, 1, 1));
1638       }
1639 
1640       key.mode = LIBAGX_TESS_MODE_WITH_COUNTS;
1641    } else {
1642       key.mode = LIBAGX_TESS_MODE_VDM;
1643    }
1644 
1645    /* Now we can tessellate */
1646    {
1647       struct hk_shader *tess =
1648          hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key));
1649 
1650       hk_dispatch_with_usc(
1651          dev, cs, tess,
1652          hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)),
1653          grid_tess, hk_grid(64, 1, 1));
1654    }
1655 
1656    struct hk_addr_range range = (struct hk_addr_range){
1657       .addr = dev->heap->va->addr,
1658       .range = dev->heap->size,
1659    };
1660 
1661    struct hk_draw out = hk_draw_indexed_indirect(gfx->tess_out_draws, range,
1662                                                  AGX_INDEX_SIZE_U32, false);
1663    out.raw = !with_counts;
1664    return out;
1665 }
1666 
1667 void
hk_cmd_bind_graphics_shader(struct hk_cmd_buffer * cmd,const gl_shader_stage stage,struct hk_api_shader * shader)1668 hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
1669                             const gl_shader_stage stage,
1670                             struct hk_api_shader *shader)
1671 {
1672    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1673 
1674    assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1675    if (cmd->state.gfx.shaders[stage] == shader)
1676       return;
1677 
1678    cmd->state.gfx.shaders[stage] = shader;
1679    cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
1680 
1681    if (stage == MESA_SHADER_FRAGMENT) {
1682       BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
1683    }
1684 }
1685 
1686 static uint32_t
hk_pipeline_bind_group(gl_shader_stage stage)1687 hk_pipeline_bind_group(gl_shader_stage stage)
1688 {
1689    return stage;
1690 }
1691 
1692 static void
hk_flush_shaders(struct hk_cmd_buffer * cmd)1693 hk_flush_shaders(struct hk_cmd_buffer *cmd)
1694 {
1695    if (cmd->state.gfx.shaders_dirty == 0)
1696       return;
1697 
1698    /* Map shader types to shaders */
1699    struct hk_api_shader *type_shader[6] = {
1700       NULL,
1701    };
1702    uint32_t types_dirty = 0;
1703 
1704    const uint32_t gfx_stages =
1705       BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_TESS_CTRL) |
1706       BITFIELD_BIT(MESA_SHADER_TESS_EVAL) | BITFIELD_BIT(MESA_SHADER_GEOMETRY) |
1707       BITFIELD_BIT(MESA_SHADER_FRAGMENT);
1708 
1709    /* Geometry shading overrides the restart index, reemit on rebind */
1710    if (IS_SHADER_DIRTY(GEOMETRY)) {
1711       cmd->state.gfx.dirty |= HK_DIRTY_INDEX;
1712    }
1713 
1714    u_foreach_bit(stage, cmd->state.gfx.shaders_dirty & gfx_stages) {
1715       /* TODO: compact? */
1716       uint32_t type = stage;
1717       types_dirty |= BITFIELD_BIT(type);
1718 
1719       /* Only copy non-NULL shaders because mesh/task alias with vertex and
1720        * tessellation stages.
1721        */
1722       if (cmd->state.gfx.shaders[stage] != NULL) {
1723          assert(type < ARRAY_SIZE(type_shader));
1724          assert(type_shader[type] == NULL);
1725          type_shader[type] = cmd->state.gfx.shaders[stage];
1726       }
1727    }
1728 
1729    u_foreach_bit(type, types_dirty) {
1730       struct hk_api_shader *shader = type_shader[type];
1731 
1732       /* We always map index == type */
1733       // const uint32_t idx = type;
1734 
1735       if (shader == NULL)
1736          continue;
1737 
1738       /* TODO */
1739    }
1740 
1741    struct hk_graphics_state *gfx = &cmd->state.gfx;
1742    struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
1743    struct hk_api_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
1744 
1745    /* If we have a new VS/FS pair, UVS locations may have changed so need to
1746     * relink. We do this here because there's no dependence on the fast linked
1747     * shaders.
1748     */
1749    agx_assign_uvs(&gfx->linked_varyings, &hw_vs->info.uvs,
1750                   fs ? hk_only_variant(fs)->info.fs.interp.flat : 0,
1751                   fs ? hk_only_variant(fs)->info.fs.interp.linear : 0);
1752 
1753    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1754    desc->root_dirty = true;
1755 
1756    for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
1757       desc->root.draw.uvs_index[i] = gfx->linked_varyings.slots[i];
1758    }
1759 }
1760 
1761 static struct agx_shader_part *
hk_get_prolog_epilog_locked(struct hk_device * dev,struct hk_internal_key * key,hk_internal_builder_t builder,bool preprocess_nir,bool stop,unsigned cf_base)1762 hk_get_prolog_epilog_locked(struct hk_device *dev, struct hk_internal_key *key,
1763                             hk_internal_builder_t builder, bool preprocess_nir,
1764                             bool stop, unsigned cf_base)
1765 {
1766    /* Try to get the cached shader */
1767    struct hash_entry *ent = _mesa_hash_table_search(dev->prolog_epilog.ht, key);
1768    if (ent)
1769       return ent->data;
1770 
1771    nir_builder b = nir_builder_init_simple_shader(0, &agx_nir_options, NULL);
1772    builder(&b, key->key);
1773 
1774    if (preprocess_nir)
1775       agx_preprocess_nir(b.shader, dev->dev.libagx);
1776 
1777    struct agx_shader_key backend_key = {
1778       .dev = agx_gather_device_key(&dev->dev),
1779       .libagx = dev->dev.libagx,
1780       .secondary = true,
1781       .no_stop = !stop,
1782    };
1783 
1784    /* We always use dynamic sample shading in the GL driver. Indicate that. */
1785    if (b.shader->info.stage == MESA_SHADER_FRAGMENT) {
1786       backend_key.fs.cf_base = cf_base;
1787 
1788       if (b.shader->info.fs.uses_sample_shading)
1789          backend_key.fs.inside_sample_loop = true;
1790    }
1791 
1792    struct agx_shader_part *part =
1793       rzalloc(dev->prolog_epilog.ht, struct agx_shader_part);
1794 
1795    agx_compile_shader_nir(b.shader, &backend_key, NULL, part);
1796 
1797    ralloc_free(b.shader);
1798 
1799    /* ..and cache it before we return. The key is on the stack right now, so
1800     * clone it before using it as a hash table key. The clone is logically owned
1801     * by the hash table.
1802     */
1803    size_t total_key_size = sizeof(*key) + key->key_size;
1804    void *cloned_key = ralloc_memdup(dev->prolog_epilog.ht, key, total_key_size);
1805 
1806    _mesa_hash_table_insert(dev->prolog_epilog.ht, cloned_key, part);
1807    return part;
1808 }
1809 
1810 static struct agx_shader_part *
hk_get_prolog_epilog(struct hk_device * dev,void * data,size_t data_size,hk_internal_builder_t builder,bool preprocess_nir,bool stop,unsigned cf_base)1811 hk_get_prolog_epilog(struct hk_device *dev, void *data, size_t data_size,
1812                      hk_internal_builder_t builder, bool preprocess_nir,
1813                      bool stop, unsigned cf_base)
1814 {
1815    /* Build the meta shader key */
1816    size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
1817 
1818    struct hk_internal_key *key = alloca(total_key_size);
1819    key->builder = builder;
1820    key->key_size = data_size;
1821 
1822    if (data_size)
1823       memcpy(key->key, data, data_size);
1824 
1825    simple_mtx_lock(&dev->prolog_epilog.lock);
1826 
1827    struct agx_shader_part *part = hk_get_prolog_epilog_locked(
1828       dev, key, builder, preprocess_nir, stop, cf_base);
1829 
1830    simple_mtx_unlock(&dev->prolog_epilog.lock);
1831    return part;
1832 }
1833 
1834 static struct hk_linked_shader *
hk_get_fast_linked_locked_vs(struct hk_device * dev,struct hk_shader * shader,struct hk_fast_link_key_vs * key)1835 hk_get_fast_linked_locked_vs(struct hk_device *dev, struct hk_shader *shader,
1836                              struct hk_fast_link_key_vs *key)
1837 {
1838    struct agx_shader_part *prolog =
1839       hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
1840                            agx_nir_vs_prolog, false, false, 0);
1841 
1842    struct hk_linked_shader *linked =
1843       hk_fast_link(dev, false, shader, prolog, NULL, 0);
1844 
1845    struct hk_fast_link_key *key_clone =
1846       ralloc_memdup(shader->linked.ht, key, sizeof(*key));
1847 
1848    /* XXX: Fix this higher up the stack */
1849    linked->b.uses_base_param |= !key->prolog.hw;
1850 
1851    _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
1852    return linked;
1853 }
1854 
1855 static void
build_fs_prolog(nir_builder * b,const void * key)1856 build_fs_prolog(nir_builder *b, const void *key)
1857 {
1858    agx_nir_fs_prolog(b, key);
1859 
1860    /* Lower load_stat_query_address_agx, needed for FS statistics */
1861    NIR_PASS(_, b->shader, hk_lower_uvs_index, 0);
1862 }
1863 
1864 static struct hk_linked_shader *
hk_get_fast_linked_locked_fs(struct hk_device * dev,struct hk_shader * shader,struct hk_fast_link_key_fs * key)1865 hk_get_fast_linked_locked_fs(struct hk_device *dev, struct hk_shader *shader,
1866                              struct hk_fast_link_key_fs *key)
1867 {
1868    /* TODO: prolog without fs needs to work too... */
1869    bool needs_prolog = key->prolog.statistics ||
1870                        key->prolog.cull_distance_size ||
1871                        key->prolog.api_sample_mask != 0xff;
1872 
1873    struct agx_shader_part *prolog = NULL;
1874    if (needs_prolog) {
1875       prolog = hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
1876                                     build_fs_prolog, false, false,
1877                                     key->prolog.cf_base);
1878    }
1879 
1880    /* If sample shading is used, don't stop at the epilog, there's a
1881     * footer that the fast linker will insert to stop.
1882     */
1883    bool epilog_stop = (key->nr_samples_shaded == 0);
1884 
1885    struct agx_shader_part *epilog =
1886       hk_get_prolog_epilog(dev, &key->epilog, sizeof(key->epilog),
1887                            agx_nir_fs_epilog, true, epilog_stop, 0);
1888 
1889    struct hk_linked_shader *linked =
1890       hk_fast_link(dev, true, shader, prolog, epilog, key->nr_samples_shaded);
1891 
1892    struct hk_fast_link_key *key_clone =
1893       ralloc_memdup(shader->linked.ht, key, sizeof(*key));
1894 
1895    _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
1896    return linked;
1897 }
1898 
1899 /*
1900  * First, look for a fully linked variant. Else, build the required shader
1901  * parts and link.
1902  */
1903 static struct hk_linked_shader *
hk_get_fast_linked(struct hk_device * dev,struct hk_shader * shader,void * key)1904 hk_get_fast_linked(struct hk_device *dev, struct hk_shader *shader, void *key)
1905 {
1906    struct hk_linked_shader *linked;
1907    simple_mtx_lock(&shader->linked.lock);
1908 
1909    struct hash_entry *ent = _mesa_hash_table_search(shader->linked.ht, key);
1910 
1911    if (ent)
1912       linked = ent->data;
1913    else if (shader->info.stage == MESA_SHADER_VERTEX)
1914       linked = hk_get_fast_linked_locked_vs(dev, shader, key);
1915    else if (shader->info.stage == MESA_SHADER_FRAGMENT)
1916       linked = hk_get_fast_linked_locked_fs(dev, shader, key);
1917    else
1918       unreachable("invalid stage");
1919 
1920    simple_mtx_unlock(&shader->linked.lock);
1921    return linked;
1922 }
1923 
1924 static void
hk_update_fast_linked(struct hk_cmd_buffer * cmd,struct hk_shader * shader,void * key)1925 hk_update_fast_linked(struct hk_cmd_buffer *cmd, struct hk_shader *shader,
1926                       void *key)
1927 {
1928    struct hk_device *dev = hk_cmd_buffer_device(cmd);
1929    struct hk_linked_shader *new = hk_get_fast_linked(dev, shader, key);
1930    gl_shader_stage stage = shader->info.stage;
1931 
1932    if (cmd->state.gfx.linked[stage] != new) {
1933       cmd->state.gfx.linked[stage] = new;
1934       cmd->state.gfx.linked_dirty |= BITFIELD_BIT(stage);
1935    }
1936 }
1937 
1938 static enum agx_polygon_mode
translate_polygon_mode(VkPolygonMode vk_mode)1939 translate_polygon_mode(VkPolygonMode vk_mode)
1940 {
1941    static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_FILL ==
1942                  AGX_POLYGON_MODE_FILL);
1943    static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_LINE ==
1944                  AGX_POLYGON_MODE_LINE);
1945    static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_POINT ==
1946                  AGX_POLYGON_MODE_POINT);
1947 
1948    assert(vk_mode <= VK_POLYGON_MODE_POINT);
1949    return (enum agx_polygon_mode)vk_mode;
1950 }
1951 
1952 static enum agx_zs_func
translate_compare_op(VkCompareOp vk_mode)1953 translate_compare_op(VkCompareOp vk_mode)
1954 {
1955    static_assert((enum agx_zs_func)VK_COMPARE_OP_NEVER == AGX_ZS_FUNC_NEVER);
1956    static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS == AGX_ZS_FUNC_LESS);
1957    static_assert((enum agx_zs_func)VK_COMPARE_OP_EQUAL == AGX_ZS_FUNC_EQUAL);
1958    static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS_OR_EQUAL ==
1959                  AGX_ZS_FUNC_LEQUAL);
1960    static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER ==
1961                  AGX_ZS_FUNC_GREATER);
1962    static_assert((enum agx_zs_func)VK_COMPARE_OP_NOT_EQUAL ==
1963                  AGX_ZS_FUNC_NOT_EQUAL);
1964    static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER_OR_EQUAL ==
1965                  AGX_ZS_FUNC_GEQUAL);
1966    static_assert((enum agx_zs_func)VK_COMPARE_OP_ALWAYS == AGX_ZS_FUNC_ALWAYS);
1967 
1968    assert(vk_mode <= VK_COMPARE_OP_ALWAYS);
1969    return (enum agx_zs_func)vk_mode;
1970 }
1971 
1972 static enum agx_stencil_op
translate_stencil_op(VkStencilOp vk_op)1973 translate_stencil_op(VkStencilOp vk_op)
1974 {
1975    static_assert((enum agx_stencil_op)VK_STENCIL_OP_KEEP ==
1976                  AGX_STENCIL_OP_KEEP);
1977    static_assert((enum agx_stencil_op)VK_STENCIL_OP_ZERO ==
1978                  AGX_STENCIL_OP_ZERO);
1979    static_assert((enum agx_stencil_op)VK_STENCIL_OP_REPLACE ==
1980                  AGX_STENCIL_OP_REPLACE);
1981    static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_CLAMP ==
1982                  AGX_STENCIL_OP_INCR_SAT);
1983    static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_CLAMP ==
1984                  AGX_STENCIL_OP_DECR_SAT);
1985    static_assert((enum agx_stencil_op)VK_STENCIL_OP_INVERT ==
1986                  AGX_STENCIL_OP_INVERT);
1987    static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_WRAP ==
1988                  AGX_STENCIL_OP_INCR_WRAP);
1989    static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_WRAP ==
1990                  AGX_STENCIL_OP_DECR_WRAP);
1991 
1992    return (enum agx_stencil_op)vk_op;
1993 }
1994 
1995 static void
hk_ppp_push_stencil_face(struct agx_ppp_update * ppp,struct vk_stencil_test_face_state s,bool enabled)1996 hk_ppp_push_stencil_face(struct agx_ppp_update *ppp,
1997                          struct vk_stencil_test_face_state s, bool enabled)
1998 {
1999    if (enabled) {
2000       agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
2001          cfg.compare = translate_compare_op(s.op.compare);
2002          cfg.write_mask = s.write_mask;
2003          cfg.read_mask = s.compare_mask;
2004 
2005          cfg.depth_pass = translate_stencil_op(s.op.pass);
2006          cfg.depth_fail = translate_stencil_op(s.op.depth_fail);
2007          cfg.stencil_fail = translate_stencil_op(s.op.fail);
2008       }
2009    } else {
2010       agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
2011          cfg.compare = AGX_ZS_FUNC_ALWAYS;
2012          cfg.write_mask = 0xFF;
2013          cfg.read_mask = 0xFF;
2014 
2015          cfg.depth_pass = AGX_STENCIL_OP_KEEP;
2016          cfg.depth_fail = AGX_STENCIL_OP_KEEP;
2017          cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
2018       }
2019    }
2020 }
2021 
2022 static bool
hk_stencil_test_enabled(struct hk_cmd_buffer * cmd)2023 hk_stencil_test_enabled(struct hk_cmd_buffer *cmd)
2024 {
2025    const struct hk_rendering_state *render = &cmd->state.gfx.render;
2026    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2027 
2028    return dyn->ds.stencil.test_enable &&
2029           render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
2030 }
2031 
2032 static void
hk_flush_vp_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint8_t ** out)2033 hk_flush_vp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
2034 {
2035    const struct vk_dynamic_graphics_state *dyn =
2036       &cmd->vk.dynamic_graphics_state;
2037 
2038    /* We always need at least 1 viewport for the hardware. With rasterizer
2039     * discard the app may not supply any, but we can just program garbage.
2040     */
2041    unsigned count = MAX2(dyn->vp.viewport_count, 1);
2042 
2043    unsigned minx[HK_MAX_VIEWPORTS] = {0}, miny[HK_MAX_VIEWPORTS] = {0};
2044    unsigned maxx[HK_MAX_VIEWPORTS] = {0}, maxy[HK_MAX_VIEWPORTS] = {0};
2045 
2046    /* We implicitly scissor to the viewport. We need to do a min/max dance to
2047     * handle inverted viewports.
2048     */
2049    for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
2050       const VkViewport *vp = &dyn->vp.viewports[i];
2051 
2052       minx[i] = MIN2(vp->x, vp->x + vp->width);
2053       miny[i] = MIN2(vp->y, vp->y + vp->height);
2054       maxx[i] = MAX2(vp->x, vp->x + vp->width);
2055       maxy[i] = MAX2(vp->y, vp->y + vp->height);
2056    }
2057 
2058    /* Additionally clamp to the framebuffer so we don't rasterize
2059     * off-screen pixels. TODO: Is this necessary? the GL driver does this but
2060     * it might be cargoculted at this point.
2061     *
2062     * which is software-visible and can cause faults with
2063     * eMRT when the framebuffer is not a multiple of the tile size.
2064     */
2065    for (unsigned i = 0; i < count; ++i) {
2066       minx[i] = MIN2(minx[i], cmd->state.gfx.render.cr.width);
2067       maxx[i] = MIN2(maxx[i], cmd->state.gfx.render.cr.width);
2068       miny[i] = MIN2(miny[i], cmd->state.gfx.render.cr.height);
2069       maxy[i] = MIN2(maxy[i], cmd->state.gfx.render.cr.height);
2070    }
2071 
2072    /* We additionally apply any API scissors */
2073    for (unsigned i = 0; i < dyn->vp.scissor_count; ++i) {
2074       const VkRect2D *s = &dyn->vp.scissors[i];
2075 
2076       minx[i] = MAX2(minx[i], s->offset.x);
2077       miny[i] = MAX2(miny[i], s->offset.y);
2078       maxx[i] = MIN2(maxx[i], s->offset.x + s->extent.width);
2079       maxy[i] = MIN2(maxy[i], s->offset.y + s->extent.height);
2080    }
2081 
2082    /* Upload a hardware scissor for each viewport, whether there's a
2083     * corresponding API scissor or not.
2084     */
2085    unsigned index = cs->scissor.size / AGX_SCISSOR_LENGTH;
2086    struct agx_scissor_packed *scissors =
2087       util_dynarray_grow_bytes(&cs->scissor, count, AGX_SCISSOR_LENGTH);
2088 
2089    for (unsigned i = 0; i < count; ++i) {
2090       const VkViewport *vp = &dyn->vp.viewports[i];
2091 
2092       agx_pack(scissors + i, SCISSOR, cfg) {
2093          cfg.min_x = minx[i];
2094          cfg.min_y = miny[i];
2095          cfg.max_x = maxx[i];
2096          cfg.max_y = maxy[i];
2097 
2098          /* These settings in conjunction with the PPP control depth clip/clamp
2099           * settings implement depth clip/clamping. Properly setting them
2100           * together is required for conformant depth clip enable.
2101           *
2102           * TODO: Reverse-engineer the finer interactions here.
2103           */
2104          if (dyn->rs.depth_clamp_enable) {
2105             cfg.min_z = MIN2(vp->minDepth, vp->maxDepth);
2106             cfg.max_z = MAX2(vp->minDepth, vp->maxDepth);
2107          } else {
2108             cfg.min_z = 0.0;
2109             cfg.max_z = 1.0;
2110          }
2111       }
2112    }
2113 
2114    /* Upload state */
2115    struct AGX_PPP_HEADER present = {
2116       .depth_bias_scissor = true,
2117       .region_clip = true,
2118       .viewport = true,
2119       .viewport_count = count,
2120    };
2121 
2122    size_t size = agx_ppp_update_size(&present);
2123    struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
2124    if (!T.cpu)
2125       return;
2126 
2127    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
2128 
2129    agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
2130       cfg.scissor = index;
2131 
2132       /* Use the current depth bias, we allocate linearly */
2133       unsigned count = cs->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
2134       cfg.depth_bias = count ? count - 1 : 0;
2135    };
2136 
2137    for (unsigned i = 0; i < count; ++i) {
2138       agx_ppp_push(&ppp, REGION_CLIP, cfg) {
2139          cfg.enable = true;
2140          cfg.min_x = minx[i] / 32;
2141          cfg.min_y = miny[i] / 32;
2142          cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
2143          cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
2144       }
2145    }
2146 
2147    agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
2148       ;
2149 
2150    /* Upload viewports */
2151    for (unsigned i = 0; i < count; ++i) {
2152       const VkViewport *vp = &dyn->vp.viewports[i];
2153 
2154       agx_ppp_push(&ppp, VIEWPORT, cfg) {
2155          cfg.translate_x = vp->x + 0.5f * vp->width;
2156          cfg.translate_y = vp->y + 0.5f * vp->height;
2157          cfg.translate_z = vp->minDepth;
2158 
2159          cfg.scale_x = vp->width * 0.5f;
2160          cfg.scale_y = vp->height * 0.5f;
2161          cfg.scale_z = vp->maxDepth - vp->minDepth;
2162       }
2163    }
2164 
2165    agx_ppp_fini(out, &ppp);
2166 }
2167 
2168 static enum agx_object_type
translate_object_type(enum mesa_prim topology)2169 translate_object_type(enum mesa_prim topology)
2170 {
2171    static_assert(MESA_PRIM_LINES < MESA_PRIM_LINE_STRIP);
2172    static_assert(MESA_PRIM_TRIANGLES >= MESA_PRIM_LINE_STRIP);
2173 
2174    if (topology == MESA_PRIM_POINTS)
2175       return AGX_OBJECT_TYPE_POINT_SPRITE_UV01;
2176    else if (topology <= MESA_PRIM_LINE_STRIP)
2177       return AGX_OBJECT_TYPE_LINE;
2178    else
2179       return AGX_OBJECT_TYPE_TRIANGLE;
2180 }
2181 
2182 static enum agx_primitive
translate_hw_primitive_topology(enum mesa_prim prim)2183 translate_hw_primitive_topology(enum mesa_prim prim)
2184 {
2185    switch (prim) {
2186    case MESA_PRIM_POINTS:
2187       return AGX_PRIMITIVE_POINTS;
2188    case MESA_PRIM_LINES:
2189       return AGX_PRIMITIVE_LINES;
2190    case MESA_PRIM_LINE_STRIP:
2191       return AGX_PRIMITIVE_LINE_STRIP;
2192    case MESA_PRIM_TRIANGLES:
2193       return AGX_PRIMITIVE_TRIANGLES;
2194    case MESA_PRIM_TRIANGLE_STRIP:
2195       return AGX_PRIMITIVE_TRIANGLE_STRIP;
2196    case MESA_PRIM_TRIANGLE_FAN:
2197       return AGX_PRIMITIVE_TRIANGLE_FAN;
2198    default:
2199       unreachable("Invalid hardware primitive topology");
2200    }
2201 }
2202 
2203 static inline enum agx_vdm_vertex
translate_vdm_vertex(unsigned vtx)2204 translate_vdm_vertex(unsigned vtx)
2205 {
2206    static_assert(AGX_VDM_VERTEX_0 == 0);
2207    static_assert(AGX_VDM_VERTEX_1 == 1);
2208    static_assert(AGX_VDM_VERTEX_2 == 2);
2209 
2210    assert(vtx <= 2);
2211    return vtx;
2212 }
2213 
2214 static inline enum agx_ppp_vertex
translate_ppp_vertex(unsigned vtx)2215 translate_ppp_vertex(unsigned vtx)
2216 {
2217    static_assert(AGX_PPP_VERTEX_0 == 0 + 1);
2218    static_assert(AGX_PPP_VERTEX_1 == 1 + 1);
2219    static_assert(AGX_PPP_VERTEX_2 == 2 + 1);
2220 
2221    assert(vtx <= 2);
2222    return vtx + 1;
2223 }
2224 
2225 static void
hk_flush_index(struct hk_cmd_buffer * cmd,struct hk_cs * cs)2226 hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
2227 {
2228    uint8_t *out = cs->current;
2229    agx_push(out, VDM_STATE, cfg) {
2230       cfg.restart_index_present = true;
2231    }
2232 
2233    agx_push(out, VDM_STATE_RESTART_INDEX, cfg) {
2234       if (cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY])
2235          cfg.value = BITFIELD_MASK(32);
2236       else
2237          cfg.value = cmd->state.gfx.index.restart;
2238    }
2239 
2240    cs->current = out;
2241 }
2242 
2243 /*
2244  * Return the given sample positions, packed into a 32-bit word with fixed
2245  * point nibbles for each x/y component of the (at most 4) samples. This is
2246  * suitable for programming the PPP_MULTISAMPLECTL control register.
2247  */
2248 static uint32_t
hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state * sl)2249 hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state *sl)
2250 {
2251    uint32_t ctrl = 0;
2252 
2253    for (int32_t i = sl->per_pixel - 1; i >= 0; i--) {
2254       VkSampleLocationEXT loc = sl->locations[i];
2255 
2256       uint32_t x = CLAMP(loc.x, 0.0f, 0.9375f) * 16.0;
2257       uint32_t y = CLAMP(loc.y, 0.0f, 0.9375f) * 16.0;
2258 
2259       assert(x <= 15);
2260       assert(y <= 15);
2261 
2262       /* Push bytes in reverse order so we can use constant shifts. */
2263       ctrl = (ctrl << 8) | (y << 4) | x;
2264    }
2265 
2266    return ctrl;
2267 }
2268 
2269 /*
2270  * Return the standard sample positions, prepacked as above for efficiency.
2271  */
2272 uint32_t
hk_default_sample_positions(unsigned nr_samples)2273 hk_default_sample_positions(unsigned nr_samples)
2274 {
2275    switch (nr_samples) {
2276    case 0:
2277    case 1:
2278       return 0x88;
2279    case 2:
2280       return 0x44cc;
2281    case 4:
2282       return 0xeaa26e26;
2283    default:
2284       unreachable("Invalid sample count");
2285    }
2286 }
2287 
2288 static void
hk_flush_ppp_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint8_t ** out)2289 hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
2290 {
2291    const struct hk_rendering_state *render = &cmd->state.gfx.render;
2292    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2293 
2294    struct hk_graphics_state *gfx = &cmd->state.gfx;
2295    struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
2296    struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
2297 
2298    bool hw_vs_dirty = IS_SHADER_DIRTY(VERTEX) || IS_SHADER_DIRTY(TESS_EVAL) ||
2299                       IS_SHADER_DIRTY(GEOMETRY);
2300    bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
2301 
2302    struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
2303    bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
2304 
2305    bool varyings_dirty = gfx->dirty & HK_DIRTY_VARYINGS;
2306 
2307    bool face_dirty =
2308       IS_DIRTY(DS_DEPTH_TEST_ENABLE) || IS_DIRTY(DS_DEPTH_WRITE_ENABLE) ||
2309       IS_DIRTY(DS_DEPTH_COMPARE_OP) || IS_DIRTY(DS_STENCIL_REFERENCE) ||
2310       IS_DIRTY(RS_LINE_WIDTH) || IS_DIRTY(RS_POLYGON_MODE) || fs_dirty;
2311 
2312    bool stencil_face_dirty =
2313       IS_DIRTY(DS_STENCIL_OP) || IS_DIRTY(DS_STENCIL_COMPARE_MASK) ||
2314       IS_DIRTY(DS_STENCIL_WRITE_MASK) || IS_DIRTY(DS_STENCIL_TEST_ENABLE);
2315 
2316    struct AGX_PPP_HEADER dirty = {
2317       .fragment_control =
2318          IS_DIRTY(DS_STENCIL_TEST_ENABLE) || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
2319          IS_DIRTY(RS_DEPTH_BIAS_ENABLE) || gfx->dirty & HK_DIRTY_OCCLUSION,
2320 
2321       .fragment_control_2 =
2322          IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_fs_dirty,
2323 
2324       .fragment_front_face = face_dirty,
2325       .fragment_front_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
2326       .fragment_front_stencil = stencil_face_dirty,
2327       .fragment_back_face = face_dirty,
2328       .fragment_back_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
2329       .fragment_back_stencil = stencil_face_dirty,
2330       .output_select = hw_vs_dirty || linked_fs_dirty || varyings_dirty,
2331       .varying_counts_32 = varyings_dirty,
2332       .varying_counts_16 = varyings_dirty,
2333       .cull =
2334          IS_DIRTY(RS_CULL_MODE) || IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) ||
2335          IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(RS_DEPTH_CLIP_ENABLE) ||
2336          IS_DIRTY(RS_DEPTH_CLAMP_ENABLE) || IS_DIRTY(RS_LINE_MODE) ||
2337          IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || (gfx->dirty & HK_DIRTY_PROVOKING),
2338       .cull_2 = varyings_dirty,
2339 
2340       /* With a null FS, the fragment shader PPP word is ignored and doesn't
2341        * need to be present.
2342        */
2343       .fragment_shader = fs && (fs_dirty || linked_fs_dirty || varyings_dirty ||
2344                                 gfx->descriptors.root_dirty),
2345 
2346       .occlusion_query = gfx->dirty & HK_DIRTY_OCCLUSION,
2347       .output_size = hw_vs_dirty,
2348       .viewport_count = 1, /* irrelevant */
2349    };
2350 
2351    /* Calculate the update size. If it equals the header, there is nothing to
2352     * update so early-exit.
2353     */
2354    size_t size = agx_ppp_update_size(&dirty);
2355    if (size == AGX_PPP_HEADER_LENGTH)
2356       return;
2357 
2358    /* Otherwise, allocate enough space for the update and push it. */
2359    assert(size > AGX_PPP_HEADER_LENGTH);
2360 
2361    struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
2362    if (!T.cpu)
2363       return;
2364 
2365    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
2366 
2367    if (dirty.fragment_control) {
2368       agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
2369          cfg.visibility_mode = gfx->occlusion.mode;
2370          cfg.stencil_test_enable = hk_stencil_test_enabled(cmd);
2371 
2372          /* TODO: Consider optimizing this? */
2373          cfg.two_sided_stencil = cfg.stencil_test_enable;
2374 
2375          cfg.depth_bias_enable = dyn->rs.depth_bias.enable &&
2376                                  gfx->object_type == AGX_OBJECT_TYPE_TRIANGLE;
2377 
2378          /* Always enable scissoring so we may scissor to the viewport (TODO:
2379           * optimize this out if the viewport is the default and the app does
2380           * not use the scissor test)
2381           */
2382          cfg.scissor_enable = true;
2383 
2384          /* This avoids broken derivatives along primitive edges */
2385          cfg.disable_tri_merging = gfx->object_type != AGX_OBJECT_TYPE_TRIANGLE;
2386       }
2387    }
2388 
2389    if (dirty.fragment_control_2) {
2390       if (linked_fs) {
2391          /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
2392           * main fragment control word and has to be combined into the secondary
2393           * word for reliable behaviour.
2394           */
2395          agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
2396                              linked_fs->b.fragment_control) {
2397 
2398             cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable;
2399          }
2400       } else {
2401          /* If there is no fragment shader, we must disable tag writes to avoid
2402           * executing the missing shader. This optimizes depth-only passes.
2403           */
2404          agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
2405             cfg.tag_write_disable = true;
2406             cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
2407          }
2408       }
2409    }
2410 
2411    struct agx_fragment_face_packed fragment_face;
2412    struct agx_fragment_face_2_packed fragment_face_2;
2413 
2414    if (dirty.fragment_front_face) {
2415       bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2416       bool z_test = has_z && dyn->ds.depth.test_enable;
2417 
2418       agx_pack(&fragment_face, FRAGMENT_FACE, cfg) {
2419          cfg.line_width = agx_pack_line_width(dyn->rs.line.width);
2420          cfg.polygon_mode = translate_polygon_mode(dyn->rs.polygon_mode);
2421          cfg.disable_depth_write = !(z_test && dyn->ds.depth.write_enable);
2422 
2423          if (z_test && !gfx->descriptors.root.draw.force_never_in_shader)
2424             cfg.depth_function = translate_compare_op(dyn->ds.depth.compare_op);
2425          else
2426             cfg.depth_function = AGX_ZS_FUNC_ALWAYS;
2427       };
2428 
2429       agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
2430          cfg.stencil_reference = dyn->ds.stencil.front.reference;
2431       }
2432    }
2433 
2434    if (dirty.fragment_front_face_2) {
2435       agx_pack(&fragment_face_2, FRAGMENT_FACE_2, cfg) {
2436          cfg.object_type = gfx->object_type;
2437 
2438          /* TODO: flip the default? */
2439          if (fs)
2440             cfg.conservative_depth = 0;
2441       }
2442 
2443       if (fs)
2444          agx_merge(fragment_face_2, fs->frag_face, FRAGMENT_FACE_2);
2445 
2446       agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
2447    }
2448 
2449    if (dirty.fragment_front_stencil) {
2450       hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.front,
2451                                hk_stencil_test_enabled(cmd));
2452    }
2453 
2454    if (dirty.fragment_back_face) {
2455       assert(dirty.fragment_front_face);
2456 
2457       agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
2458          cfg.stencil_reference = dyn->ds.stencil.back.reference;
2459       }
2460    }
2461 
2462    if (dirty.fragment_back_face_2) {
2463       assert(dirty.fragment_front_face_2);
2464 
2465       agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
2466    }
2467 
2468    if (dirty.fragment_back_stencil) {
2469       hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.back,
2470                                hk_stencil_test_enabled(cmd));
2471    }
2472 
2473    if (dirty.output_select) {
2474       struct agx_output_select_packed osel = hw_vs->info.uvs.osel;
2475 
2476       if (linked_fs) {
2477          agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &osel,
2478                                    &linked_fs->b.osel);
2479       } else {
2480          agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT);
2481       }
2482    }
2483 
2484    assert(dirty.varying_counts_32 == dirty.varying_counts_16);
2485 
2486    if (dirty.varying_counts_32) {
2487       agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_32,
2488                           VARYING_COUNTS);
2489 
2490       agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_16,
2491                           VARYING_COUNTS);
2492    }
2493 
2494    if (dirty.cull) {
2495       agx_ppp_push(&ppp, CULL, cfg) {
2496          cfg.cull_front = dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT;
2497          cfg.cull_back = dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT;
2498          cfg.front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE;
2499          cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking);
2500          cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable;
2501 
2502          /* We do not support unrestricted depth, so clamping is inverted from
2503           * clipping. This implementation seems to pass CTS without unrestricted
2504           * depth support.
2505           *
2506           * TODO: Make sure this is right with gl_FragDepth.
2507           */
2508          cfg.depth_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2509          cfg.depth_clamp = !cfg.depth_clip;
2510 
2511          cfg.primitive_msaa =
2512             gfx->object_type == AGX_OBJECT_TYPE_LINE &&
2513             dyn->rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
2514       }
2515    }
2516 
2517    if (dirty.cull_2) {
2518       agx_ppp_push(&ppp, CULL_2, cfg) {
2519          cfg.needs_primitive_id = gfx->generate_primitive_id;
2520       }
2521    }
2522 
2523    if (dirty.fragment_shader) {
2524       /* TODO: Do less often? */
2525       hk_reserve_scratch(cmd, cs, fs);
2526 
2527       agx_ppp_push_packed(&ppp, &linked_fs->fs_counts, FRAGMENT_SHADER_WORD_0);
2528 
2529       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
2530          cfg.pipeline = hk_upload_usc_words(cmd, fs, linked_fs);
2531       }
2532 
2533       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
2534          cfg.cf_bindings = gfx->varyings;
2535       }
2536 
2537       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg)
2538          ;
2539    }
2540 
2541    if (dirty.occlusion_query) {
2542       agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
2543          cfg.index = gfx->occlusion.index;
2544       }
2545    }
2546 
2547    if (dirty.output_size) {
2548       agx_ppp_push(&ppp, OUTPUT_SIZE, cfg) {
2549          cfg.count = hw_vs->info.uvs.size;
2550       }
2551    }
2552 
2553    agx_ppp_fini(out, &ppp);
2554 }
2555 
2556 static void
hk_flush_dynamic_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint32_t draw_id,struct hk_draw draw)2557 hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
2558                        uint32_t draw_id, struct hk_draw draw)
2559 {
2560    struct hk_device *dev = hk_cmd_buffer_device(cmd);
2561    const struct hk_rendering_state *render = &cmd->state.gfx.render;
2562    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2563 
2564    struct hk_graphics_state *gfx = &cmd->state.gfx;
2565 
2566    struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
2567    struct hk_shader *sw_vs = hk_bound_sw_vs(gfx);
2568 
2569    if (!vk_dynamic_graphics_state_any_dirty(dyn) &&
2570        !(gfx->dirty & ~HK_DIRTY_INDEX) && !gfx->descriptors.root_dirty &&
2571        !gfx->shaders_dirty && !sw_vs->b.info.uses_draw_id &&
2572        !sw_vs->b.info.uses_base_param &&
2573        !(gfx->linked[MESA_SHADER_VERTEX] &&
2574          gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param))
2575       return;
2576 
2577    struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
2578 
2579    assert(cs->current + 0x1000 < cs->end && "already ensured space");
2580    uint8_t *out = cs->current;
2581 
2582    struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
2583 
2584    bool gt_dirty = IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL) ||
2585                    IS_SHADER_DIRTY(GEOMETRY);
2586    bool vgt_dirty = IS_SHADER_DIRTY(VERTEX) || gt_dirty;
2587    bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
2588 
2589    if (IS_DIRTY(CB_BLEND_CONSTANTS)) {
2590       static_assert(sizeof(desc->root.draw.blend_constant) ==
2591                        sizeof(dyn->cb.blend_constants) &&
2592                     "common size");
2593 
2594       memcpy(desc->root.draw.blend_constant, dyn->cb.blend_constants,
2595              sizeof(dyn->cb.blend_constants));
2596       desc->root_dirty = true;
2597    }
2598 
2599    if (IS_DIRTY(MS_SAMPLE_MASK)) {
2600       desc->root.draw.api_sample_mask = dyn->ms.sample_mask;
2601       desc->root_dirty = true;
2602    }
2603 
2604    if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) ||
2605        IS_DIRTY(DS_DEPTH_COMPARE_OP)) {
2606 
2607       const struct hk_rendering_state *render = &cmd->state.gfx.render;
2608       bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2609       bool z_test = has_z && dyn->ds.depth.test_enable;
2610 
2611       desc->root.draw.force_never_in_shader =
2612          z_test && dyn->ds.depth.compare_op == VK_COMPARE_OP_NEVER && fs &&
2613          fs->info.fs.writes_memory;
2614 
2615       desc->root_dirty = true;
2616    }
2617 
2618    /* The main shader must not run tests if the epilog will. */
2619    bool nontrivial_force_early =
2620       fs && (fs->b.info.early_fragment_tests &&
2621              (fs->b.info.writes_sample_mask || fs->info.fs.writes_memory));
2622 
2623    bool epilog_discards = dyn->ms.alpha_to_coverage_enable ||
2624                           (fs && (fs->info.fs.epilog_key.write_z ||
2625                                   fs->info.fs.epilog_key.write_s));
2626    epilog_discards &= !nontrivial_force_early;
2627 
2628    if (fs_dirty || IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE)) {
2629       desc->root.draw.no_epilog_discard = !epilog_discards ? ~0 : 0;
2630       desc->root_dirty = true;
2631    }
2632 
2633    if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) ||
2634        IS_DIRTY(VI_BINDING_STRIDES) || vgt_dirty || true /* TODO */) {
2635 
2636       struct hk_fast_link_key_vs key = {
2637          .prolog.hw = (sw_vs == hw_vs),
2638 
2639          /* FIXME: handle pipeline robustness "properly" */
2640          .prolog.robustness.level =
2641             (dev->vk.enabled_features.robustBufferAccess2 ||
2642              dev->vk.enabled_features.pipelineRobustness)
2643                ? AGX_ROBUSTNESS_D3D
2644                : AGX_ROBUSTNESS_GL,
2645 
2646          .prolog.robustness.soft_fault = agx_has_soft_fault(&dev->dev),
2647       };
2648 
2649       if (!key.prolog.hw) {
2650          key.prolog.sw_index_size_B =
2651             draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
2652       }
2653 
2654       static_assert(sizeof(key.prolog.component_mask) ==
2655                     sizeof(sw_vs->info.vs.attrib_components_read));
2656       BITSET_COPY(key.prolog.component_mask,
2657                   sw_vs->info.vs.attrib_components_read);
2658 
2659       u_foreach_bit(a, dyn->vi->attributes_valid) {
2660          struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
2661 
2662          assert(dyn->vi->bindings_valid & BITFIELD_BIT(attr.binding));
2663          struct vk_vertex_binding_state binding =
2664             dyn->vi->bindings[attr.binding];
2665 
2666          /* nir_assign_io_var_locations compacts vertex inputs, eliminating
2667           * unused inputs. We need to do the same here to match the locations.
2668           */
2669          unsigned slot =
2670             util_bitcount64(sw_vs->info.vs.attribs_read & BITFIELD_MASK(a));
2671 
2672          key.prolog.attribs[slot] = (struct agx_velem_key){
2673             .format = vk_format_to_pipe_format(attr.format),
2674             .stride = dyn->vi_binding_strides[attr.binding],
2675             .divisor = binding.divisor,
2676             .instanced = binding.input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
2677          };
2678       }
2679 
2680       hk_update_fast_linked(cmd, sw_vs, &key);
2681    }
2682 
2683    if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) || vgt_dirty ||
2684        (gfx->dirty & HK_DIRTY_VB)) {
2685 
2686       uint64_t sink = dev->rodata.zero_sink;
2687 
2688       unsigned slot = 0;
2689       u_foreach_bit(a, sw_vs->info.vs.attribs_read) {
2690          if (dyn->vi->attributes_valid & BITFIELD_BIT(a)) {
2691             struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
2692             struct hk_addr_range vb = gfx->vb[attr.binding];
2693 
2694             desc->root.draw.attrib_clamps[slot] = agx_calculate_vbo_clamp(
2695                vb.addr, sink, vk_format_to_pipe_format(attr.format), vb.range,
2696                dyn->vi_binding_strides[attr.binding], attr.offset,
2697                &desc->root.draw.attrib_base[slot]);
2698          } else {
2699             desc->root.draw.attrib_base[slot] = sink;
2700             desc->root.draw.attrib_clamps[slot] = 0;
2701          }
2702 
2703          ++slot;
2704       }
2705 
2706       desc->root_dirty = true;
2707    }
2708 
2709    if (vgt_dirty || IS_SHADER_DIRTY(FRAGMENT) ||
2710        IS_DIRTY(MS_RASTERIZATION_SAMPLES) || IS_DIRTY(MS_SAMPLE_MASK) ||
2711        IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE) ||
2712        IS_DIRTY(MS_ALPHA_TO_ONE_ENABLE) || IS_DIRTY(CB_LOGIC_OP) ||
2713        IS_DIRTY(CB_LOGIC_OP_ENABLE) || IS_DIRTY(CB_WRITE_MASKS) ||
2714        IS_DIRTY(CB_COLOR_WRITE_ENABLES) || IS_DIRTY(CB_ATTACHMENT_COUNT) ||
2715        IS_DIRTY(CB_BLEND_ENABLES) || IS_DIRTY(CB_BLEND_EQUATIONS) ||
2716        IS_DIRTY(CB_BLEND_CONSTANTS) ||
2717        desc->root_dirty /* for pipeline stats */ || true) {
2718 
2719       if (fs) {
2720          unsigned samples_shaded = 0;
2721          if (fs->info.fs.epilog_key.sample_shading)
2722             samples_shaded = dyn->ms.rasterization_samples;
2723 
2724          unsigned tib_sample_mask =
2725             BITFIELD_MASK(dyn->ms.rasterization_samples);
2726          unsigned api_sample_mask = dyn->ms.sample_mask & tib_sample_mask;
2727          bool has_sample_mask = api_sample_mask != tib_sample_mask;
2728 
2729          struct hk_fast_link_key_fs key = {
2730             .prolog.statistics = hk_pipeline_stat_addr(
2731                cmd,
2732                VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT),
2733 
2734             .prolog.cull_distance_size =
2735                hw_vs->info.vs.cull_distance_array_size,
2736             .prolog.api_sample_mask = has_sample_mask ? api_sample_mask : 0xff,
2737             .nr_samples_shaded = samples_shaded,
2738          };
2739 
2740          bool prolog_discards =
2741             has_sample_mask || key.prolog.cull_distance_size;
2742 
2743          bool needs_prolog = key.prolog.statistics || prolog_discards;
2744 
2745          if (needs_prolog) {
2746             /* With late main shader tests, the prolog runs tests if neither the
2747              * main shader nor epilog will.
2748              *
2749              * With (nontrivial) early main shader tests, the prolog does not
2750              * run tests, the tests will run at the start of the main shader.
2751              * This ensures tests are after API sample mask and cull distance
2752              * discards.
2753              */
2754             key.prolog.run_zs_tests = !nontrivial_force_early &&
2755                                       !fs->b.info.writes_sample_mask &&
2756                                       !epilog_discards && prolog_discards;
2757 
2758             if (key.prolog.cull_distance_size) {
2759                key.prolog.cf_base = fs->b.info.varyings.fs.nr_cf;
2760             }
2761          }
2762 
2763          key.epilog = (struct agx_fs_epilog_key){
2764             .link = fs->info.fs.epilog_key,
2765             .nr_samples = MAX2(dyn->ms.rasterization_samples, 1),
2766             .blend.alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2767             .blend.alpha_to_one = dyn->ms.alpha_to_one_enable,
2768             .blend.logicop_func = dyn->cb.logic_op_enable
2769                                      ? vk_logic_op_to_pipe(dyn->cb.logic_op)
2770                                      : PIPE_LOGICOP_COPY,
2771          };
2772 
2773          key.epilog.link.already_ran_zs |= nontrivial_force_early;
2774 
2775          struct hk_rendering_state *render = &cmd->state.gfx.render;
2776          for (uint32_t i = 0; i < render->color_att_count; i++) {
2777             key.epilog.rt_formats[i] =
2778                vk_format_to_pipe_format(render->color_att[i].vk_format);
2779 
2780             const struct vk_color_blend_attachment_state *cb =
2781                &dyn->cb.attachments[i];
2782 
2783             bool write_enable = dyn->cb.color_write_enables & BITFIELD_BIT(i);
2784             unsigned write_mask = write_enable ? cb->write_mask : 0;
2785 
2786             /* nir_lower_blend always blends, so use a default blend state when
2787              * blending is disabled at an API level.
2788              */
2789             if (!dyn->cb.attachments[i].blend_enable) {
2790                key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
2791                   .colormask = write_mask,
2792                   .rgb_func = PIPE_BLEND_ADD,
2793                   .alpha_func = PIPE_BLEND_ADD,
2794                   .rgb_src_factor = PIPE_BLENDFACTOR_ONE,
2795                   .alpha_src_factor = PIPE_BLENDFACTOR_ONE,
2796                   .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
2797                   .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
2798                };
2799             } else {
2800                key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
2801                   .colormask = write_mask,
2802 
2803                   .rgb_src_factor =
2804                      vk_blend_factor_to_pipe(cb->src_color_blend_factor),
2805 
2806                   .rgb_dst_factor =
2807                      vk_blend_factor_to_pipe(cb->dst_color_blend_factor),
2808 
2809                   .rgb_func = vk_blend_op_to_pipe(cb->color_blend_op),
2810 
2811                   .alpha_src_factor =
2812                      vk_blend_factor_to_pipe(cb->src_alpha_blend_factor),
2813 
2814                   .alpha_dst_factor =
2815                      vk_blend_factor_to_pipe(cb->dst_alpha_blend_factor),
2816 
2817                   .alpha_func = vk_blend_op_to_pipe(cb->alpha_blend_op),
2818                };
2819             }
2820          }
2821 
2822          hk_update_fast_linked(cmd, fs, &key);
2823       } else {
2824          /* TODO: prolog without fs needs to work too... */
2825          if (cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] != NULL) {
2826             cmd->state.gfx.linked_dirty |= BITFIELD_BIT(MESA_SHADER_FRAGMENT);
2827             cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] = NULL;
2828          }
2829       }
2830    }
2831 
2832    /* If the vertex shader uses draw parameters, vertex uniforms are dirty every
2833     * draw. Fragment uniforms are unaffected.
2834     *
2835     * For a direct draw, we upload the draw parameters as-if indirect to
2836     * avoid keying to indirectness.
2837     */
2838    if (gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param) {
2839       if (draw.b.indirect) {
2840          gfx->draw_params = draw.b.ptr;
2841 
2842          if (draw.indexed) {
2843             gfx->draw_params +=
2844                offsetof(VkDrawIndexedIndirectCommand, vertexOffset);
2845          } else {
2846             gfx->draw_params += offsetof(VkDrawIndirectCommand, firstVertex);
2847          }
2848       } else {
2849          uint32_t params[] = {
2850             draw.indexed ? draw.index_bias : draw.start,
2851             draw.start_instance,
2852          };
2853 
2854          gfx->draw_params = hk_pool_upload(cmd, params, sizeof(params), 4);
2855       }
2856    } else {
2857       gfx->draw_params = 0;
2858    }
2859 
2860    if (sw_vs->b.info.uses_draw_id) {
2861       /* TODO: rodata? */
2862       gfx->draw_id_ptr = hk_pool_upload(cmd, &draw_id, 2, 4);
2863    } else {
2864       gfx->draw_id_ptr = 0;
2865    }
2866 
2867    if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || gt_dirty) {
2868       enum mesa_prim prim = hk_rast_prim(cmd);
2869 
2870       gfx->topology = translate_hw_primitive_topology(prim);
2871       gfx->object_type = translate_object_type(prim);
2872    }
2873 
2874    if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || IS_DIRTY(RS_PROVOKING_VERTEX)) {
2875       unsigned provoking;
2876       if (dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
2877          provoking = 2;
2878       else if (gfx->topology == AGX_PRIMITIVE_TRIANGLE_FAN)
2879          provoking = 1;
2880       else
2881          provoking = 0;
2882 
2883       if (provoking != gfx->provoking) {
2884          gfx->provoking = provoking;
2885          gfx->dirty |= HK_DIRTY_PROVOKING;
2886 
2887          gfx->descriptors.root.draw.provoking = provoking;
2888          gfx->descriptors.root_dirty = true;
2889       }
2890    }
2891 
2892    /* With attachmentless rendering, we don't know the sample count until draw
2893     * time, so we do a late tilebuffer fix up. But with rasterizer discard,
2894     * rasterization_samples might be 0.
2895     */
2896    if (dyn->ms.rasterization_samples &&
2897        gfx->render.tilebuffer.nr_samples != dyn->ms.rasterization_samples) {
2898 
2899       assert(gfx->render.tilebuffer.nr_samples == 0);
2900 
2901       unsigned nr_samples = MAX2(dyn->ms.rasterization_samples, 1);
2902       gfx->render.tilebuffer.nr_samples = nr_samples;
2903       agx_tilebuffer_pack_usc(&gfx->render.tilebuffer);
2904       cs->tib = gfx->render.tilebuffer;
2905    }
2906 
2907    if (IS_DIRTY(MS_SAMPLE_LOCATIONS) || IS_DIRTY(MS_SAMPLE_LOCATIONS_ENABLE) ||
2908        IS_DIRTY(MS_RASTERIZATION_SAMPLES)) {
2909 
2910       uint32_t ctrl;
2911       if (dyn->ms.sample_locations_enable) {
2912          ctrl = hk_pack_ppp_multisamplectrl(dyn->ms.sample_locations);
2913       } else {
2914          ctrl = hk_default_sample_positions(dyn->ms.rasterization_samples);
2915       }
2916 
2917       bool dont_commit = cmd->in_meta || dyn->ms.rasterization_samples == 0;
2918 
2919       if (!cs->has_sample_locations) {
2920          cs->ppp_multisamplectl = ctrl;
2921 
2922          /* If we're in vk_meta, do not commit to the sample locations yet.
2923           * vk_meta doesn't care, but the app will!
2924           */
2925          cs->has_sample_locations |= !dont_commit;
2926       } else {
2927          assert(dont_commit || cs->ppp_multisamplectl == ctrl);
2928       }
2929 
2930       gfx->descriptors.root.draw.ppp_multisamplectl = ctrl;
2931       gfx->descriptors.root_dirty = true;
2932    }
2933 
2934    /* Link varyings before uploading tessellation state, becuase the
2935     * gfx->generate_primitive_id boolean needs to be plumbed.
2936     */
2937    struct hk_linked_shader *linked_vs = gfx->linked[MESA_SHADER_VERTEX];
2938    struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
2939    bool linked_vs_dirty = IS_LINKED_DIRTY(VERTEX);
2940    bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
2941 
2942    if ((gfx->dirty & HK_DIRTY_PROVOKING) || vgt_dirty || linked_fs_dirty) {
2943       unsigned bindings = linked_fs ? linked_fs->b.cf.nr_bindings : 0;
2944       if (bindings) {
2945          size_t linkage_size =
2946             AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
2947 
2948          struct agx_ptr t = hk_pool_usc_alloc(cmd, linkage_size, 16);
2949          if (!t.cpu)
2950             return;
2951 
2952          agx_link_varyings_vs_fs(
2953             t.cpu, &gfx->linked_varyings, hw_vs->info.uvs.user_size,
2954             &linked_fs->b.cf, gfx->provoking, 0, &gfx->generate_primitive_id);
2955 
2956          gfx->varyings = agx_usc_addr(&dev->dev, t.gpu);
2957       } else {
2958          gfx->varyings = 0;
2959       }
2960 
2961       gfx->dirty |= HK_DIRTY_VARYINGS;
2962    }
2963 
2964    if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
2965        gfx->shaders[MESA_SHADER_GEOMETRY]) {
2966 
2967       struct hk_shader *vs = hk_bound_sw_vs(gfx);
2968       desc->root.draw.vertex_outputs = vs->b.info.outputs;
2969 
2970       /* XXX: We should deduplicate this logic */
2971       bool restart = (draw.indexed && draw.restart);
2972       bool indirect = draw.b.indirect || restart;
2973 
2974       desc->root.draw.input_assembly =
2975          indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
2976                   : hk_upload_ia_params(cmd, draw);
2977 
2978       if (!indirect) {
2979          uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
2980          unsigned vb_size =
2981             libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
2982 
2983          /* Allocate if there are any outputs, or use the null sink to trap
2984           * reads if there aren't. Those reads are undefined but should not
2985           * fault. Affects:
2986           *
2987           *    dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
2988           */
2989          desc->root.draw.vertex_output_buffer =
2990             vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu
2991                     : dev->rodata.null_sink;
2992       }
2993    }
2994 
2995    if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
2996       gfx->descriptors.root.draw.tess_params = hk_upload_tess_params(cmd, draw);
2997       gfx->descriptors.root_dirty = true;
2998    }
2999 
3000    if (gfx->shaders[MESA_SHADER_GEOMETRY]) {
3001       /* TODO: size */
3002       cmd->geom_indirect = hk_pool_alloc(cmd, 64, 4).gpu;
3003 
3004       gfx->descriptors.root.draw.geometry_params =
3005          hk_upload_geometry_params(cmd, draw);
3006 
3007       gfx->descriptors.root_dirty = true;
3008    }
3009 
3010    /* Root must be uploaded after the above, which touch the root */
3011    if (gfx->descriptors.root_dirty) {
3012       gfx->root =
3013          hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
3014    }
3015 
3016    /* Hardware dynamic state must be deferred until after the root and fast
3017     * linking, since it will use the root address and the linked shaders.
3018     */
3019    if ((gfx->dirty & (HK_DIRTY_PROVOKING | HK_DIRTY_VARYINGS)) ||
3020        IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_vs_dirty || vgt_dirty ||
3021        gfx->descriptors.root_dirty || gfx->draw_id_ptr || gfx->draw_params) {
3022 
3023       /* TODO: Do less often? */
3024       hk_reserve_scratch(cmd, cs, hw_vs);
3025 
3026       agx_push(out, VDM_STATE, cfg) {
3027          cfg.vertex_shader_word_0_present = true;
3028          cfg.vertex_shader_word_1_present = true;
3029          cfg.vertex_outputs_present = true;
3030          cfg.vertex_unknown_present = true;
3031       }
3032 
3033       agx_push_packed(out, hw_vs->counts, VDM_STATE_VERTEX_SHADER_WORD_0);
3034 
3035       struct hk_linked_shader *linked_hw_vs =
3036          (hw_vs == sw_vs) ? linked_vs : hw_vs->only_linked;
3037 
3038       agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
3039          cfg.pipeline = hk_upload_usc_words(cmd, hw_vs, linked_hw_vs);
3040       }
3041 
3042       agx_push_packed(out, hw_vs->info.uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
3043 
3044       agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
3045          cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking);
3046          cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable;
3047          cfg.generate_primitive_id = gfx->generate_primitive_id;
3048       }
3049 
3050       /* Pad up to a multiple of 8 bytes */
3051       memset(out, 0, 4);
3052       out += 4;
3053    }
3054 
3055    if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS)) {
3056       void *ptr =
3057          util_dynarray_grow_bytes(&cs->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
3058 
3059       agx_pack(ptr, DEPTH_BIAS, cfg) {
3060          cfg.depth_bias = dyn->rs.depth_bias.constant;
3061          cfg.slope_scale = dyn->rs.depth_bias.slope;
3062          cfg.clamp = dyn->rs.depth_bias.clamp;
3063 
3064          /* Value from the PowerVR driver. */
3065          if (render->depth_att.vk_format == VK_FORMAT_D16_UNORM) {
3066             cfg.depth_bias /= (1 << 15);
3067          }
3068       }
3069    }
3070 
3071    /* Hardware viewport/scissor state is entangled with depth bias. */
3072    if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(VP_SCISSORS) ||
3073        IS_DIRTY(VP_SCISSOR_COUNT) || IS_DIRTY(VP_VIEWPORTS) ||
3074        IS_DIRTY(VP_VIEWPORT_COUNT) ||
3075        IS_DIRTY(VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
3076        IS_DIRTY(RS_DEPTH_CLIP_ENABLE) || IS_DIRTY(RS_DEPTH_CLAMP_ENABLE)) {
3077 
3078       hk_flush_vp_state(cmd, cs, &out);
3079    }
3080 
3081    hk_flush_ppp_state(cmd, cs, &out);
3082    cs->current = out;
3083 
3084    vk_dynamic_graphics_state_clear_dirty(dyn);
3085    gfx->shaders_dirty = 0;
3086    gfx->linked_dirty = 0;
3087    gfx->dirty = 0;
3088    gfx->descriptors.root_dirty = false;
3089 }
3090 
3091 static bool
hk_needs_index_robustness(struct hk_cmd_buffer * cmd,struct hk_draw draw)3092 hk_needs_index_robustness(struct hk_cmd_buffer *cmd, struct hk_draw draw)
3093 {
3094    struct hk_graphics_state *gfx = &cmd->state.gfx;
3095    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3096 
3097    if (!draw.indexed)
3098       return false;
3099 
3100    /* If tessellation is used, we'll go through the robust path anyway, don't
3101     * end up with a tess+geom doom combo.
3102     */
3103    if (gfx->shaders[MESA_SHADER_TESS_EVAL])
3104       return false;
3105 
3106    /* Allowed with maint6 without robustness features enabled */
3107    if (draw.index.range == 0)
3108       return true;
3109 
3110    if (!(dev->vk.enabled_features.robustBufferAccess ||
3111          dev->vk.enabled_features.robustBufferAccess2 ||
3112          dev->vk.enabled_features.pipelineRobustness))
3113       return false;
3114 
3115    if (draw.b.indirect) {
3116       return true;
3117    } else {
3118       uint32_t range_B =
3119          (draw.start + draw.b.count[0]) * agx_index_size_to_B(draw.index_size);
3120 
3121       return range_B > draw.index.range;
3122    }
3123 }
3124 
3125 static void
hk_handle_passthrough_gs(struct hk_cmd_buffer * cmd,struct hk_draw draw)3126 hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct hk_draw draw)
3127 {
3128    struct hk_graphics_state *gfx = &cmd->state.gfx;
3129    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
3130 
3131    /* If there's an application geometry shader, there's nothing to un/bind */
3132    if (gs && !gs->is_passthrough)
3133       return;
3134 
3135    /* Determine if we need a geometry shader to emulate XFB or adjacency */
3136    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
3137    struct hk_shader *last_sw = hk_bound_sw_vs_before_gs(gfx);
3138    uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
3139 
3140    VkPrimitiveTopology topology = dyn->ia.primitive_topology;
3141    bool adjacency =
3142       (topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY) ||
3143       (topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY) ||
3144       (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY) ||
3145       (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY);
3146 
3147    /* TODO: Don't use a whole GS just for index robustness. */
3148    bool index_robustness = hk_needs_index_robustness(cmd, draw);
3149 
3150    bool needs_gs = xfb_outputs || adjacency || index_robustness;
3151 
3152    /* Various pipeline statistics are implemented in the pre-GS shader. TODO:
3153     * This could easily be optimized.
3154     */
3155    VkQueryPipelineStatisticFlagBits ia_statistics[] = {
3156       VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
3157       VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
3158       VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
3159    };
3160 
3161    bool ia_stats = false;
3162 
3163    for (unsigned i = 0; i < ARRAY_SIZE(ia_statistics); ++i) {
3164       ia_stats |= hk_pipeline_stat_addr(cmd, ia_statistics[i]) != 0;
3165    }
3166 
3167    needs_gs |= ia_stats;
3168 
3169    /* If we already have a matching GS configuration, we're done */
3170    if ((gs != NULL) == needs_gs)
3171       return;
3172 
3173    /* If we don't need a GS but we do have a passthrough, unbind it */
3174    if (gs) {
3175       assert(!needs_gs && gs->is_passthrough);
3176       hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
3177       return;
3178    }
3179 
3180    /* Else, we need to bind a passthrough GS */
3181    size_t key_size =
3182       sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs);
3183    struct hk_passthrough_gs_key *key = alloca(key_size);
3184 
3185    *key = (struct hk_passthrough_gs_key){
3186       .prim = u_decomposed_prim(hk_gs_in_prim(cmd)),
3187       .outputs = last_sw->b.info.outputs,
3188       .clip_distance_array_size = last_sw->info.clip_distance_array_size,
3189       .cull_distance_array_size = last_sw->info.cull_distance_array_size,
3190    };
3191 
3192    if (xfb_outputs) {
3193       typed_memcpy(key->xfb_stride, last_sw->info.xfb_stride,
3194                    ARRAY_SIZE(key->xfb_stride));
3195 
3196       memcpy(&key->xfb_info, &last_sw->info.xfb_info,
3197              nir_xfb_info_size(xfb_outputs));
3198    }
3199 
3200    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3201    perf_debug(dev, "Binding passthrough GS for%s%s%s%s\n",
3202               xfb_outputs ? " XFB" : "", adjacency ? " adjacency" : "",
3203               index_robustness ? " robustness" : "",
3204               ia_stats ? " statistics" : "");
3205 
3206    gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size);
3207    gs->is_passthrough = true;
3208    hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, gs);
3209 }
3210 
3211 static struct hk_cs *
hk_flush_gfx_state(struct hk_cmd_buffer * cmd,uint32_t draw_id,struct hk_draw draw)3212 hk_flush_gfx_state(struct hk_cmd_buffer *cmd, uint32_t draw_id,
3213                    struct hk_draw draw)
3214 {
3215    struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
3216    if (!cs)
3217       return NULL;
3218 
3219    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3220 
3221    struct hk_graphics_state *gfx = &cmd->state.gfx;
3222    struct hk_descriptor_state *desc = &gfx->descriptors;
3223    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3224 
3225 #ifndef NDEBUG
3226    if (unlikely(dev->dev.debug & AGX_DBG_DIRTY)) {
3227       hk_cmd_buffer_dirty_all(cmd);
3228    }
3229 #endif
3230 
3231    /* TODO: Try to reduce draw overhead of this */
3232    hk_handle_passthrough_gs(cmd, draw);
3233 
3234    hk_flush_shaders(cmd);
3235 
3236    if (desc->push_dirty)
3237       hk_cmd_buffer_flush_push_descriptors(cmd, desc);
3238 
3239    if ((gfx->dirty & HK_DIRTY_INDEX) &&
3240        (draw.restart || gfx->shaders[MESA_SHADER_GEOMETRY]))
3241       hk_flush_index(cmd, cs);
3242 
3243    hk_flush_dynamic_state(cmd, cs, draw_id, draw);
3244    return cs;
3245 }
3246 
3247 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3248 hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3249                           VkDeviceSize offset, VkDeviceSize size,
3250                           VkIndexType indexType)
3251 {
3252    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3253    VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3254 
3255    cmd->state.gfx.index = (struct hk_index_buffer_state){
3256       .buffer = hk_buffer_addr_range(buffer, offset, size),
3257       .size = agx_translate_index_size(vk_index_type_to_bytes(indexType)),
3258       .restart = vk_index_to_restart(indexType),
3259    };
3260 
3261    /* TODO: check if necessary, blob does this */
3262    cmd->state.gfx.index.buffer.range =
3263       align(cmd->state.gfx.index.buffer.range, 4);
3264 
3265    cmd->state.gfx.dirty |= HK_DIRTY_INDEX;
3266 }
3267 
3268 void
hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer * cmd,uint32_t vb_idx,struct hk_addr_range addr_range)3269 hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
3270                           struct hk_addr_range addr_range)
3271 {
3272    cmd->state.gfx.vb[vb_idx] = addr_range;
3273    cmd->state.gfx.dirty |= HK_DIRTY_VB;
3274 }
3275 
3276 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3277 hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
3278                          uint32_t bindingCount, const VkBuffer *pBuffers,
3279                          const VkDeviceSize *pOffsets,
3280                          const VkDeviceSize *pSizes,
3281                          const VkDeviceSize *pStrides)
3282 {
3283    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3284 
3285    if (pStrides) {
3286       vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding, bindingCount,
3287                                         pStrides);
3288    }
3289 
3290    for (uint32_t i = 0; i < bindingCount; i++) {
3291       VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
3292       uint32_t idx = firstBinding + i;
3293 
3294       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3295       const struct hk_addr_range addr_range =
3296          hk_buffer_addr_range(buffer, pOffsets[i], size);
3297 
3298       hk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3299    }
3300 }
3301 
3302 static bool
hk_set_view_index(struct hk_cmd_buffer * cmd,uint32_t view_idx)3303 hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx)
3304 {
3305    if (cmd->state.gfx.render.view_mask) {
3306       cmd->state.gfx.descriptors.root.draw.view_index = view_idx;
3307       cmd->state.gfx.descriptors.root_dirty = true;
3308    }
3309 
3310    return true;
3311 }
3312 
3313 /*
3314  * Iterator macro to duplicate a draw for each enabled view (when multiview is
3315  * enabled, else always view 0). Along with hk_lower_multiview, this forms the
3316  * world's worst multiview lowering.
3317  */
3318 #define hk_foreach_view(cmd)                                                   \
3319    u_foreach_bit(view_idx, cmd->state.gfx.render.view_mask ?: 1)               \
3320       if (hk_set_view_index(cmd, view_idx))
3321 
3322 static void
hk_ia_update(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_draw draw,uint64_t ia_vertices,uint64_t vs_invocations)3323 hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw,
3324              uint64_t ia_vertices, uint64_t vs_invocations)
3325 {
3326    /* XXX: stream link needed? */
3327    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3328    perf_debug(dev, "Input assembly counters");
3329 
3330    struct agx_increment_ia_counters_key key = {
3331       .index_size_B = draw.restart ? agx_index_size_to_B(draw.index_size) : 0,
3332    };
3333 
3334    uint64_t draw_ptr;
3335    if (draw.b.indirect) {
3336       draw_ptr = draw.b.ptr;
3337    } else {
3338       uint32_t desc[] = {draw.b.count[0], draw.b.count[1], 0};
3339       draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4);
3340    }
3341 
3342    struct libagx_increment_ia_counters args = {
3343       .ia_vertices = ia_vertices,
3344       .vs_invocations = vs_invocations,
3345       .restart_index = cmd->state.gfx.index.restart,
3346       .draw = draw_ptr,
3347       .index_buffer = draw.index.addr,
3348       .index_buffer_range_el =
3349          key.index_size_B ? (draw.index.range / key.index_size_B) : 0,
3350    };
3351 
3352    uint64_t wg_size = key.index_size_B ? 1024 : 1;
3353 
3354    struct hk_shader *s =
3355       hk_meta_kernel(dev, agx_nir_increment_ia_counters, &key, sizeof(key));
3356 
3357    uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8);
3358    uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
3359 
3360    hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(wg_size, 1, 1),
3361                         hk_grid(wg_size, 1, 1));
3362 }
3363 
3364 static void
hk_draw(struct hk_cmd_buffer * cmd,uint16_t draw_id,struct hk_draw draw_)3365 hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct hk_draw draw_)
3366 {
3367    const struct vk_dynamic_graphics_state *dyn =
3368       &cmd->vk.dynamic_graphics_state;
3369 
3370    /* Filter trivial draws so we don't need to worry about null index buffers */
3371    if (!draw_.b.indirect && (draw_.b.count[0] == 0 || draw_.b.count[1] == 0))
3372       return;
3373 
3374    draw_.restart = dyn->ia.primitive_restart_enable;
3375    draw_.index_size = cmd->state.gfx.index.size;
3376 
3377    uint64_t stat_ia_verts = hk_pipeline_stat_addr(
3378       cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT);
3379 
3380    uint64_t stat_vs_inv = hk_pipeline_stat_addr(
3381       cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT);
3382 
3383    bool ia_stats = stat_ia_verts || stat_vs_inv;
3384 
3385    hk_foreach_view(cmd) {
3386       struct hk_draw draw = draw_;
3387       struct hk_cs *cs = hk_flush_gfx_state(cmd, draw_id, draw);
3388       /* If we failed to allocate a control stream, we've already lost the
3389        * device. Just drop the draw so we don't crash.
3390        */
3391       if (!cs)
3392          return;
3393 
3394       bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
3395       bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
3396       struct hk_cs *ccs = NULL;
3397       uint8_t *out = cs->current;
3398       assert(cs->current + 0x1000 < cs->end);
3399 
3400       if (geom || tess || ia_stats) {
3401          ccs =
3402             hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3403          if (!ccs)
3404             return;
3405       }
3406 
3407       if (ia_stats) {
3408          hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_vs_inv);
3409       }
3410 
3411       if (tess) {
3412          draw = hk_launch_tess(cmd, ccs, draw);
3413 
3414          if (draw.raw) {
3415             assert(!geom);
3416             assert(draw.b.indirect);
3417 
3418             agx_push(out, VDM_STREAM_LINK, cfg) {
3419                cfg.target_lo = draw.b.ptr & BITFIELD_MASK(32);
3420                cfg.target_hi = draw.b.ptr >> 32;
3421                cfg.with_return = true;
3422             }
3423 
3424             cs->current = out;
3425             continue;
3426          }
3427       }
3428 
3429       if (geom) {
3430          draw = hk_launch_gs_prerast(cmd, ccs, draw);
3431 
3432          /* We must not draw if the app specified rasterizer discard. This is
3433           * required for both performance (it is pointless to rasterize and
3434           * there are no side effects), but also correctness (no indirect draw
3435           * descriptor will be filled out).
3436           */
3437          if (dyn->rs.rasterizer_discard_enable)
3438             continue;
3439       }
3440 
3441       uint64_t ib = draw.index.addr;
3442       if (draw.indexed && !draw.b.indirect)
3443          ib += (draw.start << draw.index_size);
3444 
3445       agx_push(out, INDEX_LIST, cfg) {
3446          cfg.primitive = cmd->state.gfx.topology;
3447 
3448          if (draw.b.indirect) {
3449             cfg.indirect_buffer_present = true;
3450          } else {
3451             cfg.instance_count_present = true;
3452             cfg.index_count_present = true;
3453             cfg.start_present = true;
3454          }
3455 
3456          if (draw.indexed) {
3457             cfg.restart_enable = draw.restart;
3458             cfg.index_buffer_hi = ib >> 32;
3459             cfg.index_size = draw.index_size;
3460 
3461             cfg.index_buffer_present = true;
3462             cfg.index_buffer_size_present = true;
3463          }
3464       }
3465 
3466       if (draw.indexed) {
3467          agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
3468             cfg.buffer_lo = ib;
3469          }
3470       }
3471 
3472       if (draw.b.indirect) {
3473          agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
3474             cfg.address_hi = draw.b.ptr >> 32;
3475             cfg.address_lo = draw.b.ptr & BITFIELD_MASK(32);
3476          }
3477       } else {
3478          agx_push(out, INDEX_LIST_COUNT, cfg) {
3479             cfg.count = draw.b.count[0];
3480          }
3481 
3482          agx_push(out, INDEX_LIST_INSTANCES, cfg) {
3483             cfg.count = draw.b.count[1];
3484          }
3485 
3486          agx_push(out, INDEX_LIST_START, cfg) {
3487             cfg.start = draw.indexed ? draw.index_bias : draw.start;
3488          }
3489       }
3490 
3491       if (draw.indexed) {
3492          agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
3493             cfg.size = draw.index.range;
3494          }
3495       }
3496 
3497       cs->current = out;
3498    }
3499 }
3500 
3501 VKAPI_ATTR void VKAPI_CALL
hk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3502 hk_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount,
3503            uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance)
3504 {
3505    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3506 
3507    struct hk_draw draw = {
3508       .b = hk_grid(vertexCount, instanceCount, 1),
3509       .start = firstVertex,
3510       .start_instance = firstInstance,
3511    };
3512 
3513    hk_draw(cmd, 0, draw);
3514 }
3515 
3516 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3517 hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
3518                    const VkMultiDrawInfoEXT *pVertexInfo,
3519                    uint32_t instanceCount, uint32_t firstInstance,
3520                    uint32_t stride)
3521 {
3522    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3523 
3524    for (unsigned i = 0; i < drawCount; ++i) {
3525       struct hk_draw draw = {
3526          .b = hk_grid(pVertexInfo->vertexCount, instanceCount, 1),
3527          .start = pVertexInfo->firstVertex,
3528          .start_instance = firstInstance,
3529       };
3530 
3531       hk_draw(cmd, i, draw);
3532       pVertexInfo = ((void *)pVertexInfo) + stride;
3533    }
3534 }
3535 
3536 static void
hk_draw_indexed(VkCommandBuffer commandBuffer,uint16_t draw_id,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3537 hk_draw_indexed(VkCommandBuffer commandBuffer, uint16_t draw_id,
3538                 uint32_t indexCount, uint32_t instanceCount,
3539                 uint32_t firstIndex, int32_t vertexOffset,
3540                 uint32_t firstInstance)
3541 {
3542    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3543 
3544    struct hk_draw draw = {
3545       .b = hk_grid(indexCount, instanceCount, 1),
3546       .indexed = true,
3547       .index = cmd->state.gfx.index.buffer,
3548       .start = firstIndex,
3549       .index_bias = vertexOffset,
3550       .start_instance = firstInstance,
3551    };
3552 
3553    hk_draw(cmd, draw_id, draw);
3554 }
3555 
3556 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3557 hk_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount,
3558                   uint32_t instanceCount, uint32_t firstIndex,
3559                   int32_t vertexOffset, uint32_t firstInstance)
3560 {
3561    hk_draw_indexed(commandBuffer, 0, indexCount, instanceCount, firstIndex,
3562                    vertexOffset, firstInstance);
3563 }
3564 
3565 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)3566 hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
3567                           const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3568                           uint32_t instanceCount, uint32_t firstInstance,
3569                           uint32_t stride, const int32_t *pVertexOffset)
3570 {
3571    for (unsigned i = 0; i < drawCount; ++i) {
3572       const uint32_t vertex_offset =
3573          pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
3574 
3575       hk_draw_indexed(commandBuffer, i, pIndexInfo->indexCount, instanceCount,
3576                       pIndexInfo->firstIndex, vertex_offset, firstInstance);
3577 
3578       pIndexInfo = ((void *)pIndexInfo) + stride;
3579    }
3580 }
3581 
3582 static void
hk_draw_indirect_inner(VkCommandBuffer commandBuffer,uint64_t base,uint32_t drawCount,uint32_t stride)3583 hk_draw_indirect_inner(VkCommandBuffer commandBuffer, uint64_t base,
3584                        uint32_t drawCount, uint32_t stride)
3585 {
3586    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3587 
3588    /* From the Vulkan 1.3.238 spec:
3589     *
3590     *    VUID-vkCmdDrawIndirect-drawCount-00476
3591     *
3592     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
3593     *    must be greater than or equal to sizeof(VkDrawIndirectCommand)"
3594     *
3595     * and
3596     *
3597     *    "If drawCount is less than or equal to one, stride is ignored."
3598     */
3599    if (drawCount > 1) {
3600       assert(stride % 4 == 0);
3601       assert(stride >= sizeof(VkDrawIndirectCommand));
3602    }
3603 
3604    for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
3605       uint64_t addr = base + stride * draw_id;
3606       hk_draw(cmd, draw_id, hk_draw_indirect(addr));
3607    }
3608 }
3609 
3610 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3611 hk_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3612                    VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
3613 {
3614    VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3615 
3616    hk_draw_indirect_inner(commandBuffer, hk_buffer_address(buffer, offset),
3617                           drawCount, stride);
3618 }
3619 
3620 static void
hk_draw_indexed_indirect_inner(VkCommandBuffer commandBuffer,uint64_t buffer,uint32_t drawCount,uint32_t stride)3621 hk_draw_indexed_indirect_inner(VkCommandBuffer commandBuffer, uint64_t buffer,
3622                                uint32_t drawCount, uint32_t stride)
3623 {
3624    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3625 
3626    /* From the Vulkan 1.3.238 spec:
3627     *
3628     *    VUID-vkCmdDrawIndexedIndirect-drawCount-00528
3629     *
3630     *    "If drawCount is greater than 1, stride must be a multiple of 4 and
3631     *    must be greater than or equal to
3632     * sizeof(VkDrawIndexedIndirectCommand)"
3633     *
3634     * and
3635     *
3636     *    "If drawCount is less than or equal to one, stride is ignored."
3637     */
3638    if (drawCount > 1) {
3639       assert(stride % 4 == 0);
3640       assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
3641    }
3642 
3643    for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
3644       uint64_t addr = buffer + stride * draw_id;
3645 
3646       hk_draw(
3647          cmd, draw_id,
3648          hk_draw_indexed_indirect(addr, cmd->state.gfx.index.buffer, 0, 0));
3649    }
3650 }
3651 
3652 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3653 hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3654                           VkDeviceSize offset, uint32_t drawCount,
3655                           uint32_t stride)
3656 {
3657    VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3658 
3659    hk_draw_indexed_indirect_inner(
3660       commandBuffer, hk_buffer_address(buffer, offset), drawCount, stride);
3661 }
3662 
3663 /*
3664  * To implement drawIndirectCount generically, we dispatch a compute kernel to
3665  * patch the indirect buffer and then we dispatch the predicated maxDrawCount
3666  * indirect draws.
3667  */
3668 static void
hk_draw_indirect_count(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride,bool indexed)3669 hk_draw_indirect_count(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3670                        VkDeviceSize offset, VkBuffer countBuffer,
3671                        VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
3672                        uint32_t stride, bool indexed)
3673 {
3674    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3675    VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3676    VK_FROM_HANDLE(hk_buffer, count_buffer, countBuffer);
3677 
3678    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3679    struct agx_predicate_indirect_key key = {.indexed = indexed};
3680    struct hk_shader *s =
3681       hk_meta_kernel(dev, agx_nir_predicate_indirect, &key, sizeof(key));
3682 
3683    perf_debug(dev, "Draw indirect count");
3684 
3685    struct hk_cs *cs =
3686       hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3687    if (!cs)
3688       return;
3689 
3690    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3691 
3692    assert((stride % 4) == 0 && "aligned");
3693 
3694    size_t out_stride = sizeof(uint32_t) * (indexed ? 5 : 4);
3695    uint64_t patched = hk_pool_alloc(cmd, out_stride * maxDrawCount, 4).gpu;
3696 
3697    struct libagx_predicate_indirect_push push = {
3698       .in = hk_buffer_address(buffer, offset),
3699       .out = patched,
3700       .draw_count = hk_buffer_address(count_buffer, countBufferOffset),
3701       .stride_el = stride / 4,
3702    };
3703 
3704    uint64_t push_ = hk_pool_upload(cmd, &push, sizeof(push), 8);
3705    uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push_, sizeof(push_));
3706 
3707    hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(maxDrawCount, 1, 1),
3708                         hk_grid(1, 1, 1));
3709 
3710    if (indexed) {
3711       hk_draw_indexed_indirect_inner(commandBuffer, patched, maxDrawCount,
3712                                      out_stride);
3713    } else {
3714       hk_draw_indirect_inner(commandBuffer, patched, maxDrawCount, out_stride);
3715    }
3716 }
3717 
3718 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3719 hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3720                         VkDeviceSize offset, VkBuffer countBuffer,
3721                         VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
3722                         uint32_t stride)
3723 {
3724    hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
3725                           countBufferOffset, maxDrawCount, stride, false);
3726 }
3727 
3728 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3729 hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3730                                VkDeviceSize offset, VkBuffer countBuffer,
3731                                VkDeviceSize countBufferOffset,
3732                                uint32_t maxDrawCount, uint32_t stride)
3733 {
3734    hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
3735                           countBufferOffset, maxDrawCount, stride, true);
3736 }
3737 
3738 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)3739 hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3740                                uint32_t instanceCount, uint32_t firstInstance,
3741                                VkBuffer counterBuffer,
3742                                VkDeviceSize counterBufferOffset,
3743                                uint32_t counterOffset, uint32_t vertexStride)
3744 {
3745    unreachable("TODO");
3746 }
3747 
3748 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)3749 hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
3750                                       uint32_t firstBinding,
3751                                       uint32_t bindingCount,
3752                                       const VkBuffer *pBuffers,
3753                                       const VkDeviceSize *pOffsets,
3754                                       const VkDeviceSize *pSizes)
3755 {
3756    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3757    struct hk_graphics_state *gfx = &cmd->state.gfx;
3758 
3759    for (uint32_t i = 0; i < bindingCount; i++) {
3760       VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
3761       uint32_t idx = firstBinding + i;
3762       uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3763 
3764       gfx->xfb[idx] = hk_buffer_addr_range(buffer, pOffsets[i], size);
3765    }
3766 }
3767 
3768 static void
hk_libagx_copy_xfb_counters(nir_builder * b,const void * key)3769 hk_libagx_copy_xfb_counters(nir_builder *b, const void *key)
3770 {
3771    b->shader->info.workgroup_size_variable = true;
3772 
3773    libagx_copy_xfb_counters(b, nir_load_preamble(b, 1, 64));
3774 }
3775 
3776 static void
hk_begin_end_xfb(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets,bool begin)3777 hk_begin_end_xfb(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
3778                  uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
3779                  const VkDeviceSize *pCounterBufferOffsets, bool begin)
3780 
3781 {
3782    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3783    struct hk_device *dev = hk_cmd_buffer_device(cmd);
3784    struct hk_graphics_state *gfx = &cmd->state.gfx;
3785 
3786    gfx->xfb_enabled = begin;
3787 
3788    /* If we haven't reserved XFB offsets yet for the command buffer, do so. */
3789    if (!gfx->xfb_offsets) {
3790       gfx->xfb_offsets = hk_pool_alloc(cmd, 4 * sizeof(uint32_t), 4).gpu;
3791    }
3792 
3793    struct hk_cs *cs =
3794       hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3795    if (!cs)
3796       return;
3797    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3798 
3799    struct libagx_xfb_counter_copy params = {};
3800    unsigned copies = 0;
3801 
3802    /* For CmdBeginTransformFeedbackEXT, we need to initialize everything */
3803    if (begin) {
3804       for (copies = 0; copies < 4; ++copies) {
3805          params.dest[copies] = gfx->xfb_offsets + copies * sizeof(uint32_t);
3806       }
3807    }
3808 
3809    for (unsigned i = 0; i < counterBufferCount; ++i) {
3810       if (pCounterBuffers[i] == VK_NULL_HANDLE)
3811          continue;
3812 
3813       VK_FROM_HANDLE(hk_buffer, buffer, pCounterBuffers[i]);
3814 
3815       uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
3816       uint64_t cb_addr = hk_buffer_address(buffer, offset);
3817       uint32_t cmd_idx = firstCounterBuffer + i;
3818 
3819       if (begin) {
3820          params.src[cmd_idx] = cb_addr;
3821       } else {
3822          params.dest[copies] = cb_addr;
3823          params.src[copies] = gfx->xfb_offsets + cmd_idx * sizeof(uint32_t);
3824          ++copies;
3825       }
3826    }
3827 
3828    if (begin)
3829       copies = 4;
3830 
3831    if (copies > 0) {
3832       perf_debug(dev, "XFB counter copy");
3833 
3834       struct hk_shader *s =
3835          hk_meta_kernel(dev, hk_libagx_copy_xfb_counters, NULL, 0);
3836 
3837       uint64_t push = hk_pool_upload(cmd, &params, sizeof(params), 8);
3838       uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
3839 
3840       hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(copies, 1, 1),
3841                            hk_grid(copies, 1, 1));
3842    }
3843 }
3844 
3845 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3846 hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3847                                 uint32_t firstCounterBuffer,
3848                                 uint32_t counterBufferCount,
3849                                 const VkBuffer *pCounterBuffers,
3850                                 const VkDeviceSize *pCounterBufferOffsets)
3851 {
3852    hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
3853                     pCounterBuffers, pCounterBufferOffsets, true);
3854 }
3855 
3856 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3857 hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3858                               uint32_t firstCounterBuffer,
3859                               uint32_t counterBufferCount,
3860                               const VkBuffer *pCounterBuffers,
3861                               const VkDeviceSize *pCounterBufferOffsets)
3862 {
3863    hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
3864                     pCounterBuffers, pCounterBufferOffsets, false);
3865 }
3866 
3867 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)3868 hk_CmdBeginConditionalRenderingEXT(
3869    VkCommandBuffer commandBuffer,
3870    const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
3871 {
3872    unreachable("stub");
3873 }
3874 
3875 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)3876 hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
3877 {
3878    unreachable("stub");
3879 }
3880