1 /*
2 * Copyright 2024 Valve Corporation
3 * Copyright 2024 Alyssa Rosenzweig
4 * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5 * SPDX-License-Identifier: MIT
6 */
7 #include <assert.h>
8 #include "agx_bg_eot.h"
9 #include "agx_bo.h"
10 #include "agx_compile.h"
11 #include "agx_compiler.h"
12 #include "agx_device.h"
13 #include "agx_helpers.h"
14 #include "agx_linker.h"
15 #include "agx_nir_lower_gs.h"
16 #include "agx_nir_lower_vbo.h"
17 #include "agx_ppp.h"
18 #include "agx_tilebuffer.h"
19 #include "agx_usc.h"
20 #include "agx_uvs.h"
21 #include "hk_buffer.h"
22 #include "hk_cmd_buffer.h"
23 #include "hk_device.h"
24 #include "hk_entrypoints.h"
25 #include "hk_image.h"
26 #include "hk_image_view.h"
27 #include "hk_physical_device.h"
28 #include "hk_private.h"
29 #include "hk_shader.h"
30
31 #include "asahi/genxml/agx_pack.h"
32 #include "asahi/lib/libagx_shaders.h"
33 #include "asahi/lib/shaders/draws.h"
34 #include "asahi/lib/shaders/geometry.h"
35 #include "shaders/query.h"
36 #include "shaders/tessellator.h"
37 #include "util/bitpack_helpers.h"
38 #include "util/blend.h"
39 #include "util/format/format_utils.h"
40 #include "util/format/u_formats.h"
41 #include "util/macros.h"
42 #include "util/ralloc.h"
43 #include "vulkan/vulkan_core.h"
44 #include "layout.h"
45 #include "nir.h"
46 #include "nir_builder.h"
47 #include "nir_lower_blend.h"
48 #include "nir_xfb_info.h"
49 #include "pool.h"
50 #include "shader_enums.h"
51 #include "vk_blend.h"
52 #include "vk_enum_to_str.h"
53 #include "vk_format.h"
54 #include "vk_graphics_state.h"
55 #include "vk_pipeline.h"
56 #include "vk_render_pass.h"
57 #include "vk_standard_sample_locations.h"
58 #include "vk_util.h"
59
60 #define IS_DIRTY(bit) BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_##bit)
61
62 #define IS_SHADER_DIRTY(bit) \
63 (cmd->state.gfx.shaders_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
64
65 #define IS_LINKED_DIRTY(bit) \
66 (cmd->state.gfx.linked_dirty & BITFIELD_BIT(MESA_SHADER_##bit))
67
68 struct hk_draw {
69 struct hk_grid b;
70 struct hk_addr_range index;
71 bool indexed;
72 uint32_t start;
73 uint32_t index_bias;
74 uint32_t start_instance;
75
76 /* Indicates that the indirect draw consists of raw VDM commands and should
77 * be stream linked to. Used to accelerate tessellation.
78 */
79 bool raw;
80
81 /* Set within hk_draw() but here so geometry/tessellation can override */
82 bool restart;
83 enum agx_index_size index_size;
84 };
85
86 static struct hk_draw
hk_draw_indirect(uint64_t ptr)87 hk_draw_indirect(uint64_t ptr)
88 {
89 return (struct hk_draw){.b = hk_grid_indirect(ptr)};
90 }
91
92 static struct hk_draw
hk_draw_indexed_indirect(uint64_t ptr,struct hk_addr_range index,enum agx_index_size index_size,bool restart)93 hk_draw_indexed_indirect(uint64_t ptr, struct hk_addr_range index,
94 enum agx_index_size index_size, bool restart)
95 {
96 return (struct hk_draw){
97 .b = hk_grid_indirect(ptr),
98 .index = index,
99 .indexed = true,
100 .index_size = index_size,
101 .restart = restart,
102 };
103 }
104
105 /* XXX: deduplicate */
106 static inline enum mesa_prim
vk_conv_topology(VkPrimitiveTopology topology)107 vk_conv_topology(VkPrimitiveTopology topology)
108 {
109 switch (topology) {
110 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
111 return MESA_PRIM_POINTS;
112 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
113 return MESA_PRIM_LINES;
114 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
115 return MESA_PRIM_LINE_STRIP;
116 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
117 #pragma GCC diagnostic push
118 #pragma GCC diagnostic ignored "-Wswitch"
119 case VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA:
120 #pragma GCC diagnostic pop
121 return MESA_PRIM_TRIANGLES;
122 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
123 return MESA_PRIM_TRIANGLE_STRIP;
124 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
125 return MESA_PRIM_TRIANGLE_FAN;
126 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
127 return MESA_PRIM_LINES_ADJACENCY;
128 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
129 return MESA_PRIM_LINE_STRIP_ADJACENCY;
130 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
131 return MESA_PRIM_TRIANGLES_ADJACENCY;
132 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
133 return MESA_PRIM_TRIANGLE_STRIP_ADJACENCY;
134 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
135 return MESA_PRIM_PATCHES;
136 default:
137 unreachable("invalid");
138 }
139 }
140
141 static void
hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer * cmd)142 hk_cmd_buffer_dirty_render_pass(struct hk_cmd_buffer *cmd)
143 {
144 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
145
146 /* These depend on color attachment count */
147 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
148 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_ENABLES);
149 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_EQUATIONS);
150 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_WRITE_MASKS);
151
152 /* These depend on the depth/stencil format */
153 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE);
154 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_WRITE_ENABLE);
155 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
156 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
157 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
158
159 /* This may depend on render targets for ESO */
160 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
161 }
162
163 void
hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer * cmd,const VkCommandBufferBeginInfo * pBeginInfo)164 hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
165 const VkCommandBufferBeginInfo *pBeginInfo)
166 {
167 if (cmd->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY &&
168 (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT)) {
169 char gcbiar_data[VK_GCBIARR_DATA_SIZE(HK_MAX_RTS)];
170 const VkRenderingInfo *resume_info =
171 vk_get_command_buffer_inheritance_as_rendering_resume(
172 cmd->vk.level, pBeginInfo, gcbiar_data);
173 if (resume_info) {
174 hk_CmdBeginRendering(hk_cmd_buffer_to_handle(cmd), resume_info);
175 } else {
176 const VkCommandBufferInheritanceRenderingInfo *inheritance_info =
177 vk_get_command_buffer_inheritance_rendering_info(cmd->vk.level,
178 pBeginInfo);
179 assert(inheritance_info);
180
181 struct hk_rendering_state *render = &cmd->state.gfx.render;
182 render->flags = inheritance_info->flags;
183 render->area = (VkRect2D){};
184 render->layer_count = 0;
185 render->view_mask = inheritance_info->viewMask;
186 render->tilebuffer.nr_samples = inheritance_info->rasterizationSamples;
187
188 render->color_att_count = inheritance_info->colorAttachmentCount;
189 for (uint32_t i = 0; i < render->color_att_count; i++) {
190 render->color_att[i].vk_format =
191 inheritance_info->pColorAttachmentFormats[i];
192 }
193 render->depth_att.vk_format = inheritance_info->depthAttachmentFormat;
194 render->stencil_att.vk_format =
195 inheritance_info->stencilAttachmentFormat;
196
197 hk_cmd_buffer_dirty_render_pass(cmd);
198 }
199 }
200
201 hk_cmd_buffer_dirty_all(cmd);
202
203 /* If multiview is disabled, always read 0. If multiview is enabled,
204 * hk_set_view_index will dirty the root each draw.
205 */
206 cmd->state.gfx.descriptors.root.draw.view_index = 0;
207 cmd->state.gfx.descriptors.root_dirty = true;
208 }
209
210 void
hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer * cmd)211 hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd)
212 {
213 hk_cmd_buffer_dirty_all(cmd);
214
215 /* From the Vulkan 1.3.275 spec:
216 *
217 * "...There is one exception to this rule - if the primary command
218 * buffer is inside a render pass instance, then the render pass and
219 * subpass state is not disturbed by executing secondary command
220 * buffers."
221 *
222 * We need to reset everything EXCEPT the render pass state.
223 */
224 struct hk_rendering_state render_save = cmd->state.gfx.render;
225 memset(&cmd->state.gfx, 0, sizeof(cmd->state.gfx));
226 cmd->state.gfx.render = render_save;
227 }
228
229 static void
hk_attachment_init(struct hk_attachment * att,const VkRenderingAttachmentInfo * info)230 hk_attachment_init(struct hk_attachment *att,
231 const VkRenderingAttachmentInfo *info)
232 {
233 if (info == NULL || info->imageView == VK_NULL_HANDLE) {
234 *att = (struct hk_attachment){
235 .iview = NULL,
236 };
237 return;
238 }
239
240 VK_FROM_HANDLE(hk_image_view, iview, info->imageView);
241 *att = (struct hk_attachment){
242 .vk_format = iview->vk.format,
243 .iview = iview,
244 };
245
246 if (info->resolveMode != VK_RESOLVE_MODE_NONE) {
247 VK_FROM_HANDLE(hk_image_view, res_iview, info->resolveImageView);
248 att->resolve_mode = info->resolveMode;
249 att->resolve_iview = res_iview;
250 }
251 }
252
253 VKAPI_ATTR void VKAPI_CALL
hk_GetRenderingAreaGranularityKHR(VkDevice device,const VkRenderingAreaInfoKHR * pRenderingAreaInfo,VkExtent2D * pGranularity)254 hk_GetRenderingAreaGranularityKHR(
255 VkDevice device, const VkRenderingAreaInfoKHR *pRenderingAreaInfo,
256 VkExtent2D *pGranularity)
257 {
258 *pGranularity = (VkExtent2D){.width = 1, .height = 1};
259 }
260
261 static struct hk_bg_eot
hk_build_bg_eot(struct hk_cmd_buffer * cmd,const VkRenderingInfo * info,bool store,bool partial_render,bool incomplete_render_area)262 hk_build_bg_eot(struct hk_cmd_buffer *cmd, const VkRenderingInfo *info,
263 bool store, bool partial_render, bool incomplete_render_area)
264 {
265 struct hk_device *dev = hk_cmd_buffer_device(cmd);
266 struct hk_rendering_state *render = &cmd->state.gfx.render;
267
268 /* Construct the key */
269 struct agx_bg_eot_key key = {.tib = render->tilebuffer};
270 static_assert(AGX_BG_EOT_NONE == 0, "default initializer");
271
272 key.tib.layered = (render->cr.layers > 1);
273
274 bool needs_textures_for_spilled_rts =
275 agx_tilebuffer_spills(&render->tilebuffer) && !partial_render && !store;
276
277 for (unsigned i = 0; i < info->colorAttachmentCount; ++i) {
278 const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[i];
279 if (att_info->imageView == VK_NULL_HANDLE)
280 continue;
281
282 /* Partial render programs exist only to store/load the tilebuffer to
283 * main memory. When render targets are already spilled to main memory,
284 * there's nothing to do.
285 */
286 if (key.tib.spilled[i] && (partial_render || store))
287 continue;
288
289 if (store) {
290 bool store = att_info->storeOp == VK_ATTACHMENT_STORE_OP_STORE;
291
292 /* When resolving, we store the intermediate multisampled image as the
293 * resolve is a separate control stream. This could be optimized.
294 */
295 store |= att_info->resolveMode != VK_RESOLVE_MODE_NONE;
296
297 /* Partial renders always need to flush to memory. */
298 store |= partial_render;
299
300 if (store)
301 key.op[i] = AGX_EOT_STORE;
302 } else {
303 bool load = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD;
304 bool clear = att_info->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR;
305
306 /* The background program used for partial renders must always load
307 * whatever was stored in the mid-frame end-of-tile program.
308 */
309 load |= partial_render;
310
311 /* With an incomplete render area, we're forced to load back tiles and
312 * then use the 3D pipe for the clear.
313 */
314 load |= incomplete_render_area;
315
316 /* Don't read back spilled render targets, they're already in memory */
317 load &= !key.tib.spilled[i];
318
319 /* Don't apply clears for spilled render targets when we clear the
320 * render area explicitly after.
321 */
322 if (key.tib.spilled[i] && incomplete_render_area)
323 continue;
324
325 if (load)
326 key.op[i] = AGX_BG_LOAD;
327 else if (clear)
328 key.op[i] = AGX_BG_CLEAR;
329 }
330 }
331
332 /* Begin building the pipeline */
333 size_t usc_size = agx_usc_size(3 + HK_MAX_RTS);
334 struct agx_ptr t = hk_pool_usc_alloc(cmd, usc_size, 64);
335 if (!t.cpu)
336 return (struct hk_bg_eot){.usc = t.gpu};
337
338 struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
339
340 bool uses_txf = false;
341 unsigned uniforms = 0;
342 unsigned nr_tex = 0;
343
344 for (unsigned rt = 0; rt < HK_MAX_RTS; ++rt) {
345 const VkRenderingAttachmentInfo *att_info = &info->pColorAttachments[rt];
346 struct hk_image_view *iview = render->color_att[rt].iview;
347
348 if (key.op[rt] == AGX_BG_LOAD) {
349 uses_txf = true;
350
351 uint32_t index = key.tib.layered
352 ? iview->planes[0].layered_background_desc_index
353 : iview->planes[0].background_desc_index;
354
355 agx_usc_pack(&b, TEXTURE, cfg) {
356 /* Shifted to match eMRT indexing, could be optimized */
357 cfg.start = rt * 2;
358 cfg.count = 1;
359 cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
360 }
361
362 nr_tex = (rt * 2) + 1;
363 } else if (key.op[rt] == AGX_BG_CLEAR) {
364 static_assert(sizeof(att_info->clearValue.color) == 16, "fixed ABI");
365 uint64_t colour =
366 hk_pool_upload(cmd, &att_info->clearValue.color, 16, 16);
367
368 agx_usc_uniform(&b, 4 + (8 * rt), 8, colour);
369 uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
370 } else if (key.op[rt] == AGX_EOT_STORE) {
371 uint32_t index = key.tib.layered
372 ? iview->planes[0].layered_eot_pbe_desc_index
373 : iview->planes[0].eot_pbe_desc_index;
374
375 agx_usc_pack(&b, TEXTURE, cfg) {
376 cfg.start = rt;
377 cfg.count = 1;
378 cfg.buffer = dev->images.bo->va->addr + index * AGX_TEXTURE_LENGTH;
379 }
380
381 nr_tex = rt + 1;
382 }
383 }
384
385 if (needs_textures_for_spilled_rts) {
386 hk_usc_upload_spilled_rt_descs(&b, cmd);
387 uniforms = MAX2(uniforms, 4);
388 }
389
390 if (uses_txf) {
391 agx_usc_push_packed(&b, SAMPLER, dev->rodata.txf_sampler);
392 }
393
394 /* For attachmentless rendering, we don't know the sample count until
395 * draw-time. But we have trivial bg/eot programs in that case too.
396 */
397 if (key.tib.nr_samples >= 1) {
398 agx_usc_push_packed(&b, SHARED, &key.tib.usc);
399 } else {
400 assert(key.tib.sample_size_B == 0);
401 agx_usc_shared_none(&b);
402
403 key.tib.nr_samples = 1;
404 }
405
406 /* Get the shader */
407 key.reserved_preamble = uniforms;
408 /* XXX: locking? */
409 struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&dev->bg_eot, &key);
410
411 agx_usc_pack(&b, SHADER, cfg) {
412 cfg.code = agx_usc_addr(&dev->dev, shader->ptr);
413 cfg.unk_2 = 0;
414 }
415
416 agx_usc_pack(&b, REGISTERS, cfg)
417 cfg.register_count = shader->info.nr_gprs;
418
419 if (shader->info.has_preamble) {
420 agx_usc_pack(&b, PRESHADER, cfg) {
421 cfg.code =
422 agx_usc_addr(&dev->dev, shader->ptr + shader->info.preamble_offset);
423 }
424 } else {
425 agx_usc_pack(&b, NO_PRESHADER, cfg)
426 ;
427 }
428
429 struct hk_bg_eot ret = {.usc = t.gpu};
430
431 agx_pack(&ret.counts, COUNTS, cfg) {
432 cfg.uniform_register_count = shader->info.push_count;
433 cfg.preshader_register_count = shader->info.nr_preamble_gprs;
434 cfg.texture_state_register_count = nr_tex;
435 cfg.sampler_state_register_count =
436 agx_translate_sampler_state_count(uses_txf ? 1 : 0, false);
437 }
438
439 return ret;
440 }
441
442 static bool
is_aligned(unsigned x,unsigned pot_alignment)443 is_aligned(unsigned x, unsigned pot_alignment)
444 {
445 assert(util_is_power_of_two_nonzero(pot_alignment));
446 return (x & (pot_alignment - 1)) == 0;
447 }
448
449 static void
hk_merge_render_iview(struct hk_rendering_state * render,struct hk_image_view * iview)450 hk_merge_render_iview(struct hk_rendering_state *render,
451 struct hk_image_view *iview)
452 {
453 if (iview) {
454 unsigned samples = iview->vk.image->samples;
455 /* TODO: is this right for ycbcr? */
456 unsigned level = iview->vk.base_mip_level;
457 unsigned width = u_minify(iview->vk.image->extent.width, level);
458 unsigned height = u_minify(iview->vk.image->extent.height, level);
459
460 assert(render->tilebuffer.nr_samples == 0 ||
461 render->tilebuffer.nr_samples == samples);
462 render->tilebuffer.nr_samples = samples;
463
464 /* TODO: Is this merging logic sound? Not sure how this is supposed to
465 * work conceptually.
466 */
467 render->cr.width = MAX2(render->cr.width, width);
468 render->cr.height = MAX2(render->cr.height, height);
469 }
470 }
471
472 static void
hk_pack_zls_control(struct agx_zls_control_packed * packed,struct ail_layout * z_layout,struct ail_layout * s_layout,const VkRenderingAttachmentInfo * attach_z,const VkRenderingAttachmentInfo * attach_s,bool incomplete_render_area,bool partial_render)473 hk_pack_zls_control(struct agx_zls_control_packed *packed,
474 struct ail_layout *z_layout, struct ail_layout *s_layout,
475 const VkRenderingAttachmentInfo *attach_z,
476 const VkRenderingAttachmentInfo *attach_s,
477 bool incomplete_render_area, bool partial_render)
478 {
479 agx_pack(packed, ZLS_CONTROL, zls_control) {
480 if (z_layout) {
481 zls_control.z_store_enable =
482 attach_z->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
483 attach_z->resolveMode != VK_RESOLVE_MODE_NONE || partial_render;
484
485 zls_control.z_load_enable =
486 attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
487 incomplete_render_area;
488
489 if (ail_is_compressed(z_layout)) {
490 zls_control.z_compress_1 = true;
491 zls_control.z_compress_2 = true;
492 }
493
494 if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
495 zls_control.z_format = AGX_ZLS_FORMAT_16;
496 } else {
497 zls_control.z_format = AGX_ZLS_FORMAT_32F;
498 }
499 }
500
501 if (s_layout) {
502 /* TODO:
503 * Fail
504 * dEQP-VK.renderpass.dedicated_allocation.formats.d32_sfloat_s8_uint.input.dont_care.store.self_dep_clear_draw_use_input_aspect
505 * without the force
506 * .. maybe a VkRenderPass emulation bug.
507 */
508 zls_control.s_store_enable =
509 attach_s->storeOp == VK_ATTACHMENT_STORE_OP_STORE ||
510 attach_s->resolveMode != VK_RESOLVE_MODE_NONE || partial_render ||
511 true;
512
513 zls_control.s_load_enable =
514 attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD || partial_render ||
515 incomplete_render_area;
516
517 if (ail_is_compressed(s_layout)) {
518 zls_control.s_compress_1 = true;
519 zls_control.s_compress_2 = true;
520 }
521 }
522 }
523 }
524
525 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginRendering(VkCommandBuffer commandBuffer,const VkRenderingInfo * pRenderingInfo)526 hk_CmdBeginRendering(VkCommandBuffer commandBuffer,
527 const VkRenderingInfo *pRenderingInfo)
528 {
529 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
530 struct hk_rendering_state *render = &cmd->state.gfx.render;
531
532 memset(render, 0, sizeof(*render));
533
534 render->flags = pRenderingInfo->flags;
535 render->area = pRenderingInfo->renderArea;
536 render->view_mask = pRenderingInfo->viewMask;
537 render->layer_count = pRenderingInfo->layerCount;
538 render->tilebuffer.nr_samples = 0;
539
540 const uint32_t layer_count = render->view_mask
541 ? util_last_bit(render->view_mask)
542 : render->layer_count;
543
544 render->color_att_count = pRenderingInfo->colorAttachmentCount;
545 for (uint32_t i = 0; i < render->color_att_count; i++) {
546 hk_attachment_init(&render->color_att[i],
547 &pRenderingInfo->pColorAttachments[i]);
548 }
549
550 hk_attachment_init(&render->depth_att, pRenderingInfo->pDepthAttachment);
551 hk_attachment_init(&render->stencil_att, pRenderingInfo->pStencilAttachment);
552
553 for (uint32_t i = 0; i < render->color_att_count; i++) {
554 hk_merge_render_iview(render, render->color_att[i].iview);
555 }
556
557 hk_merge_render_iview(render,
558 render->depth_att.iview ?: render->stencil_att.iview);
559
560 /* Infer for attachmentless. samples is inferred at draw-time. */
561 render->cr.width =
562 MAX2(render->cr.width, render->area.offset.x + render->area.extent.width);
563
564 render->cr.height = MAX2(render->cr.height,
565 render->area.offset.y + render->area.extent.height);
566
567 render->cr.layers = layer_count;
568
569 /* Choose a tilebuffer layout given the framebuffer key */
570 enum pipe_format formats[HK_MAX_RTS] = {0};
571 for (unsigned i = 0; i < render->color_att_count; ++i) {
572 formats[i] = vk_format_to_pipe_format(render->color_att[i].vk_format);
573 }
574
575 /* For now, we force layered=true since it makes compatibility problems way
576 * easier.
577 */
578 render->tilebuffer = agx_build_tilebuffer_layout(
579 formats, render->color_att_count, render->tilebuffer.nr_samples, true);
580
581 hk_cmd_buffer_dirty_render_pass(cmd);
582
583 /* Determine whether the render area is complete, enabling us to use a
584 * fast-clear.
585 *
586 * TODO: If it is incomplete but tile aligned, it should be possibly to fast
587 * clear with the appropriate settings. This is critical for performance.
588 */
589 bool incomplete_render_area =
590 render->area.offset.x > 0 || render->area.offset.y > 0 ||
591 render->area.extent.width < render->cr.width ||
592 render->area.extent.height < render->cr.height ||
593 (render->view_mask &&
594 render->view_mask != BITFIELD64_MASK(render->cr.layers));
595
596 render->cr.bg.main = hk_build_bg_eot(cmd, pRenderingInfo, false, false,
597 incomplete_render_area);
598 render->cr.bg.partial =
599 hk_build_bg_eot(cmd, pRenderingInfo, false, true, incomplete_render_area);
600
601 render->cr.eot.main =
602 hk_build_bg_eot(cmd, pRenderingInfo, true, false, incomplete_render_area);
603 render->cr.eot.partial = render->cr.eot.main;
604
605 render->cr.isp_bgobjvals = 0x300;
606
607 const VkRenderingAttachmentInfo *attach_z = pRenderingInfo->pDepthAttachment;
608 const VkRenderingAttachmentInfo *attach_s =
609 pRenderingInfo->pStencilAttachment;
610
611 render->cr.iogpu_unk_214 = 0xc000;
612
613 struct ail_layout *z_layout = NULL, *s_layout = NULL;
614
615 if (attach_z != NULL && attach_z != VK_NULL_HANDLE && attach_z->imageView) {
616 struct hk_image_view *view = render->depth_att.iview;
617 struct hk_image *image =
618 container_of(view->vk.image, struct hk_image, vk);
619
620 z_layout = &image->planes[0].layout;
621
622 unsigned level = view->vk.base_mip_level;
623 unsigned first_layer = view->vk.base_array_layer;
624
625 const struct util_format_description *desc =
626 util_format_description(vk_format_to_pipe_format(view->vk.format));
627
628 assert(desc->format == PIPE_FORMAT_Z32_FLOAT ||
629 desc->format == PIPE_FORMAT_Z16_UNORM ||
630 desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
631
632 render->cr.depth.buffer =
633 hk_image_base_address(image, 0) +
634 ail_get_layer_level_B(z_layout, first_layer, level);
635
636 /* Main stride in pages */
637 assert((z_layout->depth_px == 1 ||
638 is_aligned(z_layout->layer_stride_B, AIL_PAGESIZE)) &&
639 "Page aligned Z layers");
640
641 unsigned stride_pages = z_layout->layer_stride_B / AIL_PAGESIZE;
642 render->cr.depth.stride = ((stride_pages - 1) << 14) | 1;
643
644 assert(z_layout->tiling != AIL_TILING_LINEAR && "must tile");
645
646 if (ail_is_compressed(z_layout)) {
647 render->cr.depth.meta =
648 hk_image_base_address(image, 0) + z_layout->metadata_offset_B +
649 (first_layer * z_layout->compression_layer_stride_B) +
650 z_layout->level_offsets_compressed_B[level];
651
652 /* Meta stride in cache lines */
653 assert(
654 is_aligned(z_layout->compression_layer_stride_B, AIL_CACHELINE) &&
655 "Cacheline aligned Z meta layers");
656
657 unsigned stride_lines =
658 z_layout->compression_layer_stride_B / AIL_CACHELINE;
659 render->cr.depth.meta_stride = (stride_lines - 1) << 14;
660 }
661
662 float clear_depth = attach_z->clearValue.depthStencil.depth;
663
664 if (z_layout->format == PIPE_FORMAT_Z16_UNORM) {
665 render->cr.isp_bgobjdepth = _mesa_float_to_unorm(clear_depth, 16);
666 render->cr.iogpu_unk_214 |= 0x40000;
667 } else {
668 render->cr.isp_bgobjdepth = fui(clear_depth);
669 }
670 }
671
672 if (attach_s != NULL && attach_s != VK_NULL_HANDLE && attach_s->imageView) {
673 struct hk_image_view *view = render->stencil_att.iview;
674 struct hk_image *image =
675 container_of(view->vk.image, struct hk_image, vk);
676
677 /* Stencil is always the last plane (possibly the only plane) */
678 unsigned plane = image->plane_count - 1;
679 s_layout = &image->planes[plane].layout;
680 assert(s_layout->format == PIPE_FORMAT_S8_UINT);
681
682 unsigned level = view->vk.base_mip_level;
683 unsigned first_layer = view->vk.base_array_layer;
684
685 render->cr.stencil.buffer =
686 hk_image_base_address(image, plane) +
687 ail_get_layer_level_B(s_layout, first_layer, level);
688
689 /* Main stride in pages */
690 assert((s_layout->depth_px == 1 ||
691 is_aligned(s_layout->layer_stride_B, AIL_PAGESIZE)) &&
692 "Page aligned S layers");
693 unsigned stride_pages = s_layout->layer_stride_B / AIL_PAGESIZE;
694 render->cr.stencil.stride = ((stride_pages - 1) << 14) | 1;
695
696 if (ail_is_compressed(s_layout)) {
697 render->cr.stencil.meta =
698 hk_image_base_address(image, plane) + s_layout->metadata_offset_B +
699 (first_layer * s_layout->compression_layer_stride_B) +
700 s_layout->level_offsets_compressed_B[level];
701
702 /* Meta stride in cache lines */
703 assert(
704 is_aligned(s_layout->compression_layer_stride_B, AIL_CACHELINE) &&
705 "Cacheline aligned S meta layers");
706
707 unsigned stride_lines =
708 s_layout->compression_layer_stride_B / AIL_CACHELINE;
709
710 render->cr.stencil.meta_stride = (stride_lines - 1) << 14;
711 }
712
713 render->cr.isp_bgobjvals |= attach_s->clearValue.depthStencil.stencil;
714 }
715
716 hk_pack_zls_control(&render->cr.zls_control, z_layout, s_layout, attach_z,
717 attach_s, incomplete_render_area, false);
718
719 hk_pack_zls_control(&render->cr.zls_control_partial, z_layout, s_layout,
720 attach_z, attach_s, incomplete_render_area, true);
721
722 /* If multiview is disabled, always read 0. If multiview is enabled,
723 * hk_set_view_index will dirty the root each draw.
724 */
725 cmd->state.gfx.descriptors.root.draw.view_index = 0;
726 cmd->state.gfx.descriptors.root_dirty = true;
727
728 if (render->flags & VK_RENDERING_RESUMING_BIT)
729 return;
730
731 /* The first control stream of the render pass is special since it gets
732 * the clears. Create it and swap in the clear.
733 */
734 assert(!cmd->current_cs.gfx && "not already in a render pass");
735 struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
736 if (!cs)
737 return;
738
739 cs->cr.bg.main = render->cr.bg.main;
740 cs->cr.zls_control = render->cr.zls_control;
741
742 /* Reordering barrier for post-gfx, in case we had any. */
743 hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
744
745 /* Don't reorder compute across render passes.
746 *
747 * TODO: Check if this is necessary if the proper PipelineBarriers are
748 * handled... there may be CTS bugs...
749 */
750 hk_cmd_buffer_end_compute(cmd);
751
752 /* If we spill colour attachments, we need to decompress them. This happens
753 * at the start of the render; it is not re-emitted when resuming
754 * secondaries. It could be hoisted to the start of the command buffer but
755 * we're not that clever yet.
756 */
757 if (agx_tilebuffer_spills(&render->tilebuffer)) {
758 struct hk_device *dev = hk_cmd_buffer_device(cmd);
759
760 perf_debug(dev, "eMRT render pass");
761
762 for (unsigned i = 0; i < render->color_att_count; ++i) {
763 struct hk_image_view *view = render->color_att[i].iview;
764 if (view) {
765 struct hk_image *image =
766 container_of(view->vk.image, struct hk_image, vk);
767
768 /* TODO: YCbCr interaction? */
769 uint8_t plane = 0;
770 uint8_t image_plane = view->planes[plane].image_plane;
771 struct ail_layout *layout = &image->planes[image_plane].layout;
772
773 if (ail_is_level_compressed(layout, view->vk.base_mip_level)) {
774 struct hk_device *dev = hk_cmd_buffer_device(cmd);
775 perf_debug(dev, "Decompressing in-place");
776
777 struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
778 cmd, &cmd->current_cs.pre_gfx, true);
779 if (!cs)
780 return;
781
782 unsigned level = view->vk.base_mip_level;
783
784 struct agx_ptr data =
785 hk_pool_alloc(cmd, sizeof(struct libagx_decompress_push), 64);
786 struct libagx_decompress_push *push = data.cpu;
787 agx_fill_decompress_push(
788 push, layout, view->vk.base_array_layer, level,
789 hk_image_base_address(image, image_plane));
790
791 push->compressed = view->planes[plane].emrt_texture;
792 push->uncompressed = view->planes[plane].emrt_pbe;
793
794 struct hk_grid grid =
795 hk_grid(ail_metadata_width_tl(layout, level) * 32,
796 ail_metadata_height_tl(layout, level), layer_count);
797
798 struct agx_decompress_key key = {
799 .nr_samples = layout->sample_count_sa,
800 };
801
802 struct hk_shader *s =
803 hk_meta_kernel(dev, agx_nir_decompress, &key, sizeof(key));
804
805 uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &data.gpu, 8);
806 hk_dispatch_with_usc(dev, cs, s, usc, grid, hk_grid(32, 1, 1));
807 }
808 }
809 }
810 }
811
812 if (incomplete_render_area) {
813 uint32_t clear_count = 0;
814 VkClearAttachment clear_att[HK_MAX_RTS + 1];
815 for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
816 const VkRenderingAttachmentInfo *att_info =
817 &pRenderingInfo->pColorAttachments[i];
818 if (att_info->imageView == VK_NULL_HANDLE ||
819 att_info->loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
820 continue;
821
822 clear_att[clear_count++] = (VkClearAttachment){
823 .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
824 .colorAttachment = i,
825 .clearValue = att_info->clearValue,
826 };
827 }
828
829 clear_att[clear_count] = (VkClearAttachment){
830 .aspectMask = 0,
831 };
832
833 if (attach_z && attach_z->imageView != VK_NULL_HANDLE &&
834 attach_z->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
835 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_DEPTH_BIT;
836 clear_att[clear_count].clearValue.depthStencil.depth =
837 attach_z->clearValue.depthStencil.depth;
838 }
839
840 if (attach_s != NULL && attach_s->imageView != VK_NULL_HANDLE &&
841 attach_s->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
842 clear_att[clear_count].aspectMask |= VK_IMAGE_ASPECT_STENCIL_BIT;
843 clear_att[clear_count].clearValue.depthStencil.stencil =
844 attach_s->clearValue.depthStencil.stencil;
845 }
846
847 if (clear_att[clear_count].aspectMask != 0)
848 clear_count++;
849
850 if (clear_count > 0) {
851 const VkClearRect clear_rect = {
852 .rect = render->area,
853 .baseArrayLayer = 0,
854 .layerCount = render->view_mask ? 1 : render->layer_count,
855 };
856
857 hk_CmdClearAttachments(hk_cmd_buffer_to_handle(cmd), clear_count,
858 clear_att, 1, &clear_rect);
859 }
860 }
861 }
862
863 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndRendering(VkCommandBuffer commandBuffer)864 hk_CmdEndRendering(VkCommandBuffer commandBuffer)
865 {
866 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
867 struct hk_rendering_state *render = &cmd->state.gfx.render;
868
869 hk_cmd_buffer_end_graphics(cmd);
870
871 bool need_resolve = false;
872
873 /* Translate render state back to VK for meta */
874 VkRenderingAttachmentInfo vk_color_att[HK_MAX_RTS];
875 for (uint32_t i = 0; i < render->color_att_count; i++) {
876 if (render->color_att[i].resolve_mode != VK_RESOLVE_MODE_NONE)
877 need_resolve = true;
878
879 vk_color_att[i] = (VkRenderingAttachmentInfo){
880 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
881 .imageView = hk_image_view_to_handle(render->color_att[i].iview),
882 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
883 .resolveMode = render->color_att[i].resolve_mode,
884 .resolveImageView =
885 hk_image_view_to_handle(render->color_att[i].resolve_iview),
886 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
887 };
888 }
889
890 const VkRenderingAttachmentInfo vk_depth_att = {
891 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
892 .imageView = hk_image_view_to_handle(render->depth_att.iview),
893 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
894 .resolveMode = render->depth_att.resolve_mode,
895 .resolveImageView =
896 hk_image_view_to_handle(render->depth_att.resolve_iview),
897 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
898 };
899 if (render->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE)
900 need_resolve = true;
901
902 const VkRenderingAttachmentInfo vk_stencil_att = {
903 .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
904 .imageView = hk_image_view_to_handle(render->stencil_att.iview),
905 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
906 .resolveMode = render->stencil_att.resolve_mode,
907 .resolveImageView =
908 hk_image_view_to_handle(render->stencil_att.resolve_iview),
909 .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
910 };
911 if (render->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)
912 need_resolve = true;
913
914 const VkRenderingInfo vk_render = {
915 .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
916 .renderArea = render->area,
917 .layerCount = render->layer_count,
918 .viewMask = render->view_mask,
919 .colorAttachmentCount = render->color_att_count,
920 .pColorAttachments = vk_color_att,
921 .pDepthAttachment = &vk_depth_att,
922 .pStencilAttachment = &vk_stencil_att,
923 };
924
925 if (render->flags & VK_RENDERING_SUSPENDING_BIT)
926 need_resolve = false;
927
928 memset(render, 0, sizeof(*render));
929
930 if (need_resolve) {
931 hk_meta_resolve_rendering(cmd, &vk_render);
932 }
933 }
934
935 static uint64_t
hk_geometry_state(struct hk_cmd_buffer * cmd)936 hk_geometry_state(struct hk_cmd_buffer *cmd)
937 {
938 struct hk_device *dev = hk_cmd_buffer_device(cmd);
939
940 /* We tie heap allocation to geometry state allocation, so allocate now. */
941 if (unlikely(!dev->heap)) {
942 size_t size = 128 * 1024 * 1024;
943 dev->heap = agx_bo_create(&dev->dev, size, 0, 0, "Geometry heap");
944
945 /* The geometry state buffer is initialized here and then is treated by
946 * the CPU as rodata, even though the GPU uses it for scratch internally.
947 */
948 off_t off = dev->rodata.geometry_state - dev->rodata.bo->va->addr;
949 struct agx_geometry_state *map = dev->rodata.bo->map + off;
950
951 *map = (struct agx_geometry_state){
952 .heap = dev->heap->va->addr,
953 .heap_size = size,
954 };
955 }
956
957 /* We need to free all allocations after each command buffer execution */
958 if (!cmd->uses_heap) {
959 uint64_t addr = dev->rodata.geometry_state;
960
961 /* Zeroing the allocated index frees everything */
962 hk_queue_write(cmd,
963 addr + offsetof(struct agx_geometry_state, heap_bottom), 0,
964 true /* after gfx */);
965
966 cmd->uses_heap = true;
967 }
968
969 return dev->rodata.geometry_state;
970 }
971
972 static uint64_t
hk_upload_gsi_params(struct hk_cmd_buffer * cmd,struct hk_draw draw)973 hk_upload_gsi_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
974 {
975 struct hk_device *dev = hk_cmd_buffer_device(cmd);
976 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
977 struct hk_graphics_state *gfx = &cmd->state.gfx;
978 struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
979
980 unsigned index_size_B =
981 draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
982
983 uint64_t vb;
984 if (cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL]) {
985 assert(index_size_B == 4);
986
987 vb = desc->root.draw.tess_params +
988 offsetof(struct libagx_tess_args, tes_buffer);
989 } else {
990 vb = desc->root.root_desc_addr +
991 offsetof(struct hk_root_descriptor_table, draw.vertex_output_buffer);
992 }
993
994 struct agx_gs_setup_indirect_params gsi = {
995 .index_buffer = draw.index.addr,
996 .index_size_B = index_size_B,
997 .index_buffer_range_el = draw.index.range / index_size_B,
998 .zero_sink = dev->rodata.zero_sink,
999 .draw = draw.b.ptr,
1000 .vertex_buffer = vb,
1001 .ia = desc->root.draw.input_assembly,
1002 .geom = desc->root.draw.geometry_params,
1003 .vs_outputs = vs->b.info.outputs,
1004 };
1005
1006 return hk_pool_upload(cmd, &gsi, sizeof(gsi), 8);
1007 }
1008
1009 static uint64_t
hk_upload_ia_params(struct hk_cmd_buffer * cmd,struct hk_draw draw)1010 hk_upload_ia_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
1011 {
1012 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1013 assert(!draw.b.indirect && "indirect params written by GPU");
1014
1015 struct agx_ia_state ia = {.verts_per_instance = draw.b.count[0]};
1016
1017 if (draw.indexed) {
1018 unsigned index_size_B = agx_index_size_to_B(draw.index_size);
1019 unsigned range_el = draw.index.range / index_size_B;
1020
1021 ia.index_buffer =
1022 libagx_index_buffer(draw.index.addr, range_el, draw.start,
1023 index_size_B, dev->rodata.zero_sink);
1024
1025 ia.index_buffer_range_el =
1026 libagx_index_buffer_range_el(range_el, draw.start);
1027 }
1028
1029 return hk_pool_upload(cmd, &ia, sizeof(ia), 8);
1030 }
1031
1032 static enum mesa_prim
hk_gs_in_prim(struct hk_cmd_buffer * cmd)1033 hk_gs_in_prim(struct hk_cmd_buffer *cmd)
1034 {
1035 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1036 struct hk_graphics_state *gfx = &cmd->state.gfx;
1037 struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
1038
1039 if (tes != NULL)
1040 return tes->variants[HK_GS_VARIANT_RAST].info.ts.out_prim;
1041 else
1042 return vk_conv_topology(dyn->ia.primitive_topology);
1043 }
1044
1045 static enum mesa_prim
hk_rast_prim(struct hk_cmd_buffer * cmd)1046 hk_rast_prim(struct hk_cmd_buffer *cmd)
1047 {
1048 struct hk_graphics_state *gfx = &cmd->state.gfx;
1049 struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1050
1051 if (gs != NULL)
1052 return gs->variants[HK_GS_VARIANT_RAST].info.gs.out_prim;
1053 else
1054 return hk_gs_in_prim(cmd);
1055 }
1056
1057 static uint64_t
hk_upload_geometry_params(struct hk_cmd_buffer * cmd,struct hk_draw draw)1058 hk_upload_geometry_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
1059 {
1060 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1061 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1062 struct hk_graphics_state *gfx = &cmd->state.gfx;
1063 struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1064 struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
1065
1066 bool rast_disc = dyn->rs.rasterizer_discard_enable;
1067 struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
1068
1069 /* XXX: We should deduplicate this logic */
1070 bool restart = (draw.indexed && draw.restart);
1071 bool indirect =
1072 draw.b.indirect || gfx->shaders[MESA_SHADER_TESS_EVAL] || restart;
1073 enum mesa_prim mode = hk_gs_in_prim(cmd);
1074
1075 if (restart) {
1076 mode = u_decomposed_prim(mode);
1077 }
1078
1079 struct agx_geometry_params params = {
1080 .state = hk_geometry_state(cmd),
1081 .indirect_desc = cmd->geom_indirect,
1082 .flat_outputs = fs ? fs->info.fs.interp.flat : 0,
1083 .input_topology = mode,
1084
1085 /* Overriden by the indirect setup kernel. As tess->GS is always indirect,
1086 * we can assume here that we're VS->GS.
1087 */
1088 .input_buffer = desc->root.draw.vertex_output_buffer,
1089 .input_mask = desc->root.draw.vertex_outputs,
1090 };
1091
1092 if (gfx->xfb_enabled) {
1093 for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb); ++i) {
1094 params.xfb_base_original[i] = gfx->xfb[i].addr;
1095 params.xfb_size[i] = gfx->xfb[i].range;
1096 params.xfb_offs_ptrs[i] = gfx->xfb_offsets + i * sizeof(uint32_t);
1097 }
1098 }
1099
1100 for (unsigned i = 0; i < ARRAY_SIZE(gfx->xfb_query); ++i) {
1101 uint64_t q = gfx->xfb_query[i];
1102
1103 if (q) {
1104 params.xfb_prims_generated_counter[i] = q;
1105 params.prims_generated_counter[i] = q + sizeof(uint64_t);
1106 }
1107 }
1108
1109 /* Calculate input primitive count for direct draws, and allocate the vertex
1110 * & count buffers. GPU calculates and allocates for indirect draws.
1111 */
1112 unsigned count_buffer_stride = count->info.gs.count_words * 4;
1113
1114 if (indirect) {
1115 params.count_buffer_stride = count_buffer_stride;
1116 params.vs_grid[2] = params.gs_grid[2] = 1;
1117 } else {
1118 uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
1119
1120 params.vs_grid[0] = verts;
1121 params.gs_grid[0] = u_decomposed_prims_for_vertices(mode, verts);
1122
1123 params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
1124 params.input_primitives = params.gs_grid[0] * instances;
1125
1126 unsigned size = params.input_primitives * count_buffer_stride;
1127 if (size) {
1128 params.count_buffer = hk_pool_alloc(cmd, size, 4).gpu;
1129 }
1130 }
1131
1132 desc->root_dirty = true;
1133 return hk_pool_upload(cmd, ¶ms, sizeof(params), 8);
1134 }
1135
1136 /*
1137 * Tessellation has a fast path where the tessellator generates a VDM Index List
1138 * command per patch, as well as a slow path using prefix sums to generate a
1139 * single combined API draw. We need the latter if tessellation is fed into
1140 * another software stage (geometry shading), or if we need accurate primitive
1141 * IDs in the linked fragment shader (since that would require a prefix sum
1142 * anyway).
1143 */
1144 static bool
hk_tess_needs_prefix_sum(struct hk_cmd_buffer * cmd)1145 hk_tess_needs_prefix_sum(struct hk_cmd_buffer *cmd)
1146 {
1147 struct hk_graphics_state *gfx = &cmd->state.gfx;
1148
1149 return gfx->shaders[MESA_SHADER_GEOMETRY] || gfx->generate_primitive_id;
1150 }
1151
1152 static uint64_t
hk_upload_tess_params(struct hk_cmd_buffer * cmd,struct hk_draw draw)1153 hk_upload_tess_params(struct hk_cmd_buffer *cmd, struct hk_draw draw)
1154 {
1155 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1156 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1157 struct hk_graphics_state *gfx = &cmd->state.gfx;
1158 struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
1159 struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]);
1160
1161 struct libagx_tess_args args = {
1162 .heap = hk_geometry_state(cmd),
1163 .tcs_stride_el = tcs->info.tcs.output_stride / 4,
1164 .statistic = hk_pipeline_stat_addr(
1165 cmd,
1166 VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT),
1167
1168 .input_patch_size = dyn->ts.patch_control_points,
1169 .output_patch_size = tcs->info.tcs.output_patch_size,
1170 .tcs_patch_constants = tcs->info.tcs.nr_patch_outputs,
1171 .tcs_per_vertex_outputs = tcs->info.tcs.per_vertex_outputs,
1172 };
1173
1174 bool with_counts = hk_tess_needs_prefix_sum(cmd);
1175
1176 /* This assumes !with_counts, if we have counts it's only one draw */
1177 uint32_t draw_stride_el = tes->info.ts.point_mode ? 4 : 6;
1178 size_t draw_stride_B = draw_stride_el * sizeof(uint32_t);
1179
1180 /* heap is allocated by hk_geometry_state */
1181 args.patch_coord_buffer = dev->heap->va->addr;
1182
1183 if (!draw.b.indirect) {
1184 unsigned in_patches = draw.b.count[0] / args.input_patch_size;
1185 if (in_patches == 0)
1186 unreachable("todo: drop the draw?");
1187
1188 unsigned unrolled_patches = in_patches * draw.b.count[1];
1189
1190 uint32_t alloc = 0;
1191 uint32_t tcs_out_offs = alloc;
1192 alloc += unrolled_patches * args.tcs_stride_el * 4 * 32;
1193
1194 uint32_t patch_coord_offs = alloc;
1195 alloc += unrolled_patches * 4 * 32;
1196
1197 uint32_t count_offs = alloc;
1198 if (with_counts)
1199 alloc += unrolled_patches * sizeof(uint32_t) * 32;
1200
1201 uint32_t draw_offs = alloc;
1202
1203 if (with_counts) {
1204 /* Single API draw */
1205 alloc += 5 * sizeof(uint32_t);
1206 } else {
1207 /* Padding added because VDM overreads */
1208 alloc += (draw_stride_B * unrolled_patches) +
1209 (AGX_VDM_BARRIER_LENGTH + 0x800);
1210 }
1211
1212 struct agx_ptr blob = hk_pool_alloc(cmd, alloc, 4);
1213 args.tcs_buffer = blob.gpu + tcs_out_offs;
1214 args.patches_per_instance = in_patches;
1215 args.coord_allocs = blob.gpu + patch_coord_offs;
1216 args.nr_patches = unrolled_patches;
1217 args.out_draws = blob.gpu + draw_offs;
1218
1219 gfx->tess_out_draws = args.out_draws;
1220
1221 if (with_counts) {
1222 args.counts = blob.gpu + count_offs;
1223 } else {
1224 /* Arrange so we return after all generated draws */
1225 uint8_t *ret = (uint8_t *)blob.cpu + draw_offs +
1226 (draw_stride_B * unrolled_patches);
1227
1228 agx_pack(ret, VDM_BARRIER, cfg) {
1229 cfg.returns = true;
1230 }
1231 }
1232 } else {
1233 unreachable("todo: indirect with tess");
1234 #if 0
1235 args.tcs_statistic = agx_get_query_address(
1236 batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
1237
1238 args.indirect = agx_indirect_buffer_ptr(batch, indirect);
1239
1240 /* Allocate 3x indirect global+local grids for VS/TCS/tess */
1241 uint32_t grid_stride = sizeof(uint32_t) * 6;
1242 args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
1243
1244 vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride);
1245 tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride);
1246 tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride);
1247
1248 args.vertex_outputs = ctx->vs->b.info.outputs;
1249 args.vertex_output_buffer_ptr =
1250 agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
1251
1252 batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr;
1253
1254 if (with_counts) {
1255 args.out_draws = agx_pool_alloc_aligned_with_bo(
1256 &batch->pool, draw_stride, 4, &draw_bo)
1257 .gpu;
1258 } else {
1259 unreachable("need an extra indirection...");
1260 }
1261 #endif
1262 }
1263
1264 return hk_pool_upload(cmd, &args, sizeof(args), 8);
1265 }
1266
1267 static struct hk_api_shader *
hk_build_meta_shader_locked(struct hk_device * dev,struct hk_internal_key * key,hk_internal_builder_t builder)1268 hk_build_meta_shader_locked(struct hk_device *dev, struct hk_internal_key *key,
1269 hk_internal_builder_t builder)
1270 {
1271 /* Try to get the cached shader */
1272 struct hash_entry *ent = _mesa_hash_table_search(dev->kernels.ht, key);
1273 if (ent)
1274 return ent->data;
1275
1276 nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE,
1277 &agx_nir_options, NULL);
1278 builder(&b, key->key);
1279
1280 const struct vk_pipeline_robustness_state rs = {
1281 .images = VK_PIPELINE_ROBUSTNESS_IMAGE_BEHAVIOR_DISABLED_EXT,
1282 .storage_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1283 .uniform_buffers = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1284 .vertex_inputs = VK_PIPELINE_ROBUSTNESS_BUFFER_BEHAVIOR_DISABLED_EXT,
1285 };
1286
1287 struct vk_shader_compile_info info = {
1288 .stage = b.shader->info.stage,
1289 .nir = b.shader,
1290 .robustness = &rs,
1291 };
1292
1293 /* We need to link libagx and assign shared before preprocessing, matching
1294 * what the driver would otherwise produce.
1295 */
1296 agx_link_libagx(b.shader, dev->dev.libagx);
1297
1298 if (info.stage == MESA_SHADER_COMPUTE) {
1299 NIR_PASS(_, b.shader, nir_lower_vars_to_explicit_types,
1300 nir_var_mem_shared, glsl_get_cl_type_size_align);
1301
1302 /* Commit to the layout so we don't clobber later */
1303 b.shader->info.shared_memory_explicit_layout = true;
1304
1305 NIR_PASS(_, b.shader, nir_lower_explicit_io, nir_var_mem_shared,
1306 nir_address_format_62bit_generic);
1307 }
1308
1309 hk_preprocess_nir_internal(dev->vk.physical, b.shader);
1310
1311 struct hk_api_shader *s;
1312 if (hk_compile_shader(dev, &info, NULL, NULL, &s) != VK_SUCCESS)
1313 return NULL;
1314
1315 /* ..and cache it before we return. The key is on the stack right now, so
1316 * clone it before using it as a hash table key. The clone is logically owned
1317 * by the hash table.
1318 */
1319 size_t total_key_size = sizeof(*key) + key->key_size;
1320 void *cloned_key = ralloc_memdup(dev->kernels.ht, key, total_key_size);
1321
1322 _mesa_hash_table_insert(dev->kernels.ht, cloned_key, s);
1323 return s;
1324 }
1325
1326 struct hk_api_shader *
hk_meta_shader(struct hk_device * dev,hk_internal_builder_t builder,void * data,size_t data_size)1327 hk_meta_shader(struct hk_device *dev, hk_internal_builder_t builder, void *data,
1328 size_t data_size)
1329 {
1330 size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
1331
1332 struct hk_internal_key *key = alloca(total_key_size);
1333 key->builder = builder;
1334 key->key_size = data_size;
1335
1336 if (data_size)
1337 memcpy(key->key, data, data_size);
1338
1339 simple_mtx_lock(&dev->kernels.lock);
1340 struct hk_api_shader *s = hk_build_meta_shader_locked(dev, key, builder);
1341 simple_mtx_unlock(&dev->kernels.lock);
1342
1343 return s;
1344 }
1345
1346 static struct hk_draw
hk_draw_without_restart(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_draw draw,uint32_t draw_count)1347 hk_draw_without_restart(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1348 struct hk_draw draw, uint32_t draw_count)
1349 {
1350 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1351 struct hk_graphics_state *gfx = &cmd->state.gfx;
1352 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1353
1354 perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
1355
1356 /* The unroll kernel assumes an indirect draw. Synthesize one if needed */
1357 if (!draw.b.indirect) {
1358 uint32_t desc[5] = {draw.b.count[0], draw.b.count[1], draw.start,
1359 draw.index_bias, draw.start_instance};
1360
1361 draw =
1362 hk_draw_indexed_indirect(hk_pool_upload(cmd, desc, sizeof(desc), 4),
1363 draw.index, draw.index_size, true);
1364 }
1365
1366 /* Next, we unroll the index buffer used by the indirect draw */
1367 struct agx_unroll_restart_key key = {
1368 .prim = vk_conv_topology(dyn->ia.primitive_topology),
1369 .index_size_B = agx_index_size_to_B(draw.index_size),
1370 };
1371
1372 struct agx_restart_unroll_params ia = {
1373 .heap = hk_geometry_state(cmd),
1374 .index_buffer = draw.index.addr,
1375 .count = hk_pool_upload(cmd, &draw_count, sizeof(uint32_t), 4),
1376 .draws = draw.b.ptr,
1377 .out_draws = hk_pool_alloc(cmd, 5 * sizeof(uint32_t) * draw_count, 4).gpu,
1378 .max_draws = 1 /* TODO: MDI */,
1379 .restart_index = gfx->index.restart,
1380 .index_buffer_size_el = draw.index.range / key.index_size_B,
1381 .flatshade_first =
1382 dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT,
1383 .zero_sink = dev->rodata.zero_sink,
1384 };
1385
1386 struct hk_shader *s =
1387 hk_meta_kernel(dev, agx_nir_unroll_restart, &key, sizeof(key));
1388
1389 uint64_t params = hk_pool_upload(cmd, &ia, sizeof(ia), 8);
1390 uint32_t usc = hk_upload_usc_words_kernel(cmd, s, ¶ms, sizeof(params));
1391 hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1024 * draw_count, 1, 1),
1392 hk_grid(1024, 1, 1));
1393
1394 struct hk_addr_range out_index = {
1395 .addr = dev->heap->va->addr,
1396 .range = dev->heap->size,
1397 };
1398
1399 return hk_draw_indexed_indirect(ia.out_draws, out_index, draw.index_size,
1400 false /* restart */);
1401 }
1402
1403 static struct hk_draw
hk_launch_gs_prerast(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_draw draw)1404 hk_launch_gs_prerast(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
1405 struct hk_draw draw)
1406 {
1407 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1408 struct hk_graphics_state *gfx = &cmd->state.gfx;
1409 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1410 struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
1411 struct hk_grid grid_vs, grid_gs;
1412
1413 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1414 bool rast_disc = dyn->rs.rasterizer_discard_enable;
1415
1416 hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
1417
1418 struct hk_shader *vs = hk_bound_sw_vs_before_gs(gfx);
1419 struct hk_shader *main = hk_main_gs_variant(gs, rast_disc);
1420 struct hk_shader *count = hk_count_gs_variant(gs, rast_disc);
1421 struct hk_shader *pre_gs = hk_pre_gs_variant(gs, rast_disc);
1422
1423 unsigned count_words = count->info.gs.count_words;
1424
1425 if (false /* TODO */)
1426 perf_debug(dev, "Transform feedbck");
1427 else if (count_words)
1428 perf_debug(dev, "Geometry shader with counts");
1429 else
1430 perf_debug(dev, "Geometry shader without counts");
1431
1432 enum mesa_prim mode = hk_gs_in_prim(cmd);
1433
1434 if (draw.indexed && draw.restart) {
1435 draw = hk_draw_without_restart(cmd, cs, draw, 1);
1436 mode = u_decomposed_prim(mode);
1437 }
1438
1439 /* Setup grids */
1440 if (draw.b.indirect) {
1441 struct agx_gs_setup_indirect_key key = {.prim = mode};
1442
1443 struct hk_shader *gsi =
1444 hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key));
1445
1446 uint64_t push = hk_upload_gsi_params(cmd, draw);
1447 uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push));
1448
1449 hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1),
1450 hk_grid(1, 1, 1));
1451
1452 uint64_t geometry_params = desc->root.draw.geometry_params;
1453 grid_vs = hk_grid_indirect(geometry_params +
1454 offsetof(struct agx_geometry_params, vs_grid));
1455
1456 grid_gs = hk_grid_indirect(geometry_params +
1457 offsetof(struct agx_geometry_params, gs_grid));
1458 } else {
1459 grid_vs = grid_gs = draw.b;
1460 grid_gs.count[0] = u_decomposed_prims_for_vertices(mode, draw.b.count[0]);
1461 }
1462
1463 /* Launch the vertex shader first */
1464 hk_reserve_scratch(cmd, cs, vs);
1465 hk_dispatch_with_usc(dev, cs, vs,
1466 hk_upload_usc_words(cmd, vs,
1467 vs->info.stage == MESA_SHADER_VERTEX
1468 ? gfx->linked[MESA_SHADER_VERTEX]
1469 : vs->only_linked),
1470 grid_vs, hk_grid(1, 1, 1));
1471
1472 /* If we need counts, launch the count shader and prefix sum the results. */
1473 if (count_words) {
1474 hk_dispatch_with_local_size(cmd, cs, count, grid_gs, hk_grid(1, 1, 1));
1475
1476 struct hk_api_shader *prefix_sum = hk_meta_shader(
1477 dev, agx_nir_prefix_sum_gs, &count_words, sizeof(count_words));
1478
1479 /* XXX: hack */
1480 hk_only_variant(prefix_sum)->info.stage = MESA_SHADER_GEOMETRY;
1481
1482 hk_dispatch_with_local_size(cmd, cs, hk_only_variant(prefix_sum),
1483 hk_grid(1024 * count_words, 1, 1),
1484 hk_grid(1024, 1, 1));
1485 }
1486
1487 /* Pre-GS shader */
1488 hk_dispatch_with_local_size(cmd, cs, pre_gs, hk_grid(1, 1, 1),
1489 hk_grid(1, 1, 1));
1490
1491 /* Pre-rast geometry shader */
1492 hk_dispatch_with_local_size(cmd, cs, main, grid_gs, hk_grid(1, 1, 1));
1493
1494 struct hk_addr_range range = (struct hk_addr_range){
1495 .addr = dev->heap->va->addr,
1496 .range = dev->heap->size,
1497 };
1498
1499 bool restart = cmd->state.gfx.topology != AGX_PRIMITIVE_POINTS;
1500 return hk_draw_indexed_indirect(cmd->geom_indirect, range,
1501 AGX_INDEX_SIZE_U32, restart);
1502 }
1503
1504 static struct hk_draw
hk_launch_tess(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_draw draw)1505 hk_launch_tess(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw)
1506 {
1507 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1508 struct hk_graphics_state *gfx = &cmd->state.gfx;
1509 struct hk_grid grid_vs, grid_tcs, grid_tess;
1510
1511 struct hk_shader *vs = hk_bound_sw_vs(gfx);
1512 struct hk_shader *tcs = hk_only_variant(gfx->shaders[MESA_SHADER_TESS_CTRL]);
1513 struct hk_shader *tes = hk_any_variant(gfx->shaders[MESA_SHADER_TESS_EVAL]);
1514
1515 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1516 uint32_t input_patch_size = dyn->ts.patch_control_points;
1517
1518 hk_ensure_cs_has_space(cmd, cs, 0x2000 /*XXX*/);
1519
1520 perf_debug(dev, "Tessellation");
1521
1522 uint64_t tcs_stat = hk_pipeline_stat_addr(
1523 cmd, VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT);
1524
1525 /* Setup grids */
1526 if (draw.b.indirect) {
1527 unreachable("todo: indirect tess");
1528 #if 0
1529 struct agx_gs_setup_indirect_key key = {.prim = mode};
1530
1531 struct hk_shader *gsi =
1532 hk_meta_kernel(dev, agx_nir_gs_setup_indirect, &key, sizeof(key));
1533
1534 uint64_t push = hk_upload_gsi_params(cmd, draw);
1535 uint32_t usc = hk_upload_usc_words_kernel(cmd, gsi, &push, sizeof(push));
1536
1537 hk_dispatch_with_usc(dev, cs, gsi, usc, hk_grid(1, 1, 1),
1538 hk_grid(1, 1, 1));
1539
1540 uint64_t geometry_params = desc->root.draw.geometry_params;
1541 grid_vs = hk_grid_indirect(geometry_params +
1542 offsetof(struct agx_geometry_params, vs_grid));
1543
1544 grid_gs = hk_grid_indirect(geometry_params +
1545 offsetof(struct agx_geometry_params, gs_grid));
1546 #endif
1547 } else {
1548 uint32_t patches = draw.b.count[0] / input_patch_size;
1549 grid_vs = grid_tcs = draw.b;
1550
1551 grid_tcs.count[0] = patches * tcs->info.tcs.output_patch_size;
1552 grid_tess = hk_grid(patches * draw.b.count[1], 1, 1);
1553
1554 /* TCS invocation counter increments once per-patch */
1555 if (tcs_stat) {
1556 perf_debug(dev, "Direct TCS statistic");
1557
1558 struct libagx_increment_params args = {
1559 .statistic = tcs_stat,
1560 .delta = patches,
1561 };
1562
1563 struct hk_shader *s =
1564 hk_meta_kernel(dev, agx_nir_increment_statistic, NULL, 0);
1565
1566 uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8);
1567 uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
1568
1569 hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1),
1570 hk_grid(1, 1, 1));
1571 }
1572 }
1573
1574 /* First launch the VS and TCS */
1575 hk_reserve_scratch(cmd, cs, vs);
1576 hk_reserve_scratch(cmd, cs, tcs);
1577
1578 hk_dispatch_with_usc(
1579 dev, cs, vs,
1580 hk_upload_usc_words(cmd, vs, gfx->linked[MESA_SHADER_VERTEX]), grid_vs,
1581 hk_grid(64, 1, 1));
1582
1583 hk_dispatch_with_usc(
1584 dev, cs, tcs, hk_upload_usc_words(cmd, tcs, tcs->only_linked), grid_tcs,
1585 hk_grid(tcs->info.tcs.output_patch_size, 1, 1));
1586
1587 /* TODO indirect */
1588
1589 bool with_counts = hk_tess_needs_prefix_sum(cmd);
1590 uint64_t state = gfx->descriptors.root.draw.tess_params;
1591
1592 /* If the domain is flipped, we need to flip the winding order */
1593 bool ccw = tes->info.ts.ccw;
1594 ccw ^= dyn->ts.domain_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT;
1595
1596 enum libagx_tess_partitioning partitioning =
1597 tes->info.ts.spacing == TESS_SPACING_EQUAL
1598 ? LIBAGX_TESS_PARTITIONING_INTEGER
1599 : tes->info.ts.spacing == TESS_SPACING_FRACTIONAL_ODD
1600 ? LIBAGX_TESS_PARTITIONING_FRACTIONAL_ODD
1601 : LIBAGX_TESS_PARTITIONING_FRACTIONAL_EVEN;
1602
1603 enum libagx_tess_output_primitive prim =
1604 tes->info.ts.point_mode ? LIBAGX_TESS_OUTPUT_POINT
1605 : ccw ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW
1606 : LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
1607
1608 struct agx_tessellator_key key = {
1609 .prim = tes->info.ts.mode,
1610 .output_primitive = prim,
1611 .partitioning = partitioning,
1612 };
1613
1614 if (with_counts) {
1615 perf_debug(dev, "Tessellation with counts");
1616
1617 /* Generate counts */
1618 key.mode = LIBAGX_TESS_MODE_COUNT;
1619 {
1620 struct hk_shader *tess =
1621 hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key));
1622
1623 hk_dispatch_with_usc(
1624 dev, cs, tess,
1625 hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)),
1626 grid_tess, hk_grid(64, 1, 1));
1627 }
1628
1629 /* Prefix sum counts, allocating index buffer space. */
1630 {
1631 struct hk_shader *sum =
1632 hk_meta_kernel(dev, agx_nir_prefix_sum_tess, NULL, 0);
1633
1634 hk_dispatch_with_usc(
1635 dev, cs, sum,
1636 hk_upload_usc_words_kernel(cmd, sum, &state, sizeof(state)),
1637 hk_grid(1024, 1, 1), hk_grid(1024, 1, 1));
1638 }
1639
1640 key.mode = LIBAGX_TESS_MODE_WITH_COUNTS;
1641 } else {
1642 key.mode = LIBAGX_TESS_MODE_VDM;
1643 }
1644
1645 /* Now we can tessellate */
1646 {
1647 struct hk_shader *tess =
1648 hk_meta_kernel(dev, agx_nir_tessellate, &key, sizeof(key));
1649
1650 hk_dispatch_with_usc(
1651 dev, cs, tess,
1652 hk_upload_usc_words_kernel(cmd, tess, &state, sizeof(state)),
1653 grid_tess, hk_grid(64, 1, 1));
1654 }
1655
1656 struct hk_addr_range range = (struct hk_addr_range){
1657 .addr = dev->heap->va->addr,
1658 .range = dev->heap->size,
1659 };
1660
1661 struct hk_draw out = hk_draw_indexed_indirect(gfx->tess_out_draws, range,
1662 AGX_INDEX_SIZE_U32, false);
1663 out.raw = !with_counts;
1664 return out;
1665 }
1666
1667 void
hk_cmd_bind_graphics_shader(struct hk_cmd_buffer * cmd,const gl_shader_stage stage,struct hk_api_shader * shader)1668 hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
1669 const gl_shader_stage stage,
1670 struct hk_api_shader *shader)
1671 {
1672 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
1673
1674 assert(stage < ARRAY_SIZE(cmd->state.gfx.shaders));
1675 if (cmd->state.gfx.shaders[stage] == shader)
1676 return;
1677
1678 cmd->state.gfx.shaders[stage] = shader;
1679 cmd->state.gfx.shaders_dirty |= BITFIELD_BIT(stage);
1680
1681 if (stage == MESA_SHADER_FRAGMENT) {
1682 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
1683 }
1684 }
1685
1686 static uint32_t
hk_pipeline_bind_group(gl_shader_stage stage)1687 hk_pipeline_bind_group(gl_shader_stage stage)
1688 {
1689 return stage;
1690 }
1691
1692 static void
hk_flush_shaders(struct hk_cmd_buffer * cmd)1693 hk_flush_shaders(struct hk_cmd_buffer *cmd)
1694 {
1695 if (cmd->state.gfx.shaders_dirty == 0)
1696 return;
1697
1698 /* Map shader types to shaders */
1699 struct hk_api_shader *type_shader[6] = {
1700 NULL,
1701 };
1702 uint32_t types_dirty = 0;
1703
1704 const uint32_t gfx_stages =
1705 BITFIELD_BIT(MESA_SHADER_VERTEX) | BITFIELD_BIT(MESA_SHADER_TESS_CTRL) |
1706 BITFIELD_BIT(MESA_SHADER_TESS_EVAL) | BITFIELD_BIT(MESA_SHADER_GEOMETRY) |
1707 BITFIELD_BIT(MESA_SHADER_FRAGMENT);
1708
1709 /* Geometry shading overrides the restart index, reemit on rebind */
1710 if (IS_SHADER_DIRTY(GEOMETRY)) {
1711 cmd->state.gfx.dirty |= HK_DIRTY_INDEX;
1712 }
1713
1714 u_foreach_bit(stage, cmd->state.gfx.shaders_dirty & gfx_stages) {
1715 /* TODO: compact? */
1716 uint32_t type = stage;
1717 types_dirty |= BITFIELD_BIT(type);
1718
1719 /* Only copy non-NULL shaders because mesh/task alias with vertex and
1720 * tessellation stages.
1721 */
1722 if (cmd->state.gfx.shaders[stage] != NULL) {
1723 assert(type < ARRAY_SIZE(type_shader));
1724 assert(type_shader[type] == NULL);
1725 type_shader[type] = cmd->state.gfx.shaders[stage];
1726 }
1727 }
1728
1729 u_foreach_bit(type, types_dirty) {
1730 struct hk_api_shader *shader = type_shader[type];
1731
1732 /* We always map index == type */
1733 // const uint32_t idx = type;
1734
1735 if (shader == NULL)
1736 continue;
1737
1738 /* TODO */
1739 }
1740
1741 struct hk_graphics_state *gfx = &cmd->state.gfx;
1742 struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
1743 struct hk_api_shader *fs = gfx->shaders[MESA_SHADER_FRAGMENT];
1744
1745 /* If we have a new VS/FS pair, UVS locations may have changed so need to
1746 * relink. We do this here because there's no dependence on the fast linked
1747 * shaders.
1748 */
1749 agx_assign_uvs(&gfx->linked_varyings, &hw_vs->info.uvs,
1750 fs ? hk_only_variant(fs)->info.fs.interp.flat : 0,
1751 fs ? hk_only_variant(fs)->info.fs.interp.linear : 0);
1752
1753 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
1754 desc->root_dirty = true;
1755
1756 for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
1757 desc->root.draw.uvs_index[i] = gfx->linked_varyings.slots[i];
1758 }
1759 }
1760
1761 static struct agx_shader_part *
hk_get_prolog_epilog_locked(struct hk_device * dev,struct hk_internal_key * key,hk_internal_builder_t builder,bool preprocess_nir,bool stop,unsigned cf_base)1762 hk_get_prolog_epilog_locked(struct hk_device *dev, struct hk_internal_key *key,
1763 hk_internal_builder_t builder, bool preprocess_nir,
1764 bool stop, unsigned cf_base)
1765 {
1766 /* Try to get the cached shader */
1767 struct hash_entry *ent = _mesa_hash_table_search(dev->prolog_epilog.ht, key);
1768 if (ent)
1769 return ent->data;
1770
1771 nir_builder b = nir_builder_init_simple_shader(0, &agx_nir_options, NULL);
1772 builder(&b, key->key);
1773
1774 if (preprocess_nir)
1775 agx_preprocess_nir(b.shader, dev->dev.libagx);
1776
1777 struct agx_shader_key backend_key = {
1778 .dev = agx_gather_device_key(&dev->dev),
1779 .libagx = dev->dev.libagx,
1780 .secondary = true,
1781 .no_stop = !stop,
1782 };
1783
1784 /* We always use dynamic sample shading in the GL driver. Indicate that. */
1785 if (b.shader->info.stage == MESA_SHADER_FRAGMENT) {
1786 backend_key.fs.cf_base = cf_base;
1787
1788 if (b.shader->info.fs.uses_sample_shading)
1789 backend_key.fs.inside_sample_loop = true;
1790 }
1791
1792 struct agx_shader_part *part =
1793 rzalloc(dev->prolog_epilog.ht, struct agx_shader_part);
1794
1795 agx_compile_shader_nir(b.shader, &backend_key, NULL, part);
1796
1797 ralloc_free(b.shader);
1798
1799 /* ..and cache it before we return. The key is on the stack right now, so
1800 * clone it before using it as a hash table key. The clone is logically owned
1801 * by the hash table.
1802 */
1803 size_t total_key_size = sizeof(*key) + key->key_size;
1804 void *cloned_key = ralloc_memdup(dev->prolog_epilog.ht, key, total_key_size);
1805
1806 _mesa_hash_table_insert(dev->prolog_epilog.ht, cloned_key, part);
1807 return part;
1808 }
1809
1810 static struct agx_shader_part *
hk_get_prolog_epilog(struct hk_device * dev,void * data,size_t data_size,hk_internal_builder_t builder,bool preprocess_nir,bool stop,unsigned cf_base)1811 hk_get_prolog_epilog(struct hk_device *dev, void *data, size_t data_size,
1812 hk_internal_builder_t builder, bool preprocess_nir,
1813 bool stop, unsigned cf_base)
1814 {
1815 /* Build the meta shader key */
1816 size_t total_key_size = sizeof(struct hk_internal_key) + data_size;
1817
1818 struct hk_internal_key *key = alloca(total_key_size);
1819 key->builder = builder;
1820 key->key_size = data_size;
1821
1822 if (data_size)
1823 memcpy(key->key, data, data_size);
1824
1825 simple_mtx_lock(&dev->prolog_epilog.lock);
1826
1827 struct agx_shader_part *part = hk_get_prolog_epilog_locked(
1828 dev, key, builder, preprocess_nir, stop, cf_base);
1829
1830 simple_mtx_unlock(&dev->prolog_epilog.lock);
1831 return part;
1832 }
1833
1834 static struct hk_linked_shader *
hk_get_fast_linked_locked_vs(struct hk_device * dev,struct hk_shader * shader,struct hk_fast_link_key_vs * key)1835 hk_get_fast_linked_locked_vs(struct hk_device *dev, struct hk_shader *shader,
1836 struct hk_fast_link_key_vs *key)
1837 {
1838 struct agx_shader_part *prolog =
1839 hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
1840 agx_nir_vs_prolog, false, false, 0);
1841
1842 struct hk_linked_shader *linked =
1843 hk_fast_link(dev, false, shader, prolog, NULL, 0);
1844
1845 struct hk_fast_link_key *key_clone =
1846 ralloc_memdup(shader->linked.ht, key, sizeof(*key));
1847
1848 /* XXX: Fix this higher up the stack */
1849 linked->b.uses_base_param |= !key->prolog.hw;
1850
1851 _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
1852 return linked;
1853 }
1854
1855 static void
build_fs_prolog(nir_builder * b,const void * key)1856 build_fs_prolog(nir_builder *b, const void *key)
1857 {
1858 agx_nir_fs_prolog(b, key);
1859
1860 /* Lower load_stat_query_address_agx, needed for FS statistics */
1861 NIR_PASS(_, b->shader, hk_lower_uvs_index, 0);
1862 }
1863
1864 static struct hk_linked_shader *
hk_get_fast_linked_locked_fs(struct hk_device * dev,struct hk_shader * shader,struct hk_fast_link_key_fs * key)1865 hk_get_fast_linked_locked_fs(struct hk_device *dev, struct hk_shader *shader,
1866 struct hk_fast_link_key_fs *key)
1867 {
1868 /* TODO: prolog without fs needs to work too... */
1869 bool needs_prolog = key->prolog.statistics ||
1870 key->prolog.cull_distance_size ||
1871 key->prolog.api_sample_mask != 0xff;
1872
1873 struct agx_shader_part *prolog = NULL;
1874 if (needs_prolog) {
1875 prolog = hk_get_prolog_epilog(dev, &key->prolog, sizeof(key->prolog),
1876 build_fs_prolog, false, false,
1877 key->prolog.cf_base);
1878 }
1879
1880 /* If sample shading is used, don't stop at the epilog, there's a
1881 * footer that the fast linker will insert to stop.
1882 */
1883 bool epilog_stop = (key->nr_samples_shaded == 0);
1884
1885 struct agx_shader_part *epilog =
1886 hk_get_prolog_epilog(dev, &key->epilog, sizeof(key->epilog),
1887 agx_nir_fs_epilog, true, epilog_stop, 0);
1888
1889 struct hk_linked_shader *linked =
1890 hk_fast_link(dev, true, shader, prolog, epilog, key->nr_samples_shaded);
1891
1892 struct hk_fast_link_key *key_clone =
1893 ralloc_memdup(shader->linked.ht, key, sizeof(*key));
1894
1895 _mesa_hash_table_insert(shader->linked.ht, key_clone, linked);
1896 return linked;
1897 }
1898
1899 /*
1900 * First, look for a fully linked variant. Else, build the required shader
1901 * parts and link.
1902 */
1903 static struct hk_linked_shader *
hk_get_fast_linked(struct hk_device * dev,struct hk_shader * shader,void * key)1904 hk_get_fast_linked(struct hk_device *dev, struct hk_shader *shader, void *key)
1905 {
1906 struct hk_linked_shader *linked;
1907 simple_mtx_lock(&shader->linked.lock);
1908
1909 struct hash_entry *ent = _mesa_hash_table_search(shader->linked.ht, key);
1910
1911 if (ent)
1912 linked = ent->data;
1913 else if (shader->info.stage == MESA_SHADER_VERTEX)
1914 linked = hk_get_fast_linked_locked_vs(dev, shader, key);
1915 else if (shader->info.stage == MESA_SHADER_FRAGMENT)
1916 linked = hk_get_fast_linked_locked_fs(dev, shader, key);
1917 else
1918 unreachable("invalid stage");
1919
1920 simple_mtx_unlock(&shader->linked.lock);
1921 return linked;
1922 }
1923
1924 static void
hk_update_fast_linked(struct hk_cmd_buffer * cmd,struct hk_shader * shader,void * key)1925 hk_update_fast_linked(struct hk_cmd_buffer *cmd, struct hk_shader *shader,
1926 void *key)
1927 {
1928 struct hk_device *dev = hk_cmd_buffer_device(cmd);
1929 struct hk_linked_shader *new = hk_get_fast_linked(dev, shader, key);
1930 gl_shader_stage stage = shader->info.stage;
1931
1932 if (cmd->state.gfx.linked[stage] != new) {
1933 cmd->state.gfx.linked[stage] = new;
1934 cmd->state.gfx.linked_dirty |= BITFIELD_BIT(stage);
1935 }
1936 }
1937
1938 static enum agx_polygon_mode
translate_polygon_mode(VkPolygonMode vk_mode)1939 translate_polygon_mode(VkPolygonMode vk_mode)
1940 {
1941 static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_FILL ==
1942 AGX_POLYGON_MODE_FILL);
1943 static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_LINE ==
1944 AGX_POLYGON_MODE_LINE);
1945 static_assert((enum agx_polygon_mode)VK_POLYGON_MODE_POINT ==
1946 AGX_POLYGON_MODE_POINT);
1947
1948 assert(vk_mode <= VK_POLYGON_MODE_POINT);
1949 return (enum agx_polygon_mode)vk_mode;
1950 }
1951
1952 static enum agx_zs_func
translate_compare_op(VkCompareOp vk_mode)1953 translate_compare_op(VkCompareOp vk_mode)
1954 {
1955 static_assert((enum agx_zs_func)VK_COMPARE_OP_NEVER == AGX_ZS_FUNC_NEVER);
1956 static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS == AGX_ZS_FUNC_LESS);
1957 static_assert((enum agx_zs_func)VK_COMPARE_OP_EQUAL == AGX_ZS_FUNC_EQUAL);
1958 static_assert((enum agx_zs_func)VK_COMPARE_OP_LESS_OR_EQUAL ==
1959 AGX_ZS_FUNC_LEQUAL);
1960 static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER ==
1961 AGX_ZS_FUNC_GREATER);
1962 static_assert((enum agx_zs_func)VK_COMPARE_OP_NOT_EQUAL ==
1963 AGX_ZS_FUNC_NOT_EQUAL);
1964 static_assert((enum agx_zs_func)VK_COMPARE_OP_GREATER_OR_EQUAL ==
1965 AGX_ZS_FUNC_GEQUAL);
1966 static_assert((enum agx_zs_func)VK_COMPARE_OP_ALWAYS == AGX_ZS_FUNC_ALWAYS);
1967
1968 assert(vk_mode <= VK_COMPARE_OP_ALWAYS);
1969 return (enum agx_zs_func)vk_mode;
1970 }
1971
1972 static enum agx_stencil_op
translate_stencil_op(VkStencilOp vk_op)1973 translate_stencil_op(VkStencilOp vk_op)
1974 {
1975 static_assert((enum agx_stencil_op)VK_STENCIL_OP_KEEP ==
1976 AGX_STENCIL_OP_KEEP);
1977 static_assert((enum agx_stencil_op)VK_STENCIL_OP_ZERO ==
1978 AGX_STENCIL_OP_ZERO);
1979 static_assert((enum agx_stencil_op)VK_STENCIL_OP_REPLACE ==
1980 AGX_STENCIL_OP_REPLACE);
1981 static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_CLAMP ==
1982 AGX_STENCIL_OP_INCR_SAT);
1983 static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_CLAMP ==
1984 AGX_STENCIL_OP_DECR_SAT);
1985 static_assert((enum agx_stencil_op)VK_STENCIL_OP_INVERT ==
1986 AGX_STENCIL_OP_INVERT);
1987 static_assert((enum agx_stencil_op)VK_STENCIL_OP_INCREMENT_AND_WRAP ==
1988 AGX_STENCIL_OP_INCR_WRAP);
1989 static_assert((enum agx_stencil_op)VK_STENCIL_OP_DECREMENT_AND_WRAP ==
1990 AGX_STENCIL_OP_DECR_WRAP);
1991
1992 return (enum agx_stencil_op)vk_op;
1993 }
1994
1995 static void
hk_ppp_push_stencil_face(struct agx_ppp_update * ppp,struct vk_stencil_test_face_state s,bool enabled)1996 hk_ppp_push_stencil_face(struct agx_ppp_update *ppp,
1997 struct vk_stencil_test_face_state s, bool enabled)
1998 {
1999 if (enabled) {
2000 agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
2001 cfg.compare = translate_compare_op(s.op.compare);
2002 cfg.write_mask = s.write_mask;
2003 cfg.read_mask = s.compare_mask;
2004
2005 cfg.depth_pass = translate_stencil_op(s.op.pass);
2006 cfg.depth_fail = translate_stencil_op(s.op.depth_fail);
2007 cfg.stencil_fail = translate_stencil_op(s.op.fail);
2008 }
2009 } else {
2010 agx_ppp_push(ppp, FRAGMENT_STENCIL, cfg) {
2011 cfg.compare = AGX_ZS_FUNC_ALWAYS;
2012 cfg.write_mask = 0xFF;
2013 cfg.read_mask = 0xFF;
2014
2015 cfg.depth_pass = AGX_STENCIL_OP_KEEP;
2016 cfg.depth_fail = AGX_STENCIL_OP_KEEP;
2017 cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
2018 }
2019 }
2020 }
2021
2022 static bool
hk_stencil_test_enabled(struct hk_cmd_buffer * cmd)2023 hk_stencil_test_enabled(struct hk_cmd_buffer *cmd)
2024 {
2025 const struct hk_rendering_state *render = &cmd->state.gfx.render;
2026 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2027
2028 return dyn->ds.stencil.test_enable &&
2029 render->stencil_att.vk_format != VK_FORMAT_UNDEFINED;
2030 }
2031
2032 static void
hk_flush_vp_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint8_t ** out)2033 hk_flush_vp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
2034 {
2035 const struct vk_dynamic_graphics_state *dyn =
2036 &cmd->vk.dynamic_graphics_state;
2037
2038 /* We always need at least 1 viewport for the hardware. With rasterizer
2039 * discard the app may not supply any, but we can just program garbage.
2040 */
2041 unsigned count = MAX2(dyn->vp.viewport_count, 1);
2042
2043 unsigned minx[HK_MAX_VIEWPORTS] = {0}, miny[HK_MAX_VIEWPORTS] = {0};
2044 unsigned maxx[HK_MAX_VIEWPORTS] = {0}, maxy[HK_MAX_VIEWPORTS] = {0};
2045
2046 /* We implicitly scissor to the viewport. We need to do a min/max dance to
2047 * handle inverted viewports.
2048 */
2049 for (uint32_t i = 0; i < dyn->vp.viewport_count; i++) {
2050 const VkViewport *vp = &dyn->vp.viewports[i];
2051
2052 minx[i] = MIN2(vp->x, vp->x + vp->width);
2053 miny[i] = MIN2(vp->y, vp->y + vp->height);
2054 maxx[i] = MAX2(vp->x, vp->x + vp->width);
2055 maxy[i] = MAX2(vp->y, vp->y + vp->height);
2056 }
2057
2058 /* Additionally clamp to the framebuffer so we don't rasterize
2059 * off-screen pixels. TODO: Is this necessary? the GL driver does this but
2060 * it might be cargoculted at this point.
2061 *
2062 * which is software-visible and can cause faults with
2063 * eMRT when the framebuffer is not a multiple of the tile size.
2064 */
2065 for (unsigned i = 0; i < count; ++i) {
2066 minx[i] = MIN2(minx[i], cmd->state.gfx.render.cr.width);
2067 maxx[i] = MIN2(maxx[i], cmd->state.gfx.render.cr.width);
2068 miny[i] = MIN2(miny[i], cmd->state.gfx.render.cr.height);
2069 maxy[i] = MIN2(maxy[i], cmd->state.gfx.render.cr.height);
2070 }
2071
2072 /* We additionally apply any API scissors */
2073 for (unsigned i = 0; i < dyn->vp.scissor_count; ++i) {
2074 const VkRect2D *s = &dyn->vp.scissors[i];
2075
2076 minx[i] = MAX2(minx[i], s->offset.x);
2077 miny[i] = MAX2(miny[i], s->offset.y);
2078 maxx[i] = MIN2(maxx[i], s->offset.x + s->extent.width);
2079 maxy[i] = MIN2(maxy[i], s->offset.y + s->extent.height);
2080 }
2081
2082 /* Upload a hardware scissor for each viewport, whether there's a
2083 * corresponding API scissor or not.
2084 */
2085 unsigned index = cs->scissor.size / AGX_SCISSOR_LENGTH;
2086 struct agx_scissor_packed *scissors =
2087 util_dynarray_grow_bytes(&cs->scissor, count, AGX_SCISSOR_LENGTH);
2088
2089 for (unsigned i = 0; i < count; ++i) {
2090 const VkViewport *vp = &dyn->vp.viewports[i];
2091
2092 agx_pack(scissors + i, SCISSOR, cfg) {
2093 cfg.min_x = minx[i];
2094 cfg.min_y = miny[i];
2095 cfg.max_x = maxx[i];
2096 cfg.max_y = maxy[i];
2097
2098 /* These settings in conjunction with the PPP control depth clip/clamp
2099 * settings implement depth clip/clamping. Properly setting them
2100 * together is required for conformant depth clip enable.
2101 *
2102 * TODO: Reverse-engineer the finer interactions here.
2103 */
2104 if (dyn->rs.depth_clamp_enable) {
2105 cfg.min_z = MIN2(vp->minDepth, vp->maxDepth);
2106 cfg.max_z = MAX2(vp->minDepth, vp->maxDepth);
2107 } else {
2108 cfg.min_z = 0.0;
2109 cfg.max_z = 1.0;
2110 }
2111 }
2112 }
2113
2114 /* Upload state */
2115 struct AGX_PPP_HEADER present = {
2116 .depth_bias_scissor = true,
2117 .region_clip = true,
2118 .viewport = true,
2119 .viewport_count = count,
2120 };
2121
2122 size_t size = agx_ppp_update_size(&present);
2123 struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
2124 if (!T.cpu)
2125 return;
2126
2127 struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
2128
2129 agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
2130 cfg.scissor = index;
2131
2132 /* Use the current depth bias, we allocate linearly */
2133 unsigned count = cs->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
2134 cfg.depth_bias = count ? count - 1 : 0;
2135 };
2136
2137 for (unsigned i = 0; i < count; ++i) {
2138 agx_ppp_push(&ppp, REGION_CLIP, cfg) {
2139 cfg.enable = true;
2140 cfg.min_x = minx[i] / 32;
2141 cfg.min_y = miny[i] / 32;
2142 cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
2143 cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
2144 }
2145 }
2146
2147 agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
2148 ;
2149
2150 /* Upload viewports */
2151 for (unsigned i = 0; i < count; ++i) {
2152 const VkViewport *vp = &dyn->vp.viewports[i];
2153
2154 agx_ppp_push(&ppp, VIEWPORT, cfg) {
2155 cfg.translate_x = vp->x + 0.5f * vp->width;
2156 cfg.translate_y = vp->y + 0.5f * vp->height;
2157 cfg.translate_z = vp->minDepth;
2158
2159 cfg.scale_x = vp->width * 0.5f;
2160 cfg.scale_y = vp->height * 0.5f;
2161 cfg.scale_z = vp->maxDepth - vp->minDepth;
2162 }
2163 }
2164
2165 agx_ppp_fini(out, &ppp);
2166 }
2167
2168 static enum agx_object_type
translate_object_type(enum mesa_prim topology)2169 translate_object_type(enum mesa_prim topology)
2170 {
2171 static_assert(MESA_PRIM_LINES < MESA_PRIM_LINE_STRIP);
2172 static_assert(MESA_PRIM_TRIANGLES >= MESA_PRIM_LINE_STRIP);
2173
2174 if (topology == MESA_PRIM_POINTS)
2175 return AGX_OBJECT_TYPE_POINT_SPRITE_UV01;
2176 else if (topology <= MESA_PRIM_LINE_STRIP)
2177 return AGX_OBJECT_TYPE_LINE;
2178 else
2179 return AGX_OBJECT_TYPE_TRIANGLE;
2180 }
2181
2182 static enum agx_primitive
translate_hw_primitive_topology(enum mesa_prim prim)2183 translate_hw_primitive_topology(enum mesa_prim prim)
2184 {
2185 switch (prim) {
2186 case MESA_PRIM_POINTS:
2187 return AGX_PRIMITIVE_POINTS;
2188 case MESA_PRIM_LINES:
2189 return AGX_PRIMITIVE_LINES;
2190 case MESA_PRIM_LINE_STRIP:
2191 return AGX_PRIMITIVE_LINE_STRIP;
2192 case MESA_PRIM_TRIANGLES:
2193 return AGX_PRIMITIVE_TRIANGLES;
2194 case MESA_PRIM_TRIANGLE_STRIP:
2195 return AGX_PRIMITIVE_TRIANGLE_STRIP;
2196 case MESA_PRIM_TRIANGLE_FAN:
2197 return AGX_PRIMITIVE_TRIANGLE_FAN;
2198 default:
2199 unreachable("Invalid hardware primitive topology");
2200 }
2201 }
2202
2203 static inline enum agx_vdm_vertex
translate_vdm_vertex(unsigned vtx)2204 translate_vdm_vertex(unsigned vtx)
2205 {
2206 static_assert(AGX_VDM_VERTEX_0 == 0);
2207 static_assert(AGX_VDM_VERTEX_1 == 1);
2208 static_assert(AGX_VDM_VERTEX_2 == 2);
2209
2210 assert(vtx <= 2);
2211 return vtx;
2212 }
2213
2214 static inline enum agx_ppp_vertex
translate_ppp_vertex(unsigned vtx)2215 translate_ppp_vertex(unsigned vtx)
2216 {
2217 static_assert(AGX_PPP_VERTEX_0 == 0 + 1);
2218 static_assert(AGX_PPP_VERTEX_1 == 1 + 1);
2219 static_assert(AGX_PPP_VERTEX_2 == 2 + 1);
2220
2221 assert(vtx <= 2);
2222 return vtx + 1;
2223 }
2224
2225 static void
hk_flush_index(struct hk_cmd_buffer * cmd,struct hk_cs * cs)2226 hk_flush_index(struct hk_cmd_buffer *cmd, struct hk_cs *cs)
2227 {
2228 uint8_t *out = cs->current;
2229 agx_push(out, VDM_STATE, cfg) {
2230 cfg.restart_index_present = true;
2231 }
2232
2233 agx_push(out, VDM_STATE_RESTART_INDEX, cfg) {
2234 if (cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY])
2235 cfg.value = BITFIELD_MASK(32);
2236 else
2237 cfg.value = cmd->state.gfx.index.restart;
2238 }
2239
2240 cs->current = out;
2241 }
2242
2243 /*
2244 * Return the given sample positions, packed into a 32-bit word with fixed
2245 * point nibbles for each x/y component of the (at most 4) samples. This is
2246 * suitable for programming the PPP_MULTISAMPLECTL control register.
2247 */
2248 static uint32_t
hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state * sl)2249 hk_pack_ppp_multisamplectrl(const struct vk_sample_locations_state *sl)
2250 {
2251 uint32_t ctrl = 0;
2252
2253 for (int32_t i = sl->per_pixel - 1; i >= 0; i--) {
2254 VkSampleLocationEXT loc = sl->locations[i];
2255
2256 uint32_t x = CLAMP(loc.x, 0.0f, 0.9375f) * 16.0;
2257 uint32_t y = CLAMP(loc.y, 0.0f, 0.9375f) * 16.0;
2258
2259 assert(x <= 15);
2260 assert(y <= 15);
2261
2262 /* Push bytes in reverse order so we can use constant shifts. */
2263 ctrl = (ctrl << 8) | (y << 4) | x;
2264 }
2265
2266 return ctrl;
2267 }
2268
2269 /*
2270 * Return the standard sample positions, prepacked as above for efficiency.
2271 */
2272 uint32_t
hk_default_sample_positions(unsigned nr_samples)2273 hk_default_sample_positions(unsigned nr_samples)
2274 {
2275 switch (nr_samples) {
2276 case 0:
2277 case 1:
2278 return 0x88;
2279 case 2:
2280 return 0x44cc;
2281 case 4:
2282 return 0xeaa26e26;
2283 default:
2284 unreachable("Invalid sample count");
2285 }
2286 }
2287
2288 static void
hk_flush_ppp_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint8_t ** out)2289 hk_flush_ppp_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs, uint8_t **out)
2290 {
2291 const struct hk_rendering_state *render = &cmd->state.gfx.render;
2292 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2293
2294 struct hk_graphics_state *gfx = &cmd->state.gfx;
2295 struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
2296 struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
2297
2298 bool hw_vs_dirty = IS_SHADER_DIRTY(VERTEX) || IS_SHADER_DIRTY(TESS_EVAL) ||
2299 IS_SHADER_DIRTY(GEOMETRY);
2300 bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
2301
2302 struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
2303 bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
2304
2305 bool varyings_dirty = gfx->dirty & HK_DIRTY_VARYINGS;
2306
2307 bool face_dirty =
2308 IS_DIRTY(DS_DEPTH_TEST_ENABLE) || IS_DIRTY(DS_DEPTH_WRITE_ENABLE) ||
2309 IS_DIRTY(DS_DEPTH_COMPARE_OP) || IS_DIRTY(DS_STENCIL_REFERENCE) ||
2310 IS_DIRTY(RS_LINE_WIDTH) || IS_DIRTY(RS_POLYGON_MODE) || fs_dirty;
2311
2312 bool stencil_face_dirty =
2313 IS_DIRTY(DS_STENCIL_OP) || IS_DIRTY(DS_STENCIL_COMPARE_MASK) ||
2314 IS_DIRTY(DS_STENCIL_WRITE_MASK) || IS_DIRTY(DS_STENCIL_TEST_ENABLE);
2315
2316 struct AGX_PPP_HEADER dirty = {
2317 .fragment_control =
2318 IS_DIRTY(DS_STENCIL_TEST_ENABLE) || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) ||
2319 IS_DIRTY(RS_DEPTH_BIAS_ENABLE) || gfx->dirty & HK_DIRTY_OCCLUSION,
2320
2321 .fragment_control_2 =
2322 IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_fs_dirty,
2323
2324 .fragment_front_face = face_dirty,
2325 .fragment_front_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
2326 .fragment_front_stencil = stencil_face_dirty,
2327 .fragment_back_face = face_dirty,
2328 .fragment_back_face_2 = fs_dirty || IS_DIRTY(IA_PRIMITIVE_TOPOLOGY),
2329 .fragment_back_stencil = stencil_face_dirty,
2330 .output_select = hw_vs_dirty || linked_fs_dirty || varyings_dirty,
2331 .varying_counts_32 = varyings_dirty,
2332 .varying_counts_16 = varyings_dirty,
2333 .cull =
2334 IS_DIRTY(RS_CULL_MODE) || IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) ||
2335 IS_DIRTY(RS_FRONT_FACE) || IS_DIRTY(RS_DEPTH_CLIP_ENABLE) ||
2336 IS_DIRTY(RS_DEPTH_CLAMP_ENABLE) || IS_DIRTY(RS_LINE_MODE) ||
2337 IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || (gfx->dirty & HK_DIRTY_PROVOKING),
2338 .cull_2 = varyings_dirty,
2339
2340 /* With a null FS, the fragment shader PPP word is ignored and doesn't
2341 * need to be present.
2342 */
2343 .fragment_shader = fs && (fs_dirty || linked_fs_dirty || varyings_dirty ||
2344 gfx->descriptors.root_dirty),
2345
2346 .occlusion_query = gfx->dirty & HK_DIRTY_OCCLUSION,
2347 .output_size = hw_vs_dirty,
2348 .viewport_count = 1, /* irrelevant */
2349 };
2350
2351 /* Calculate the update size. If it equals the header, there is nothing to
2352 * update so early-exit.
2353 */
2354 size_t size = agx_ppp_update_size(&dirty);
2355 if (size == AGX_PPP_HEADER_LENGTH)
2356 return;
2357
2358 /* Otherwise, allocate enough space for the update and push it. */
2359 assert(size > AGX_PPP_HEADER_LENGTH);
2360
2361 struct agx_ptr T = hk_pool_alloc(cmd, size, 64);
2362 if (!T.cpu)
2363 return;
2364
2365 struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
2366
2367 if (dirty.fragment_control) {
2368 agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
2369 cfg.visibility_mode = gfx->occlusion.mode;
2370 cfg.stencil_test_enable = hk_stencil_test_enabled(cmd);
2371
2372 /* TODO: Consider optimizing this? */
2373 cfg.two_sided_stencil = cfg.stencil_test_enable;
2374
2375 cfg.depth_bias_enable = dyn->rs.depth_bias.enable &&
2376 gfx->object_type == AGX_OBJECT_TYPE_TRIANGLE;
2377
2378 /* Always enable scissoring so we may scissor to the viewport (TODO:
2379 * optimize this out if the viewport is the default and the app does
2380 * not use the scissor test)
2381 */
2382 cfg.scissor_enable = true;
2383
2384 /* This avoids broken derivatives along primitive edges */
2385 cfg.disable_tri_merging = gfx->object_type != AGX_OBJECT_TYPE_TRIANGLE;
2386 }
2387 }
2388
2389 if (dirty.fragment_control_2) {
2390 if (linked_fs) {
2391 /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
2392 * main fragment control word and has to be combined into the secondary
2393 * word for reliable behaviour.
2394 */
2395 agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
2396 linked_fs->b.fragment_control) {
2397
2398 cfg.tag_write_disable = dyn->rs.rasterizer_discard_enable;
2399 }
2400 } else {
2401 /* If there is no fragment shader, we must disable tag writes to avoid
2402 * executing the missing shader. This optimizes depth-only passes.
2403 */
2404 agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
2405 cfg.tag_write_disable = true;
2406 cfg.pass_type = AGX_PASS_TYPE_OPAQUE;
2407 }
2408 }
2409 }
2410
2411 struct agx_fragment_face_packed fragment_face;
2412 struct agx_fragment_face_2_packed fragment_face_2;
2413
2414 if (dirty.fragment_front_face) {
2415 bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2416 bool z_test = has_z && dyn->ds.depth.test_enable;
2417
2418 agx_pack(&fragment_face, FRAGMENT_FACE, cfg) {
2419 cfg.line_width = agx_pack_line_width(dyn->rs.line.width);
2420 cfg.polygon_mode = translate_polygon_mode(dyn->rs.polygon_mode);
2421 cfg.disable_depth_write = !(z_test && dyn->ds.depth.write_enable);
2422
2423 if (z_test && !gfx->descriptors.root.draw.force_never_in_shader)
2424 cfg.depth_function = translate_compare_op(dyn->ds.depth.compare_op);
2425 else
2426 cfg.depth_function = AGX_ZS_FUNC_ALWAYS;
2427 };
2428
2429 agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
2430 cfg.stencil_reference = dyn->ds.stencil.front.reference;
2431 }
2432 }
2433
2434 if (dirty.fragment_front_face_2) {
2435 agx_pack(&fragment_face_2, FRAGMENT_FACE_2, cfg) {
2436 cfg.object_type = gfx->object_type;
2437
2438 /* TODO: flip the default? */
2439 if (fs)
2440 cfg.conservative_depth = 0;
2441 }
2442
2443 if (fs)
2444 agx_merge(fragment_face_2, fs->frag_face, FRAGMENT_FACE_2);
2445
2446 agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
2447 }
2448
2449 if (dirty.fragment_front_stencil) {
2450 hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.front,
2451 hk_stencil_test_enabled(cmd));
2452 }
2453
2454 if (dirty.fragment_back_face) {
2455 assert(dirty.fragment_front_face);
2456
2457 agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, fragment_face) {
2458 cfg.stencil_reference = dyn->ds.stencil.back.reference;
2459 }
2460 }
2461
2462 if (dirty.fragment_back_face_2) {
2463 assert(dirty.fragment_front_face_2);
2464
2465 agx_ppp_push_packed(&ppp, &fragment_face_2, FRAGMENT_FACE_2);
2466 }
2467
2468 if (dirty.fragment_back_stencil) {
2469 hk_ppp_push_stencil_face(&ppp, dyn->ds.stencil.back,
2470 hk_stencil_test_enabled(cmd));
2471 }
2472
2473 if (dirty.output_select) {
2474 struct agx_output_select_packed osel = hw_vs->info.uvs.osel;
2475
2476 if (linked_fs) {
2477 agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &osel,
2478 &linked_fs->b.osel);
2479 } else {
2480 agx_ppp_push_packed(&ppp, &osel, OUTPUT_SELECT);
2481 }
2482 }
2483
2484 assert(dirty.varying_counts_32 == dirty.varying_counts_16);
2485
2486 if (dirty.varying_counts_32) {
2487 agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_32,
2488 VARYING_COUNTS);
2489
2490 agx_ppp_push_packed(&ppp, &gfx->linked_varyings.counts_16,
2491 VARYING_COUNTS);
2492 }
2493
2494 if (dirty.cull) {
2495 agx_ppp_push(&ppp, CULL, cfg) {
2496 cfg.cull_front = dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT;
2497 cfg.cull_back = dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT;
2498 cfg.front_face_ccw = dyn->rs.front_face != VK_FRONT_FACE_CLOCKWISE;
2499 cfg.flat_shading_vertex = translate_ppp_vertex(gfx->provoking);
2500 cfg.rasterizer_discard = dyn->rs.rasterizer_discard_enable;
2501
2502 /* We do not support unrestricted depth, so clamping is inverted from
2503 * clipping. This implementation seems to pass CTS without unrestricted
2504 * depth support.
2505 *
2506 * TODO: Make sure this is right with gl_FragDepth.
2507 */
2508 cfg.depth_clip = vk_rasterization_state_depth_clip_enable(&dyn->rs);
2509 cfg.depth_clamp = !cfg.depth_clip;
2510
2511 cfg.primitive_msaa =
2512 gfx->object_type == AGX_OBJECT_TYPE_LINE &&
2513 dyn->rs.line.mode == VK_LINE_RASTERIZATION_MODE_BRESENHAM_KHR;
2514 }
2515 }
2516
2517 if (dirty.cull_2) {
2518 agx_ppp_push(&ppp, CULL_2, cfg) {
2519 cfg.needs_primitive_id = gfx->generate_primitive_id;
2520 }
2521 }
2522
2523 if (dirty.fragment_shader) {
2524 /* TODO: Do less often? */
2525 hk_reserve_scratch(cmd, cs, fs);
2526
2527 agx_ppp_push_packed(&ppp, &linked_fs->fs_counts, FRAGMENT_SHADER_WORD_0);
2528
2529 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
2530 cfg.pipeline = hk_upload_usc_words(cmd, fs, linked_fs);
2531 }
2532
2533 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
2534 cfg.cf_bindings = gfx->varyings;
2535 }
2536
2537 agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg)
2538 ;
2539 }
2540
2541 if (dirty.occlusion_query) {
2542 agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
2543 cfg.index = gfx->occlusion.index;
2544 }
2545 }
2546
2547 if (dirty.output_size) {
2548 agx_ppp_push(&ppp, OUTPUT_SIZE, cfg) {
2549 cfg.count = hw_vs->info.uvs.size;
2550 }
2551 }
2552
2553 agx_ppp_fini(out, &ppp);
2554 }
2555
2556 static void
hk_flush_dynamic_state(struct hk_cmd_buffer * cmd,struct hk_cs * cs,uint32_t draw_id,struct hk_draw draw)2557 hk_flush_dynamic_state(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
2558 uint32_t draw_id, struct hk_draw draw)
2559 {
2560 struct hk_device *dev = hk_cmd_buffer_device(cmd);
2561 const struct hk_rendering_state *render = &cmd->state.gfx.render;
2562 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
2563
2564 struct hk_graphics_state *gfx = &cmd->state.gfx;
2565
2566 struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
2567 struct hk_shader *sw_vs = hk_bound_sw_vs(gfx);
2568
2569 if (!vk_dynamic_graphics_state_any_dirty(dyn) &&
2570 !(gfx->dirty & ~HK_DIRTY_INDEX) && !gfx->descriptors.root_dirty &&
2571 !gfx->shaders_dirty && !sw_vs->b.info.uses_draw_id &&
2572 !sw_vs->b.info.uses_base_param &&
2573 !(gfx->linked[MESA_SHADER_VERTEX] &&
2574 gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param))
2575 return;
2576
2577 struct hk_descriptor_state *desc = &cmd->state.gfx.descriptors;
2578
2579 assert(cs->current + 0x1000 < cs->end && "already ensured space");
2580 uint8_t *out = cs->current;
2581
2582 struct hk_shader *fs = hk_only_variant(gfx->shaders[MESA_SHADER_FRAGMENT]);
2583
2584 bool gt_dirty = IS_SHADER_DIRTY(TESS_CTRL) || IS_SHADER_DIRTY(TESS_EVAL) ||
2585 IS_SHADER_DIRTY(GEOMETRY);
2586 bool vgt_dirty = IS_SHADER_DIRTY(VERTEX) || gt_dirty;
2587 bool fs_dirty = IS_SHADER_DIRTY(FRAGMENT);
2588
2589 if (IS_DIRTY(CB_BLEND_CONSTANTS)) {
2590 static_assert(sizeof(desc->root.draw.blend_constant) ==
2591 sizeof(dyn->cb.blend_constants) &&
2592 "common size");
2593
2594 memcpy(desc->root.draw.blend_constant, dyn->cb.blend_constants,
2595 sizeof(dyn->cb.blend_constants));
2596 desc->root_dirty = true;
2597 }
2598
2599 if (IS_DIRTY(MS_SAMPLE_MASK)) {
2600 desc->root.draw.api_sample_mask = dyn->ms.sample_mask;
2601 desc->root_dirty = true;
2602 }
2603
2604 if (fs_dirty || IS_DIRTY(DS_DEPTH_TEST_ENABLE) ||
2605 IS_DIRTY(DS_DEPTH_COMPARE_OP)) {
2606
2607 const struct hk_rendering_state *render = &cmd->state.gfx.render;
2608 bool has_z = render->depth_att.vk_format != VK_FORMAT_UNDEFINED;
2609 bool z_test = has_z && dyn->ds.depth.test_enable;
2610
2611 desc->root.draw.force_never_in_shader =
2612 z_test && dyn->ds.depth.compare_op == VK_COMPARE_OP_NEVER && fs &&
2613 fs->info.fs.writes_memory;
2614
2615 desc->root_dirty = true;
2616 }
2617
2618 /* The main shader must not run tests if the epilog will. */
2619 bool nontrivial_force_early =
2620 fs && (fs->b.info.early_fragment_tests &&
2621 (fs->b.info.writes_sample_mask || fs->info.fs.writes_memory));
2622
2623 bool epilog_discards = dyn->ms.alpha_to_coverage_enable ||
2624 (fs && (fs->info.fs.epilog_key.write_z ||
2625 fs->info.fs.epilog_key.write_s));
2626 epilog_discards &= !nontrivial_force_early;
2627
2628 if (fs_dirty || IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE)) {
2629 desc->root.draw.no_epilog_discard = !epilog_discards ? ~0 : 0;
2630 desc->root_dirty = true;
2631 }
2632
2633 if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) ||
2634 IS_DIRTY(VI_BINDING_STRIDES) || vgt_dirty || true /* TODO */) {
2635
2636 struct hk_fast_link_key_vs key = {
2637 .prolog.hw = (sw_vs == hw_vs),
2638
2639 /* FIXME: handle pipeline robustness "properly" */
2640 .prolog.robustness.level =
2641 (dev->vk.enabled_features.robustBufferAccess2 ||
2642 dev->vk.enabled_features.pipelineRobustness)
2643 ? AGX_ROBUSTNESS_D3D
2644 : AGX_ROBUSTNESS_GL,
2645
2646 .prolog.robustness.soft_fault = agx_has_soft_fault(&dev->dev),
2647 };
2648
2649 if (!key.prolog.hw) {
2650 key.prolog.sw_index_size_B =
2651 draw.indexed ? agx_index_size_to_B(draw.index_size) : 0;
2652 }
2653
2654 static_assert(sizeof(key.prolog.component_mask) ==
2655 sizeof(sw_vs->info.vs.attrib_components_read));
2656 BITSET_COPY(key.prolog.component_mask,
2657 sw_vs->info.vs.attrib_components_read);
2658
2659 u_foreach_bit(a, dyn->vi->attributes_valid) {
2660 struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
2661
2662 assert(dyn->vi->bindings_valid & BITFIELD_BIT(attr.binding));
2663 struct vk_vertex_binding_state binding =
2664 dyn->vi->bindings[attr.binding];
2665
2666 /* nir_assign_io_var_locations compacts vertex inputs, eliminating
2667 * unused inputs. We need to do the same here to match the locations.
2668 */
2669 unsigned slot =
2670 util_bitcount64(sw_vs->info.vs.attribs_read & BITFIELD_MASK(a));
2671
2672 key.prolog.attribs[slot] = (struct agx_velem_key){
2673 .format = vk_format_to_pipe_format(attr.format),
2674 .stride = dyn->vi_binding_strides[attr.binding],
2675 .divisor = binding.divisor,
2676 .instanced = binding.input_rate == VK_VERTEX_INPUT_RATE_INSTANCE,
2677 };
2678 }
2679
2680 hk_update_fast_linked(cmd, sw_vs, &key);
2681 }
2682
2683 if (IS_DIRTY(VI) || IS_DIRTY(VI_BINDINGS_VALID) || vgt_dirty ||
2684 (gfx->dirty & HK_DIRTY_VB)) {
2685
2686 uint64_t sink = dev->rodata.zero_sink;
2687
2688 unsigned slot = 0;
2689 u_foreach_bit(a, sw_vs->info.vs.attribs_read) {
2690 if (dyn->vi->attributes_valid & BITFIELD_BIT(a)) {
2691 struct vk_vertex_attribute_state attr = dyn->vi->attributes[a];
2692 struct hk_addr_range vb = gfx->vb[attr.binding];
2693
2694 desc->root.draw.attrib_clamps[slot] = agx_calculate_vbo_clamp(
2695 vb.addr, sink, vk_format_to_pipe_format(attr.format), vb.range,
2696 dyn->vi_binding_strides[attr.binding], attr.offset,
2697 &desc->root.draw.attrib_base[slot]);
2698 } else {
2699 desc->root.draw.attrib_base[slot] = sink;
2700 desc->root.draw.attrib_clamps[slot] = 0;
2701 }
2702
2703 ++slot;
2704 }
2705
2706 desc->root_dirty = true;
2707 }
2708
2709 if (vgt_dirty || IS_SHADER_DIRTY(FRAGMENT) ||
2710 IS_DIRTY(MS_RASTERIZATION_SAMPLES) || IS_DIRTY(MS_SAMPLE_MASK) ||
2711 IS_DIRTY(MS_ALPHA_TO_COVERAGE_ENABLE) ||
2712 IS_DIRTY(MS_ALPHA_TO_ONE_ENABLE) || IS_DIRTY(CB_LOGIC_OP) ||
2713 IS_DIRTY(CB_LOGIC_OP_ENABLE) || IS_DIRTY(CB_WRITE_MASKS) ||
2714 IS_DIRTY(CB_COLOR_WRITE_ENABLES) || IS_DIRTY(CB_ATTACHMENT_COUNT) ||
2715 IS_DIRTY(CB_BLEND_ENABLES) || IS_DIRTY(CB_BLEND_EQUATIONS) ||
2716 IS_DIRTY(CB_BLEND_CONSTANTS) ||
2717 desc->root_dirty /* for pipeline stats */ || true) {
2718
2719 if (fs) {
2720 unsigned samples_shaded = 0;
2721 if (fs->info.fs.epilog_key.sample_shading)
2722 samples_shaded = dyn->ms.rasterization_samples;
2723
2724 unsigned tib_sample_mask =
2725 BITFIELD_MASK(dyn->ms.rasterization_samples);
2726 unsigned api_sample_mask = dyn->ms.sample_mask & tib_sample_mask;
2727 bool has_sample_mask = api_sample_mask != tib_sample_mask;
2728
2729 struct hk_fast_link_key_fs key = {
2730 .prolog.statistics = hk_pipeline_stat_addr(
2731 cmd,
2732 VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT),
2733
2734 .prolog.cull_distance_size =
2735 hw_vs->info.vs.cull_distance_array_size,
2736 .prolog.api_sample_mask = has_sample_mask ? api_sample_mask : 0xff,
2737 .nr_samples_shaded = samples_shaded,
2738 };
2739
2740 bool prolog_discards =
2741 has_sample_mask || key.prolog.cull_distance_size;
2742
2743 bool needs_prolog = key.prolog.statistics || prolog_discards;
2744
2745 if (needs_prolog) {
2746 /* With late main shader tests, the prolog runs tests if neither the
2747 * main shader nor epilog will.
2748 *
2749 * With (nontrivial) early main shader tests, the prolog does not
2750 * run tests, the tests will run at the start of the main shader.
2751 * This ensures tests are after API sample mask and cull distance
2752 * discards.
2753 */
2754 key.prolog.run_zs_tests = !nontrivial_force_early &&
2755 !fs->b.info.writes_sample_mask &&
2756 !epilog_discards && prolog_discards;
2757
2758 if (key.prolog.cull_distance_size) {
2759 key.prolog.cf_base = fs->b.info.varyings.fs.nr_cf;
2760 }
2761 }
2762
2763 key.epilog = (struct agx_fs_epilog_key){
2764 .link = fs->info.fs.epilog_key,
2765 .nr_samples = MAX2(dyn->ms.rasterization_samples, 1),
2766 .blend.alpha_to_coverage = dyn->ms.alpha_to_coverage_enable,
2767 .blend.alpha_to_one = dyn->ms.alpha_to_one_enable,
2768 .blend.logicop_func = dyn->cb.logic_op_enable
2769 ? vk_logic_op_to_pipe(dyn->cb.logic_op)
2770 : PIPE_LOGICOP_COPY,
2771 };
2772
2773 key.epilog.link.already_ran_zs |= nontrivial_force_early;
2774
2775 struct hk_rendering_state *render = &cmd->state.gfx.render;
2776 for (uint32_t i = 0; i < render->color_att_count; i++) {
2777 key.epilog.rt_formats[i] =
2778 vk_format_to_pipe_format(render->color_att[i].vk_format);
2779
2780 const struct vk_color_blend_attachment_state *cb =
2781 &dyn->cb.attachments[i];
2782
2783 bool write_enable = dyn->cb.color_write_enables & BITFIELD_BIT(i);
2784 unsigned write_mask = write_enable ? cb->write_mask : 0;
2785
2786 /* nir_lower_blend always blends, so use a default blend state when
2787 * blending is disabled at an API level.
2788 */
2789 if (!dyn->cb.attachments[i].blend_enable) {
2790 key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
2791 .colormask = write_mask,
2792 .rgb_func = PIPE_BLEND_ADD,
2793 .alpha_func = PIPE_BLEND_ADD,
2794 .rgb_src_factor = PIPE_BLENDFACTOR_ONE,
2795 .alpha_src_factor = PIPE_BLENDFACTOR_ONE,
2796 .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
2797 .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
2798 };
2799 } else {
2800 key.epilog.blend.rt[i] = (struct agx_blend_rt_key){
2801 .colormask = write_mask,
2802
2803 .rgb_src_factor =
2804 vk_blend_factor_to_pipe(cb->src_color_blend_factor),
2805
2806 .rgb_dst_factor =
2807 vk_blend_factor_to_pipe(cb->dst_color_blend_factor),
2808
2809 .rgb_func = vk_blend_op_to_pipe(cb->color_blend_op),
2810
2811 .alpha_src_factor =
2812 vk_blend_factor_to_pipe(cb->src_alpha_blend_factor),
2813
2814 .alpha_dst_factor =
2815 vk_blend_factor_to_pipe(cb->dst_alpha_blend_factor),
2816
2817 .alpha_func = vk_blend_op_to_pipe(cb->alpha_blend_op),
2818 };
2819 }
2820 }
2821
2822 hk_update_fast_linked(cmd, fs, &key);
2823 } else {
2824 /* TODO: prolog without fs needs to work too... */
2825 if (cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] != NULL) {
2826 cmd->state.gfx.linked_dirty |= BITFIELD_BIT(MESA_SHADER_FRAGMENT);
2827 cmd->state.gfx.linked[MESA_SHADER_FRAGMENT] = NULL;
2828 }
2829 }
2830 }
2831
2832 /* If the vertex shader uses draw parameters, vertex uniforms are dirty every
2833 * draw. Fragment uniforms are unaffected.
2834 *
2835 * For a direct draw, we upload the draw parameters as-if indirect to
2836 * avoid keying to indirectness.
2837 */
2838 if (gfx->linked[MESA_SHADER_VERTEX]->b.uses_base_param) {
2839 if (draw.b.indirect) {
2840 gfx->draw_params = draw.b.ptr;
2841
2842 if (draw.indexed) {
2843 gfx->draw_params +=
2844 offsetof(VkDrawIndexedIndirectCommand, vertexOffset);
2845 } else {
2846 gfx->draw_params += offsetof(VkDrawIndirectCommand, firstVertex);
2847 }
2848 } else {
2849 uint32_t params[] = {
2850 draw.indexed ? draw.index_bias : draw.start,
2851 draw.start_instance,
2852 };
2853
2854 gfx->draw_params = hk_pool_upload(cmd, params, sizeof(params), 4);
2855 }
2856 } else {
2857 gfx->draw_params = 0;
2858 }
2859
2860 if (sw_vs->b.info.uses_draw_id) {
2861 /* TODO: rodata? */
2862 gfx->draw_id_ptr = hk_pool_upload(cmd, &draw_id, 2, 4);
2863 } else {
2864 gfx->draw_id_ptr = 0;
2865 }
2866
2867 if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || gt_dirty) {
2868 enum mesa_prim prim = hk_rast_prim(cmd);
2869
2870 gfx->topology = translate_hw_primitive_topology(prim);
2871 gfx->object_type = translate_object_type(prim);
2872 }
2873
2874 if (IS_DIRTY(IA_PRIMITIVE_TOPOLOGY) || IS_DIRTY(RS_PROVOKING_VERTEX)) {
2875 unsigned provoking;
2876 if (dyn->rs.provoking_vertex == VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT)
2877 provoking = 2;
2878 else if (gfx->topology == AGX_PRIMITIVE_TRIANGLE_FAN)
2879 provoking = 1;
2880 else
2881 provoking = 0;
2882
2883 if (provoking != gfx->provoking) {
2884 gfx->provoking = provoking;
2885 gfx->dirty |= HK_DIRTY_PROVOKING;
2886
2887 gfx->descriptors.root.draw.provoking = provoking;
2888 gfx->descriptors.root_dirty = true;
2889 }
2890 }
2891
2892 /* With attachmentless rendering, we don't know the sample count until draw
2893 * time, so we do a late tilebuffer fix up. But with rasterizer discard,
2894 * rasterization_samples might be 0.
2895 */
2896 if (dyn->ms.rasterization_samples &&
2897 gfx->render.tilebuffer.nr_samples != dyn->ms.rasterization_samples) {
2898
2899 assert(gfx->render.tilebuffer.nr_samples == 0);
2900
2901 unsigned nr_samples = MAX2(dyn->ms.rasterization_samples, 1);
2902 gfx->render.tilebuffer.nr_samples = nr_samples;
2903 agx_tilebuffer_pack_usc(&gfx->render.tilebuffer);
2904 cs->tib = gfx->render.tilebuffer;
2905 }
2906
2907 if (IS_DIRTY(MS_SAMPLE_LOCATIONS) || IS_DIRTY(MS_SAMPLE_LOCATIONS_ENABLE) ||
2908 IS_DIRTY(MS_RASTERIZATION_SAMPLES)) {
2909
2910 uint32_t ctrl;
2911 if (dyn->ms.sample_locations_enable) {
2912 ctrl = hk_pack_ppp_multisamplectrl(dyn->ms.sample_locations);
2913 } else {
2914 ctrl = hk_default_sample_positions(dyn->ms.rasterization_samples);
2915 }
2916
2917 bool dont_commit = cmd->in_meta || dyn->ms.rasterization_samples == 0;
2918
2919 if (!cs->has_sample_locations) {
2920 cs->ppp_multisamplectl = ctrl;
2921
2922 /* If we're in vk_meta, do not commit to the sample locations yet.
2923 * vk_meta doesn't care, but the app will!
2924 */
2925 cs->has_sample_locations |= !dont_commit;
2926 } else {
2927 assert(dont_commit || cs->ppp_multisamplectl == ctrl);
2928 }
2929
2930 gfx->descriptors.root.draw.ppp_multisamplectl = ctrl;
2931 gfx->descriptors.root_dirty = true;
2932 }
2933
2934 /* Link varyings before uploading tessellation state, becuase the
2935 * gfx->generate_primitive_id boolean needs to be plumbed.
2936 */
2937 struct hk_linked_shader *linked_vs = gfx->linked[MESA_SHADER_VERTEX];
2938 struct hk_linked_shader *linked_fs = gfx->linked[MESA_SHADER_FRAGMENT];
2939 bool linked_vs_dirty = IS_LINKED_DIRTY(VERTEX);
2940 bool linked_fs_dirty = IS_LINKED_DIRTY(FRAGMENT);
2941
2942 if ((gfx->dirty & HK_DIRTY_PROVOKING) || vgt_dirty || linked_fs_dirty) {
2943 unsigned bindings = linked_fs ? linked_fs->b.cf.nr_bindings : 0;
2944 if (bindings) {
2945 size_t linkage_size =
2946 AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
2947
2948 struct agx_ptr t = hk_pool_usc_alloc(cmd, linkage_size, 16);
2949 if (!t.cpu)
2950 return;
2951
2952 agx_link_varyings_vs_fs(
2953 t.cpu, &gfx->linked_varyings, hw_vs->info.uvs.user_size,
2954 &linked_fs->b.cf, gfx->provoking, 0, &gfx->generate_primitive_id);
2955
2956 gfx->varyings = agx_usc_addr(&dev->dev, t.gpu);
2957 } else {
2958 gfx->varyings = 0;
2959 }
2960
2961 gfx->dirty |= HK_DIRTY_VARYINGS;
2962 }
2963
2964 if (gfx->shaders[MESA_SHADER_TESS_EVAL] ||
2965 gfx->shaders[MESA_SHADER_GEOMETRY]) {
2966
2967 struct hk_shader *vs = hk_bound_sw_vs(gfx);
2968 desc->root.draw.vertex_outputs = vs->b.info.outputs;
2969
2970 /* XXX: We should deduplicate this logic */
2971 bool restart = (draw.indexed && draw.restart);
2972 bool indirect = draw.b.indirect || restart;
2973
2974 desc->root.draw.input_assembly =
2975 indirect ? hk_pool_alloc(cmd, sizeof(struct agx_ia_state), 4).gpu
2976 : hk_upload_ia_params(cmd, draw);
2977
2978 if (!indirect) {
2979 uint32_t verts = draw.b.count[0], instances = draw.b.count[1];
2980 unsigned vb_size =
2981 libagx_tcs_in_size(verts * instances, vs->b.info.outputs);
2982
2983 /* Allocate if there are any outputs, or use the null sink to trap
2984 * reads if there aren't. Those reads are undefined but should not
2985 * fault. Affects:
2986 *
2987 * dEQP-VK.pipeline.monolithic.no_position.explicit_declarations.basic.single_view.v0_g1
2988 */
2989 desc->root.draw.vertex_output_buffer =
2990 vb_size ? hk_pool_alloc(cmd, vb_size, 4).gpu
2991 : dev->rodata.null_sink;
2992 }
2993 }
2994
2995 if (gfx->shaders[MESA_SHADER_TESS_EVAL]) {
2996 gfx->descriptors.root.draw.tess_params = hk_upload_tess_params(cmd, draw);
2997 gfx->descriptors.root_dirty = true;
2998 }
2999
3000 if (gfx->shaders[MESA_SHADER_GEOMETRY]) {
3001 /* TODO: size */
3002 cmd->geom_indirect = hk_pool_alloc(cmd, 64, 4).gpu;
3003
3004 gfx->descriptors.root.draw.geometry_params =
3005 hk_upload_geometry_params(cmd, draw);
3006
3007 gfx->descriptors.root_dirty = true;
3008 }
3009
3010 /* Root must be uploaded after the above, which touch the root */
3011 if (gfx->descriptors.root_dirty) {
3012 gfx->root =
3013 hk_cmd_buffer_upload_root(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS);
3014 }
3015
3016 /* Hardware dynamic state must be deferred until after the root and fast
3017 * linking, since it will use the root address and the linked shaders.
3018 */
3019 if ((gfx->dirty & (HK_DIRTY_PROVOKING | HK_DIRTY_VARYINGS)) ||
3020 IS_DIRTY(RS_RASTERIZER_DISCARD_ENABLE) || linked_vs_dirty || vgt_dirty ||
3021 gfx->descriptors.root_dirty || gfx->draw_id_ptr || gfx->draw_params) {
3022
3023 /* TODO: Do less often? */
3024 hk_reserve_scratch(cmd, cs, hw_vs);
3025
3026 agx_push(out, VDM_STATE, cfg) {
3027 cfg.vertex_shader_word_0_present = true;
3028 cfg.vertex_shader_word_1_present = true;
3029 cfg.vertex_outputs_present = true;
3030 cfg.vertex_unknown_present = true;
3031 }
3032
3033 agx_push_packed(out, hw_vs->counts, VDM_STATE_VERTEX_SHADER_WORD_0);
3034
3035 struct hk_linked_shader *linked_hw_vs =
3036 (hw_vs == sw_vs) ? linked_vs : hw_vs->only_linked;
3037
3038 agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
3039 cfg.pipeline = hk_upload_usc_words(cmd, hw_vs, linked_hw_vs);
3040 }
3041
3042 agx_push_packed(out, hw_vs->info.uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
3043
3044 agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
3045 cfg.flat_shading_control = translate_vdm_vertex(gfx->provoking);
3046 cfg.unknown_4 = cfg.unknown_5 = dyn->rs.rasterizer_discard_enable;
3047 cfg.generate_primitive_id = gfx->generate_primitive_id;
3048 }
3049
3050 /* Pad up to a multiple of 8 bytes */
3051 memset(out, 0, 4);
3052 out += 4;
3053 }
3054
3055 if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS)) {
3056 void *ptr =
3057 util_dynarray_grow_bytes(&cs->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
3058
3059 agx_pack(ptr, DEPTH_BIAS, cfg) {
3060 cfg.depth_bias = dyn->rs.depth_bias.constant;
3061 cfg.slope_scale = dyn->rs.depth_bias.slope;
3062 cfg.clamp = dyn->rs.depth_bias.clamp;
3063
3064 /* Value from the PowerVR driver. */
3065 if (render->depth_att.vk_format == VK_FORMAT_D16_UNORM) {
3066 cfg.depth_bias /= (1 << 15);
3067 }
3068 }
3069 }
3070
3071 /* Hardware viewport/scissor state is entangled with depth bias. */
3072 if (IS_DIRTY(RS_DEPTH_BIAS_FACTORS) || IS_DIRTY(VP_SCISSORS) ||
3073 IS_DIRTY(VP_SCISSOR_COUNT) || IS_DIRTY(VP_VIEWPORTS) ||
3074 IS_DIRTY(VP_VIEWPORT_COUNT) ||
3075 IS_DIRTY(VP_DEPTH_CLIP_NEGATIVE_ONE_TO_ONE) ||
3076 IS_DIRTY(RS_DEPTH_CLIP_ENABLE) || IS_DIRTY(RS_DEPTH_CLAMP_ENABLE)) {
3077
3078 hk_flush_vp_state(cmd, cs, &out);
3079 }
3080
3081 hk_flush_ppp_state(cmd, cs, &out);
3082 cs->current = out;
3083
3084 vk_dynamic_graphics_state_clear_dirty(dyn);
3085 gfx->shaders_dirty = 0;
3086 gfx->linked_dirty = 0;
3087 gfx->dirty = 0;
3088 gfx->descriptors.root_dirty = false;
3089 }
3090
3091 static bool
hk_needs_index_robustness(struct hk_cmd_buffer * cmd,struct hk_draw draw)3092 hk_needs_index_robustness(struct hk_cmd_buffer *cmd, struct hk_draw draw)
3093 {
3094 struct hk_graphics_state *gfx = &cmd->state.gfx;
3095 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3096
3097 if (!draw.indexed)
3098 return false;
3099
3100 /* If tessellation is used, we'll go through the robust path anyway, don't
3101 * end up with a tess+geom doom combo.
3102 */
3103 if (gfx->shaders[MESA_SHADER_TESS_EVAL])
3104 return false;
3105
3106 /* Allowed with maint6 without robustness features enabled */
3107 if (draw.index.range == 0)
3108 return true;
3109
3110 if (!(dev->vk.enabled_features.robustBufferAccess ||
3111 dev->vk.enabled_features.robustBufferAccess2 ||
3112 dev->vk.enabled_features.pipelineRobustness))
3113 return false;
3114
3115 if (draw.b.indirect) {
3116 return true;
3117 } else {
3118 uint32_t range_B =
3119 (draw.start + draw.b.count[0]) * agx_index_size_to_B(draw.index_size);
3120
3121 return range_B > draw.index.range;
3122 }
3123 }
3124
3125 static void
hk_handle_passthrough_gs(struct hk_cmd_buffer * cmd,struct hk_draw draw)3126 hk_handle_passthrough_gs(struct hk_cmd_buffer *cmd, struct hk_draw draw)
3127 {
3128 struct hk_graphics_state *gfx = &cmd->state.gfx;
3129 struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
3130
3131 /* If there's an application geometry shader, there's nothing to un/bind */
3132 if (gs && !gs->is_passthrough)
3133 return;
3134
3135 /* Determine if we need a geometry shader to emulate XFB or adjacency */
3136 struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
3137 struct hk_shader *last_sw = hk_bound_sw_vs_before_gs(gfx);
3138 uint32_t xfb_outputs = last_sw->info.xfb_info.output_count;
3139
3140 VkPrimitiveTopology topology = dyn->ia.primitive_topology;
3141 bool adjacency =
3142 (topology == VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY) ||
3143 (topology == VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY) ||
3144 (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY) ||
3145 (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY);
3146
3147 /* TODO: Don't use a whole GS just for index robustness. */
3148 bool index_robustness = hk_needs_index_robustness(cmd, draw);
3149
3150 bool needs_gs = xfb_outputs || adjacency || index_robustness;
3151
3152 /* Various pipeline statistics are implemented in the pre-GS shader. TODO:
3153 * This could easily be optimized.
3154 */
3155 VkQueryPipelineStatisticFlagBits ia_statistics[] = {
3156 VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT,
3157 VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT,
3158 VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT,
3159 };
3160
3161 bool ia_stats = false;
3162
3163 for (unsigned i = 0; i < ARRAY_SIZE(ia_statistics); ++i) {
3164 ia_stats |= hk_pipeline_stat_addr(cmd, ia_statistics[i]) != 0;
3165 }
3166
3167 needs_gs |= ia_stats;
3168
3169 /* If we already have a matching GS configuration, we're done */
3170 if ((gs != NULL) == needs_gs)
3171 return;
3172
3173 /* If we don't need a GS but we do have a passthrough, unbind it */
3174 if (gs) {
3175 assert(!needs_gs && gs->is_passthrough);
3176 hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, NULL);
3177 return;
3178 }
3179
3180 /* Else, we need to bind a passthrough GS */
3181 size_t key_size =
3182 sizeof(struct hk_passthrough_gs_key) + nir_xfb_info_size(xfb_outputs);
3183 struct hk_passthrough_gs_key *key = alloca(key_size);
3184
3185 *key = (struct hk_passthrough_gs_key){
3186 .prim = u_decomposed_prim(hk_gs_in_prim(cmd)),
3187 .outputs = last_sw->b.info.outputs,
3188 .clip_distance_array_size = last_sw->info.clip_distance_array_size,
3189 .cull_distance_array_size = last_sw->info.cull_distance_array_size,
3190 };
3191
3192 if (xfb_outputs) {
3193 typed_memcpy(key->xfb_stride, last_sw->info.xfb_stride,
3194 ARRAY_SIZE(key->xfb_stride));
3195
3196 memcpy(&key->xfb_info, &last_sw->info.xfb_info,
3197 nir_xfb_info_size(xfb_outputs));
3198 }
3199
3200 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3201 perf_debug(dev, "Binding passthrough GS for%s%s%s%s\n",
3202 xfb_outputs ? " XFB" : "", adjacency ? " adjacency" : "",
3203 index_robustness ? " robustness" : "",
3204 ia_stats ? " statistics" : "");
3205
3206 gs = hk_meta_shader(dev, hk_nir_passthrough_gs, key, key_size);
3207 gs->is_passthrough = true;
3208 hk_cmd_bind_graphics_shader(cmd, MESA_SHADER_GEOMETRY, gs);
3209 }
3210
3211 static struct hk_cs *
hk_flush_gfx_state(struct hk_cmd_buffer * cmd,uint32_t draw_id,struct hk_draw draw)3212 hk_flush_gfx_state(struct hk_cmd_buffer *cmd, uint32_t draw_id,
3213 struct hk_draw draw)
3214 {
3215 struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, false /* compute */);
3216 if (!cs)
3217 return NULL;
3218
3219 hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3220
3221 struct hk_graphics_state *gfx = &cmd->state.gfx;
3222 struct hk_descriptor_state *desc = &gfx->descriptors;
3223 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3224
3225 #ifndef NDEBUG
3226 if (unlikely(dev->dev.debug & AGX_DBG_DIRTY)) {
3227 hk_cmd_buffer_dirty_all(cmd);
3228 }
3229 #endif
3230
3231 /* TODO: Try to reduce draw overhead of this */
3232 hk_handle_passthrough_gs(cmd, draw);
3233
3234 hk_flush_shaders(cmd);
3235
3236 if (desc->push_dirty)
3237 hk_cmd_buffer_flush_push_descriptors(cmd, desc);
3238
3239 if ((gfx->dirty & HK_DIRTY_INDEX) &&
3240 (draw.restart || gfx->shaders[MESA_SHADER_GEOMETRY]))
3241 hk_flush_index(cmd, cs);
3242
3243 hk_flush_dynamic_state(cmd, cs, draw_id, draw);
3244 return cs;
3245 }
3246
3247 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3248 hk_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3249 VkDeviceSize offset, VkDeviceSize size,
3250 VkIndexType indexType)
3251 {
3252 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3253 VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3254
3255 cmd->state.gfx.index = (struct hk_index_buffer_state){
3256 .buffer = hk_buffer_addr_range(buffer, offset, size),
3257 .size = agx_translate_index_size(vk_index_type_to_bytes(indexType)),
3258 .restart = vk_index_to_restart(indexType),
3259 };
3260
3261 /* TODO: check if necessary, blob does this */
3262 cmd->state.gfx.index.buffer.range =
3263 align(cmd->state.gfx.index.buffer.range, 4);
3264
3265 cmd->state.gfx.dirty |= HK_DIRTY_INDEX;
3266 }
3267
3268 void
hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer * cmd,uint32_t vb_idx,struct hk_addr_range addr_range)3269 hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
3270 struct hk_addr_range addr_range)
3271 {
3272 cmd->state.gfx.vb[vb_idx] = addr_range;
3273 cmd->state.gfx.dirty |= HK_DIRTY_VB;
3274 }
3275
3276 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3277 hk_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer, uint32_t firstBinding,
3278 uint32_t bindingCount, const VkBuffer *pBuffers,
3279 const VkDeviceSize *pOffsets,
3280 const VkDeviceSize *pSizes,
3281 const VkDeviceSize *pStrides)
3282 {
3283 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3284
3285 if (pStrides) {
3286 vk_cmd_set_vertex_binding_strides(&cmd->vk, firstBinding, bindingCount,
3287 pStrides);
3288 }
3289
3290 for (uint32_t i = 0; i < bindingCount; i++) {
3291 VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
3292 uint32_t idx = firstBinding + i;
3293
3294 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3295 const struct hk_addr_range addr_range =
3296 hk_buffer_addr_range(buffer, pOffsets[i], size);
3297
3298 hk_cmd_bind_vertex_buffer(cmd, idx, addr_range);
3299 }
3300 }
3301
3302 static bool
hk_set_view_index(struct hk_cmd_buffer * cmd,uint32_t view_idx)3303 hk_set_view_index(struct hk_cmd_buffer *cmd, uint32_t view_idx)
3304 {
3305 if (cmd->state.gfx.render.view_mask) {
3306 cmd->state.gfx.descriptors.root.draw.view_index = view_idx;
3307 cmd->state.gfx.descriptors.root_dirty = true;
3308 }
3309
3310 return true;
3311 }
3312
3313 /*
3314 * Iterator macro to duplicate a draw for each enabled view (when multiview is
3315 * enabled, else always view 0). Along with hk_lower_multiview, this forms the
3316 * world's worst multiview lowering.
3317 */
3318 #define hk_foreach_view(cmd) \
3319 u_foreach_bit(view_idx, cmd->state.gfx.render.view_mask ?: 1) \
3320 if (hk_set_view_index(cmd, view_idx))
3321
3322 static void
hk_ia_update(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_draw draw,uint64_t ia_vertices,uint64_t vs_invocations)3323 hk_ia_update(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_draw draw,
3324 uint64_t ia_vertices, uint64_t vs_invocations)
3325 {
3326 /* XXX: stream link needed? */
3327 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3328 perf_debug(dev, "Input assembly counters");
3329
3330 struct agx_increment_ia_counters_key key = {
3331 .index_size_B = draw.restart ? agx_index_size_to_B(draw.index_size) : 0,
3332 };
3333
3334 uint64_t draw_ptr;
3335 if (draw.b.indirect) {
3336 draw_ptr = draw.b.ptr;
3337 } else {
3338 uint32_t desc[] = {draw.b.count[0], draw.b.count[1], 0};
3339 draw_ptr = hk_pool_upload(cmd, &desc, sizeof(desc), 4);
3340 }
3341
3342 struct libagx_increment_ia_counters args = {
3343 .ia_vertices = ia_vertices,
3344 .vs_invocations = vs_invocations,
3345 .restart_index = cmd->state.gfx.index.restart,
3346 .draw = draw_ptr,
3347 .index_buffer = draw.index.addr,
3348 .index_buffer_range_el =
3349 key.index_size_B ? (draw.index.range / key.index_size_B) : 0,
3350 };
3351
3352 uint64_t wg_size = key.index_size_B ? 1024 : 1;
3353
3354 struct hk_shader *s =
3355 hk_meta_kernel(dev, agx_nir_increment_ia_counters, &key, sizeof(key));
3356
3357 uint64_t push = hk_pool_upload(cmd, &args, sizeof(args), 8);
3358 uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
3359
3360 hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(wg_size, 1, 1),
3361 hk_grid(wg_size, 1, 1));
3362 }
3363
3364 static void
hk_draw(struct hk_cmd_buffer * cmd,uint16_t draw_id,struct hk_draw draw_)3365 hk_draw(struct hk_cmd_buffer *cmd, uint16_t draw_id, struct hk_draw draw_)
3366 {
3367 const struct vk_dynamic_graphics_state *dyn =
3368 &cmd->vk.dynamic_graphics_state;
3369
3370 /* Filter trivial draws so we don't need to worry about null index buffers */
3371 if (!draw_.b.indirect && (draw_.b.count[0] == 0 || draw_.b.count[1] == 0))
3372 return;
3373
3374 draw_.restart = dyn->ia.primitive_restart_enable;
3375 draw_.index_size = cmd->state.gfx.index.size;
3376
3377 uint64_t stat_ia_verts = hk_pipeline_stat_addr(
3378 cmd, VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT);
3379
3380 uint64_t stat_vs_inv = hk_pipeline_stat_addr(
3381 cmd, VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT);
3382
3383 bool ia_stats = stat_ia_verts || stat_vs_inv;
3384
3385 hk_foreach_view(cmd) {
3386 struct hk_draw draw = draw_;
3387 struct hk_cs *cs = hk_flush_gfx_state(cmd, draw_id, draw);
3388 /* If we failed to allocate a control stream, we've already lost the
3389 * device. Just drop the draw so we don't crash.
3390 */
3391 if (!cs)
3392 return;
3393
3394 bool geom = cmd->state.gfx.shaders[MESA_SHADER_GEOMETRY];
3395 bool tess = cmd->state.gfx.shaders[MESA_SHADER_TESS_EVAL];
3396 struct hk_cs *ccs = NULL;
3397 uint8_t *out = cs->current;
3398 assert(cs->current + 0x1000 < cs->end);
3399
3400 if (geom || tess || ia_stats) {
3401 ccs =
3402 hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3403 if (!ccs)
3404 return;
3405 }
3406
3407 if (ia_stats) {
3408 hk_ia_update(cmd, ccs, draw, stat_ia_verts, stat_vs_inv);
3409 }
3410
3411 if (tess) {
3412 draw = hk_launch_tess(cmd, ccs, draw);
3413
3414 if (draw.raw) {
3415 assert(!geom);
3416 assert(draw.b.indirect);
3417
3418 agx_push(out, VDM_STREAM_LINK, cfg) {
3419 cfg.target_lo = draw.b.ptr & BITFIELD_MASK(32);
3420 cfg.target_hi = draw.b.ptr >> 32;
3421 cfg.with_return = true;
3422 }
3423
3424 cs->current = out;
3425 continue;
3426 }
3427 }
3428
3429 if (geom) {
3430 draw = hk_launch_gs_prerast(cmd, ccs, draw);
3431
3432 /* We must not draw if the app specified rasterizer discard. This is
3433 * required for both performance (it is pointless to rasterize and
3434 * there are no side effects), but also correctness (no indirect draw
3435 * descriptor will be filled out).
3436 */
3437 if (dyn->rs.rasterizer_discard_enable)
3438 continue;
3439 }
3440
3441 uint64_t ib = draw.index.addr;
3442 if (draw.indexed && !draw.b.indirect)
3443 ib += (draw.start << draw.index_size);
3444
3445 agx_push(out, INDEX_LIST, cfg) {
3446 cfg.primitive = cmd->state.gfx.topology;
3447
3448 if (draw.b.indirect) {
3449 cfg.indirect_buffer_present = true;
3450 } else {
3451 cfg.instance_count_present = true;
3452 cfg.index_count_present = true;
3453 cfg.start_present = true;
3454 }
3455
3456 if (draw.indexed) {
3457 cfg.restart_enable = draw.restart;
3458 cfg.index_buffer_hi = ib >> 32;
3459 cfg.index_size = draw.index_size;
3460
3461 cfg.index_buffer_present = true;
3462 cfg.index_buffer_size_present = true;
3463 }
3464 }
3465
3466 if (draw.indexed) {
3467 agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
3468 cfg.buffer_lo = ib;
3469 }
3470 }
3471
3472 if (draw.b.indirect) {
3473 agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
3474 cfg.address_hi = draw.b.ptr >> 32;
3475 cfg.address_lo = draw.b.ptr & BITFIELD_MASK(32);
3476 }
3477 } else {
3478 agx_push(out, INDEX_LIST_COUNT, cfg) {
3479 cfg.count = draw.b.count[0];
3480 }
3481
3482 agx_push(out, INDEX_LIST_INSTANCES, cfg) {
3483 cfg.count = draw.b.count[1];
3484 }
3485
3486 agx_push(out, INDEX_LIST_START, cfg) {
3487 cfg.start = draw.indexed ? draw.index_bias : draw.start;
3488 }
3489 }
3490
3491 if (draw.indexed) {
3492 agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
3493 cfg.size = draw.index.range;
3494 }
3495 }
3496
3497 cs->current = out;
3498 }
3499 }
3500
3501 VKAPI_ATTR void VKAPI_CALL
hk_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3502 hk_CmdDraw(VkCommandBuffer commandBuffer, uint32_t vertexCount,
3503 uint32_t instanceCount, uint32_t firstVertex, uint32_t firstInstance)
3504 {
3505 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3506
3507 struct hk_draw draw = {
3508 .b = hk_grid(vertexCount, instanceCount, 1),
3509 .start = firstVertex,
3510 .start_instance = firstInstance,
3511 };
3512
3513 hk_draw(cmd, 0, draw);
3514 }
3515
3516 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3517 hk_CmdDrawMultiEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
3518 const VkMultiDrawInfoEXT *pVertexInfo,
3519 uint32_t instanceCount, uint32_t firstInstance,
3520 uint32_t stride)
3521 {
3522 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3523
3524 for (unsigned i = 0; i < drawCount; ++i) {
3525 struct hk_draw draw = {
3526 .b = hk_grid(pVertexInfo->vertexCount, instanceCount, 1),
3527 .start = pVertexInfo->firstVertex,
3528 .start_instance = firstInstance,
3529 };
3530
3531 hk_draw(cmd, i, draw);
3532 pVertexInfo = ((void *)pVertexInfo) + stride;
3533 }
3534 }
3535
3536 static void
hk_draw_indexed(VkCommandBuffer commandBuffer,uint16_t draw_id,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3537 hk_draw_indexed(VkCommandBuffer commandBuffer, uint16_t draw_id,
3538 uint32_t indexCount, uint32_t instanceCount,
3539 uint32_t firstIndex, int32_t vertexOffset,
3540 uint32_t firstInstance)
3541 {
3542 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3543
3544 struct hk_draw draw = {
3545 .b = hk_grid(indexCount, instanceCount, 1),
3546 .indexed = true,
3547 .index = cmd->state.gfx.index.buffer,
3548 .start = firstIndex,
3549 .index_bias = vertexOffset,
3550 .start_instance = firstInstance,
3551 };
3552
3553 hk_draw(cmd, draw_id, draw);
3554 }
3555
3556 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3557 hk_CmdDrawIndexed(VkCommandBuffer commandBuffer, uint32_t indexCount,
3558 uint32_t instanceCount, uint32_t firstIndex,
3559 int32_t vertexOffset, uint32_t firstInstance)
3560 {
3561 hk_draw_indexed(commandBuffer, 0, indexCount, instanceCount, firstIndex,
3562 vertexOffset, firstInstance);
3563 }
3564
3565 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)3566 hk_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer, uint32_t drawCount,
3567 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3568 uint32_t instanceCount, uint32_t firstInstance,
3569 uint32_t stride, const int32_t *pVertexOffset)
3570 {
3571 for (unsigned i = 0; i < drawCount; ++i) {
3572 const uint32_t vertex_offset =
3573 pVertexOffset != NULL ? *pVertexOffset : pIndexInfo->vertexOffset;
3574
3575 hk_draw_indexed(commandBuffer, i, pIndexInfo->indexCount, instanceCount,
3576 pIndexInfo->firstIndex, vertex_offset, firstInstance);
3577
3578 pIndexInfo = ((void *)pIndexInfo) + stride;
3579 }
3580 }
3581
3582 static void
hk_draw_indirect_inner(VkCommandBuffer commandBuffer,uint64_t base,uint32_t drawCount,uint32_t stride)3583 hk_draw_indirect_inner(VkCommandBuffer commandBuffer, uint64_t base,
3584 uint32_t drawCount, uint32_t stride)
3585 {
3586 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3587
3588 /* From the Vulkan 1.3.238 spec:
3589 *
3590 * VUID-vkCmdDrawIndirect-drawCount-00476
3591 *
3592 * "If drawCount is greater than 1, stride must be a multiple of 4 and
3593 * must be greater than or equal to sizeof(VkDrawIndirectCommand)"
3594 *
3595 * and
3596 *
3597 * "If drawCount is less than or equal to one, stride is ignored."
3598 */
3599 if (drawCount > 1) {
3600 assert(stride % 4 == 0);
3601 assert(stride >= sizeof(VkDrawIndirectCommand));
3602 }
3603
3604 for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
3605 uint64_t addr = base + stride * draw_id;
3606 hk_draw(cmd, draw_id, hk_draw_indirect(addr));
3607 }
3608 }
3609
3610 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3611 hk_CmdDrawIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3612 VkDeviceSize offset, uint32_t drawCount, uint32_t stride)
3613 {
3614 VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3615
3616 hk_draw_indirect_inner(commandBuffer, hk_buffer_address(buffer, offset),
3617 drawCount, stride);
3618 }
3619
3620 static void
hk_draw_indexed_indirect_inner(VkCommandBuffer commandBuffer,uint64_t buffer,uint32_t drawCount,uint32_t stride)3621 hk_draw_indexed_indirect_inner(VkCommandBuffer commandBuffer, uint64_t buffer,
3622 uint32_t drawCount, uint32_t stride)
3623 {
3624 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3625
3626 /* From the Vulkan 1.3.238 spec:
3627 *
3628 * VUID-vkCmdDrawIndexedIndirect-drawCount-00528
3629 *
3630 * "If drawCount is greater than 1, stride must be a multiple of 4 and
3631 * must be greater than or equal to
3632 * sizeof(VkDrawIndexedIndirectCommand)"
3633 *
3634 * and
3635 *
3636 * "If drawCount is less than or equal to one, stride is ignored."
3637 */
3638 if (drawCount > 1) {
3639 assert(stride % 4 == 0);
3640 assert(stride >= sizeof(VkDrawIndexedIndirectCommand));
3641 }
3642
3643 for (unsigned draw_id = 0; draw_id < drawCount; ++draw_id) {
3644 uint64_t addr = buffer + stride * draw_id;
3645
3646 hk_draw(
3647 cmd, draw_id,
3648 hk_draw_indexed_indirect(addr, cmd->state.gfx.index.buffer, 0, 0));
3649 }
3650 }
3651
3652 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3653 hk_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3654 VkDeviceSize offset, uint32_t drawCount,
3655 uint32_t stride)
3656 {
3657 VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3658
3659 hk_draw_indexed_indirect_inner(
3660 commandBuffer, hk_buffer_address(buffer, offset), drawCount, stride);
3661 }
3662
3663 /*
3664 * To implement drawIndirectCount generically, we dispatch a compute kernel to
3665 * patch the indirect buffer and then we dispatch the predicated maxDrawCount
3666 * indirect draws.
3667 */
3668 static void
hk_draw_indirect_count(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride,bool indexed)3669 hk_draw_indirect_count(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3670 VkDeviceSize offset, VkBuffer countBuffer,
3671 VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
3672 uint32_t stride, bool indexed)
3673 {
3674 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3675 VK_FROM_HANDLE(hk_buffer, buffer, _buffer);
3676 VK_FROM_HANDLE(hk_buffer, count_buffer, countBuffer);
3677
3678 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3679 struct agx_predicate_indirect_key key = {.indexed = indexed};
3680 struct hk_shader *s =
3681 hk_meta_kernel(dev, agx_nir_predicate_indirect, &key, sizeof(key));
3682
3683 perf_debug(dev, "Draw indirect count");
3684
3685 struct hk_cs *cs =
3686 hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3687 if (!cs)
3688 return;
3689
3690 hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3691
3692 assert((stride % 4) == 0 && "aligned");
3693
3694 size_t out_stride = sizeof(uint32_t) * (indexed ? 5 : 4);
3695 uint64_t patched = hk_pool_alloc(cmd, out_stride * maxDrawCount, 4).gpu;
3696
3697 struct libagx_predicate_indirect_push push = {
3698 .in = hk_buffer_address(buffer, offset),
3699 .out = patched,
3700 .draw_count = hk_buffer_address(count_buffer, countBufferOffset),
3701 .stride_el = stride / 4,
3702 };
3703
3704 uint64_t push_ = hk_pool_upload(cmd, &push, sizeof(push), 8);
3705 uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push_, sizeof(push_));
3706
3707 hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(maxDrawCount, 1, 1),
3708 hk_grid(1, 1, 1));
3709
3710 if (indexed) {
3711 hk_draw_indexed_indirect_inner(commandBuffer, patched, maxDrawCount,
3712 out_stride);
3713 } else {
3714 hk_draw_indirect_inner(commandBuffer, patched, maxDrawCount, out_stride);
3715 }
3716 }
3717
3718 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3719 hk_CmdDrawIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3720 VkDeviceSize offset, VkBuffer countBuffer,
3721 VkDeviceSize countBufferOffset, uint32_t maxDrawCount,
3722 uint32_t stride)
3723 {
3724 hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
3725 countBufferOffset, maxDrawCount, stride, false);
3726 }
3727
3728 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,VkBuffer countBuffer,VkDeviceSize countBufferOffset,uint32_t maxDrawCount,uint32_t stride)3729 hk_CmdDrawIndexedIndirectCount(VkCommandBuffer commandBuffer, VkBuffer _buffer,
3730 VkDeviceSize offset, VkBuffer countBuffer,
3731 VkDeviceSize countBufferOffset,
3732 uint32_t maxDrawCount, uint32_t stride)
3733 {
3734 hk_draw_indirect_count(commandBuffer, _buffer, offset, countBuffer,
3735 countBufferOffset, maxDrawCount, stride, true);
3736 }
3737
3738 VKAPI_ATTR void VKAPI_CALL
hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,uint32_t instanceCount,uint32_t firstInstance,VkBuffer counterBuffer,VkDeviceSize counterBufferOffset,uint32_t counterOffset,uint32_t vertexStride)3739 hk_CmdDrawIndirectByteCountEXT(VkCommandBuffer commandBuffer,
3740 uint32_t instanceCount, uint32_t firstInstance,
3741 VkBuffer counterBuffer,
3742 VkDeviceSize counterBufferOffset,
3743 uint32_t counterOffset, uint32_t vertexStride)
3744 {
3745 unreachable("TODO");
3746 }
3747
3748 VKAPI_ATTR void VKAPI_CALL
hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes)3749 hk_CmdBindTransformFeedbackBuffersEXT(VkCommandBuffer commandBuffer,
3750 uint32_t firstBinding,
3751 uint32_t bindingCount,
3752 const VkBuffer *pBuffers,
3753 const VkDeviceSize *pOffsets,
3754 const VkDeviceSize *pSizes)
3755 {
3756 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3757 struct hk_graphics_state *gfx = &cmd->state.gfx;
3758
3759 for (uint32_t i = 0; i < bindingCount; i++) {
3760 VK_FROM_HANDLE(hk_buffer, buffer, pBuffers[i]);
3761 uint32_t idx = firstBinding + i;
3762 uint64_t size = pSizes ? pSizes[i] : VK_WHOLE_SIZE;
3763
3764 gfx->xfb[idx] = hk_buffer_addr_range(buffer, pOffsets[i], size);
3765 }
3766 }
3767
3768 static void
hk_libagx_copy_xfb_counters(nir_builder * b,const void * key)3769 hk_libagx_copy_xfb_counters(nir_builder *b, const void *key)
3770 {
3771 b->shader->info.workgroup_size_variable = true;
3772
3773 libagx_copy_xfb_counters(b, nir_load_preamble(b, 1, 64));
3774 }
3775
3776 static void
hk_begin_end_xfb(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets,bool begin)3777 hk_begin_end_xfb(VkCommandBuffer commandBuffer, uint32_t firstCounterBuffer,
3778 uint32_t counterBufferCount, const VkBuffer *pCounterBuffers,
3779 const VkDeviceSize *pCounterBufferOffsets, bool begin)
3780
3781 {
3782 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
3783 struct hk_device *dev = hk_cmd_buffer_device(cmd);
3784 struct hk_graphics_state *gfx = &cmd->state.gfx;
3785
3786 gfx->xfb_enabled = begin;
3787
3788 /* If we haven't reserved XFB offsets yet for the command buffer, do so. */
3789 if (!gfx->xfb_offsets) {
3790 gfx->xfb_offsets = hk_pool_alloc(cmd, 4 * sizeof(uint32_t), 4).gpu;
3791 }
3792
3793 struct hk_cs *cs =
3794 hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.pre_gfx, true);
3795 if (!cs)
3796 return;
3797 hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
3798
3799 struct libagx_xfb_counter_copy params = {};
3800 unsigned copies = 0;
3801
3802 /* For CmdBeginTransformFeedbackEXT, we need to initialize everything */
3803 if (begin) {
3804 for (copies = 0; copies < 4; ++copies) {
3805 params.dest[copies] = gfx->xfb_offsets + copies * sizeof(uint32_t);
3806 }
3807 }
3808
3809 for (unsigned i = 0; i < counterBufferCount; ++i) {
3810 if (pCounterBuffers[i] == VK_NULL_HANDLE)
3811 continue;
3812
3813 VK_FROM_HANDLE(hk_buffer, buffer, pCounterBuffers[i]);
3814
3815 uint64_t offset = pCounterBufferOffsets ? pCounterBufferOffsets[i] : 0;
3816 uint64_t cb_addr = hk_buffer_address(buffer, offset);
3817 uint32_t cmd_idx = firstCounterBuffer + i;
3818
3819 if (begin) {
3820 params.src[cmd_idx] = cb_addr;
3821 } else {
3822 params.dest[copies] = cb_addr;
3823 params.src[copies] = gfx->xfb_offsets + cmd_idx * sizeof(uint32_t);
3824 ++copies;
3825 }
3826 }
3827
3828 if (begin)
3829 copies = 4;
3830
3831 if (copies > 0) {
3832 perf_debug(dev, "XFB counter copy");
3833
3834 struct hk_shader *s =
3835 hk_meta_kernel(dev, hk_libagx_copy_xfb_counters, NULL, 0);
3836
3837 uint64_t push = hk_pool_upload(cmd, ¶ms, sizeof(params), 8);
3838 uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
3839
3840 hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(copies, 1, 1),
3841 hk_grid(copies, 1, 1));
3842 }
3843 }
3844
3845 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3846 hk_CmdBeginTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3847 uint32_t firstCounterBuffer,
3848 uint32_t counterBufferCount,
3849 const VkBuffer *pCounterBuffers,
3850 const VkDeviceSize *pCounterBufferOffsets)
3851 {
3852 hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
3853 pCounterBuffers, pCounterBufferOffsets, true);
3854 }
3855
3856 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,uint32_t firstCounterBuffer,uint32_t counterBufferCount,const VkBuffer * pCounterBuffers,const VkDeviceSize * pCounterBufferOffsets)3857 hk_CmdEndTransformFeedbackEXT(VkCommandBuffer commandBuffer,
3858 uint32_t firstCounterBuffer,
3859 uint32_t counterBufferCount,
3860 const VkBuffer *pCounterBuffers,
3861 const VkDeviceSize *pCounterBufferOffsets)
3862 {
3863 hk_begin_end_xfb(commandBuffer, firstCounterBuffer, counterBufferCount,
3864 pCounterBuffers, pCounterBufferOffsets, false);
3865 }
3866
3867 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,const VkConditionalRenderingBeginInfoEXT * pConditionalRenderingBegin)3868 hk_CmdBeginConditionalRenderingEXT(
3869 VkCommandBuffer commandBuffer,
3870 const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
3871 {
3872 unreachable("stub");
3873 }
3874
3875 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)3876 hk_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
3877 {
3878 unreachable("stub");
3879 }
3880