xref: /aosp_15_r20/external/mesa3d/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  *
4  * Derived from tu_cmd_buffer.c which is:
5  * Copyright © 2016 Red Hat.
6  * Copyright © 2016 Bas Nieuwenhuizen
7  * Copyright © 2015 Intel Corporation
8  *
9  * SPDX-License-Identifier: MIT
10  */
11 
12 #include "genxml/gen_macros.h"
13 
14 #include "panvk_buffer.h"
15 #include "panvk_cmd_alloc.h"
16 #include "panvk_cmd_buffer.h"
17 #include "panvk_cmd_desc_state.h"
18 #include "panvk_cmd_meta.h"
19 #include "panvk_device.h"
20 #include "panvk_entrypoints.h"
21 #include "panvk_image.h"
22 #include "panvk_image_view.h"
23 #include "panvk_instance.h"
24 #include "panvk_priv_bo.h"
25 #include "panvk_shader.h"
26 
27 #include "pan_desc.h"
28 #include "pan_earlyzs.h"
29 #include "pan_encoder.h"
30 #include "pan_format.h"
31 #include "pan_jc.h"
32 #include "pan_props.h"
33 #include "pan_samples.h"
34 #include "pan_shader.h"
35 
36 #include "vk_format.h"
37 #include "vk_meta.h"
38 #include "vk_pipeline_layout.h"
39 
40 struct panvk_draw_info {
41    struct {
42       uint32_t size;
43       uint32_t offset;
44       int32_t vertex_offset;
45    } index;
46 
47    struct {
48       uint32_t base;
49       uint32_t count;
50    } vertex;
51 
52    struct {
53       uint32_t base;
54       uint32_t count;
55    } instance;
56 };
57 
58 #define is_dirty(__cmdbuf, __name)                                             \
59    BITSET_TEST((__cmdbuf)->vk.dynamic_graphics_state.dirty,                    \
60                MESA_VK_DYNAMIC_##__name)
61 
62 static void
emit_vs_attrib(const struct panvk_draw_info * draw,const struct vk_vertex_attribute_state * attrib_info,const struct vk_vertex_binding_state * buf_info,const struct panvk_attrib_buf * buf,uint32_t vb_desc_offset,struct mali_attribute_packed * desc)63 emit_vs_attrib(const struct panvk_draw_info *draw,
64                const struct vk_vertex_attribute_state *attrib_info,
65                const struct vk_vertex_binding_state *buf_info,
66                const struct panvk_attrib_buf *buf, uint32_t vb_desc_offset,
67                struct mali_attribute_packed *desc)
68 {
69    bool per_instance = buf_info->input_rate == VK_VERTEX_INPUT_RATE_INSTANCE;
70    enum pipe_format f = vk_format_to_pipe_format(attrib_info->format);
71    unsigned buf_idx = vb_desc_offset + attrib_info->binding;
72    unsigned divisor = draw->vertex.count * buf_info->divisor;
73 
74    pan_pack(desc, ATTRIBUTE, cfg) {
75       cfg.offset = attrib_info->offset;
76       cfg.format = GENX(panfrost_format_from_pipe_format)(f)->hw;
77       cfg.table = 0;
78       cfg.buffer_index = buf_idx;
79       cfg.stride = buf_info->stride;
80       if (!per_instance) {
81          /* Per-vertex */
82          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D;
83          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_VERTEX;
84          cfg.offset_enable = true;
85       } else if (util_is_power_of_two_or_zero(divisor)) {
86          /* Per-instance, POT divisor */
87          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
88          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
89          cfg.divisor_r = __builtin_ctz(divisor);
90       } else {
91          /* Per-instance, NPOT divisor */
92          cfg.attribute_type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
93          cfg.frequency = MALI_ATTRIBUTE_FREQUENCY_INSTANCE;
94          cfg.divisor_d = panfrost_compute_magic_divisor(divisor, &cfg.divisor_r,
95                                                         &cfg.divisor_e);
96       }
97    }
98 }
99 
100 static VkResult
prepare_vs_driver_set(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)101 prepare_vs_driver_set(struct panvk_cmd_buffer *cmdbuf,
102                       struct panvk_draw_info *draw)
103 {
104    struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
105    bool dirty = is_dirty(cmdbuf, VI) || is_dirty(cmdbuf, VI_BINDINGS_VALID) ||
106                 is_dirty(cmdbuf, VI_BINDING_STRIDES) ||
107                 cmdbuf->state.gfx.vb.dirty ||
108                 !vs_desc_state->driver_set.dev_addr;
109 
110    if (!dirty)
111       return VK_SUCCESS;
112 
113    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
114    const struct vk_vertex_input_state *vi =
115       cmdbuf->vk.dynamic_graphics_state.vi;
116    unsigned num_vs_attribs = util_last_bit(vi->attributes_valid);
117    uint32_t vb_count = 0;
118 
119    for (unsigned i = 0; i < num_vs_attribs; i++) {
120       if (vi->attributes_valid & BITFIELD_BIT(i))
121          vb_count = MAX2(vi->attributes[i].binding + 1, vb_count);
122    }
123 
124    uint32_t vb_offset = vs->desc_info.dyn_bufs.count + MAX_VS_ATTRIBS + 1;
125    uint32_t desc_count = vb_offset + vb_count;
126    const struct panvk_descriptor_state *desc_state =
127       &cmdbuf->state.gfx.desc_state;
128    struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
129       cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
130    struct panvk_opaque_desc *descs = driver_set.cpu;
131 
132    if (!driver_set.gpu)
133       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
134 
135    for (uint32_t i = 0; i < MAX_VS_ATTRIBS; i++) {
136       if (vi->attributes_valid & BITFIELD_BIT(i)) {
137          unsigned binding = vi->attributes[i].binding;
138 
139          emit_vs_attrib(draw, &vi->attributes[i], &vi->bindings[binding],
140                         &cmdbuf->state.gfx.vb.bufs[binding], vb_offset,
141                         (struct mali_attribute_packed *)(&descs[i]));
142       } else {
143          memset(&descs[i], 0, sizeof(descs[0]));
144       }
145    }
146 
147    /* Dummy sampler always comes right after the vertex attribs. */
148    pan_pack(&descs[MAX_VS_ATTRIBS], SAMPLER, _) {
149    }
150 
151    panvk_per_arch(cmd_fill_dyn_bufs)(
152       desc_state, vs,
153       (struct mali_buffer_packed *)(&descs[MAX_VS_ATTRIBS + 1]));
154 
155    for (uint32_t i = 0; i < vb_count; i++) {
156       const struct panvk_attrib_buf *vb = &cmdbuf->state.gfx.vb.bufs[i];
157 
158       pan_pack(&descs[vb_offset + i], BUFFER, cfg) {
159          if (vi->bindings_valid & BITFIELD_BIT(i)) {
160             cfg.address = vb->address;
161             cfg.size = vb->size;
162          } else {
163             cfg.address = 0;
164             cfg.size = 0;
165          }
166       }
167    }
168 
169    vs_desc_state->driver_set.dev_addr = driver_set.gpu;
170    vs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
171    return VK_SUCCESS;
172 }
173 
174 static VkResult
prepare_fs_driver_set(struct panvk_cmd_buffer * cmdbuf)175 prepare_fs_driver_set(struct panvk_cmd_buffer *cmdbuf)
176 {
177    struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
178 
179    if (fs_desc_state->driver_set.dev_addr)
180       return VK_SUCCESS;
181 
182    const struct panvk_descriptor_state *desc_state =
183       &cmdbuf->state.gfx.desc_state;
184    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
185    uint32_t desc_count = fs->desc_info.dyn_bufs.count + 1;
186    struct panfrost_ptr driver_set = panvk_cmd_alloc_dev_mem(
187       cmdbuf, desc, desc_count * PANVK_DESCRIPTOR_SIZE, PANVK_DESCRIPTOR_SIZE);
188    struct panvk_opaque_desc *descs = driver_set.cpu;
189 
190    if (desc_count && !driver_set.gpu)
191       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
192 
193    /* Dummy sampler always comes first. */
194    pan_pack(&descs[0], SAMPLER, _) {
195    }
196 
197    panvk_per_arch(cmd_fill_dyn_bufs)(desc_state, fs,
198                                      (struct mali_buffer_packed *)(&descs[1]));
199 
200    fs_desc_state->driver_set.dev_addr = driver_set.gpu;
201    fs_desc_state->driver_set.size = desc_count * PANVK_DESCRIPTOR_SIZE;
202    return VK_SUCCESS;
203 }
204 
205 static void
prepare_sysvals(struct panvk_cmd_buffer * cmdbuf)206 prepare_sysvals(struct panvk_cmd_buffer *cmdbuf)
207 {
208    struct panvk_graphics_sysvals *sysvals = &cmdbuf->state.gfx.sysvals;
209    struct vk_color_blend_state *cb = &cmdbuf->vk.dynamic_graphics_state.cb;
210 
211    if (is_dirty(cmdbuf, CB_BLEND_CONSTANTS)) {
212       for (unsigned i = 0; i < ARRAY_SIZE(cb->blend_constants); i++)
213          sysvals->blend.constants[i] =
214             CLAMP(cb->blend_constants[i], 0.0f, 1.0f);
215       cmdbuf->state.gfx.push_uniforms = 0;
216    }
217 
218    if (is_dirty(cmdbuf, VP_VIEWPORTS)) {
219       VkViewport *viewport = &cmdbuf->vk.dynamic_graphics_state.vp.viewports[0];
220 
221       /* Upload the viewport scale. Defined as (px/2, py/2, pz) at the start of
222        * section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the
223        * end of the section, the spec defines:
224        *
225        * px = width
226        * py = height
227        * pz = maxDepth - minDepth
228        */
229       sysvals->viewport.scale.x = 0.5f * viewport->width;
230       sysvals->viewport.scale.y = 0.5f * viewport->height;
231       sysvals->viewport.scale.z = (viewport->maxDepth - viewport->minDepth);
232 
233       /* Upload the viewport offset. Defined as (ox, oy, oz) at the start of
234        * section 24.5 ("Controlling the Viewport") of the Vulkan spec. At the
235        * end of the section, the spec defines:
236        *
237        * ox = x + width/2
238        * oy = y + height/2
239        * oz = minDepth
240        */
241       sysvals->viewport.offset.x = (0.5f * viewport->width) + viewport->x;
242       sysvals->viewport.offset.y = (0.5f * viewport->height) + viewport->y;
243       sysvals->viewport.offset.z = viewport->minDepth;
244       cmdbuf->state.gfx.push_uniforms = 0;
245    }
246 }
247 
248 static bool
has_depth_att(struct panvk_cmd_buffer * cmdbuf)249 has_depth_att(struct panvk_cmd_buffer *cmdbuf)
250 {
251    return (cmdbuf->state.gfx.render.bound_attachments &
252            MESA_VK_RP_ATTACHMENT_DEPTH_BIT) != 0;
253 }
254 
255 static bool
has_stencil_att(struct panvk_cmd_buffer * cmdbuf)256 has_stencil_att(struct panvk_cmd_buffer *cmdbuf)
257 {
258    return (cmdbuf->state.gfx.render.bound_attachments &
259            MESA_VK_RP_ATTACHMENT_STENCIL_BIT) != 0;
260 }
261 
262 static bool
writes_depth(struct panvk_cmd_buffer * cmdbuf)263 writes_depth(struct panvk_cmd_buffer *cmdbuf)
264 {
265    const struct vk_depth_stencil_state *ds =
266       &cmdbuf->vk.dynamic_graphics_state.ds;
267 
268    return has_depth_att(cmdbuf) && ds->depth.test_enable &&
269           ds->depth.write_enable && ds->depth.compare_op != VK_COMPARE_OP_NEVER;
270 }
271 
272 static bool
writes_stencil(struct panvk_cmd_buffer * cmdbuf)273 writes_stencil(struct panvk_cmd_buffer *cmdbuf)
274 {
275    const struct vk_depth_stencil_state *ds =
276       &cmdbuf->vk.dynamic_graphics_state.ds;
277 
278    return has_stencil_att(cmdbuf) && ds->stencil.test_enable &&
279           ((ds->stencil.front.write_mask &&
280             (ds->stencil.front.op.fail != VK_STENCIL_OP_KEEP ||
281              ds->stencil.front.op.pass != VK_STENCIL_OP_KEEP ||
282              ds->stencil.front.op.depth_fail != VK_STENCIL_OP_KEEP)) ||
283            (ds->stencil.back.write_mask &&
284             (ds->stencil.back.op.fail != VK_STENCIL_OP_KEEP ||
285              ds->stencil.back.op.pass != VK_STENCIL_OP_KEEP ||
286              ds->stencil.back.op.depth_fail != VK_STENCIL_OP_KEEP)));
287 }
288 
289 static bool
ds_test_always_passes(struct panvk_cmd_buffer * cmdbuf)290 ds_test_always_passes(struct panvk_cmd_buffer *cmdbuf)
291 {
292    const struct vk_depth_stencil_state *ds =
293       &cmdbuf->vk.dynamic_graphics_state.ds;
294 
295    if (!has_depth_att(cmdbuf))
296       return true;
297 
298    if (ds->depth.test_enable && ds->depth.compare_op != VK_COMPARE_OP_ALWAYS)
299       return false;
300 
301    if (ds->stencil.test_enable &&
302        (ds->stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
303         ds->stencil.back.op.compare != VK_COMPARE_OP_ALWAYS))
304       return false;
305 
306    return true;
307 }
308 
309 static inline enum mali_func
translate_compare_func(VkCompareOp comp)310 translate_compare_func(VkCompareOp comp)
311 {
312    STATIC_ASSERT(VK_COMPARE_OP_NEVER == (VkCompareOp)MALI_FUNC_NEVER);
313    STATIC_ASSERT(VK_COMPARE_OP_LESS == (VkCompareOp)MALI_FUNC_LESS);
314    STATIC_ASSERT(VK_COMPARE_OP_EQUAL == (VkCompareOp)MALI_FUNC_EQUAL);
315    STATIC_ASSERT(VK_COMPARE_OP_LESS_OR_EQUAL == (VkCompareOp)MALI_FUNC_LEQUAL);
316    STATIC_ASSERT(VK_COMPARE_OP_GREATER == (VkCompareOp)MALI_FUNC_GREATER);
317    STATIC_ASSERT(VK_COMPARE_OP_NOT_EQUAL == (VkCompareOp)MALI_FUNC_NOT_EQUAL);
318    STATIC_ASSERT(VK_COMPARE_OP_GREATER_OR_EQUAL ==
319                  (VkCompareOp)MALI_FUNC_GEQUAL);
320    STATIC_ASSERT(VK_COMPARE_OP_ALWAYS == (VkCompareOp)MALI_FUNC_ALWAYS);
321 
322    return (enum mali_func)comp;
323 }
324 
325 static enum mali_stencil_op
translate_stencil_op(VkStencilOp in)326 translate_stencil_op(VkStencilOp in)
327 {
328    switch (in) {
329    case VK_STENCIL_OP_KEEP:
330       return MALI_STENCIL_OP_KEEP;
331    case VK_STENCIL_OP_ZERO:
332       return MALI_STENCIL_OP_ZERO;
333    case VK_STENCIL_OP_REPLACE:
334       return MALI_STENCIL_OP_REPLACE;
335    case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
336       return MALI_STENCIL_OP_INCR_SAT;
337    case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
338       return MALI_STENCIL_OP_DECR_SAT;
339    case VK_STENCIL_OP_INCREMENT_AND_WRAP:
340       return MALI_STENCIL_OP_INCR_WRAP;
341    case VK_STENCIL_OP_DECREMENT_AND_WRAP:
342       return MALI_STENCIL_OP_DECR_WRAP;
343    case VK_STENCIL_OP_INVERT:
344       return MALI_STENCIL_OP_INVERT;
345    default:
346       unreachable("Invalid stencil op");
347    }
348 }
349 
350 static bool
fs_required(struct panvk_cmd_buffer * cmdbuf)351 fs_required(struct panvk_cmd_buffer *cmdbuf)
352 {
353    const struct pan_shader_info *fs_info =
354       cmdbuf->state.gfx.fs.shader ? &cmdbuf->state.gfx.fs.shader->info : NULL;
355    const struct vk_dynamic_graphics_state *dyns =
356       &cmdbuf->vk.dynamic_graphics_state;
357    const struct vk_color_blend_state *cb = &dyns->cb;
358 
359    if (!fs_info)
360       return false;
361 
362    /* If we generally have side effects */
363    if (fs_info->fs.sidefx)
364       return true;
365 
366    /* If colour is written we need to execute */
367    for (unsigned i = 0; i < cb->attachment_count; ++i) {
368       if ((cb->color_write_enables & BITFIELD_BIT(i)) &&
369           cb->attachments[i].write_mask)
370          return true;
371    }
372 
373    /* If alpha-to-coverage is enabled, we need to run the fragment shader even
374     * if we don't have a color attachment, so depth/stencil updates can be
375     * discarded if alpha, and thus coverage, is 0. */
376    if (dyns->ms.alpha_to_coverage_enable)
377       return true;
378 
379    /* If depth is written and not implied we need to execute.
380     * TODO: Predicate on Z/S writes being enabled */
381    return (fs_info->fs.writes_depth || fs_info->fs.writes_stencil);
382 }
383 
384 static enum mali_draw_mode
translate_prim_topology(VkPrimitiveTopology in)385 translate_prim_topology(VkPrimitiveTopology in)
386 {
387    /* Test VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA separately, as it's not
388     * part of the VkPrimitiveTopology enum.
389     */
390    if (in == VK_PRIMITIVE_TOPOLOGY_META_RECT_LIST_MESA)
391       return MALI_DRAW_MODE_TRIANGLES;
392 
393    switch (in) {
394    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
395       return MALI_DRAW_MODE_POINTS;
396    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
397       return MALI_DRAW_MODE_LINES;
398    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
399       return MALI_DRAW_MODE_LINE_STRIP;
400    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
401       return MALI_DRAW_MODE_TRIANGLES;
402    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
403       return MALI_DRAW_MODE_TRIANGLE_STRIP;
404    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
405       return MALI_DRAW_MODE_TRIANGLE_FAN;
406    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
407    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
408    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
409    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
410    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
411    default:
412       unreachable("Invalid primitive type");
413    }
414 }
415 
416 static void
force_fb_preload(struct panvk_cmd_buffer * cmdbuf)417 force_fb_preload(struct panvk_cmd_buffer *cmdbuf)
418 {
419    for (unsigned i = 0; i < cmdbuf->state.gfx.render.fb.info.rt_count; i++) {
420       if (cmdbuf->state.gfx.render.fb.info.rts[i].view) {
421          cmdbuf->state.gfx.render.fb.info.rts[i].clear = false;
422          cmdbuf->state.gfx.render.fb.info.rts[i].preload = true;
423       }
424    }
425 
426    if (cmdbuf->state.gfx.render.fb.info.zs.view.zs) {
427       cmdbuf->state.gfx.render.fb.info.zs.clear.z = false;
428       cmdbuf->state.gfx.render.fb.info.zs.preload.z = true;
429    }
430 
431    if (cmdbuf->state.gfx.render.fb.info.zs.view.s ||
432        (cmdbuf->state.gfx.render.fb.info.zs.view.zs &&
433         util_format_is_depth_and_stencil(
434            cmdbuf->state.gfx.render.fb.info.zs.view.zs->format))) {
435       cmdbuf->state.gfx.render.fb.info.zs.clear.s = false;
436       cmdbuf->state.gfx.render.fb.info.zs.preload.s = true;
437    }
438 }
439 
440 static VkResult
update_tls(struct panvk_cmd_buffer * cmdbuf)441 update_tls(struct panvk_cmd_buffer *cmdbuf)
442 {
443    struct panvk_tls_state *state = &cmdbuf->state.tls;
444    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
445    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
446    struct cs_builder *b =
447       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
448 
449    if (!cmdbuf->state.gfx.tsd) {
450       if (!state->desc.gpu) {
451          state->desc = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
452          if (!state->desc.gpu)
453             return VK_ERROR_OUT_OF_DEVICE_MEMORY;
454       }
455 
456       cmdbuf->state.gfx.tsd = state->desc.gpu;
457 
458       cs_update_vt_ctx(b)
459          cs_move64_to(b, cs_sr_reg64(b, 24), state->desc.gpu);
460    }
461 
462    state->info.tls.size =
463       MAX3(vs->info.tls_size, fs ? fs->info.tls_size : 0, state->info.tls.size);
464    return VK_SUCCESS;
465 }
466 
467 static enum mali_index_type
index_size_to_index_type(uint32_t size)468 index_size_to_index_type(uint32_t size)
469 {
470    switch (size) {
471    case 0:
472       return MALI_INDEX_TYPE_NONE;
473    case 1:
474       return MALI_INDEX_TYPE_UINT8;
475    case 2:
476       return MALI_INDEX_TYPE_UINT16;
477    case 4:
478       return MALI_INDEX_TYPE_UINT32;
479    default:
480       assert(!"Invalid index size");
481       return MALI_INDEX_TYPE_NONE;
482    }
483 }
484 
485 static VkResult
prepare_blend(struct panvk_cmd_buffer * cmdbuf)486 prepare_blend(struct panvk_cmd_buffer *cmdbuf)
487 {
488    bool dirty =
489       is_dirty(cmdbuf, CB_LOGIC_OP_ENABLE) || is_dirty(cmdbuf, CB_LOGIC_OP) ||
490       is_dirty(cmdbuf, CB_ATTACHMENT_COUNT) ||
491       is_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) ||
492       is_dirty(cmdbuf, CB_BLEND_ENABLES) ||
493       is_dirty(cmdbuf, CB_BLEND_EQUATIONS) ||
494       is_dirty(cmdbuf, CB_WRITE_MASKS) || is_dirty(cmdbuf, CB_BLEND_CONSTANTS);
495 
496    if (!dirty)
497       return VK_SUCCESS;
498 
499    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
500    const struct vk_dynamic_graphics_state *dyns =
501       &cmdbuf->vk.dynamic_graphics_state;
502    const struct vk_color_blend_state *cb = &dyns->cb;
503    unsigned bd_count = MAX2(cb->attachment_count, 1);
504    struct cs_builder *b =
505       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
506    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
507    const struct pan_shader_info *fs_info = fs ? &fs->info : NULL;
508    mali_ptr fs_code = panvk_shader_get_dev_addr(fs);
509    struct panfrost_ptr ptr =
510       panvk_cmd_alloc_desc_array(cmdbuf, bd_count, BLEND);
511    struct mali_blend_packed *bds = ptr.cpu;
512 
513    if (bd_count && !ptr.gpu)
514       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
515 
516    panvk_per_arch(blend_emit_descs)(
517       dev, cb, cmdbuf->state.gfx.render.color_attachments.fmts,
518       cmdbuf->state.gfx.render.color_attachments.samples, fs_info, fs_code, bds,
519       &cmdbuf->state.gfx.cb.info);
520 
521    cs_move64_to(b, cs_sr_reg64(b, 50), ptr.gpu | bd_count);
522    return VK_SUCCESS;
523 }
524 
525 static void
prepare_vp(struct panvk_cmd_buffer * cmdbuf)526 prepare_vp(struct panvk_cmd_buffer *cmdbuf)
527 {
528    struct cs_builder *b =
529       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
530    const VkViewport *viewport =
531       &cmdbuf->vk.dynamic_graphics_state.vp.viewports[0];
532    const VkRect2D *scissor = &cmdbuf->vk.dynamic_graphics_state.vp.scissors[0];
533 
534    if (is_dirty(cmdbuf, VP_VIEWPORTS) || is_dirty(cmdbuf, VP_SCISSORS)) {
535       uint64_t scissor_box;
536       pan_pack(&scissor_box, SCISSOR, cfg) {
537 
538          /* The spec says "width must be greater than 0.0" */
539          assert(viewport->x >= 0);
540          int minx = (int)viewport->x;
541          int maxx = (int)(viewport->x + viewport->width);
542 
543          /* Viewport height can be negative */
544          int miny =
545             MIN2((int)viewport->y, (int)(viewport->y + viewport->height));
546          int maxy =
547             MAX2((int)viewport->y, (int)(viewport->y + viewport->height));
548 
549          assert(scissor->offset.x >= 0 && scissor->offset.y >= 0);
550          miny = MAX2(scissor->offset.x, minx);
551          miny = MAX2(scissor->offset.y, miny);
552          maxx = MIN2(scissor->offset.x + scissor->extent.width, maxx);
553          maxy = MIN2(scissor->offset.y + scissor->extent.height, maxy);
554 
555          /* Make sure we don't end up with a max < min when width/height is 0 */
556          maxx = maxx > minx ? maxx - 1 : maxx;
557          maxy = maxy > miny ? maxy - 1 : maxy;
558 
559          cfg.scissor_minimum_x = minx;
560          cfg.scissor_minimum_y = miny;
561          cfg.scissor_maximum_x = maxx;
562          cfg.scissor_maximum_y = maxy;
563       }
564 
565       cs_move64_to(b, cs_sr_reg64(b, 42), scissor_box);
566    }
567 
568    if (is_dirty(cmdbuf, VP_VIEWPORTS)) {
569       cs_move32_to(b, cs_sr_reg32(b, 44),
570                    fui(MIN2(viewport->minDepth, viewport->maxDepth)));
571       cs_move32_to(b, cs_sr_reg32(b, 45),
572                    fui(MAX2(viewport->minDepth, viewport->maxDepth)));
573    }
574 }
575 
576 static uint32_t
calc_fbd_size(struct panvk_cmd_buffer * cmdbuf)577 calc_fbd_size(struct panvk_cmd_buffer *cmdbuf)
578 {
579    const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info;
580    bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s;
581    uint32_t fbd_size = pan_size(FRAMEBUFFER);
582 
583    if (has_zs_ext)
584       fbd_size += pan_size(ZS_CRC_EXTENSION);
585 
586    fbd_size += pan_size(RENDER_TARGET) * MAX2(fb->rt_count, 1);
587    return fbd_size;
588 }
589 
590 static uint32_t
calc_render_descs_size(struct panvk_cmd_buffer * cmdbuf)591 calc_render_descs_size(struct panvk_cmd_buffer *cmdbuf)
592 {
593    return (calc_fbd_size(cmdbuf) * cmdbuf->state.gfx.render.layer_count) +
594           pan_size(TILER_CONTEXT);
595 }
596 
597 static void
cs_render_desc_ringbuf_reserve(struct cs_builder * b,uint32_t size)598 cs_render_desc_ringbuf_reserve(struct cs_builder *b, uint32_t size)
599 {
600    /* Make sure we don't allocate more than the ringbuf size. */
601    assert(size <= RENDER_DESC_RINGBUF_SIZE);
602 
603    /* Make sure the allocation is 64-byte aligned. */
604    assert(ALIGN_POT(size, 64) == size);
605 
606    struct cs_index ringbuf_sync = cs_scratch_reg64(b, 0);
607    struct cs_index sz_reg = cs_scratch_reg32(b, 2);
608 
609    cs_load64_to(
610       b, ringbuf_sync, cs_subqueue_ctx_reg(b),
611       offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.syncobj));
612    cs_wait_slot(b, SB_ID(LS), false);
613 
614    /* Wait for the other end to release memory. */
615    cs_move32_to(b, sz_reg, size - 1);
616    cs_sync32_wait(b, false, MALI_CS_CONDITION_GREATER, sz_reg, ringbuf_sync);
617 
618    /* Decrement the syncobj to reflect the fact we're reserving memory. */
619    cs_move32_to(b, sz_reg, -size);
620    cs_sync32_add(b, false, MALI_CS_SYNC_SCOPE_CSG, sz_reg, ringbuf_sync,
621                  cs_now());
622 }
623 
624 static void
cs_render_desc_ringbuf_move_ptr(struct cs_builder * b,uint32_t size)625 cs_render_desc_ringbuf_move_ptr(struct cs_builder *b, uint32_t size)
626 {
627    struct cs_index scratch_reg = cs_scratch_reg32(b, 0);
628    struct cs_index ptr_lo = cs_scratch_reg32(b, 2);
629    struct cs_index pos = cs_scratch_reg32(b, 4);
630 
631    cs_load_to(
632       b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
633       BITFIELD_MASK(3),
634       offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
635    cs_wait_slot(b, SB_ID(LS), false);
636 
637    /* Update the relative position and absolute address. */
638    cs_add32(b, ptr_lo, ptr_lo, size);
639    cs_add32(b, pos, pos, size);
640    cs_add32(b, scratch_reg, pos, -RENDER_DESC_RINGBUF_SIZE);
641 
642    /* Wrap-around. */
643    cs_while(b, MALI_CS_CONDITION_GEQUAL, scratch_reg) {
644       cs_add32(b, ptr_lo, ptr_lo, -RENDER_DESC_RINGBUF_SIZE);
645       cs_add32(b, pos, pos, -RENDER_DESC_RINGBUF_SIZE);
646       cs_loop_break(b, MALI_CS_CONDITION_ALWAYS, cs_undef());
647    }
648 
649    cs_store(
650       b, cs_scratch_reg_tuple(b, 2, 3), cs_subqueue_ctx_reg(b),
651       BITFIELD_MASK(3),
652       offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
653    cs_wait_slot(b, SB_ID(LS), false);
654 }
655 
656 static VkResult
get_tiler_desc(struct panvk_cmd_buffer * cmdbuf)657 get_tiler_desc(struct panvk_cmd_buffer *cmdbuf)
658 {
659    if (cmdbuf->state.gfx.render.tiler)
660       return VK_SUCCESS;
661 
662    struct cs_builder *b =
663       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
664    struct panvk_physical_device *phys_dev =
665       to_panvk_physical_device(cmdbuf->vk.base.device->physical);
666    struct panfrost_tiler_features tiler_features =
667       panfrost_query_tiler_features(&phys_dev->kmod.props);
668    bool simul_use =
669       cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
670    struct panfrost_ptr tiler_desc = {0};
671    struct mali_tiler_context_packed tiler_tmpl;
672 
673    if (!simul_use) {
674       tiler_desc = panvk_cmd_alloc_desc(cmdbuf, TILER_CONTEXT);
675       if (!tiler_desc.gpu)
676          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
677    } else {
678       /* If the tiler descriptor is allocated from the ring buffer, we set a
679        * dumb non-zero address to allow the is-tiler-acquired test to pass. */
680       tiler_desc.cpu = &tiler_tmpl;
681       tiler_desc.gpu = 0xdeadbeefdeadbeefull;
682    }
683 
684    pan_pack(tiler_desc.cpu, TILER_CONTEXT, cfg) {
685       unsigned max_levels = tiler_features.max_levels;
686       assert(max_levels >= 2);
687 
688       /* TODO: Select hierarchy mask more effectively */
689       cfg.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
690 
691       /* For large framebuffers, disable the smallest bin size to
692        * avoid pathological tiler memory usage.
693        */
694       cfg.fb_width = cmdbuf->state.gfx.render.fb.info.width;
695       cfg.fb_height = cmdbuf->state.gfx.render.fb.info.height;
696       if (MAX2(cfg.fb_width, cfg.fb_height) >= 4096)
697          cfg.hierarchy_mask &= ~1;
698 
699       cfg.sample_pattern =
700          pan_sample_pattern(cmdbuf->state.gfx.render.fb.info.nr_samples);
701 
702       /* TODO: revisit for VK_EXT_provoking_vertex. */
703       cfg.first_provoking_vertex = true;
704 
705       cfg.layer_count = cmdbuf->state.gfx.render.layer_count;
706       cfg.layer_offset = 0;
707    }
708 
709    cmdbuf->state.gfx.render.tiler = tiler_desc.gpu;
710 
711    struct cs_index tiler_ctx_addr = cs_sr_reg64(b, 40);
712 
713    if (simul_use) {
714       uint32_t descs_sz = calc_render_descs_size(cmdbuf);
715 
716       cs_render_desc_ringbuf_reserve(b, descs_sz);
717 
718       /* Reserve ringbuf mem. */
719       cs_update_vt_ctx(b) {
720          cs_load64_to(b, tiler_ctx_addr, cs_subqueue_ctx_reg(b),
721                       offsetof(struct panvk_cs_subqueue_context,
722                                render.desc_ringbuf.ptr));
723       }
724 
725       cs_render_desc_ringbuf_move_ptr(b, descs_sz);
726 
727       /* Lay out words 2:5, so they can be stored along the other updates. */
728       cs_move64_to(b, cs_scratch_reg64(b, 2),
729                    tiler_tmpl.opaque[2] | (uint64_t)tiler_tmpl.opaque[3] << 32);
730       cs_move64_to(b, cs_scratch_reg64(b, 4),
731                    tiler_tmpl.opaque[4] | (uint64_t)tiler_tmpl.opaque[5] << 32);
732    } else {
733       cs_update_vt_ctx(b) {
734          cs_move64_to(b, tiler_ctx_addr, tiler_desc.gpu);
735       }
736    }
737 
738    /* Reset the polygon list. */
739    cs_move64_to(b, cs_scratch_reg64(b, 0), 0);
740 
741    /* Load the tiler_heap and geom_buf from the context. */
742    cs_load_to(b, cs_scratch_reg_tuple(b, 6, 4), cs_subqueue_ctx_reg(b),
743               BITFIELD_MASK(4),
744               offsetof(struct panvk_cs_subqueue_context, render.tiler_heap));
745 
746    /* Reset the completed chain. */
747    cs_move64_to(b, cs_scratch_reg64(b, 10), 0);
748    cs_move64_to(b, cs_scratch_reg64(b, 12), 0);
749 
750    cs_wait_slot(b, SB_ID(LS), false);
751 
752    /* Update the first half of the tiler desc. */
753    if (simul_use) {
754       cs_store(b, cs_scratch_reg_tuple(b, 0, 14), tiler_ctx_addr,
755                BITFIELD_MASK(14), 0);
756    } else {
757       cs_store(b, cs_scratch_reg_tuple(b, 0, 2), tiler_ctx_addr,
758                BITFIELD_MASK(2), 0);
759       cs_store(b, cs_scratch_reg_tuple(b, 6, 8), tiler_ctx_addr,
760                BITFIELD_MASK(8), 24);
761    }
762 
763    cs_wait_slot(b, SB_ID(LS), false);
764 
765    /* r10:13 are already zero, fill r8:9 and r14:15 with zeros so we can reset
766     * the private state in one store. */
767    cs_move64_to(b, cs_scratch_reg64(b, 8), 0);
768    cs_move64_to(b, cs_scratch_reg64(b, 14), 0);
769 
770    /* Update the second half of the tiler descriptor. */
771    cs_store(b, cs_scratch_reg_tuple(b, 8, 8), tiler_ctx_addr, BITFIELD_MASK(8),
772             96);
773    cs_wait_slot(b, SB_ID(LS), false);
774 
775    /* Then we change the scoreboard slot used for iterators. */
776    panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
777 
778    cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, cs_now());
779    return VK_SUCCESS;
780 }
781 
782 static VkResult
get_fb_descs(struct panvk_cmd_buffer * cmdbuf)783 get_fb_descs(struct panvk_cmd_buffer *cmdbuf)
784 {
785    if (cmdbuf->state.gfx.render.fbds.gpu ||
786        !cmdbuf->state.gfx.render.layer_count)
787       return VK_SUCCESS;
788 
789    uint32_t fbds_sz =
790       calc_fbd_size(cmdbuf) * cmdbuf->state.gfx.render.layer_count;
791 
792    memset(&cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds, 0,
793           sizeof(cmdbuf->state.gfx.render.fb.info.bifrost.pre_post.dcds));
794 
795    cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem(
796       cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER));
797    if (!cmdbuf->state.gfx.render.fbds.gpu)
798       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
799 
800    return VK_SUCCESS;
801 }
802 
803 static VkResult
prepare_vs(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)804 prepare_vs(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
805 {
806    struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
807    struct panvk_shader_desc_state *vs_desc_state = &cmdbuf->state.gfx.vs.desc;
808    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
809    struct cs_builder *b =
810       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
811    const struct vk_input_assembly_state *ia =
812       &cmdbuf->vk.dynamic_graphics_state.ia;
813    mali_ptr pos_spd = ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST
814                          ? panvk_priv_mem_dev_addr(vs->spds.pos_points)
815                          : panvk_priv_mem_dev_addr(vs->spds.pos_triangles);
816    mali_ptr var_spd = panvk_priv_mem_dev_addr(vs->spds.var);
817    bool upd_res_table = false;
818 
819    if (!vs_desc_state->res_table) {
820       VkResult result = prepare_vs_driver_set(cmdbuf, draw);
821       if (result != VK_SUCCESS)
822          return result;
823 
824       result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
825                                                             vs, vs_desc_state);
826       if (result != VK_SUCCESS)
827          return result;
828 
829       upd_res_table = true;
830    }
831 
832    cs_update_vt_ctx(b) {
833       if (upd_res_table)
834          cs_move64_to(b, cs_sr_reg64(b, 0), vs_desc_state->res_table);
835 
836       if (pos_spd != cmdbuf->state.gfx.vs.spds.pos)
837          cs_move64_to(b, cs_sr_reg64(b, 16), pos_spd);
838 
839       if (var_spd != cmdbuf->state.gfx.vs.spds.var)
840          cs_move64_to(b, cs_sr_reg64(b, 18), var_spd);
841    }
842 
843    return VK_SUCCESS;
844 }
845 
846 static VkResult
prepare_fs(struct panvk_cmd_buffer * cmdbuf)847 prepare_fs(struct panvk_cmd_buffer *cmdbuf)
848 {
849    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
850    struct panvk_shader_desc_state *fs_desc_state = &cmdbuf->state.gfx.fs.desc;
851    struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
852    struct cs_builder *b =
853       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
854    mali_ptr frag_spd = panvk_priv_mem_dev_addr(fs->spd);
855    bool upd_res_table = false;
856 
857    if (!fs_desc_state->res_table) {
858       VkResult result = prepare_fs_driver_set(cmdbuf);
859       if (result != VK_SUCCESS)
860          return result;
861 
862       result = panvk_per_arch(cmd_prepare_shader_res_table)(cmdbuf, desc_state,
863                                                             fs, fs_desc_state);
864       if (result != VK_SUCCESS)
865          return result;
866 
867       upd_res_table = true;
868    }
869 
870    cs_update_vt_ctx(b) {
871       if (upd_res_table)
872          cs_move64_to(b, cs_sr_reg64(b, 4), fs_desc_state->res_table);
873 
874       if (cmdbuf->state.gfx.fs.spd != frag_spd)
875          cs_move64_to(b, cs_sr_reg64(b, 20), frag_spd);
876    }
877 
878    return VK_SUCCESS;
879 }
880 
881 static VkResult
prepare_push_uniforms(struct panvk_cmd_buffer * cmdbuf)882 prepare_push_uniforms(struct panvk_cmd_buffer *cmdbuf)
883 {
884    struct cs_builder *b =
885       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
886 
887    if (!cmdbuf->state.gfx.push_uniforms) {
888       cmdbuf->state.gfx.push_uniforms = panvk_per_arch(
889          cmd_prepare_push_uniforms)(cmdbuf, &cmdbuf->state.gfx.sysvals,
890                                     sizeof(cmdbuf->state.gfx.sysvals));
891       if (!cmdbuf->state.gfx.push_uniforms)
892          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
893 
894       uint32_t push_size = 256 + sizeof(struct panvk_graphics_sysvals);
895       uint64_t fau_count = DIV_ROUND_UP(push_size, 8);
896       mali_ptr fau_ptr = cmdbuf->state.gfx.push_uniforms | (fau_count << 56);
897 
898       cs_update_vt_ctx(b) {
899          cs_move64_to(b, cs_sr_reg64(b, 8), fau_ptr);
900          cs_move64_to(b, cs_sr_reg64(b, 12), fau_ptr);
901       }
902    }
903 
904    return VK_SUCCESS;
905 }
906 
907 static VkResult
prepare_ds(struct panvk_cmd_buffer * cmdbuf)908 prepare_ds(struct panvk_cmd_buffer *cmdbuf)
909 {
910    bool dirty = is_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
911                 is_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
912                 is_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
913                 is_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
914                 is_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
915                 is_dirty(cmdbuf, DS_STENCIL_OP) ||
916                 is_dirty(cmdbuf, DS_STENCIL_COMPARE_MASK) ||
917                 is_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
918                 is_dirty(cmdbuf, DS_STENCIL_REFERENCE) ||
919                 is_dirty(cmdbuf, RS_DEPTH_CLAMP_ENABLE) ||
920                 is_dirty(cmdbuf, RS_DEPTH_BIAS_ENABLE) ||
921                 is_dirty(cmdbuf, RS_DEPTH_BIAS_FACTORS) ||
922                 /* fs_required() uses ms.alpha_to_coverage_enable
923                  * and vk_color_blend_state
924                  */
925                 is_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) ||
926                 is_dirty(cmdbuf, CB_ATTACHMENT_COUNT) ||
927                 is_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) ||
928                 is_dirty(cmdbuf, CB_WRITE_MASKS);
929 
930    if (!dirty)
931       return VK_SUCCESS;
932 
933    struct cs_builder *b =
934       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
935    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
936    const struct vk_dynamic_graphics_state *dyns =
937       &cmdbuf->vk.dynamic_graphics_state;
938    const struct vk_depth_stencil_state *ds = &dyns->ds;
939    const struct vk_rasterization_state *rs = &dyns->rs;
940    bool test_s = has_stencil_att(cmdbuf) && ds->stencil.test_enable;
941    bool test_z = has_depth_att(cmdbuf) && ds->depth.test_enable;
942    bool needs_fs = fs_required(cmdbuf);
943 
944    struct panfrost_ptr zsd = panvk_cmd_alloc_desc(cmdbuf, DEPTH_STENCIL);
945    if (!zsd.gpu)
946       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
947 
948    pan_pack(zsd.cpu, DEPTH_STENCIL, cfg) {
949       cfg.stencil_test_enable = test_s;
950       if (test_s) {
951          cfg.front_compare_function =
952             translate_compare_func(ds->stencil.front.op.compare);
953          cfg.front_stencil_fail =
954             translate_stencil_op(ds->stencil.front.op.fail);
955          cfg.front_depth_fail =
956             translate_stencil_op(ds->stencil.front.op.depth_fail);
957          cfg.front_depth_pass = translate_stencil_op(ds->stencil.front.op.pass);
958          cfg.back_compare_function =
959             translate_compare_func(ds->stencil.back.op.compare);
960          cfg.back_stencil_fail = translate_stencil_op(ds->stencil.back.op.fail);
961          cfg.back_depth_fail =
962             translate_stencil_op(ds->stencil.back.op.depth_fail);
963          cfg.back_depth_pass = translate_stencil_op(ds->stencil.back.op.pass);
964       }
965 
966       cfg.stencil_from_shader = needs_fs ? fs->info.fs.writes_stencil : 0;
967       cfg.front_write_mask = ds->stencil.front.write_mask;
968       cfg.back_write_mask = ds->stencil.back.write_mask;
969       cfg.front_value_mask = ds->stencil.front.compare_mask;
970       cfg.back_value_mask = ds->stencil.back.compare_mask;
971       cfg.front_reference_value = ds->stencil.front.reference;
972       cfg.back_reference_value = ds->stencil.back.reference;
973 
974       if (rs->depth_clamp_enable)
975          cfg.depth_clamp_mode = MALI_DEPTH_CLAMP_MODE_BOUNDS;
976 
977       if (fs)
978          cfg.depth_source = pan_depth_source(&fs->info);
979       cfg.depth_write_enable = ds->depth.write_enable;
980       cfg.depth_bias_enable = rs->depth_bias.enable;
981       cfg.depth_function = test_z ? translate_compare_func(ds->depth.compare_op)
982                                   : MALI_FUNC_ALWAYS;
983       cfg.depth_units = rs->depth_bias.constant * 2.0f;
984       cfg.depth_factor = rs->depth_bias.slope;
985       cfg.depth_bias_clamp = rs->depth_bias.clamp;
986    }
987 
988    cs_update_vt_ctx(b)
989       cs_move64_to(b, cs_sr_reg64(b, 52), zsd.gpu);
990 
991    return VK_SUCCESS;
992 }
993 
994 static void
prepare_dcd(struct panvk_cmd_buffer * cmdbuf)995 prepare_dcd(struct panvk_cmd_buffer *cmdbuf)
996 {
997    struct cs_builder *b =
998       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
999    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
1000    bool fs_is_dirty =
1001       cmdbuf->state.gfx.fs.spd != (fs ? panvk_priv_mem_dev_addr(fs->spd) : 0);
1002    bool dcd0_dirty = is_dirty(cmdbuf, RS_RASTERIZER_DISCARD_ENABLE) ||
1003                      is_dirty(cmdbuf, RS_CULL_MODE) ||
1004                      is_dirty(cmdbuf, RS_FRONT_FACE) ||
1005                      is_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1006                      is_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1007                      is_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) ||
1008                      is_dirty(cmdbuf, MS_ALPHA_TO_ONE_ENABLE) ||
1009                      /* writes_depth() uses vk_depth_stencil_state */
1010                      is_dirty(cmdbuf, DS_DEPTH_TEST_ENABLE) ||
1011                      is_dirty(cmdbuf, DS_DEPTH_WRITE_ENABLE) ||
1012                      is_dirty(cmdbuf, DS_DEPTH_COMPARE_OP) ||
1013                      /* writes_stencil() uses vk_depth_stencil_state */
1014                      is_dirty(cmdbuf, DS_STENCIL_TEST_ENABLE) ||
1015                      is_dirty(cmdbuf, DS_STENCIL_OP) ||
1016                      is_dirty(cmdbuf, DS_STENCIL_WRITE_MASK) ||
1017                      /* fs_required() uses vk_color_blend_state */
1018                      is_dirty(cmdbuf, CB_ATTACHMENT_COUNT) ||
1019                      is_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) ||
1020                      is_dirty(cmdbuf, CB_WRITE_MASKS) || fs_is_dirty ||
1021                      cmdbuf->state.gfx.render.dirty;
1022    bool dcd1_dirty = is_dirty(cmdbuf, MS_RASTERIZATION_SAMPLES) ||
1023                      is_dirty(cmdbuf, MS_SAMPLE_MASK) ||
1024                      /* fs_required() uses ms.alpha_to_coverage_enable
1025                       * and vk_color_blend_state
1026                       */
1027                      is_dirty(cmdbuf, MS_ALPHA_TO_COVERAGE_ENABLE) ||
1028                      is_dirty(cmdbuf, CB_ATTACHMENT_COUNT) ||
1029                      is_dirty(cmdbuf, CB_COLOR_WRITE_ENABLES) ||
1030                      is_dirty(cmdbuf, CB_WRITE_MASKS) || fs_is_dirty ||
1031                      cmdbuf->state.gfx.render.dirty;
1032 
1033    bool needs_fs = fs_required(cmdbuf);
1034 
1035    const struct vk_dynamic_graphics_state *dyns =
1036       &cmdbuf->vk.dynamic_graphics_state;
1037    const struct vk_rasterization_state *rs =
1038       &cmdbuf->vk.dynamic_graphics_state.rs;
1039    bool alpha_to_coverage = dyns->ms.alpha_to_coverage_enable;
1040    bool writes_z = writes_depth(cmdbuf);
1041    bool writes_s = writes_stencil(cmdbuf);
1042 
1043    if (dcd0_dirty) {
1044       struct mali_dcd_flags_0_packed dcd0;
1045       pan_pack(&dcd0, DCD_FLAGS_0, cfg) {
1046          if (needs_fs) {
1047             uint8_t rt_written = fs->info.outputs_written >> FRAG_RESULT_DATA0;
1048             uint8_t rt_mask = cmdbuf->state.gfx.render.bound_attachments &
1049                               MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS;
1050 
1051             cfg.allow_forward_pixel_to_kill =
1052                fs->info.fs.can_fpk && !(rt_mask & ~rt_written) &&
1053                !alpha_to_coverage && !cmdbuf->state.gfx.cb.info.any_dest_read;
1054 
1055             bool writes_zs = writes_z || writes_s;
1056             bool zs_always_passes = ds_test_always_passes(cmdbuf);
1057             bool oq = false; /* TODO: Occlusion queries */
1058 
1059             struct pan_earlyzs_state earlyzs =
1060                pan_earlyzs_get(pan_earlyzs_analyze(&fs->info), writes_zs || oq,
1061                                alpha_to_coverage, zs_always_passes);
1062 
1063             cfg.pixel_kill_operation = earlyzs.kill;
1064             cfg.zs_update_operation = earlyzs.update;
1065          } else {
1066             cfg.allow_forward_pixel_to_kill = true;
1067             cfg.allow_forward_pixel_to_be_killed = true;
1068             cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
1069             cfg.zs_update_operation = MALI_PIXEL_KILL_STRONG_EARLY;
1070             cfg.overdraw_alpha0 = true;
1071             cfg.overdraw_alpha1 = true;
1072          }
1073 
1074          cfg.front_face_ccw = rs->front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
1075          cfg.cull_front_face = (rs->cull_mode & VK_CULL_MODE_FRONT_BIT) != 0;
1076          cfg.cull_back_face = (rs->cull_mode & VK_CULL_MODE_BACK_BIT) != 0;
1077 
1078          cfg.multisample_enable = dyns->ms.rasterization_samples > 1;
1079       }
1080 
1081       cs_update_vt_ctx(b)
1082          cs_move32_to(b, cs_sr_reg32(b, 57), dcd0.opaque[0]);
1083    }
1084 
1085    if (dcd1_dirty) {
1086       struct mali_dcd_flags_1_packed dcd1;
1087       pan_pack(&dcd1, DCD_FLAGS_1, cfg) {
1088          cfg.sample_mask = dyns->ms.rasterization_samples > 1
1089                               ? dyns->ms.sample_mask
1090                               : UINT16_MAX;
1091 
1092          if (needs_fs) {
1093             cfg.render_target_mask =
1094                (fs->info.outputs_written >> FRAG_RESULT_DATA0) &
1095                cmdbuf->state.gfx.render.bound_attachments;
1096          }
1097       }
1098 
1099       cs_update_vt_ctx(b)
1100          cs_move32_to(b, cs_sr_reg32(b, 58), dcd1.opaque[0]);
1101    }
1102 }
1103 
1104 static void
clear_dirty(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1105 clear_dirty(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1106 {
1107    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1108    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
1109 
1110    if (vs) {
1111       const struct vk_input_assembly_state *ia =
1112          &cmdbuf->vk.dynamic_graphics_state.ia;
1113 
1114       cmdbuf->state.gfx.vs.spds.pos =
1115          ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST
1116             ? panvk_priv_mem_dev_addr(vs->spds.pos_points)
1117             : panvk_priv_mem_dev_addr(vs->spds.pos_triangles);
1118       cmdbuf->state.gfx.vs.spds.var = panvk_priv_mem_dev_addr(vs->spds.var);
1119    }
1120 
1121    cmdbuf->state.gfx.fs.spd = fs ? panvk_priv_mem_dev_addr(fs->spd) : 0;
1122 
1123    if (draw->index.size)
1124       cmdbuf->state.gfx.ib.dirty = false;
1125 
1126    cmdbuf->state.gfx.render.dirty = false;
1127    vk_dynamic_graphics_state_clear_dirty(&cmdbuf->vk.dynamic_graphics_state);
1128 }
1129 
1130 static void
panvk_cmd_draw(struct panvk_cmd_buffer * cmdbuf,struct panvk_draw_info * draw)1131 panvk_cmd_draw(struct panvk_cmd_buffer *cmdbuf, struct panvk_draw_info *draw)
1132 {
1133    const struct panvk_shader *vs = cmdbuf->state.gfx.vs.shader;
1134    const struct panvk_shader *fs = cmdbuf->state.gfx.fs.shader;
1135    struct panvk_descriptor_state *desc_state = &cmdbuf->state.gfx.desc_state;
1136    const struct vk_rasterization_state *rs =
1137       &cmdbuf->vk.dynamic_graphics_state.rs;
1138    const struct vk_input_assembly_state *ia =
1139       &cmdbuf->vk.dynamic_graphics_state.ia;
1140    bool idvs = vs->info.vs.idvs;
1141    VkResult result;
1142 
1143    /* If there's no vertex shader, we can skip the draw. */
1144    if (!panvk_priv_mem_dev_addr(vs->spds.pos_points))
1145       return;
1146 
1147    /* FIXME: support non-IDVS. */
1148    assert(idvs);
1149 
1150    if (!cmdbuf->state.gfx.linked) {
1151       result = panvk_per_arch(link_shaders)(&cmdbuf->desc_pool, vs, fs,
1152                                             &cmdbuf->state.gfx.link);
1153       if (result != VK_SUCCESS) {
1154          vk_command_buffer_set_error(&cmdbuf->vk, result);
1155          return;
1156       }
1157       cmdbuf->state.gfx.linked = true;
1158    }
1159 
1160    result = update_tls(cmdbuf);
1161    if (result != VK_SUCCESS)
1162       return;
1163 
1164    bool needs_tiling = !rs->rasterizer_discard_enable;
1165 
1166    if (needs_tiling) {
1167       result = get_tiler_desc(cmdbuf);
1168       if (result != VK_SUCCESS)
1169          return;
1170 
1171       result = get_fb_descs(cmdbuf);
1172       if (result != VK_SUCCESS)
1173          return;
1174    }
1175 
1176    struct cs_builder *b =
1177       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1178 
1179    uint32_t used_set_mask =
1180       vs->desc_info.used_set_mask | (fs ? fs->desc_info.used_set_mask : 0);
1181 
1182    result =
1183       panvk_per_arch(cmd_prepare_push_descs)(cmdbuf, desc_state, used_set_mask);
1184    if (result != VK_SUCCESS)
1185       return;
1186 
1187    prepare_sysvals(cmdbuf);
1188 
1189    result = prepare_push_uniforms(cmdbuf);
1190    if (result != VK_SUCCESS)
1191       return;
1192 
1193    result = prepare_vs(cmdbuf, draw);
1194    if (result != VK_SUCCESS)
1195       return;
1196 
1197    /* No need to setup the FS desc tables if the FS is not executed. */
1198    if (needs_tiling && fs_required(cmdbuf)) {
1199       result = prepare_fs(cmdbuf);
1200       if (result != VK_SUCCESS)
1201          return;
1202    }
1203 
1204    struct mali_primitive_flags_packed tiler_idvs_flags;
1205    bool writes_point_size =
1206       vs->info.vs.writes_point_size &&
1207       ia->primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
1208 
1209    pan_pack(&tiler_idvs_flags, PRIMITIVE_FLAGS, cfg) {
1210       cfg.draw_mode = translate_prim_topology(ia->primitive_topology);
1211       cfg.index_type = index_size_to_index_type(draw->index.size);
1212 
1213       if (writes_point_size) {
1214          cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
1215          cfg.position_fifo_format = MALI_FIFO_FORMAT_EXTENDED;
1216       } else {
1217          cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_NONE;
1218          cfg.position_fifo_format = MALI_FIFO_FORMAT_BASIC;
1219       }
1220 
1221       if (vs->info.outputs_written & VARYING_BIT_LAYER) {
1222          cfg.layer_index_enable = true;
1223          cfg.position_fifo_format = MALI_FIFO_FORMAT_EXTENDED;
1224       }
1225 
1226       cfg.secondary_shader =
1227          vs->info.vs.secondary_enable && fs_required(cmdbuf);
1228       cfg.primitive_restart = ia->primitive_restart_enable;
1229    }
1230 
1231    uint32_t varying_size = 0;
1232 
1233    if (vs && fs) {
1234       unsigned vs_vars = vs->info.varyings.output_count;
1235       unsigned fs_vars = fs->info.varyings.input_count;
1236       unsigned var_slots = MAX2(vs_vars, fs_vars);
1237 
1238       /* Assumes 16 byte slots. We could do better. */
1239       varying_size = var_slots * 16;
1240    }
1241 
1242    cs_update_vt_ctx(b) {
1243       cs_move32_to(b, cs_sr_reg32(b, 32), draw->vertex.base);
1244       cs_move32_to(b, cs_sr_reg32(b, 33), draw->vertex.count);
1245       cs_move32_to(b, cs_sr_reg32(b, 34), draw->instance.count);
1246       cs_move32_to(b, cs_sr_reg32(b, 35), draw->index.offset);
1247       cs_move32_to(b, cs_sr_reg32(b, 36), draw->index.vertex_offset);
1248 
1249       /* Instance ID is assumed to be zero-based for now. See if we can
1250        * extend nir_lower_system_values() and the lower options to make
1251        * instance-ID non-zero based, or if it's fine to always return
1252        * zero for the instance base. */
1253       cs_move32_to(b, cs_sr_reg32(b, 37), 0);
1254 
1255       /* We don't use the resource dep system yet. */
1256       cs_move32_to(b, cs_sr_reg32(b, 38), 0);
1257 
1258       cs_move32_to(
1259          b, cs_sr_reg32(b, 39),
1260          (draw->index.offset + draw->vertex.count) * draw->index.size);
1261 
1262       if (draw->index.size && cmdbuf->state.gfx.ib.dirty) {
1263          cs_move64_to(b, cs_sr_reg64(b, 54),
1264                       panvk_buffer_gpu_ptr(cmdbuf->state.gfx.ib.buffer,
1265                                            cmdbuf->state.gfx.ib.offset));
1266       }
1267 
1268       /* TODO: Revisit to avoid passing everything through the override flags
1269        * (likely needed for state preservation in secondary command buffers). */
1270       cs_move32_to(b, cs_sr_reg32(b, 56), 0);
1271 
1272       cs_move32_to(b, cs_sr_reg32(b, 48), varying_size);
1273 
1274       result = prepare_blend(cmdbuf);
1275       if (result != VK_SUCCESS)
1276          return;
1277 
1278       result = prepare_ds(cmdbuf);
1279       if (result != VK_SUCCESS)
1280          return;
1281 
1282       prepare_dcd(cmdbuf);
1283       prepare_vp(cmdbuf);
1284    }
1285 
1286    clear_dirty(cmdbuf, draw);
1287 
1288    cs_req_res(b, CS_IDVS_RES);
1289    cs_run_idvs(b, tiler_idvs_flags.opaque[0], false, true,
1290                cs_shader_res_sel(0, 0, 1, 0), cs_shader_res_sel(2, 2, 2, 0),
1291                cs_undef());
1292    cs_req_res(b, 0);
1293 }
1294 
1295 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDraw)1296 panvk_per_arch(CmdDraw)(VkCommandBuffer commandBuffer, uint32_t vertexCount,
1297                         uint32_t instanceCount, uint32_t firstVertex,
1298                         uint32_t firstInstance)
1299 {
1300    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1301 
1302    if (instanceCount == 0 || vertexCount == 0)
1303       return;
1304 
1305    struct panvk_draw_info draw = {
1306       .vertex.base = firstVertex,
1307       .vertex.count = vertexCount,
1308       .instance.base = firstInstance,
1309       .instance.count = instanceCount,
1310    };
1311 
1312    panvk_cmd_draw(cmdbuf, &draw);
1313 }
1314 
1315 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexed)1316 panvk_per_arch(CmdDrawIndexed)(VkCommandBuffer commandBuffer,
1317                                uint32_t indexCount, uint32_t instanceCount,
1318                                uint32_t firstIndex, int32_t vertexOffset,
1319                                uint32_t firstInstance)
1320 {
1321    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1322 
1323    if (instanceCount == 0 || indexCount == 0)
1324       return;
1325 
1326    struct panvk_draw_info draw = {
1327       .index.size = cmdbuf->state.gfx.ib.index_size,
1328       .index.offset = firstIndex,
1329       .index.vertex_offset = vertexOffset,
1330       .vertex.count = indexCount,
1331       .instance.count = instanceCount,
1332       .instance.base = firstInstance,
1333    };
1334 
1335    panvk_cmd_draw(cmdbuf, &draw);
1336 }
1337 
1338 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndirect)1339 panvk_per_arch(CmdDrawIndirect)(VkCommandBuffer commandBuffer, VkBuffer _buffer,
1340                                 VkDeviceSize offset, uint32_t drawCount,
1341                                 uint32_t stride)
1342 {
1343    panvk_stub();
1344 }
1345 
1346 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdDrawIndexedIndirect)1347 panvk_per_arch(CmdDrawIndexedIndirect)(VkCommandBuffer commandBuffer,
1348                                        VkBuffer _buffer, VkDeviceSize offset,
1349                                        uint32_t drawCount, uint32_t stride)
1350 {
1351    panvk_stub();
1352 }
1353 
1354 static void
panvk_cmd_begin_rendering_init_state(struct panvk_cmd_buffer * cmdbuf,const VkRenderingInfo * pRenderingInfo)1355 panvk_cmd_begin_rendering_init_state(struct panvk_cmd_buffer *cmdbuf,
1356                                      const VkRenderingInfo *pRenderingInfo)
1357 {
1358    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
1359    struct panvk_physical_device *phys_dev =
1360       to_panvk_physical_device(dev->vk.physical);
1361    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1362    uint32_t att_width = 0, att_height = 0;
1363 
1364    cmdbuf->state.gfx.render.flags = pRenderingInfo->flags;
1365 
1366    /* Resuming from a suspended pass, the state should be unchanged. */
1367    if (cmdbuf->state.gfx.render.flags & VK_RENDERING_RESUMING_BIT)
1368       return;
1369 
1370    cmdbuf->state.gfx.render.dirty = true;
1371    memset(cmdbuf->state.gfx.render.fb.crc_valid, 0,
1372           sizeof(cmdbuf->state.gfx.render.fb.crc_valid));
1373    memset(&cmdbuf->state.gfx.render.color_attachments, 0,
1374           sizeof(cmdbuf->state.gfx.render.color_attachments));
1375    memset(&cmdbuf->state.gfx.render.z_attachment, 0,
1376           sizeof(cmdbuf->state.gfx.render.z_attachment));
1377    memset(&cmdbuf->state.gfx.render.s_attachment, 0,
1378           sizeof(cmdbuf->state.gfx.render.s_attachment));
1379    cmdbuf->state.gfx.render.bound_attachments = 0;
1380 
1381    cmdbuf->state.gfx.render.layer_count = pRenderingInfo->layerCount;
1382    *fbinfo = (struct pan_fb_info){
1383       .tile_buf_budget = panfrost_query_optimal_tib_size(phys_dev->model),
1384       .nr_samples = 1,
1385       .rt_count = pRenderingInfo->colorAttachmentCount,
1386    };
1387 
1388    assert(pRenderingInfo->colorAttachmentCount <= ARRAY_SIZE(fbinfo->rts));
1389 
1390    for (uint32_t i = 0; i < pRenderingInfo->colorAttachmentCount; i++) {
1391       const VkRenderingAttachmentInfo *att =
1392          &pRenderingInfo->pColorAttachments[i];
1393       VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
1394 
1395       if (!iview)
1396          continue;
1397 
1398       struct panvk_image *img =
1399          container_of(iview->vk.image, struct panvk_image, vk);
1400       const VkExtent3D iview_size =
1401          vk_image_mip_level_extent(&img->vk, iview->vk.base_mip_level);
1402 
1403       cmdbuf->state.gfx.render.bound_attachments |=
1404          MESA_VK_RP_ATTACHMENT_COLOR_BIT(i);
1405       cmdbuf->state.gfx.render.color_attachments.fmts[i] = iview->vk.format;
1406       cmdbuf->state.gfx.render.color_attachments.samples[i] = img->vk.samples;
1407       att_width = MAX2(iview_size.width, att_width);
1408       att_height = MAX2(iview_size.height, att_height);
1409 
1410       fbinfo->rts[i].view = &iview->pview;
1411       fbinfo->rts[i].crc_valid = &cmdbuf->state.gfx.render.fb.crc_valid[i];
1412       fbinfo->nr_samples =
1413          MAX2(fbinfo->nr_samples, pan_image_view_get_nr_samples(&iview->pview));
1414 
1415       if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1416          enum pipe_format fmt = vk_format_to_pipe_format(iview->vk.format);
1417          union pipe_color_union *col =
1418             (union pipe_color_union *)&att->clearValue.color;
1419 
1420          fbinfo->rts[i].clear = true;
1421          pan_pack_color(phys_dev->formats.blendable, fbinfo->rts[i].clear_value,
1422                         col, fmt, false);
1423       } else if (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD) {
1424          fbinfo->rts[i].preload = true;
1425       }
1426 
1427       if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
1428          struct panvk_resolve_attachment *resolve_info =
1429             &cmdbuf->state.gfx.render.color_attachments.resolve[i];
1430          VK_FROM_HANDLE(panvk_image_view, resolve_iview, att->resolveImageView);
1431 
1432          resolve_info->mode = att->resolveMode;
1433          resolve_info->src_iview = iview;
1434          resolve_info->dst_iview = resolve_iview;
1435       }
1436    }
1437 
1438    if (pRenderingInfo->pDepthAttachment &&
1439        pRenderingInfo->pDepthAttachment->imageView != VK_NULL_HANDLE) {
1440       const VkRenderingAttachmentInfo *att = pRenderingInfo->pDepthAttachment;
1441       VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
1442       struct panvk_image *img =
1443          container_of(iview->vk.image, struct panvk_image, vk);
1444       const VkExtent3D iview_size =
1445          vk_image_mip_level_extent(&img->vk, iview->vk.base_mip_level);
1446 
1447       if (iview->vk.aspects & VK_IMAGE_ASPECT_DEPTH_BIT) {
1448          cmdbuf->state.gfx.render.bound_attachments |=
1449             MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
1450          att_width = MAX2(iview_size.width, att_width);
1451          att_height = MAX2(iview_size.height, att_height);
1452 
1453          fbinfo->zs.view.zs = &iview->pview;
1454          fbinfo->nr_samples = MAX2(
1455             fbinfo->nr_samples, pan_image_view_get_nr_samples(&iview->pview));
1456 
1457          if (vk_format_has_stencil(img->vk.format))
1458             fbinfo->zs.preload.s = true;
1459 
1460          if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1461             fbinfo->zs.clear.z = true;
1462             fbinfo->zs.clear_value.depth = att->clearValue.depthStencil.depth;
1463          } else if (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD) {
1464             fbinfo->zs.preload.z = true;
1465          }
1466 
1467          if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
1468             struct panvk_resolve_attachment *resolve_info =
1469                &cmdbuf->state.gfx.render.z_attachment.resolve;
1470             VK_FROM_HANDLE(panvk_image_view, resolve_iview,
1471                            att->resolveImageView);
1472 
1473             resolve_info->mode = att->resolveMode;
1474             resolve_info->src_iview = iview;
1475             resolve_info->dst_iview = resolve_iview;
1476          }
1477       }
1478    }
1479 
1480    if (pRenderingInfo->pStencilAttachment &&
1481        pRenderingInfo->pStencilAttachment->imageView != VK_NULL_HANDLE) {
1482       const VkRenderingAttachmentInfo *att = pRenderingInfo->pStencilAttachment;
1483       VK_FROM_HANDLE(panvk_image_view, iview, att->imageView);
1484       struct panvk_image *img =
1485          container_of(iview->vk.image, struct panvk_image, vk);
1486       const VkExtent3D iview_size =
1487          vk_image_mip_level_extent(&img->vk, iview->vk.base_mip_level);
1488 
1489       if (iview->vk.aspects & VK_IMAGE_ASPECT_STENCIL_BIT) {
1490          cmdbuf->state.gfx.render.bound_attachments |=
1491             MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
1492          att_width = MAX2(iview_size.width, att_width);
1493          att_height = MAX2(iview_size.height, att_height);
1494 
1495          if (drm_is_afbc(img->pimage.layout.modifier)) {
1496             assert(fbinfo->zs.view.zs == &iview->pview || !fbinfo->zs.view.zs);
1497             fbinfo->zs.view.zs = &iview->pview;
1498          } else {
1499             fbinfo->zs.view.s =
1500                &iview->pview != fbinfo->zs.view.zs ? &iview->pview : NULL;
1501          }
1502 
1503          fbinfo->zs.view.s =
1504             &iview->pview != fbinfo->zs.view.zs ? &iview->pview : NULL;
1505          fbinfo->nr_samples = MAX2(
1506             fbinfo->nr_samples, pan_image_view_get_nr_samples(&iview->pview));
1507 
1508          if (vk_format_has_depth(img->vk.format)) {
1509             assert(fbinfo->zs.view.zs == NULL ||
1510                    &iview->pview == fbinfo->zs.view.zs);
1511             fbinfo->zs.view.zs = &iview->pview;
1512 
1513             fbinfo->zs.preload.s = false;
1514             fbinfo->zs.clear.s = false;
1515             if (!fbinfo->zs.clear.z)
1516                fbinfo->zs.preload.z = true;
1517          }
1518 
1519          if (att->loadOp == VK_ATTACHMENT_LOAD_OP_CLEAR) {
1520             fbinfo->zs.clear.s = true;
1521             fbinfo->zs.clear_value.stencil =
1522                att->clearValue.depthStencil.stencil;
1523          } else if (att->loadOp == VK_ATTACHMENT_LOAD_OP_LOAD) {
1524             fbinfo->zs.preload.s = true;
1525          }
1526 
1527          if (att->resolveMode != VK_RESOLVE_MODE_NONE) {
1528             struct panvk_resolve_attachment *resolve_info =
1529                &cmdbuf->state.gfx.render.s_attachment.resolve;
1530             VK_FROM_HANDLE(panvk_image_view, resolve_iview,
1531                            att->resolveImageView);
1532 
1533             resolve_info->mode = att->resolveMode;
1534             resolve_info->src_iview = iview;
1535             resolve_info->dst_iview = resolve_iview;
1536          }
1537       }
1538    }
1539 
1540    if (fbinfo->zs.view.zs) {
1541       const struct util_format_description *fdesc =
1542          util_format_description(fbinfo->zs.view.zs->format);
1543       bool needs_depth = fbinfo->zs.clear.z | fbinfo->zs.preload.z |
1544                          util_format_has_depth(fdesc);
1545       bool needs_stencil = fbinfo->zs.clear.s | fbinfo->zs.preload.s |
1546                            util_format_has_stencil(fdesc);
1547       enum pipe_format new_fmt =
1548          util_format_get_blocksize(fbinfo->zs.view.zs->format) == 4
1549             ? PIPE_FORMAT_Z24_UNORM_S8_UINT
1550             : PIPE_FORMAT_Z32_FLOAT_S8X24_UINT;
1551 
1552       if (needs_depth && needs_stencil &&
1553           fbinfo->zs.view.zs->format != new_fmt) {
1554          cmdbuf->state.gfx.render.zs_pview = *fbinfo->zs.view.zs;
1555          cmdbuf->state.gfx.render.zs_pview.format = new_fmt;
1556          fbinfo->zs.view.zs = &cmdbuf->state.gfx.render.zs_pview;
1557       }
1558    }
1559 
1560    fbinfo->extent.minx = pRenderingInfo->renderArea.offset.x;
1561    fbinfo->extent.maxx = pRenderingInfo->renderArea.offset.x +
1562                          pRenderingInfo->renderArea.extent.width - 1;
1563    fbinfo->extent.miny = pRenderingInfo->renderArea.offset.y;
1564    fbinfo->extent.maxy = pRenderingInfo->renderArea.offset.y +
1565                          pRenderingInfo->renderArea.extent.height - 1;
1566 
1567    if (cmdbuf->state.gfx.render.bound_attachments) {
1568       fbinfo->width = att_width;
1569       fbinfo->height = att_height;
1570    } else {
1571       fbinfo->width = fbinfo->extent.maxx + 1;
1572       fbinfo->height = fbinfo->extent.maxy + 1;
1573    }
1574 
1575    assert(fbinfo->width && fbinfo->height);
1576 }
1577 
1578 static void
preload_render_area_border(struct panvk_cmd_buffer * cmdbuf,const VkRenderingInfo * render_info)1579 preload_render_area_border(struct panvk_cmd_buffer *cmdbuf,
1580                            const VkRenderingInfo *render_info)
1581 {
1582    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1583    bool render_area_is_32x32_aligned =
1584       ((fbinfo->extent.minx | fbinfo->extent.miny) % 32) == 0 &&
1585       (fbinfo->extent.maxx + 1 == fbinfo->width ||
1586        (fbinfo->extent.maxx % 32) == 31) &&
1587       (fbinfo->extent.maxy + 1 == fbinfo->height ||
1588        (fbinfo->extent.maxy % 32) == 31);
1589 
1590    /* If the render area is aligned on a 32x32 section, we're good. */
1591    if (render_area_is_32x32_aligned)
1592       return;
1593 
1594    /* We force preloading for all active attachments to preserve content falling
1595     * outside the render area, but we need to compensate with attachment clears
1596     * for attachments that were initially cleared.
1597     */
1598    uint32_t bound_atts = cmdbuf->state.gfx.render.bound_attachments;
1599    VkClearAttachment clear_atts[MAX_RTS + 2];
1600    uint32_t clear_att_count = 0;
1601 
1602    for (uint32_t i = 0; i < render_info->colorAttachmentCount; i++) {
1603       if (bound_atts & MESA_VK_RP_ATTACHMENT_COLOR_BIT(i)) {
1604          if (fbinfo->rts[i].clear) {
1605             const VkRenderingAttachmentInfo *att =
1606                &render_info->pColorAttachments[i];
1607 
1608             clear_atts[clear_att_count++] = (VkClearAttachment){
1609                .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
1610                .colorAttachment = i,
1611                .clearValue = att->clearValue,
1612             };
1613          }
1614 
1615          fbinfo->rts[i].preload = true;
1616          fbinfo->rts[i].clear = false;
1617       }
1618    }
1619 
1620    if (bound_atts & MESA_VK_RP_ATTACHMENT_DEPTH_BIT) {
1621       if (fbinfo->zs.clear.z) {
1622          const VkRenderingAttachmentInfo *att = render_info->pDepthAttachment;
1623 
1624          clear_atts[clear_att_count++] = (VkClearAttachment){
1625             .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT,
1626             .clearValue = att->clearValue,
1627          };
1628       }
1629 
1630       fbinfo->zs.preload.z = true;
1631       fbinfo->zs.clear.z = false;
1632    }
1633 
1634    if (bound_atts & MESA_VK_RP_ATTACHMENT_STENCIL_BIT) {
1635       if (fbinfo->zs.clear.s) {
1636          const VkRenderingAttachmentInfo *att = render_info->pStencilAttachment;
1637 
1638          clear_atts[clear_att_count++] = (VkClearAttachment){
1639             .aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT,
1640             .clearValue = att->clearValue,
1641          };
1642       }
1643 
1644       fbinfo->zs.preload.s = true;
1645       fbinfo->zs.clear.s = false;
1646    }
1647 
1648    if (clear_att_count) {
1649       VkClearRect clear_rect = {
1650          .rect = render_info->renderArea,
1651          .baseArrayLayer = 0,
1652          .layerCount = render_info->layerCount,
1653       };
1654 
1655       panvk_per_arch(CmdClearAttachments)(panvk_cmd_buffer_to_handle(cmdbuf),
1656                                           clear_att_count, clear_atts, 1,
1657                                           &clear_rect);
1658    }
1659 }
1660 
1661 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBeginRendering)1662 panvk_per_arch(CmdBeginRendering)(VkCommandBuffer commandBuffer,
1663                                   const VkRenderingInfo *pRenderingInfo)
1664 {
1665    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
1666    struct panvk_cmd_graphics_state *state = &cmdbuf->state.gfx;
1667 
1668    panvk_cmd_begin_rendering_init_state(cmdbuf, pRenderingInfo);
1669 
1670    bool resuming = state->render.flags & VK_RENDERING_RESUMING_BIT;
1671 
1672    /* If we're not resuming, the FBD should be NULL. */
1673    assert(!state->render.fbds.gpu || resuming);
1674 
1675    if (!resuming)
1676       preload_render_area_border(cmdbuf, pRenderingInfo);
1677 }
1678 
1679 static void
resolve_attachments(struct panvk_cmd_buffer * cmdbuf)1680 resolve_attachments(struct panvk_cmd_buffer *cmdbuf)
1681 {
1682    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1683    bool needs_resolve = false;
1684 
1685    unsigned bound_atts = cmdbuf->state.gfx.render.bound_attachments;
1686    unsigned color_att_count =
1687       util_last_bit(bound_atts & MESA_VK_RP_ATTACHMENT_ANY_COLOR_BITS);
1688    VkRenderingAttachmentInfo color_atts[MAX_RTS];
1689    for (uint32_t i = 0; i < color_att_count; i++) {
1690       const struct panvk_resolve_attachment *resolve_info =
1691          &cmdbuf->state.gfx.render.color_attachments.resolve[i];
1692 
1693       color_atts[i] = (VkRenderingAttachmentInfo){
1694          .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1695          .imageView = panvk_image_view_to_handle(resolve_info->src_iview),
1696          .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1697          .resolveMode = resolve_info->mode,
1698          .resolveImageView =
1699             panvk_image_view_to_handle(resolve_info->dst_iview),
1700          .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1701       };
1702 
1703       if (resolve_info->mode != VK_RESOLVE_MODE_NONE)
1704          needs_resolve = true;
1705    }
1706 
1707    const struct panvk_resolve_attachment *resolve_info =
1708       &cmdbuf->state.gfx.render.z_attachment.resolve;
1709    VkRenderingAttachmentInfo z_att = {
1710       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1711       .imageView = panvk_image_view_to_handle(resolve_info->src_iview),
1712       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1713       .resolveMode = resolve_info->mode,
1714       .resolveImageView = panvk_image_view_to_handle(resolve_info->dst_iview),
1715       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1716    };
1717 
1718    if (resolve_info->mode != VK_RESOLVE_MODE_NONE)
1719       needs_resolve = true;
1720 
1721    resolve_info = &cmdbuf->state.gfx.render.s_attachment.resolve;
1722 
1723    VkRenderingAttachmentInfo s_att = {
1724       .sType = VK_STRUCTURE_TYPE_RENDERING_ATTACHMENT_INFO,
1725       .imageView = panvk_image_view_to_handle(resolve_info->src_iview),
1726       .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
1727       .resolveMode = resolve_info->mode,
1728       .resolveImageView = panvk_image_view_to_handle(resolve_info->dst_iview),
1729       .resolveImageLayout = VK_IMAGE_LAYOUT_GENERAL,
1730    };
1731 
1732    if (resolve_info->mode != VK_RESOLVE_MODE_NONE)
1733       needs_resolve = true;
1734 
1735    if (!needs_resolve)
1736       return;
1737 
1738    const VkRenderingInfo render_info = {
1739       .sType = VK_STRUCTURE_TYPE_RENDERING_INFO,
1740       .renderArea =
1741          {
1742             .offset.x = fbinfo->extent.minx,
1743             .offset.y = fbinfo->extent.miny,
1744             .extent.width = fbinfo->extent.maxx - fbinfo->extent.minx + 1,
1745             .extent.height = fbinfo->extent.maxy - fbinfo->extent.miny + 1,
1746          },
1747       .layerCount = cmdbuf->state.gfx.render.layer_count,
1748       .viewMask = 0,
1749       .colorAttachmentCount = color_att_count,
1750       .pColorAttachments = color_atts,
1751       .pDepthAttachment = &z_att,
1752       .pStencilAttachment = &s_att,
1753    };
1754 
1755    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
1756    struct panvk_cmd_meta_graphics_save_ctx save = {0};
1757 
1758    panvk_per_arch(cmd_meta_gfx_start)(cmdbuf, &save);
1759    vk_meta_resolve_rendering(&cmdbuf->vk, &dev->meta, &render_info);
1760    panvk_per_arch(cmd_meta_gfx_end)(cmdbuf, &save);
1761 }
1762 
1763 static uint8_t
prepare_fb_desc(struct panvk_cmd_buffer * cmdbuf,uint32_t layer,void * fbd)1764 prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd)
1765 {
1766    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
1767    bool simul_use =
1768       !(cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT);
1769 
1770    if (cmdbuf->state.tls.desc.gpu) {
1771       ASSERTED unsigned num_preload_jobs =
1772          GENX(pan_preload_fb)(&dev->blitter.cache, &cmdbuf->desc_pool.base,
1773                               &cmdbuf->state.gfx.render.fb.info, layer,
1774                               cmdbuf->state.tls.desc.gpu, NULL);
1775 
1776       /* Valhall GPUs use pre frame DCDs to preload the FB content. We
1777        * thus expect num_preload_jobs to be zero.
1778        */
1779       assert(!num_preload_jobs);
1780    }
1781 
1782    struct pan_tiler_context tiler_ctx = {
1783       .valhall.desc = !simul_use ? cmdbuf->state.gfx.render.tiler : 0,
1784    };
1785 
1786    return GENX(pan_emit_fbd)(&cmdbuf->state.gfx.render.fb.info, layer, NULL,
1787                              &tiler_ctx, fbd);
1788 }
1789 
1790 static void
flush_tiling(struct panvk_cmd_buffer * cmdbuf)1791 flush_tiling(struct panvk_cmd_buffer *cmdbuf)
1792 {
1793    if (!cmdbuf->state.gfx.render.fbds.gpu)
1794       return;
1795 
1796    struct cs_builder *b =
1797       panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER);
1798 
1799    struct cs_index render_ctx = cs_scratch_reg64(b, 2);
1800 
1801    if (cmdbuf->state.gfx.render.tiler) {
1802       /* Flush the tiling operations and signal the internal sync object. */
1803       cs_req_res(b, CS_TILER_RES);
1804       cs_finish_tiling(b, false);
1805       cs_req_res(b, 0);
1806 
1807       struct cs_index sync_addr = cs_scratch_reg64(b, 0);
1808       struct cs_index iter_sb = cs_scratch_reg32(b, 2);
1809       struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
1810       struct cs_index add_val = cs_scratch_reg64(b, 4);
1811 
1812       cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
1813                  BITFIELD_MASK(3),
1814                  offsetof(struct panvk_cs_subqueue_context, syncobjs));
1815       cs_wait_slot(b, SB_ID(LS), false);
1816 
1817       /* We're relying on PANVK_SUBQUEUE_VERTEX_TILER being the first queue to
1818        * skip an ADD operation on the syncobjs pointer. */
1819       STATIC_ASSERT(PANVK_SUBQUEUE_VERTEX_TILER == 0);
1820 
1821       cs_move64_to(b, add_val, 1);
1822 
1823       cs_match(b, iter_sb, cmp_scratch) {
1824 #define CASE(x)                                                                \
1825          cs_case(b, x) {                                                       \
1826             cs_heap_operation(b,                                               \
1827                               MALI_CS_HEAP_OPERATION_VERTEX_TILER_COMPLETED,   \
1828                               cs_defer(SB_WAIT_ITER(x),                        \
1829                                        SB_ID(DEFERRED_SYNC)));                 \
1830             cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG,                     \
1831                           add_val, sync_addr,                                  \
1832                           cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));    \
1833             cs_move32_to(b, iter_sb, next_iter_sb(x));                         \
1834          }
1835 
1836          CASE(0)
1837          CASE(1)
1838          CASE(2)
1839          CASE(3)
1840          CASE(4)
1841 #undef CASE
1842       }
1843 
1844       cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
1845                  offsetof(struct panvk_cs_subqueue_context, iter_sb));
1846       cs_wait_slot(b, SB_ID(LS), false);
1847 
1848       /* Update the vertex seqno. */
1849       ++cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
1850    } else {
1851       cs_load64_to(b, render_ctx, cs_subqueue_ctx_reg(b),
1852                    offsetof(struct panvk_cs_subqueue_context, render));
1853       cs_wait_slot(b, SB_ID(LS), false);
1854    }
1855 }
1856 
1857 static void
wait_finish_tiling(struct panvk_cmd_buffer * cmdbuf)1858 wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf)
1859 {
1860    if (!cmdbuf->state.gfx.render.tiler)
1861       return;
1862 
1863    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1864    struct cs_index vt_sync_addr = cs_scratch_reg64(b, 0);
1865    struct cs_index vt_sync_point = cs_scratch_reg64(b, 2);
1866    uint64_t rel_vt_sync_point =
1867       cmdbuf->state.cs[PANVK_SUBQUEUE_VERTEX_TILER].relative_sync_point;
1868 
1869    cs_load64_to(b, vt_sync_addr, cs_subqueue_ctx_reg(b),
1870                 offsetof(struct panvk_cs_subqueue_context, syncobjs));
1871    cs_wait_slot(b, SB_ID(LS), false);
1872 
1873    cs_add64(b, vt_sync_point,
1874             cs_progress_seqno_reg(b, PANVK_SUBQUEUE_VERTEX_TILER),
1875             rel_vt_sync_point);
1876    cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, vt_sync_point,
1877                   vt_sync_addr);
1878 }
1879 
1880 static void
issue_fragment_jobs(struct panvk_cmd_buffer * cmdbuf)1881 issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
1882 {
1883    if (!cmdbuf->state.gfx.render.fbds.gpu)
1884       return;
1885 
1886    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
1887    struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
1888    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1889 
1890    /* Wait for the tiling to be done before submitting the fragment job. */
1891    wait_finish_tiling(cmdbuf);
1892 
1893    /* Reserve a scoreboard for the fragment job. */
1894    panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
1895 
1896    /* Now initialize the fragment bits. */
1897    cs_update_frag_ctx(b) {
1898       cs_move32_to(b, cs_sr_reg32(b, 42),
1899                    (fbinfo->extent.miny << 16) | fbinfo->extent.minx);
1900       cs_move32_to(b, cs_sr_reg32(b, 43),
1901                    (fbinfo->extent.maxy << 16) | fbinfo->extent.maxx);
1902    }
1903 
1904    fbinfo->sample_positions =
1905       dev->sample_positions->addr.dev +
1906       panfrost_sample_positions_offset(pan_sample_pattern(fbinfo->nr_samples));
1907 
1908    bool simul_use =
1909       cmdbuf->flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
1910 
1911    /* The only bit we patch in FBDs is the tiler pointer. If tiler is not
1912     * involved (clear job) or if the update can happen in place (not
1913     * simultaneous use of the command buffer), we can avoid the
1914     * copy. */
1915    bool copy_fbds = simul_use && cmdbuf->state.gfx.render.tiler;
1916    uint32_t fbd_sz = calc_fbd_size(cmdbuf);
1917    struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds;
1918    uint8_t fbd_flags = 0;
1919 
1920    /* We prepare all FB descriptors upfront. */
1921    for (uint32_t i = 0; i < cmdbuf->state.gfx.render.layer_count; i++) {
1922       uint32_t new_fbd_flags =
1923          prepare_fb_desc(cmdbuf, i, fbds.cpu + (fbd_sz * i));
1924 
1925       /* Make sure all FBDs have the same flags. */
1926       assert(i == 0 || new_fbd_flags == fbd_flags);
1927       fbd_flags = new_fbd_flags;
1928    }
1929 
1930    struct cs_index layer_count = cs_sr_reg32(b, 47);
1931    struct cs_index fbd_ptr = cs_sr_reg64(b, 48);
1932    struct cs_index tiler_ptr = cs_sr_reg64(b, 50);
1933    struct cs_index src_fbd_ptr = cs_undef();
1934 
1935    if (copy_fbds) {
1936       src_fbd_ptr = cs_sr_reg64(b, 52);
1937 
1938       cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
1939       cs_load64_to(
1940          b, tiler_ptr, cs_subqueue_ctx_reg(b),
1941          offsetof(struct panvk_cs_subqueue_context, render.desc_ringbuf.ptr));
1942       cs_wait_slot(b, SB_ID(LS), false);
1943 
1944       cs_add64(b, fbd_ptr, tiler_ptr, pan_size(TILER_CONTEXT));
1945       cs_move64_to(b, src_fbd_ptr, fbds.gpu);
1946    } else if (cmdbuf->state.gfx.render.tiler) {
1947       cs_move64_to(b, fbd_ptr, fbds.gpu);
1948       cs_move64_to(b, tiler_ptr, cmdbuf->state.gfx.render.tiler);
1949    }
1950 
1951    cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count);
1952    cs_while(b, MALI_CS_CONDITION_GREATER, layer_count) {
1953       if (copy_fbds) {
1954          for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) {
1955             cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr,
1956                        BITFIELD_MASK(16), fbd_off);
1957             cs_wait_slot(b, SB_ID(LS), false);
1958             cs_store(b, cs_scratch_reg_tuple(b, 0, 16), fbd_ptr,
1959                      BITFIELD_MASK(16), fbd_off);
1960             cs_wait_slot(b, SB_ID(LS), false);
1961          }
1962 
1963          cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz);
1964       }
1965 
1966       if (cmdbuf->state.gfx.render.tiler) {
1967          cs_store64(b, tiler_ptr, fbd_ptr, 56);
1968          cs_wait_slot(b, SB_ID(LS), false);
1969       }
1970 
1971       cs_update_frag_ctx(b)
1972          cs_add64(b, cs_sr_reg64(b, 40), fbd_ptr, fbd_flags);
1973 
1974       cs_req_res(b, CS_FRAG_RES);
1975       cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
1976       cs_req_res(b, 0);
1977       cs_add64(b, fbd_ptr, fbd_ptr, fbd_sz);
1978       cs_add32(b, layer_count, layer_count, -1);
1979    }
1980 
1981    struct cs_index sync_addr = cs_scratch_reg64(b, 0);
1982    struct cs_index iter_sb = cs_scratch_reg32(b, 2);
1983    struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
1984    struct cs_index add_val = cs_scratch_reg64(b, 4);
1985    struct cs_index release_sz = cs_scratch_reg32(b, 5);
1986    struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6);
1987    struct cs_index completed = cs_scratch_reg_tuple(b, 10, 4);
1988    struct cs_index completed_top = cs_scratch_reg64(b, 10);
1989    struct cs_index completed_bottom = cs_scratch_reg64(b, 12);
1990 
1991    cs_move64_to(b, add_val, 1);
1992    cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
1993               BITFIELD_MASK(3),
1994               offsetof(struct panvk_cs_subqueue_context, syncobjs));
1995 
1996    if (copy_fbds) {
1997       cs_move32_to(b, release_sz, calc_render_descs_size(cmdbuf));
1998       cs_load64_to(b, ringbuf_sync_addr, cs_subqueue_ctx_reg(b),
1999                    offsetof(struct panvk_cs_subqueue_context,
2000                             render.desc_ringbuf.syncobj));
2001    }
2002 
2003    if (cmdbuf->state.gfx.render.tiler)
2004       cs_load_to(b, completed, tiler_ptr, BITFIELD_MASK(4), 40);
2005 
2006    cs_wait_slot(b, SB_ID(LS), false);
2007 
2008    cs_add64(b, sync_addr, sync_addr,
2009             PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
2010 
2011    cs_match(b, iter_sb, cmp_scratch) {
2012 #define CASE(x)                                                                \
2013       cs_case(b, x) {                                                          \
2014          if (cmdbuf->state.gfx.render.tiler) {                                 \
2015             cs_finish_fragment(b, true, completed_top, completed_bottom,       \
2016                                cs_defer(SB_WAIT_ITER(x),                       \
2017                                         SB_ID(DEFERRED_SYNC)));                \
2018          }                                                                     \
2019          if (copy_fbds) {                                                      \
2020             cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG,                     \
2021                           release_sz, ringbuf_sync_addr,                       \
2022                           cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));    \
2023          }                                                                     \
2024          cs_sync64_add(b, true, MALI_CS_SYNC_SCOPE_CSG,                        \
2025                        add_val, sync_addr,                                     \
2026                        cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)));       \
2027          cs_move32_to(b, iter_sb, next_iter_sb(x));                            \
2028       }
2029 
2030       CASE(0)
2031       CASE(1)
2032       CASE(2)
2033       CASE(3)
2034       CASE(4)
2035 #undef CASE
2036    }
2037 
2038    cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b),
2039               offsetof(struct panvk_cs_subqueue_context, iter_sb));
2040    cs_wait_slot(b, SB_ID(LS), false);
2041 
2042    /* Update the ring buffer position. */
2043    if (copy_fbds)
2044       cs_render_desc_ringbuf_move_ptr(b, calc_render_descs_size(cmdbuf));
2045 
2046    /* Update the frag seqno. */
2047    ++cmdbuf->state.cs[PANVK_SUBQUEUE_FRAGMENT].relative_sync_point;
2048 
2049    memset(&cmdbuf->state.gfx.render.fbds, 0,
2050           sizeof(cmdbuf->state.gfx.render.fbds));
2051    cmdbuf->state.gfx.render.tiler = 0;
2052 }
2053 
2054 void
panvk_per_arch(cmd_flush_draws)2055 panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf)
2056 {
2057    /* If there was no draw queued, we don't need to force a preload. */
2058    if (!cmdbuf->state.gfx.render.fbds.gpu)
2059       return;
2060 
2061    flush_tiling(cmdbuf);
2062    issue_fragment_jobs(cmdbuf);
2063    force_fb_preload(cmdbuf);
2064    memset(&cmdbuf->state.gfx.render.fbds, 0,
2065           sizeof(cmdbuf->state.gfx.render.fbds));
2066    cmdbuf->state.gfx.render.tiler = 0;
2067 }
2068 
2069 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdEndRendering)2070 panvk_per_arch(CmdEndRendering)(VkCommandBuffer commandBuffer)
2071 {
2072    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2073 
2074    if (!(cmdbuf->state.gfx.render.flags & VK_RENDERING_SUSPENDING_BIT)) {
2075       struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info;
2076       bool clear = fbinfo->zs.clear.z | fbinfo->zs.clear.s;
2077       for (unsigned i = 0; i < fbinfo->rt_count; i++)
2078          clear |= fbinfo->rts[i].clear;
2079 
2080       if (clear) {
2081          VkResult result = get_fb_descs(cmdbuf);
2082          if (result != VK_SUCCESS)
2083             return;
2084       }
2085 
2086       flush_tiling(cmdbuf);
2087       issue_fragment_jobs(cmdbuf);
2088       resolve_attachments(cmdbuf);
2089    }
2090 }
2091 
2092 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBindVertexBuffers)2093 panvk_per_arch(CmdBindVertexBuffers)(VkCommandBuffer commandBuffer,
2094                                      uint32_t firstBinding,
2095                                      uint32_t bindingCount,
2096                                      const VkBuffer *pBuffers,
2097                                      const VkDeviceSize *pOffsets)
2098 {
2099    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2100 
2101    assert(firstBinding + bindingCount <= MAX_VBS);
2102 
2103    for (uint32_t i = 0; i < bindingCount; i++) {
2104       VK_FROM_HANDLE(panvk_buffer, buffer, pBuffers[i]);
2105 
2106       cmdbuf->state.gfx.vb.bufs[firstBinding + i].address =
2107          panvk_buffer_gpu_ptr(buffer, pOffsets[i]);
2108       cmdbuf->state.gfx.vb.bufs[firstBinding + i].size =
2109          panvk_buffer_range(buffer, pOffsets[i], VK_WHOLE_SIZE);
2110    }
2111 
2112    cmdbuf->state.gfx.vb.count =
2113       MAX2(cmdbuf->state.gfx.vb.count, firstBinding + bindingCount);
2114    memset(&cmdbuf->state.gfx.vs.desc.driver_set, 0,
2115           sizeof(cmdbuf->state.gfx.vs.desc.driver_set));
2116 }
2117 
2118 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdBindIndexBuffer)2119 panvk_per_arch(CmdBindIndexBuffer)(VkCommandBuffer commandBuffer,
2120                                    VkBuffer buffer, VkDeviceSize offset,
2121                                    VkIndexType indexType)
2122 {
2123    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
2124    VK_FROM_HANDLE(panvk_buffer, buf, buffer);
2125 
2126    cmdbuf->state.gfx.ib.buffer = buf;
2127    cmdbuf->state.gfx.ib.offset = offset;
2128    cmdbuf->state.gfx.ib.index_size = vk_index_type_to_bytes(indexType);
2129    cmdbuf->state.gfx.ib.dirty = true;
2130 }
2131