xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/asahi/agx_state.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * Copyright 2019-2021 Collabora, Ltd.
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #pragma once
8 
9 #include <xf86drm.h>
10 #include "asahi/compiler/agx_compile.h"
11 #include "asahi/genxml/agx_pack.h"
12 #include "asahi/layout/layout.h"
13 #include "asahi/lib/agx_bo.h"
14 #include "asahi/lib/agx_device.h"
15 #include "asahi/lib/agx_linker.h"
16 #include "asahi/lib/agx_nir_lower_vbo.h"
17 #include "asahi/lib/agx_scratch.h"
18 #include "asahi/lib/agx_tilebuffer.h"
19 #include "asahi/lib/agx_uvs.h"
20 #include "asahi/lib/pool.h"
21 #include "asahi/lib/shaders/geometry.h"
22 #include "asahi/lib/unstable_asahi_drm.h"
23 #include "compiler/nir/nir_lower_blend.h"
24 #include "compiler/shader_enums.h"
25 #include "gallium/auxiliary/util/u_blitter.h"
26 #include "gallium/include/pipe/p_context.h"
27 #include "gallium/include/pipe/p_screen.h"
28 #include "gallium/include/pipe/p_state.h"
29 #include "pipe/p_defines.h"
30 #include "util/bitset.h"
31 #include "util/disk_cache.h"
32 #include "util/hash_table.h"
33 #include "util/rwlock.h"
34 #include "util/u_range.h"
35 #include "agx_bg_eot.h"
36 #include "agx_helpers.h"
37 #include "agx_nir_passes.h"
38 
39 #ifdef __GLIBC__
40 #include <errno.h>
41 #define agx_msg(fmt, ...)                                                      \
42    fprintf(stderr, "[%s] " fmt, program_invocation_short_name, ##__VA_ARGS__)
43 #else
44 #define agx_msg(...) fprintf(stderr, __VA_ARGS__)
45 #endif
46 
47 #define AGX_NUM_TEXTURE_STATE_REGS 16
48 
49 struct agx_streamout_target {
50    struct pipe_stream_output_target base;
51    struct pipe_resource *offset;
52 
53    /* Current stride (bytes per vertex) */
54    uint32_t stride;
55 };
56 
57 static inline struct agx_streamout_target *
agx_so_target(struct pipe_stream_output_target * target)58 agx_so_target(struct pipe_stream_output_target *target)
59 {
60    return (struct agx_streamout_target *)target;
61 }
62 
63 struct agx_streamout {
64    struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS];
65    unsigned num_targets;
66 };
67 
68 /* Shaders can access fixed-function state through system values.
69  * It is convenient to stash all of this information into a single "root"
70  * descriptor, then push individual parts as needed.
71  *
72  * In the future, we could optimize this to reduce CPU overhead, e.g. splitting
73  * into multiple descriptors for finer dirty tracking. This is not ABI with the
74  * compiler. The layout is up to us and handled by our code lowering system
75  * values to uniforms.
76  */
77 enum agx_sysval_table {
78    AGX_SYSVAL_TABLE_ROOT,
79    AGX_SYSVAL_TABLE_PARAMS,
80    AGX_SYSVAL_TABLE_GRID,
81    AGX_SYSVAL_TABLE_VS,
82    AGX_SYSVAL_TABLE_TCS,
83    AGX_SYSVAL_TABLE_TES,
84    AGX_SYSVAL_TABLE_GS,
85    AGX_SYSVAL_TABLE_FS,
86    AGX_SYSVAL_TABLE_CS,
87    AGX_NUM_SYSVAL_TABLES
88 };
89 
90 #define AGX_SYSVAL_STAGE(stage) (AGX_SYSVAL_TABLE_VS + (stage))
91 
92 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_VERTEX) == AGX_SYSVAL_TABLE_VS,
93               "fixed enum orderings");
94 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_CTRL) == AGX_SYSVAL_TABLE_TCS,
95               "fixed enum orderings");
96 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_EVAL) == AGX_SYSVAL_TABLE_TES,
97               "fixed enum orderings");
98 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_GEOMETRY) == AGX_SYSVAL_TABLE_GS,
99               "fixed enum orderings");
100 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_FRAGMENT) == AGX_SYSVAL_TABLE_FS,
101               "fixed enum orderings");
102 static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_COMPUTE) == AGX_SYSVAL_TABLE_CS,
103               "fixed enum orderings");
104 
105 /* Root system value table */
106 struct PACKED agx_draw_uniforms {
107    /* Pointers to the system value tables themselves (for indirection) */
108    uint64_t tables[AGX_NUM_SYSVAL_TABLES];
109 
110    /* Vertex buffer object bases, if present. If vertex robustness is disabled,
111     * attrib_base maps VBOs directly and attrib_max_index is undefined. If
112     * vertex robustness is enabled, attrib_base maps attributes and
113     * attrib_clamp is an inclusive clamp on vertex/divided instance indices.
114     */
115    uint64_t attrib_base[PIPE_MAX_ATTRIBS];
116    uint32_t attrib_clamp[PIPE_MAX_ATTRIBS];
117 
118    /* Addresses for the results of pipeline statistics queries */
119    uint64_t pipeline_statistics[PIPE_STAT_QUERY_MS_INVOCATIONS];
120 
121    /* Pointer to base address of the VS->TCS, VS->GS, or TES->GS buffer.
122     * Indirected so it can be written to in an indirect setup kernel. G13
123     * appears to prefetch uniforms across dispatches, but does not pre-run
124     * preambles, so this indirection saves us from splitting the batch.
125     */
126    uint64_t vertex_output_buffer_ptr;
127 
128    /* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */
129    uint64_t vertex_outputs;
130 
131    /* Address of input assembly buffer if geom/tess is used, else 0 */
132    uint64_t input_assembly;
133 
134    /* Address of tessellation param buffer if tessellation is used, else 0 */
135    uint64_t tess_params;
136 
137    /* Address of geometry param buffer if geometry shaders are used, else 0 */
138    uint64_t geometry_params;
139 
140    /* Address of polygon stipple mask if used */
141    uint64_t polygon_stipple;
142 
143    /* Blend constant if any */
144    float blend_constant[4];
145 
146    /* glPointSize value */
147    float fixed_point_size;
148 
149    /* Value of the multisample control register, containing sample positions in
150     * each byte (x in low nibble, y in high nibble).
151     */
152    uint32_t ppp_multisamplectl;
153 
154    /* gl_DrawID for a direct multidraw */
155    uint32_t draw_id;
156 
157    /* Sprite coord replacement mask */
158    uint16_t sprite_mask;
159 
160    /* glSampleMask */
161    uint16_t sample_mask;
162 
163    /* Nonzero for indexed draws, zero otherwise */
164    uint16_t is_indexed_draw;
165 
166    /* Zero for [0, 1] clipping, 0.5 for [-1, 1] clipping. */
167    uint16_t clip_z_coeff;
168 
169    /* ~0/0 boolean whether the epilog lacks any discard instrction */
170    uint16_t no_epilog_discard;
171 
172    /* Provoking vertex: 0, 1, 2 */
173    uint16_t provoking_vertex;
174 
175    /* Mapping from varying slots written by the last vertex stage to UVS
176     * indices. This mapping must be compatible with the fragment shader.
177     */
178    uint16_t uvs_index[VARYING_SLOT_MAX];
179 };
180 
181 struct PACKED agx_stage_uniforms {
182    /* Pointer to binding table for texture descriptor, or 0 if none. This must
183     * be first so that u0_u1 is always available for lowering binding
184     * tables to bindless access.
185     */
186    uint64_t texture_base;
187 
188    /* Uniform buffer objects */
189    uint64_t ubo_base[PIPE_MAX_CONSTANT_BUFFERS];
190    uint32_t ubo_size[PIPE_MAX_CONSTANT_BUFFERS];
191 
192    /* Shader storage buffer objects */
193    uint64_t ssbo_base[PIPE_MAX_SHADER_BUFFERS];
194    uint32_t ssbo_size[PIPE_MAX_SHADER_BUFFERS];
195 
196    /* If lowered to bindless, sampler index in the heap */
197    uint16_t sampler_handle[PIPE_MAX_SAMPLERS];
198 
199    /* LOD bias as float16 */
200    uint16_t lod_bias[PIPE_MAX_SAMPLERS];
201 };
202 
203 /* In the architecture, there are 512 uniform registers, each 16-bits. In a
204  * theoretical worst case, we could push to all of them. We use a worst-case
205  * maximum because the expression for a tight upper bound is too messy and easy
206  * to go out of sync with the code.
207  */
208 #define AGX_MAX_PUSH_RANGES (512)
209 
210 struct agx_push_range {
211    /* Base 16-bit uniform to push to */
212    uint16_t uniform;
213 
214    /* Offset into the table to push in bytes */
215    uint16_t offset;
216 
217    /* Which table to push from */
218    uint8_t table;
219 
220    /* Number of consecutive 16-bit uniforms to push */
221    uint8_t length;
222 };
223 
224 struct agx_compiled_shader {
225    /* Base struct */
226    struct agx_shader_part b;
227 
228    /* Uncompiled shader that we belong to */
229    const struct agx_uncompiled_shader *so;
230 
231    /* Mapped executable memory */
232    struct agx_bo *bo;
233 
234    /* Uniforms the driver must push */
235    unsigned push_range_count;
236    struct agx_push_range push[AGX_MAX_PUSH_RANGES];
237 
238    /* UVS layout for the last vertex stage */
239    struct agx_unlinked_uvs_layout uvs;
240 
241    /* For a vertex shader, the mask of vertex attributes read. Used to key the
242     * prolog so the prolog doesn't write components not actually read.
243     */
244    BITSET_DECLARE(attrib_components_read, AGX_MAX_ATTRIBS * 4);
245 
246    struct agx_fs_epilog_link_info epilog_key;
247 
248    /* Auxiliary programs, or NULL if not used */
249    struct agx_compiled_shader *gs_count, *pre_gs;
250    struct agx_compiled_shader *gs_copy;
251 
252    /* Output primitive mode for geometry shaders */
253    enum mesa_prim gs_output_mode;
254 
255    /* Number of words per primitive in the count buffer */
256    unsigned gs_count_words;
257 
258    /* Logical shader stage used for descriptor access. This may differ from the
259     * physical shader stage of the compiled shader, for example when executing a
260     * tessellation eval shader as a vertex shader.
261     */
262    enum pipe_shader_type stage;
263 };
264 
265 struct agx_fast_link_key {
266    union {
267       struct agx_vs_prolog_key vs;
268       struct agx_fs_prolog_key fs;
269    } prolog;
270 
271    struct agx_compiled_shader *main;
272 
273    union {
274       struct agx_fs_epilog_key fs;
275    } epilog;
276 
277    unsigned nr_samples_shaded;
278 };
279 
280 struct agx_uncompiled_shader {
281    struct pipe_shader_state base;
282    enum pipe_shader_type type;
283    struct blob early_serialized_nir;
284    struct blob serialized_nir;
285    uint8_t nir_sha1[20];
286 
287    struct {
288       uint64_t inputs_flat_shaded;
289       uint64_t inputs_linear_shaded;
290       uint8_t cull_distance_size;
291       bool has_edgeflags;
292       bool uses_fbfetch;
293 
294       /* Number of bindful textures, images used */
295       unsigned nr_bindful_textures, nr_bindful_images;
296    } info;
297 
298    struct hash_table *variants;
299    struct agx_uncompiled_shader *passthrough_progs[MESA_PRIM_COUNT][3][2];
300    struct agx_uncompiled_shader *passthrough_tcs[32];
301 
302    /* agx_fast_link_key -> agx_linked_shader */
303    struct hash_table *linked_shaders;
304 
305    uint32_t xfb_strides[4];
306    bool has_xfb_info;
307    bool is_xfb_passthrough;
308 
309    enum mesa_prim gs_mode;
310 
311    /* Whether the shader accesses indexed samplers via the bindless heap */
312    bool uses_bindless_samplers;
313 
314    /* Set on VS, passed to FS for linkage */
315    unsigned base_varying;
316 
317    /* Tessellation info */
318    struct {
319       uint64_t per_vertex_outputs;
320       uint32_t output_stride;
321       enum gl_tess_spacing spacing;
322       enum tess_primitive_mode primitive;
323       uint8_t output_patch_size;
324       uint8_t nr_patch_outputs;
325       bool ccw;
326       bool point_mode;
327    } tess;
328 };
329 
330 enum agx_stage_dirty {
331    AGX_STAGE_DIRTY_CONST = BITFIELD_BIT(0),
332    AGX_STAGE_DIRTY_SSBO = BITFIELD_BIT(1),
333    AGX_STAGE_DIRTY_IMAGE = BITFIELD_BIT(2),
334    AGX_STAGE_DIRTY_SAMPLER = BITFIELD_BIT(3),
335 };
336 
337 struct agx_stage {
338    struct agx_uncompiled_shader *shader;
339    uint32_t dirty;
340 
341    struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS];
342    uint32_t cb_mask;
343 
344    struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS];
345    uint32_t ssbo_writable_mask;
346    uint32_t ssbo_mask;
347 
348    struct pipe_image_view images[PIPE_MAX_SHADER_IMAGES];
349    uint32_t image_mask;
350 
351    /* Need full CSOs for u_blitter */
352    struct agx_sampler_state *samplers[PIPE_MAX_SAMPLERS];
353    struct agx_sampler_view *textures[PIPE_MAX_SHADER_SAMPLER_VIEWS];
354 
355    /* Does any bound sampler require custom border colours? */
356    bool custom_borders;
357 
358    unsigned sampler_count, texture_count;
359    uint32_t valid_samplers;
360 };
361 
362 union agx_batch_result {
363    struct drm_asahi_result_render render;
364    struct drm_asahi_result_compute compute;
365 };
366 
367 /* This is a firmware limit. It should be possible to raise to 2048 in the
368  * future... still not good enough for VK though :-(
369  */
370 #define AGX_SAMPLER_HEAP_SIZE (1024)
371 
372 struct agx_sampler_heap {
373    struct agx_bo *bo;
374    uint16_t count;
375 };
376 
377 uint16_t agx_sampler_heap_add(struct agx_device *dev,
378                               struct agx_sampler_heap *heap,
379                               struct agx_sampler_packed *sampler);
380 
381 struct agx_encoder {
382    struct agx_bo *bo;
383    uint8_t *current;
384    uint8_t *end;
385 };
386 
387 struct agx_batch {
388    struct agx_context *ctx;
389    struct pipe_framebuffer_state key;
390    uint64_t seqnum;
391    uint32_t syncobj;
392    uint32_t draws;
393 
394    struct agx_tilebuffer_layout tilebuffer_layout;
395 
396    /* PIPE_CLEAR_* bitmask */
397    uint32_t clear, draw, load, resolve;
398    bool initialized;
399 
400    uint64_t uploaded_clear_color[PIPE_MAX_COLOR_BUFS];
401    double clear_depth;
402    unsigned clear_stencil;
403 
404    /* Whether we're drawing points, lines, or triangles */
405    enum mesa_prim reduced_prim;
406 
407    /* Whether the bound FS needs a primitive ID that is not supplied by the
408     * bound hardware VS (software GS)
409     */
410    bool generate_primitive_id;
411 
412    /* Current varyings linkage structures */
413    uint32_t varyings;
414    struct agx_varyings_vs linked_varyings;
415 
416    struct agx_draw_uniforms uniforms;
417    struct agx_stage_uniforms stage_uniforms[PIPE_SHADER_TYPES];
418 
419    /* Indirect buffer allocated for geometry shader */
420    uint64_t geom_indirect;
421    struct agx_bo *geom_indirect_bo;
422 
423    /* Geometry state buffer if geometry/etc shaders are used */
424    uint64_t geometry_state;
425 
426    /* Uploaded descriptors */
427    uint32_t texture_count[PIPE_SHADER_TYPES];
428 
429    uint64_t samplers[PIPE_SHADER_TYPES];
430    uint32_t sampler_count[PIPE_SHADER_TYPES];
431 
432    struct agx_sampler_heap sampler_heap;
433 
434    /* Resource list requirements, represented as a bit set indexed by BO
435     * handles (GEM handles on Linux, or IOGPU's equivalent on macOS)
436     */
437    struct {
438       BITSET_WORD *set;
439       unsigned bit_count;
440    } bo_list;
441 
442    /* If true, this batch contains a shader with a potentially incoherent write
443     * (e.g. image_write), needing a barrier later to access.
444     */
445    bool incoherent_writes;
446 
447    struct agx_pool pool, pipeline_pool;
448 
449    /* We may enqueue both CDM and VDM work, possibly to the same batch for
450     * geometry/tessellation.
451     */
452    struct agx_encoder vdm;
453    struct agx_encoder cdm;
454 
455    /* Scissor and depth-bias descriptors, uploaded at GPU time */
456    struct util_dynarray scissor, depth_bias;
457 
458    /* Arrays of GPU pointers that should be written with the batch timestamps */
459    struct util_dynarray timestamps;
460 
461    /* Result buffer where the kernel places command execution information */
462    union agx_batch_result *result;
463    size_t result_off;
464 
465    /* Actual pointer in a uniform */
466    struct agx_bo *geom_params_bo;
467 
468    /* Whether each stage uses scratch */
469    bool vs_scratch;
470    bool fs_scratch;
471    bool cs_scratch;
472 
473    /* Whether each stage has preambles using scratch, and if so which bucket.
474     * This just needs to be zero/nonzero for correctness, the magnitude in
475     * buckets is for statistics.
476     */
477    unsigned vs_preamble_scratch;
478    unsigned fs_preamble_scratch;
479    unsigned cs_preamble_scratch;
480 };
481 
482 struct agx_zsa {
483    struct pipe_depth_stencil_alpha_state base;
484    struct agx_fragment_face_packed depth;
485    struct agx_fragment_stencil_packed front_stencil, back_stencil;
486 
487    /* PIPE_CLEAR_* bitmask corresponding to this depth/stencil state */
488    uint32_t load, store;
489 };
490 
491 struct agx_blend {
492    struct agx_blend_key key;
493 
494    /* PIPE_CLEAR_* bitmask corresponding to this blend state */
495    uint32_t store;
496 };
497 
498 struct asahi_vs_shader_key {
499    /* If true, this is running as a hardware vertex shader. If false, this is a
500     * compute job used to feed a TCS or GS.
501     */
502    bool hw;
503 };
504 
505 struct agx_vertex_elements {
506    unsigned num_attribs;
507    struct agx_velem_key key[PIPE_MAX_ATTRIBS];
508 
509    /* These parts do not affect the generated code so are not in the key */
510    uint16_t src_offsets[PIPE_MAX_ATTRIBS];
511    uint16_t buffers[PIPE_MAX_ATTRIBS];
512 };
513 
514 struct asahi_fs_shader_key {
515    enum pipe_format rt_formats[PIPE_MAX_COLOR_BUFS];
516    uint8_t nr_samples;
517    bool padding[7];
518 };
519 static_assert(sizeof(struct asahi_fs_shader_key) == 40, "no holes");
520 
521 struct asahi_gs_shader_key {
522    /* If true, this GS is run only for its side effects (including XFB) */
523    bool rasterizer_discard;
524    bool padding[7];
525 };
526 static_assert(sizeof(struct asahi_gs_shader_key) == 8, "no holes");
527 
528 union asahi_shader_key {
529    struct asahi_vs_shader_key vs;
530    struct asahi_gs_shader_key gs;
531    struct asahi_fs_shader_key fs;
532 };
533 
534 enum agx_dirty {
535    AGX_DIRTY_VERTEX = BITFIELD_BIT(0),
536    AGX_DIRTY_VIEWPORT = BITFIELD_BIT(1),
537    AGX_DIRTY_SCISSOR_ZBIAS = BITFIELD_BIT(2),
538    AGX_DIRTY_ZS = BITFIELD_BIT(3),
539    AGX_DIRTY_STENCIL_REF = BITFIELD_BIT(4),
540    AGX_DIRTY_RS = BITFIELD_BIT(5),
541    AGX_DIRTY_SPRITE_COORD_MODE = BITFIELD_BIT(6),
542    AGX_DIRTY_PRIM = BITFIELD_BIT(7),
543 
544    /* Vertex/fragment pipelines, including uniforms and textures */
545    AGX_DIRTY_VS = BITFIELD_BIT(8),
546    AGX_DIRTY_FS = BITFIELD_BIT(9),
547 
548    /* Just the progs themselves */
549    AGX_DIRTY_VS_PROG = BITFIELD_BIT(10),
550    AGX_DIRTY_FS_PROG = BITFIELD_BIT(11),
551 
552    AGX_DIRTY_BLEND = BITFIELD_BIT(12),
553    AGX_DIRTY_QUERY = BITFIELD_BIT(13),
554    AGX_DIRTY_XFB = BITFIELD_BIT(14),
555    AGX_DIRTY_SAMPLE_MASK = BITFIELD_BIT(15),
556    AGX_DIRTY_BLEND_COLOR = BITFIELD_BIT(16),
557    AGX_DIRTY_POLY_STIPPLE = BITFIELD_BIT(17),
558 };
559 
560 /* Maximum number of in-progress + under-construction GPU batches.
561  * Must be large enough for silly workloads that do things like
562  * glGenerateMipmap on every frame, otherwise we end up losing performance.
563  */
564 #define AGX_MAX_BATCHES (128)
565 
566 static_assert(PIPE_TEX_FILTER_NEAREST < 2, "known order");
567 static_assert(PIPE_TEX_FILTER_LINEAR < 2, "known order");
568 
569 enum asahi_blit_clamp {
570    ASAHI_BLIT_CLAMP_NONE,
571    ASAHI_BLIT_CLAMP_UINT_TO_SINT,
572    ASAHI_BLIT_CLAMP_SINT_TO_UINT,
573 
574    /* keep last */
575    ASAHI_BLIT_CLAMP_COUNT,
576 };
577 
578 struct asahi_blit_key {
579    enum pipe_format src_format, dst_format;
580    bool array;
581    bool aligned;
582 };
583 
584 DERIVE_HASH_TABLE(asahi_blit_key);
585 
586 struct asahi_blitter {
587    bool active;
588    struct hash_table *blit_cs;
589 
590    /* [filter] */
591    void *sampler[2];
592 
593    struct pipe_constant_buffer saved_cb;
594 
595    bool has_saved_image;
596    struct pipe_image_view saved_image;
597 
598    unsigned saved_num_sampler_states;
599    void *saved_sampler_states[PIPE_MAX_SAMPLERS];
600 
601    struct pipe_sampler_view *saved_sampler_view;
602 
603    void *saved_cs;
604 };
605 
606 struct agx_oq_heap;
607 
608 struct agx_context {
609    struct pipe_context base;
610    struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes;
611    struct {
612       struct agx_linked_shader *vs, *tcs, *tes, *gs, *fs;
613    } linked;
614    uint32_t dirty;
615 
616    /* Heap for dynamic memory allocation for geometry/tessellation shaders */
617    struct pipe_resource *heap;
618 
619    /* Occlusion query heap */
620    struct agx_oq_heap *oq;
621 
622    /* Acts as a context-level shader key */
623    bool support_lod_bias;
624    bool robust;
625 
626    /* Set of batches. When full, the LRU entry (the batch with the smallest
627     * seqnum) is flushed to free a slot.
628     */
629    struct {
630       uint64_t seqnum;
631       struct agx_batch slots[AGX_MAX_BATCHES];
632 
633       /** Set of active batches for faster traversal */
634       BITSET_DECLARE(active, AGX_MAX_BATCHES);
635 
636       /** Set of submitted batches for faster traversal */
637       BITSET_DECLARE(submitted, AGX_MAX_BATCHES);
638 
639       /* Monotonic counter for each batch incremented when resetting a batch to
640        * invalidate all associated queries. Compared to
641        * agx_query::writer_generation.
642        */
643       uint64_t generation[AGX_MAX_BATCHES];
644    } batches;
645 
646    /* Queue handle */
647    uint32_t queue_id;
648 
649    struct agx_batch *batch;
650    struct agx_bo *result_buf;
651 
652    struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS];
653    uint32_t vb_mask;
654 
655    unsigned patch_vertices;
656    float default_outer_level[4];
657    float default_inner_level[2];
658 
659    struct agx_stage stage[PIPE_SHADER_TYPES];
660    struct agx_vertex_elements *attributes;
661    struct agx_rasterizer *rast;
662    struct agx_zsa *zs;
663    struct agx_blend *blend;
664    struct pipe_blend_color blend_color;
665    struct pipe_viewport_state viewport[AGX_MAX_VIEWPORTS];
666    struct pipe_scissor_state scissor[AGX_MAX_VIEWPORTS];
667    struct pipe_stencil_ref stencil_ref;
668    struct agx_streamout streamout;
669    uint16_t sample_mask;
670    struct pipe_framebuffer_state framebuffer;
671 
672    uint32_t poly_stipple[32];
673 
674    struct pipe_query *cond_query;
675    bool cond_cond;
676    enum pipe_render_cond_flag cond_mode;
677 
678    struct agx_query *occlusion_query;
679    struct agx_query *prims_generated[4];
680    struct agx_query *tf_prims_generated[4];
681    struct agx_query *tf_overflow[4];
682    struct agx_query *tf_any_overflow;
683    struct agx_query *pipeline_statistics[PIPE_STAT_QUERY_TS_INVOCATIONS];
684    struct agx_query *time_elapsed;
685    bool active_queries;
686    bool active_draw_without_restart;
687 
688    struct util_debug_callback debug;
689    bool is_noop;
690 
691    bool in_tess;
692    bool in_generated_vdm;
693 
694    struct blitter_context *blitter;
695    struct asahi_blitter compute_blitter;
696 
697    /* Map of GEM handle to (batch index + 1) that (conservatively) writes that
698     * BO, or 0 if no writer.
699     */
700    struct util_dynarray writer;
701 
702    /* Bound CL global buffers */
703    struct util_dynarray global_buffers;
704 
705    struct hash_table *generic_meta;
706    struct agx_bg_eot_cache bg_eot;
707 
708    bool any_faults;
709 
710    uint32_t syncobj;
711    uint32_t dummy_syncobj;
712    int in_sync_fd;
713    uint32_t in_sync_obj;
714    uint64_t flush_last_seqid;
715    uint64_t flush_my_seqid;
716    uint64_t flush_other_seqid;
717 
718    struct agx_scratch scratch_vs;
719    struct agx_scratch scratch_fs;
720    struct agx_scratch scratch_cs;
721 };
722 
723 static inline unsigned
agx_batch_idx(struct agx_batch * batch)724 agx_batch_idx(struct agx_batch *batch)
725 {
726    return batch - batch->ctx->batches.slots;
727 }
728 
729 static void
agx_writer_add(struct agx_context * ctx,uint8_t batch_index,unsigned handle)730 agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle)
731 {
732    assert(batch_index < AGX_MAX_BATCHES && "invariant");
733    static_assert(AGX_MAX_BATCHES < 0xFF, "no overflow on addition");
734 
735    /* If we need to grow, double the capacity so insertion is amortized O(1). */
736    if (unlikely(handle >= ctx->writer.size)) {
737       unsigned new_size =
738          MAX2(ctx->writer.capacity * 2, util_next_power_of_two(handle + 1));
739       unsigned grow = new_size - ctx->writer.size;
740 
741       memset(util_dynarray_grow(&ctx->writer, uint8_t, grow), 0,
742              grow * sizeof(uint8_t));
743    }
744 
745    /* There is now room */
746    uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle);
747    assert((*value) == 0 && "there should be no existing writer");
748    *value = batch_index + 1;
749 }
750 
751 static struct agx_batch *
agx_writer_get(struct agx_context * ctx,unsigned handle)752 agx_writer_get(struct agx_context *ctx, unsigned handle)
753 {
754    if (handle >= ctx->writer.size)
755       return NULL;
756 
757    uint8_t value = *util_dynarray_element(&ctx->writer, uint8_t, handle);
758 
759    if (value > 0)
760       return &ctx->batches.slots[value - 1];
761    else
762       return NULL;
763 }
764 
765 static void
agx_writer_remove(struct agx_context * ctx,unsigned handle)766 agx_writer_remove(struct agx_context *ctx, unsigned handle)
767 {
768    if (handle >= ctx->writer.size)
769       return;
770 
771    uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle);
772    *value = 0;
773 }
774 
775 static inline struct agx_context *
agx_context(struct pipe_context * pctx)776 agx_context(struct pipe_context *pctx)
777 {
778    return (struct agx_context *)pctx;
779 }
780 
781 struct agx_linked_shader;
782 
783 typedef void (*meta_shader_builder_t)(struct nir_builder *b, const void *key);
784 
785 void agx_init_meta_shaders(struct agx_context *ctx);
786 
787 void agx_destroy_meta_shaders(struct agx_context *ctx);
788 
789 struct agx_compiled_shader *agx_build_meta_shader(struct agx_context *ctx,
790                                                   meta_shader_builder_t builder,
791                                                   void *data, size_t data_size);
792 
793 struct agx_grid {
794    /* Tag for the union */
795    enum agx_cdm_mode mode;
796 
797    /* If mode != INDIRECT_LOCAL, the local size */
798    uint32_t local[3];
799 
800    union {
801       /* If mode == DIRECT, the global size. This is *not* multiplied by the
802        * local size, differing from the API definition but matching AGX.
803        */
804       uint32_t global[3];
805 
806       /* Address of the indirect buffer if mode != DIRECT */
807       uint64_t indirect;
808    };
809 };
810 
811 static inline const struct agx_grid
agx_grid_direct(uint32_t global_x,uint32_t global_y,uint32_t global_z,uint32_t local_x,uint32_t local_y,uint32_t local_z)812 agx_grid_direct(uint32_t global_x, uint32_t global_y, uint32_t global_z,
813                 uint32_t local_x, uint32_t local_y, uint32_t local_z)
814 {
815    return (struct agx_grid){
816       .mode = AGX_CDM_MODE_DIRECT,
817       .global = {global_x, global_y, global_z},
818       .local = {local_x, local_y, local_z},
819    };
820 }
821 
822 static inline const struct agx_grid
agx_grid_indirect(uint64_t indirect,uint32_t local_x,uint32_t local_y,uint32_t local_z)823 agx_grid_indirect(uint64_t indirect, uint32_t local_x, uint32_t local_y,
824                   uint32_t local_z)
825 {
826    return (struct agx_grid){
827       .mode = AGX_CDM_MODE_INDIRECT_GLOBAL,
828       .local = {local_x, local_y, local_z},
829       .indirect = indirect,
830    };
831 }
832 
833 static inline const struct agx_grid
agx_grid_indirect_local(uint64_t indirect)834 agx_grid_indirect_local(uint64_t indirect)
835 {
836    return (struct agx_grid){
837       .mode = AGX_CDM_MODE_INDIRECT_LOCAL,
838       .indirect = indirect,
839    };
840 }
841 
842 void agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid,
843                           meta_shader_builder_t builder, void *key,
844                           size_t key_size, void *data, size_t data_size);
845 
846 void agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid,
847                          struct agx_compiled_shader *cs,
848                          enum pipe_shader_type stage, uint32_t usc);
849 
850 void agx_launch(struct agx_batch *batch, const struct agx_grid *grid,
851                 struct agx_compiled_shader *cs,
852                 struct agx_linked_shader *linked, enum pipe_shader_type stage,
853                 unsigned variable_shared_mem);
854 
855 void agx_init_query_functions(struct pipe_context *ctx);
856 
857 void
858 agx_primitives_update_direct(struct agx_context *ctx,
859                              const struct pipe_draw_info *info,
860                              const struct pipe_draw_start_count_bias *draw);
861 
862 void agx_draw_vbo_from_xfb(struct pipe_context *pctx,
863                            const struct pipe_draw_info *info,
864                            unsigned drawid_offset,
865                            const struct pipe_draw_indirect_info *indirect);
866 
867 uint64_t agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer,
868                                   uint32_t *size);
869 
870 void agx_init_streamout_functions(struct pipe_context *ctx);
871 
872 static inline void
agx_dirty_all(struct agx_context * ctx)873 agx_dirty_all(struct agx_context *ctx)
874 {
875    ctx->dirty = ~0;
876 
877    for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i)
878       ctx->stage[i].dirty = ~0;
879 }
880 
881 static inline void
agx_dirty_reset_graphics(struct agx_context * ctx)882 agx_dirty_reset_graphics(struct agx_context *ctx)
883 {
884    ctx->dirty = 0;
885 
886    for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i) {
887       if (i != PIPE_SHADER_COMPUTE)
888          ctx->stage[i].dirty = 0;
889    }
890 }
891 
892 struct agx_rasterizer {
893    struct pipe_rasterizer_state base;
894    uint8_t cull[AGX_CULL_LENGTH];
895    uint8_t line_width;
896    uint8_t polygon_mode;
897    bool depth_bias;
898 };
899 
900 struct agx_query {
901    unsigned type;
902    unsigned index;
903 
904    uint64_t writer_generation[AGX_MAX_BATCHES];
905    struct agx_bo *bo;
906    struct agx_ptr ptr;
907 };
908 
909 struct agx_sampler_state {
910    struct pipe_sampler_state base;
911 
912    /* Prepared descriptor */
913    struct agx_sampler_packed desc, desc_without_custom_border;
914 
915    /* Whether a custom border colour is required */
916    bool uses_custom_border;
917 
918    /* Packed custom border colour, or zero if none is required */
919    struct agx_border_packed border;
920 
921    /* LOD bias packed as fp16, the form we'll pass to the shader */
922    uint16_t lod_bias_as_fp16;
923 };
924 
925 struct agx_sampler_view {
926    struct pipe_sampler_view base;
927 
928    /* Resource/format, may differ from base in case of separate stencil */
929    struct agx_resource *rsrc;
930    enum pipe_format format;
931 
932    /* Prepared descriptor */
933    struct agx_texture_packed desc;
934 };
935 
936 struct agx_screen {
937    struct pipe_screen pscreen;
938    struct agx_device dev;
939    struct disk_cache *disk_cache;
940 
941    /* Shared timeline syncobj and value to serialize flushes across contexts */
942    uint32_t flush_syncobj;
943    uint64_t flush_cur_seqid;
944    uint64_t flush_wait_seqid;
945    /* Lock to protect flush_wait_seqid updates (reads are just atomic) */
946    simple_mtx_t flush_seqid_lock;
947 
948    /* Lock to protect syncobj usage vs. destruction in context destroy */
949    struct u_rwlock destroy_lock;
950 };
951 
952 static inline struct agx_screen *
agx_screen(struct pipe_screen * p)953 agx_screen(struct pipe_screen *p)
954 {
955    return (struct agx_screen *)p;
956 }
957 
958 static inline struct agx_device *
agx_device(struct pipe_screen * p)959 agx_device(struct pipe_screen *p)
960 {
961    return &(agx_screen(p)->dev);
962 }
963 
964 #define perf_debug(dev, ...)                                                   \
965    do {                                                                        \
966       if (unlikely((dev)->debug & AGX_DBG_PERF))                               \
967          mesa_logw(__VA_ARGS__);                                               \
968    } while (0)
969 
970 #define perf_debug_ctx(ctx, ...)                                               \
971    perf_debug(agx_device((ctx)->base.screen), __VA_ARGS__)
972 
973 struct agx_resource {
974    struct pipe_resource base;
975    uint64_t modifier;
976 
977    /* Should probably be part of the modifier. Affects the tiling algorithm, or
978     * something like that.
979     */
980    bool mipmapped;
981 
982    /* Hardware backing */
983    struct agx_bo *bo;
984 
985    struct renderonly_scanout *scanout;
986 
987    BITSET_DECLARE(data_valid, PIPE_MAX_TEXTURE_LEVELS);
988 
989    struct ail_layout layout;
990 
991    /* Metal does not support packed depth/stencil formats; presumably AGX does
992     * not either. Instead, we create separate depth and stencil resources,
993     * managed by u_transfer_helper.  We provide the illusion of packed
994     * resources.
995     */
996    struct agx_resource *separate_stencil;
997 
998    /* Valid buffer range tracking, to optimize buffer appends */
999    struct util_range valid_buffer_range;
1000 
1001    /* Cumulative shadowed byte count for this resource, that is, the number of
1002     * times multiplied by the resource size.
1003     */
1004    size_t shadowed_bytes;
1005 };
1006 
1007 static inline struct agx_resource *
agx_resource(struct pipe_resource * pctx)1008 agx_resource(struct pipe_resource *pctx)
1009 {
1010    return (struct agx_resource *)pctx;
1011 }
1012 
1013 static inline bool
agx_resource_valid(struct agx_resource * rsrc,int level)1014 agx_resource_valid(struct agx_resource *rsrc, int level)
1015 {
1016    /* Shared BOs can always be potentially valid */
1017    if (rsrc->bo && rsrc->bo->flags & AGX_BO_SHARED) {
1018       assert(level == 0);
1019       return true;
1020    }
1021 
1022    return BITSET_TEST(rsrc->data_valid, level);
1023 }
1024 
1025 static inline void *
agx_map_texture_cpu(struct agx_resource * rsrc,unsigned level,unsigned z)1026 agx_map_texture_cpu(struct agx_resource *rsrc, unsigned level, unsigned z)
1027 {
1028    return ((uint8_t *)rsrc->bo->map) +
1029           ail_get_layer_level_B(&rsrc->layout, z, level);
1030 }
1031 
1032 static inline uint64_t
agx_map_texture_gpu(struct agx_resource * rsrc,unsigned z)1033 agx_map_texture_gpu(struct agx_resource *rsrc, unsigned z)
1034 {
1035    return rsrc->bo->va->addr +
1036           (uint64_t)ail_get_layer_offset_B(&rsrc->layout, z);
1037 }
1038 
1039 void agx_decompress(struct agx_context *ctx, struct agx_resource *rsrc,
1040                     const char *reason);
1041 
1042 void agx_legalize_compression(struct agx_context *ctx,
1043                               struct agx_resource *rsrc,
1044                               enum pipe_format format);
1045 
1046 struct agx_transfer {
1047    struct pipe_transfer base;
1048    void *map;
1049    struct {
1050       struct pipe_resource *rsrc;
1051       struct pipe_box box;
1052    } staging;
1053 };
1054 
1055 static inline struct agx_transfer *
agx_transfer(struct pipe_transfer * p)1056 agx_transfer(struct pipe_transfer *p)
1057 {
1058    return (struct agx_transfer *)p;
1059 }
1060 
1061 void agx_upload_vbos(struct agx_batch *batch);
1062 void agx_upload_uniforms(struct agx_batch *batch);
1063 
1064 void agx_set_sampler_uniforms(struct agx_batch *batch,
1065                               enum pipe_shader_type stage);
1066 
1067 void agx_set_cbuf_uniforms(struct agx_batch *batch,
1068                            enum pipe_shader_type stage);
1069 
1070 void agx_set_ssbo_uniforms(struct agx_batch *batch,
1071                            enum pipe_shader_type stage);
1072 
1073 bool agx_nir_lower_point_size(nir_shader *nir, bool insert_write);
1074 
1075 bool agx_nir_lower_sysvals(nir_shader *shader, enum pipe_shader_type desc_stage,
1076                            bool lower_draw_params);
1077 
1078 bool agx_nir_layout_uniforms(nir_shader *shader,
1079                              struct agx_compiled_shader *compiled,
1080                              unsigned *push_size);
1081 
1082 bool agx_nir_lower_bindings(nir_shader *shader, bool *uses_bindless_samplers);
1083 
1084 bool agx_batch_is_active(struct agx_batch *batch);
1085 bool agx_batch_is_submitted(struct agx_batch *batch);
1086 
1087 /* Add a BO to a batch. This needs to be amortized O(1) since it's called in
1088  * hot paths. To achieve this we model BO lists by bit sets */
1089 
1090 static bool
agx_batch_uses_bo(struct agx_batch * batch,struct agx_bo * bo)1091 agx_batch_uses_bo(struct agx_batch *batch, struct agx_bo *bo)
1092 {
1093    if (bo->handle < batch->bo_list.bit_count)
1094       return BITSET_TEST(batch->bo_list.set, bo->handle);
1095    else
1096       return false;
1097 }
1098 
1099 static inline void
agx_batch_add_bo(struct agx_batch * batch,struct agx_bo * bo)1100 agx_batch_add_bo(struct agx_batch *batch, struct agx_bo *bo)
1101 {
1102    /* Double the size of the BO list if we run out, this is amortized O(1) */
1103    if (unlikely(bo->handle > batch->bo_list.bit_count)) {
1104       const unsigned bits_per_word = sizeof(BITSET_WORD) * 8;
1105 
1106       unsigned bit_count =
1107          MAX2(batch->bo_list.bit_count * 2,
1108               util_next_power_of_two(ALIGN_POT(bo->handle + 1, bits_per_word)));
1109 
1110       batch->bo_list.set = rerzalloc(
1111          batch->ctx, batch->bo_list.set, BITSET_WORD,
1112          batch->bo_list.bit_count / bits_per_word, bit_count / bits_per_word);
1113       batch->bo_list.bit_count = bit_count;
1114    }
1115 
1116    if (BITSET_TEST(batch->bo_list.set, bo->handle))
1117       return;
1118 
1119    /* The batch holds a single reference to each BO in the batch, released when
1120     * the batch finishes execution.
1121     */
1122    agx_bo_reference(bo);
1123    BITSET_SET(batch->bo_list.set, bo->handle);
1124 }
1125 
1126 #define AGX_BATCH_FOREACH_BO_HANDLE(batch, handle)                             \
1127    BITSET_FOREACH_SET(handle, (batch)->bo_list.set, batch->bo_list.bit_count)
1128 
1129 struct drm_asahi_cmd_compute;
1130 struct drm_asahi_cmd_render;
1131 
1132 void agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch,
1133                       struct drm_asahi_cmd_compute *compute,
1134                       struct drm_asahi_cmd_render *render);
1135 
1136 void agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch);
1137 void agx_flush_batch_for_reason(struct agx_context *ctx,
1138                                 struct agx_batch *batch, const char *reason);
1139 void agx_flush_all(struct agx_context *ctx, const char *reason);
1140 void agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc,
1141                        const char *reason);
1142 void agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc,
1143                       const char *reason);
1144 
1145 void agx_sync_writer(struct agx_context *ctx, struct agx_resource *rsrc,
1146                      const char *reason);
1147 void agx_sync_readers(struct agx_context *ctx, struct agx_resource *rsrc,
1148                       const char *reason);
1149 void agx_sync_batch(struct agx_context *ctx, struct agx_batch *batch);
1150 void agx_sync_all(struct agx_context *ctx, const char *reason);
1151 void agx_sync_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch,
1152                                const char *reason);
1153 void agx_memory_barrier(struct pipe_context *pctx, unsigned flags);
1154 
1155 /* Use these instead of batch_add_bo for proper resource tracking */
1156 void agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc);
1157 void agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc,
1158                       unsigned level);
1159 void agx_batch_writes_range(struct agx_batch *batch, struct agx_resource *rsrc,
1160                             unsigned offset, unsigned size);
1161 void agx_batch_track_image(struct agx_batch *batch,
1162                            struct pipe_image_view *image);
1163 
1164 bool agx_any_batch_uses_resource(struct agx_context *ctx,
1165                                  struct agx_resource *rsrc);
1166 
1167 /* 16384 is the maximum framebuffer dimension, so we use a larger width (the
1168  * maximum uint16_t) as a sentinel to identify the compute batch. This ensures
1169  * compute batches don't mix with graphics. This is a bit of a hack but it
1170  * works.
1171  */
1172 #define AGX_COMPUTE_BATCH_WIDTH 0xFFFF
1173 
1174 static inline bool
agx_batch_is_compute(struct agx_batch * batch)1175 agx_batch_is_compute(struct agx_batch *batch)
1176 {
1177    return batch->key.width == AGX_COMPUTE_BATCH_WIDTH;
1178 }
1179 
1180 struct agx_batch *agx_get_batch(struct agx_context *ctx);
1181 struct agx_batch *agx_get_compute_batch(struct agx_context *ctx);
1182 void agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch);
1183 int agx_cleanup_batches(struct agx_context *ctx);
1184 
1185 void agx_batch_add_timestamp_query(struct agx_batch *batch,
1186                                    struct agx_query *q);
1187 void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q);
1188 
1189 void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query,
1190                              uint64_t increment);
1191 
1192 /* Blit shaders */
1193 void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
1194                       bool render_cond);
1195 
1196 void agx_blit(struct pipe_context *pipe, const struct pipe_blit_info *info);
1197 
1198 void agx_resource_copy_region(struct pipe_context *pctx,
1199                               struct pipe_resource *dst, unsigned dst_level,
1200                               unsigned dstx, unsigned dsty, unsigned dstz,
1201                               struct pipe_resource *src, unsigned src_level,
1202                               const struct pipe_box *src_box);
1203 
1204 /* Batch logic */
1205 
1206 struct agx_encoder agx_encoder_allocate(struct agx_batch *batch,
1207                                         struct agx_device *dev);
1208 
1209 void agx_batch_init_state(struct agx_batch *batch);
1210 
1211 struct asahi_bg_eot {
1212    uint64_t usc;
1213    struct agx_counts_packed counts;
1214 };
1215 
1216 struct asahi_bg_eot agx_build_bg_eot(struct agx_batch *batch, bool store,
1217                                      bool partial_render);
1218 
1219 /* Query management */
1220 uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query);
1221 uint64_t agx_get_query_address(struct agx_batch *batch,
1222                                struct agx_query *query);
1223 uint64_t agx_get_occlusion_heap(struct agx_batch *batch);
1224 
1225 void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
1226                               uint64_t end_ts);
1227 
1228 bool agx_render_condition_check_inner(struct agx_context *ctx);
1229 
1230 static inline bool
agx_render_condition_check(struct agx_context * ctx)1231 agx_render_condition_check(struct agx_context *ctx)
1232 {
1233    if (likely(!ctx->cond_query))
1234       return true;
1235    else
1236       return agx_render_condition_check_inner(ctx);
1237 }
1238 
1239 static inline uint32_t
agx_texture_buffer_size_el(enum pipe_format format,uint32_t size)1240 agx_texture_buffer_size_el(enum pipe_format format, uint32_t size)
1241 {
1242    unsigned blocksize = util_format_get_blocksize(format);
1243 
1244    return MIN2(AGX_TEXTURE_BUFFER_MAX_SIZE, size / blocksize);
1245 }
1246