/* * Copyright 2021 Alyssa Rosenzweig * Copyright 2019-2021 Collabora, Ltd. * SPDX-License-Identifier: MIT */ #pragma once #include #include "asahi/compiler/agx_compile.h" #include "asahi/genxml/agx_pack.h" #include "asahi/layout/layout.h" #include "asahi/lib/agx_bo.h" #include "asahi/lib/agx_device.h" #include "asahi/lib/agx_linker.h" #include "asahi/lib/agx_nir_lower_vbo.h" #include "asahi/lib/agx_scratch.h" #include "asahi/lib/agx_tilebuffer.h" #include "asahi/lib/agx_uvs.h" #include "asahi/lib/pool.h" #include "asahi/lib/shaders/geometry.h" #include "asahi/lib/unstable_asahi_drm.h" #include "compiler/nir/nir_lower_blend.h" #include "compiler/shader_enums.h" #include "gallium/auxiliary/util/u_blitter.h" #include "gallium/include/pipe/p_context.h" #include "gallium/include/pipe/p_screen.h" #include "gallium/include/pipe/p_state.h" #include "pipe/p_defines.h" #include "util/bitset.h" #include "util/disk_cache.h" #include "util/hash_table.h" #include "util/rwlock.h" #include "util/u_range.h" #include "agx_bg_eot.h" #include "agx_helpers.h" #include "agx_nir_passes.h" #ifdef __GLIBC__ #include #define agx_msg(fmt, ...) \ fprintf(stderr, "[%s] " fmt, program_invocation_short_name, ##__VA_ARGS__) #else #define agx_msg(...) fprintf(stderr, __VA_ARGS__) #endif #define AGX_NUM_TEXTURE_STATE_REGS 16 struct agx_streamout_target { struct pipe_stream_output_target base; struct pipe_resource *offset; /* Current stride (bytes per vertex) */ uint32_t stride; }; static inline struct agx_streamout_target * agx_so_target(struct pipe_stream_output_target *target) { return (struct agx_streamout_target *)target; } struct agx_streamout { struct pipe_stream_output_target *targets[PIPE_MAX_SO_BUFFERS]; unsigned num_targets; }; /* Shaders can access fixed-function state through system values. * It is convenient to stash all of this information into a single "root" * descriptor, then push individual parts as needed. * * In the future, we could optimize this to reduce CPU overhead, e.g. splitting * into multiple descriptors for finer dirty tracking. This is not ABI with the * compiler. The layout is up to us and handled by our code lowering system * values to uniforms. */ enum agx_sysval_table { AGX_SYSVAL_TABLE_ROOT, AGX_SYSVAL_TABLE_PARAMS, AGX_SYSVAL_TABLE_GRID, AGX_SYSVAL_TABLE_VS, AGX_SYSVAL_TABLE_TCS, AGX_SYSVAL_TABLE_TES, AGX_SYSVAL_TABLE_GS, AGX_SYSVAL_TABLE_FS, AGX_SYSVAL_TABLE_CS, AGX_NUM_SYSVAL_TABLES }; #define AGX_SYSVAL_STAGE(stage) (AGX_SYSVAL_TABLE_VS + (stage)) static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_VERTEX) == AGX_SYSVAL_TABLE_VS, "fixed enum orderings"); static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_CTRL) == AGX_SYSVAL_TABLE_TCS, "fixed enum orderings"); static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_TESS_EVAL) == AGX_SYSVAL_TABLE_TES, "fixed enum orderings"); static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_GEOMETRY) == AGX_SYSVAL_TABLE_GS, "fixed enum orderings"); static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_FRAGMENT) == AGX_SYSVAL_TABLE_FS, "fixed enum orderings"); static_assert(AGX_SYSVAL_STAGE(PIPE_SHADER_COMPUTE) == AGX_SYSVAL_TABLE_CS, "fixed enum orderings"); /* Root system value table */ struct PACKED agx_draw_uniforms { /* Pointers to the system value tables themselves (for indirection) */ uint64_t tables[AGX_NUM_SYSVAL_TABLES]; /* Vertex buffer object bases, if present. If vertex robustness is disabled, * attrib_base maps VBOs directly and attrib_max_index is undefined. If * vertex robustness is enabled, attrib_base maps attributes and * attrib_clamp is an inclusive clamp on vertex/divided instance indices. */ uint64_t attrib_base[PIPE_MAX_ATTRIBS]; uint32_t attrib_clamp[PIPE_MAX_ATTRIBS]; /* Addresses for the results of pipeline statistics queries */ uint64_t pipeline_statistics[PIPE_STAT_QUERY_MS_INVOCATIONS]; /* Pointer to base address of the VS->TCS, VS->GS, or TES->GS buffer. * Indirected so it can be written to in an indirect setup kernel. G13 * appears to prefetch uniforms across dispatches, but does not pre-run * preambles, so this indirection saves us from splitting the batch. */ uint64_t vertex_output_buffer_ptr; /* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */ uint64_t vertex_outputs; /* Address of input assembly buffer if geom/tess is used, else 0 */ uint64_t input_assembly; /* Address of tessellation param buffer if tessellation is used, else 0 */ uint64_t tess_params; /* Address of geometry param buffer if geometry shaders are used, else 0 */ uint64_t geometry_params; /* Address of polygon stipple mask if used */ uint64_t polygon_stipple; /* Blend constant if any */ float blend_constant[4]; /* glPointSize value */ float fixed_point_size; /* Value of the multisample control register, containing sample positions in * each byte (x in low nibble, y in high nibble). */ uint32_t ppp_multisamplectl; /* gl_DrawID for a direct multidraw */ uint32_t draw_id; /* Sprite coord replacement mask */ uint16_t sprite_mask; /* glSampleMask */ uint16_t sample_mask; /* Nonzero for indexed draws, zero otherwise */ uint16_t is_indexed_draw; /* Zero for [0, 1] clipping, 0.5 for [-1, 1] clipping. */ uint16_t clip_z_coeff; /* ~0/0 boolean whether the epilog lacks any discard instrction */ uint16_t no_epilog_discard; /* Provoking vertex: 0, 1, 2 */ uint16_t provoking_vertex; /* Mapping from varying slots written by the last vertex stage to UVS * indices. This mapping must be compatible with the fragment shader. */ uint16_t uvs_index[VARYING_SLOT_MAX]; }; struct PACKED agx_stage_uniforms { /* Pointer to binding table for texture descriptor, or 0 if none. This must * be first so that u0_u1 is always available for lowering binding * tables to bindless access. */ uint64_t texture_base; /* Uniform buffer objects */ uint64_t ubo_base[PIPE_MAX_CONSTANT_BUFFERS]; uint32_t ubo_size[PIPE_MAX_CONSTANT_BUFFERS]; /* Shader storage buffer objects */ uint64_t ssbo_base[PIPE_MAX_SHADER_BUFFERS]; uint32_t ssbo_size[PIPE_MAX_SHADER_BUFFERS]; /* If lowered to bindless, sampler index in the heap */ uint16_t sampler_handle[PIPE_MAX_SAMPLERS]; /* LOD bias as float16 */ uint16_t lod_bias[PIPE_MAX_SAMPLERS]; }; /* In the architecture, there are 512 uniform registers, each 16-bits. In a * theoretical worst case, we could push to all of them. We use a worst-case * maximum because the expression for a tight upper bound is too messy and easy * to go out of sync with the code. */ #define AGX_MAX_PUSH_RANGES (512) struct agx_push_range { /* Base 16-bit uniform to push to */ uint16_t uniform; /* Offset into the table to push in bytes */ uint16_t offset; /* Which table to push from */ uint8_t table; /* Number of consecutive 16-bit uniforms to push */ uint8_t length; }; struct agx_compiled_shader { /* Base struct */ struct agx_shader_part b; /* Uncompiled shader that we belong to */ const struct agx_uncompiled_shader *so; /* Mapped executable memory */ struct agx_bo *bo; /* Uniforms the driver must push */ unsigned push_range_count; struct agx_push_range push[AGX_MAX_PUSH_RANGES]; /* UVS layout for the last vertex stage */ struct agx_unlinked_uvs_layout uvs; /* For a vertex shader, the mask of vertex attributes read. Used to key the * prolog so the prolog doesn't write components not actually read. */ BITSET_DECLARE(attrib_components_read, AGX_MAX_ATTRIBS * 4); struct agx_fs_epilog_link_info epilog_key; /* Auxiliary programs, or NULL if not used */ struct agx_compiled_shader *gs_count, *pre_gs; struct agx_compiled_shader *gs_copy; /* Output primitive mode for geometry shaders */ enum mesa_prim gs_output_mode; /* Number of words per primitive in the count buffer */ unsigned gs_count_words; /* Logical shader stage used for descriptor access. This may differ from the * physical shader stage of the compiled shader, for example when executing a * tessellation eval shader as a vertex shader. */ enum pipe_shader_type stage; }; struct agx_fast_link_key { union { struct agx_vs_prolog_key vs; struct agx_fs_prolog_key fs; } prolog; struct agx_compiled_shader *main; union { struct agx_fs_epilog_key fs; } epilog; unsigned nr_samples_shaded; }; struct agx_uncompiled_shader { struct pipe_shader_state base; enum pipe_shader_type type; struct blob early_serialized_nir; struct blob serialized_nir; uint8_t nir_sha1[20]; struct { uint64_t inputs_flat_shaded; uint64_t inputs_linear_shaded; uint8_t cull_distance_size; bool has_edgeflags; bool uses_fbfetch; /* Number of bindful textures, images used */ unsigned nr_bindful_textures, nr_bindful_images; } info; struct hash_table *variants; struct agx_uncompiled_shader *passthrough_progs[MESA_PRIM_COUNT][3][2]; struct agx_uncompiled_shader *passthrough_tcs[32]; /* agx_fast_link_key -> agx_linked_shader */ struct hash_table *linked_shaders; uint32_t xfb_strides[4]; bool has_xfb_info; bool is_xfb_passthrough; enum mesa_prim gs_mode; /* Whether the shader accesses indexed samplers via the bindless heap */ bool uses_bindless_samplers; /* Set on VS, passed to FS for linkage */ unsigned base_varying; /* Tessellation info */ struct { uint64_t per_vertex_outputs; uint32_t output_stride; enum gl_tess_spacing spacing; enum tess_primitive_mode primitive; uint8_t output_patch_size; uint8_t nr_patch_outputs; bool ccw; bool point_mode; } tess; }; enum agx_stage_dirty { AGX_STAGE_DIRTY_CONST = BITFIELD_BIT(0), AGX_STAGE_DIRTY_SSBO = BITFIELD_BIT(1), AGX_STAGE_DIRTY_IMAGE = BITFIELD_BIT(2), AGX_STAGE_DIRTY_SAMPLER = BITFIELD_BIT(3), }; struct agx_stage { struct agx_uncompiled_shader *shader; uint32_t dirty; struct pipe_constant_buffer cb[PIPE_MAX_CONSTANT_BUFFERS]; uint32_t cb_mask; struct pipe_shader_buffer ssbo[PIPE_MAX_SHADER_BUFFERS]; uint32_t ssbo_writable_mask; uint32_t ssbo_mask; struct pipe_image_view images[PIPE_MAX_SHADER_IMAGES]; uint32_t image_mask; /* Need full CSOs for u_blitter */ struct agx_sampler_state *samplers[PIPE_MAX_SAMPLERS]; struct agx_sampler_view *textures[PIPE_MAX_SHADER_SAMPLER_VIEWS]; /* Does any bound sampler require custom border colours? */ bool custom_borders; unsigned sampler_count, texture_count; uint32_t valid_samplers; }; union agx_batch_result { struct drm_asahi_result_render render; struct drm_asahi_result_compute compute; }; /* This is a firmware limit. It should be possible to raise to 2048 in the * future... still not good enough for VK though :-( */ #define AGX_SAMPLER_HEAP_SIZE (1024) struct agx_sampler_heap { struct agx_bo *bo; uint16_t count; }; uint16_t agx_sampler_heap_add(struct agx_device *dev, struct agx_sampler_heap *heap, struct agx_sampler_packed *sampler); struct agx_encoder { struct agx_bo *bo; uint8_t *current; uint8_t *end; }; struct agx_batch { struct agx_context *ctx; struct pipe_framebuffer_state key; uint64_t seqnum; uint32_t syncobj; uint32_t draws; struct agx_tilebuffer_layout tilebuffer_layout; /* PIPE_CLEAR_* bitmask */ uint32_t clear, draw, load, resolve; bool initialized; uint64_t uploaded_clear_color[PIPE_MAX_COLOR_BUFS]; double clear_depth; unsigned clear_stencil; /* Whether we're drawing points, lines, or triangles */ enum mesa_prim reduced_prim; /* Whether the bound FS needs a primitive ID that is not supplied by the * bound hardware VS (software GS) */ bool generate_primitive_id; /* Current varyings linkage structures */ uint32_t varyings; struct agx_varyings_vs linked_varyings; struct agx_draw_uniforms uniforms; struct agx_stage_uniforms stage_uniforms[PIPE_SHADER_TYPES]; /* Indirect buffer allocated for geometry shader */ uint64_t geom_indirect; struct agx_bo *geom_indirect_bo; /* Geometry state buffer if geometry/etc shaders are used */ uint64_t geometry_state; /* Uploaded descriptors */ uint32_t texture_count[PIPE_SHADER_TYPES]; uint64_t samplers[PIPE_SHADER_TYPES]; uint32_t sampler_count[PIPE_SHADER_TYPES]; struct agx_sampler_heap sampler_heap; /* Resource list requirements, represented as a bit set indexed by BO * handles (GEM handles on Linux, or IOGPU's equivalent on macOS) */ struct { BITSET_WORD *set; unsigned bit_count; } bo_list; /* If true, this batch contains a shader with a potentially incoherent write * (e.g. image_write), needing a barrier later to access. */ bool incoherent_writes; struct agx_pool pool, pipeline_pool; /* We may enqueue both CDM and VDM work, possibly to the same batch for * geometry/tessellation. */ struct agx_encoder vdm; struct agx_encoder cdm; /* Scissor and depth-bias descriptors, uploaded at GPU time */ struct util_dynarray scissor, depth_bias; /* Arrays of GPU pointers that should be written with the batch timestamps */ struct util_dynarray timestamps; /* Result buffer where the kernel places command execution information */ union agx_batch_result *result; size_t result_off; /* Actual pointer in a uniform */ struct agx_bo *geom_params_bo; /* Whether each stage uses scratch */ bool vs_scratch; bool fs_scratch; bool cs_scratch; /* Whether each stage has preambles using scratch, and if so which bucket. * This just needs to be zero/nonzero for correctness, the magnitude in * buckets is for statistics. */ unsigned vs_preamble_scratch; unsigned fs_preamble_scratch; unsigned cs_preamble_scratch; }; struct agx_zsa { struct pipe_depth_stencil_alpha_state base; struct agx_fragment_face_packed depth; struct agx_fragment_stencil_packed front_stencil, back_stencil; /* PIPE_CLEAR_* bitmask corresponding to this depth/stencil state */ uint32_t load, store; }; struct agx_blend { struct agx_blend_key key; /* PIPE_CLEAR_* bitmask corresponding to this blend state */ uint32_t store; }; struct asahi_vs_shader_key { /* If true, this is running as a hardware vertex shader. If false, this is a * compute job used to feed a TCS or GS. */ bool hw; }; struct agx_vertex_elements { unsigned num_attribs; struct agx_velem_key key[PIPE_MAX_ATTRIBS]; /* These parts do not affect the generated code so are not in the key */ uint16_t src_offsets[PIPE_MAX_ATTRIBS]; uint16_t buffers[PIPE_MAX_ATTRIBS]; }; struct asahi_fs_shader_key { enum pipe_format rt_formats[PIPE_MAX_COLOR_BUFS]; uint8_t nr_samples; bool padding[7]; }; static_assert(sizeof(struct asahi_fs_shader_key) == 40, "no holes"); struct asahi_gs_shader_key { /* If true, this GS is run only for its side effects (including XFB) */ bool rasterizer_discard; bool padding[7]; }; static_assert(sizeof(struct asahi_gs_shader_key) == 8, "no holes"); union asahi_shader_key { struct asahi_vs_shader_key vs; struct asahi_gs_shader_key gs; struct asahi_fs_shader_key fs; }; enum agx_dirty { AGX_DIRTY_VERTEX = BITFIELD_BIT(0), AGX_DIRTY_VIEWPORT = BITFIELD_BIT(1), AGX_DIRTY_SCISSOR_ZBIAS = BITFIELD_BIT(2), AGX_DIRTY_ZS = BITFIELD_BIT(3), AGX_DIRTY_STENCIL_REF = BITFIELD_BIT(4), AGX_DIRTY_RS = BITFIELD_BIT(5), AGX_DIRTY_SPRITE_COORD_MODE = BITFIELD_BIT(6), AGX_DIRTY_PRIM = BITFIELD_BIT(7), /* Vertex/fragment pipelines, including uniforms and textures */ AGX_DIRTY_VS = BITFIELD_BIT(8), AGX_DIRTY_FS = BITFIELD_BIT(9), /* Just the progs themselves */ AGX_DIRTY_VS_PROG = BITFIELD_BIT(10), AGX_DIRTY_FS_PROG = BITFIELD_BIT(11), AGX_DIRTY_BLEND = BITFIELD_BIT(12), AGX_DIRTY_QUERY = BITFIELD_BIT(13), AGX_DIRTY_XFB = BITFIELD_BIT(14), AGX_DIRTY_SAMPLE_MASK = BITFIELD_BIT(15), AGX_DIRTY_BLEND_COLOR = BITFIELD_BIT(16), AGX_DIRTY_POLY_STIPPLE = BITFIELD_BIT(17), }; /* Maximum number of in-progress + under-construction GPU batches. * Must be large enough for silly workloads that do things like * glGenerateMipmap on every frame, otherwise we end up losing performance. */ #define AGX_MAX_BATCHES (128) static_assert(PIPE_TEX_FILTER_NEAREST < 2, "known order"); static_assert(PIPE_TEX_FILTER_LINEAR < 2, "known order"); enum asahi_blit_clamp { ASAHI_BLIT_CLAMP_NONE, ASAHI_BLIT_CLAMP_UINT_TO_SINT, ASAHI_BLIT_CLAMP_SINT_TO_UINT, /* keep last */ ASAHI_BLIT_CLAMP_COUNT, }; struct asahi_blit_key { enum pipe_format src_format, dst_format; bool array; bool aligned; }; DERIVE_HASH_TABLE(asahi_blit_key); struct asahi_blitter { bool active; struct hash_table *blit_cs; /* [filter] */ void *sampler[2]; struct pipe_constant_buffer saved_cb; bool has_saved_image; struct pipe_image_view saved_image; unsigned saved_num_sampler_states; void *saved_sampler_states[PIPE_MAX_SAMPLERS]; struct pipe_sampler_view *saved_sampler_view; void *saved_cs; }; struct agx_oq_heap; struct agx_context { struct pipe_context base; struct agx_compiled_shader *vs, *fs, *gs, *tcs, *tes; struct { struct agx_linked_shader *vs, *tcs, *tes, *gs, *fs; } linked; uint32_t dirty; /* Heap for dynamic memory allocation for geometry/tessellation shaders */ struct pipe_resource *heap; /* Occlusion query heap */ struct agx_oq_heap *oq; /* Acts as a context-level shader key */ bool support_lod_bias; bool robust; /* Set of batches. When full, the LRU entry (the batch with the smallest * seqnum) is flushed to free a slot. */ struct { uint64_t seqnum; struct agx_batch slots[AGX_MAX_BATCHES]; /** Set of active batches for faster traversal */ BITSET_DECLARE(active, AGX_MAX_BATCHES); /** Set of submitted batches for faster traversal */ BITSET_DECLARE(submitted, AGX_MAX_BATCHES); /* Monotonic counter for each batch incremented when resetting a batch to * invalidate all associated queries. Compared to * agx_query::writer_generation. */ uint64_t generation[AGX_MAX_BATCHES]; } batches; /* Queue handle */ uint32_t queue_id; struct agx_batch *batch; struct agx_bo *result_buf; struct pipe_vertex_buffer vertex_buffers[PIPE_MAX_ATTRIBS]; uint32_t vb_mask; unsigned patch_vertices; float default_outer_level[4]; float default_inner_level[2]; struct agx_stage stage[PIPE_SHADER_TYPES]; struct agx_vertex_elements *attributes; struct agx_rasterizer *rast; struct agx_zsa *zs; struct agx_blend *blend; struct pipe_blend_color blend_color; struct pipe_viewport_state viewport[AGX_MAX_VIEWPORTS]; struct pipe_scissor_state scissor[AGX_MAX_VIEWPORTS]; struct pipe_stencil_ref stencil_ref; struct agx_streamout streamout; uint16_t sample_mask; struct pipe_framebuffer_state framebuffer; uint32_t poly_stipple[32]; struct pipe_query *cond_query; bool cond_cond; enum pipe_render_cond_flag cond_mode; struct agx_query *occlusion_query; struct agx_query *prims_generated[4]; struct agx_query *tf_prims_generated[4]; struct agx_query *tf_overflow[4]; struct agx_query *tf_any_overflow; struct agx_query *pipeline_statistics[PIPE_STAT_QUERY_TS_INVOCATIONS]; struct agx_query *time_elapsed; bool active_queries; bool active_draw_without_restart; struct util_debug_callback debug; bool is_noop; bool in_tess; bool in_generated_vdm; struct blitter_context *blitter; struct asahi_blitter compute_blitter; /* Map of GEM handle to (batch index + 1) that (conservatively) writes that * BO, or 0 if no writer. */ struct util_dynarray writer; /* Bound CL global buffers */ struct util_dynarray global_buffers; struct hash_table *generic_meta; struct agx_bg_eot_cache bg_eot; bool any_faults; uint32_t syncobj; uint32_t dummy_syncobj; int in_sync_fd; uint32_t in_sync_obj; uint64_t flush_last_seqid; uint64_t flush_my_seqid; uint64_t flush_other_seqid; struct agx_scratch scratch_vs; struct agx_scratch scratch_fs; struct agx_scratch scratch_cs; }; static inline unsigned agx_batch_idx(struct agx_batch *batch) { return batch - batch->ctx->batches.slots; } static void agx_writer_add(struct agx_context *ctx, uint8_t batch_index, unsigned handle) { assert(batch_index < AGX_MAX_BATCHES && "invariant"); static_assert(AGX_MAX_BATCHES < 0xFF, "no overflow on addition"); /* If we need to grow, double the capacity so insertion is amortized O(1). */ if (unlikely(handle >= ctx->writer.size)) { unsigned new_size = MAX2(ctx->writer.capacity * 2, util_next_power_of_two(handle + 1)); unsigned grow = new_size - ctx->writer.size; memset(util_dynarray_grow(&ctx->writer, uint8_t, grow), 0, grow * sizeof(uint8_t)); } /* There is now room */ uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle); assert((*value) == 0 && "there should be no existing writer"); *value = batch_index + 1; } static struct agx_batch * agx_writer_get(struct agx_context *ctx, unsigned handle) { if (handle >= ctx->writer.size) return NULL; uint8_t value = *util_dynarray_element(&ctx->writer, uint8_t, handle); if (value > 0) return &ctx->batches.slots[value - 1]; else return NULL; } static void agx_writer_remove(struct agx_context *ctx, unsigned handle) { if (handle >= ctx->writer.size) return; uint8_t *value = util_dynarray_element(&ctx->writer, uint8_t, handle); *value = 0; } static inline struct agx_context * agx_context(struct pipe_context *pctx) { return (struct agx_context *)pctx; } struct agx_linked_shader; typedef void (*meta_shader_builder_t)(struct nir_builder *b, const void *key); void agx_init_meta_shaders(struct agx_context *ctx); void agx_destroy_meta_shaders(struct agx_context *ctx); struct agx_compiled_shader *agx_build_meta_shader(struct agx_context *ctx, meta_shader_builder_t builder, void *data, size_t data_size); struct agx_grid { /* Tag for the union */ enum agx_cdm_mode mode; /* If mode != INDIRECT_LOCAL, the local size */ uint32_t local[3]; union { /* If mode == DIRECT, the global size. This is *not* multiplied by the * local size, differing from the API definition but matching AGX. */ uint32_t global[3]; /* Address of the indirect buffer if mode != DIRECT */ uint64_t indirect; }; }; static inline const struct agx_grid agx_grid_direct(uint32_t global_x, uint32_t global_y, uint32_t global_z, uint32_t local_x, uint32_t local_y, uint32_t local_z) { return (struct agx_grid){ .mode = AGX_CDM_MODE_DIRECT, .global = {global_x, global_y, global_z}, .local = {local_x, local_y, local_z}, }; } static inline const struct agx_grid agx_grid_indirect(uint64_t indirect, uint32_t local_x, uint32_t local_y, uint32_t local_z) { return (struct agx_grid){ .mode = AGX_CDM_MODE_INDIRECT_GLOBAL, .local = {local_x, local_y, local_z}, .indirect = indirect, }; } static inline const struct agx_grid agx_grid_indirect_local(uint64_t indirect) { return (struct agx_grid){ .mode = AGX_CDM_MODE_INDIRECT_LOCAL, .indirect = indirect, }; } void agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid, meta_shader_builder_t builder, void *key, size_t key_size, void *data, size_t data_size); void agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid, struct agx_compiled_shader *cs, enum pipe_shader_type stage, uint32_t usc); void agx_launch(struct agx_batch *batch, const struct agx_grid *grid, struct agx_compiled_shader *cs, struct agx_linked_shader *linked, enum pipe_shader_type stage, unsigned variable_shared_mem); void agx_init_query_functions(struct pipe_context *ctx); void agx_primitives_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info, const struct pipe_draw_start_count_bias *draw); void agx_draw_vbo_from_xfb(struct pipe_context *pctx, const struct pipe_draw_info *info, unsigned drawid_offset, const struct pipe_draw_indirect_info *indirect); uint64_t agx_batch_get_so_address(struct agx_batch *batch, unsigned buffer, uint32_t *size); void agx_init_streamout_functions(struct pipe_context *ctx); static inline void agx_dirty_all(struct agx_context *ctx) { ctx->dirty = ~0; for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i) ctx->stage[i].dirty = ~0; } static inline void agx_dirty_reset_graphics(struct agx_context *ctx) { ctx->dirty = 0; for (unsigned i = 0; i < ARRAY_SIZE(ctx->stage); ++i) { if (i != PIPE_SHADER_COMPUTE) ctx->stage[i].dirty = 0; } } struct agx_rasterizer { struct pipe_rasterizer_state base; uint8_t cull[AGX_CULL_LENGTH]; uint8_t line_width; uint8_t polygon_mode; bool depth_bias; }; struct agx_query { unsigned type; unsigned index; uint64_t writer_generation[AGX_MAX_BATCHES]; struct agx_bo *bo; struct agx_ptr ptr; }; struct agx_sampler_state { struct pipe_sampler_state base; /* Prepared descriptor */ struct agx_sampler_packed desc, desc_without_custom_border; /* Whether a custom border colour is required */ bool uses_custom_border; /* Packed custom border colour, or zero if none is required */ struct agx_border_packed border; /* LOD bias packed as fp16, the form we'll pass to the shader */ uint16_t lod_bias_as_fp16; }; struct agx_sampler_view { struct pipe_sampler_view base; /* Resource/format, may differ from base in case of separate stencil */ struct agx_resource *rsrc; enum pipe_format format; /* Prepared descriptor */ struct agx_texture_packed desc; }; struct agx_screen { struct pipe_screen pscreen; struct agx_device dev; struct disk_cache *disk_cache; /* Shared timeline syncobj and value to serialize flushes across contexts */ uint32_t flush_syncobj; uint64_t flush_cur_seqid; uint64_t flush_wait_seqid; /* Lock to protect flush_wait_seqid updates (reads are just atomic) */ simple_mtx_t flush_seqid_lock; /* Lock to protect syncobj usage vs. destruction in context destroy */ struct u_rwlock destroy_lock; }; static inline struct agx_screen * agx_screen(struct pipe_screen *p) { return (struct agx_screen *)p; } static inline struct agx_device * agx_device(struct pipe_screen *p) { return &(agx_screen(p)->dev); } #define perf_debug(dev, ...) \ do { \ if (unlikely((dev)->debug & AGX_DBG_PERF)) \ mesa_logw(__VA_ARGS__); \ } while (0) #define perf_debug_ctx(ctx, ...) \ perf_debug(agx_device((ctx)->base.screen), __VA_ARGS__) struct agx_resource { struct pipe_resource base; uint64_t modifier; /* Should probably be part of the modifier. Affects the tiling algorithm, or * something like that. */ bool mipmapped; /* Hardware backing */ struct agx_bo *bo; struct renderonly_scanout *scanout; BITSET_DECLARE(data_valid, PIPE_MAX_TEXTURE_LEVELS); struct ail_layout layout; /* Metal does not support packed depth/stencil formats; presumably AGX does * not either. Instead, we create separate depth and stencil resources, * managed by u_transfer_helper. We provide the illusion of packed * resources. */ struct agx_resource *separate_stencil; /* Valid buffer range tracking, to optimize buffer appends */ struct util_range valid_buffer_range; /* Cumulative shadowed byte count for this resource, that is, the number of * times multiplied by the resource size. */ size_t shadowed_bytes; }; static inline struct agx_resource * agx_resource(struct pipe_resource *pctx) { return (struct agx_resource *)pctx; } static inline bool agx_resource_valid(struct agx_resource *rsrc, int level) { /* Shared BOs can always be potentially valid */ if (rsrc->bo && rsrc->bo->flags & AGX_BO_SHARED) { assert(level == 0); return true; } return BITSET_TEST(rsrc->data_valid, level); } static inline void * agx_map_texture_cpu(struct agx_resource *rsrc, unsigned level, unsigned z) { return ((uint8_t *)rsrc->bo->map) + ail_get_layer_level_B(&rsrc->layout, z, level); } static inline uint64_t agx_map_texture_gpu(struct agx_resource *rsrc, unsigned z) { return rsrc->bo->va->addr + (uint64_t)ail_get_layer_offset_B(&rsrc->layout, z); } void agx_decompress(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason); void agx_legalize_compression(struct agx_context *ctx, struct agx_resource *rsrc, enum pipe_format format); struct agx_transfer { struct pipe_transfer base; void *map; struct { struct pipe_resource *rsrc; struct pipe_box box; } staging; }; static inline struct agx_transfer * agx_transfer(struct pipe_transfer *p) { return (struct agx_transfer *)p; } void agx_upload_vbos(struct agx_batch *batch); void agx_upload_uniforms(struct agx_batch *batch); void agx_set_sampler_uniforms(struct agx_batch *batch, enum pipe_shader_type stage); void agx_set_cbuf_uniforms(struct agx_batch *batch, enum pipe_shader_type stage); void agx_set_ssbo_uniforms(struct agx_batch *batch, enum pipe_shader_type stage); bool agx_nir_lower_point_size(nir_shader *nir, bool insert_write); bool agx_nir_lower_sysvals(nir_shader *shader, enum pipe_shader_type desc_stage, bool lower_draw_params); bool agx_nir_layout_uniforms(nir_shader *shader, struct agx_compiled_shader *compiled, unsigned *push_size); bool agx_nir_lower_bindings(nir_shader *shader, bool *uses_bindless_samplers); bool agx_batch_is_active(struct agx_batch *batch); bool agx_batch_is_submitted(struct agx_batch *batch); /* Add a BO to a batch. This needs to be amortized O(1) since it's called in * hot paths. To achieve this we model BO lists by bit sets */ static bool agx_batch_uses_bo(struct agx_batch *batch, struct agx_bo *bo) { if (bo->handle < batch->bo_list.bit_count) return BITSET_TEST(batch->bo_list.set, bo->handle); else return false; } static inline void agx_batch_add_bo(struct agx_batch *batch, struct agx_bo *bo) { /* Double the size of the BO list if we run out, this is amortized O(1) */ if (unlikely(bo->handle > batch->bo_list.bit_count)) { const unsigned bits_per_word = sizeof(BITSET_WORD) * 8; unsigned bit_count = MAX2(batch->bo_list.bit_count * 2, util_next_power_of_two(ALIGN_POT(bo->handle + 1, bits_per_word))); batch->bo_list.set = rerzalloc( batch->ctx, batch->bo_list.set, BITSET_WORD, batch->bo_list.bit_count / bits_per_word, bit_count / bits_per_word); batch->bo_list.bit_count = bit_count; } if (BITSET_TEST(batch->bo_list.set, bo->handle)) return; /* The batch holds a single reference to each BO in the batch, released when * the batch finishes execution. */ agx_bo_reference(bo); BITSET_SET(batch->bo_list.set, bo->handle); } #define AGX_BATCH_FOREACH_BO_HANDLE(batch, handle) \ BITSET_FOREACH_SET(handle, (batch)->bo_list.set, batch->bo_list.bit_count) struct drm_asahi_cmd_compute; struct drm_asahi_cmd_render; void agx_batch_submit(struct agx_context *ctx, struct agx_batch *batch, struct drm_asahi_cmd_compute *compute, struct drm_asahi_cmd_render *render); void agx_flush_batch(struct agx_context *ctx, struct agx_batch *batch); void agx_flush_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch, const char *reason); void agx_flush_all(struct agx_context *ctx, const char *reason); void agx_flush_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason); void agx_flush_writer(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason); void agx_sync_writer(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason); void agx_sync_readers(struct agx_context *ctx, struct agx_resource *rsrc, const char *reason); void agx_sync_batch(struct agx_context *ctx, struct agx_batch *batch); void agx_sync_all(struct agx_context *ctx, const char *reason); void agx_sync_batch_for_reason(struct agx_context *ctx, struct agx_batch *batch, const char *reason); void agx_memory_barrier(struct pipe_context *pctx, unsigned flags); /* Use these instead of batch_add_bo for proper resource tracking */ void agx_batch_reads(struct agx_batch *batch, struct agx_resource *rsrc); void agx_batch_writes(struct agx_batch *batch, struct agx_resource *rsrc, unsigned level); void agx_batch_writes_range(struct agx_batch *batch, struct agx_resource *rsrc, unsigned offset, unsigned size); void agx_batch_track_image(struct agx_batch *batch, struct pipe_image_view *image); bool agx_any_batch_uses_resource(struct agx_context *ctx, struct agx_resource *rsrc); /* 16384 is the maximum framebuffer dimension, so we use a larger width (the * maximum uint16_t) as a sentinel to identify the compute batch. This ensures * compute batches don't mix with graphics. This is a bit of a hack but it * works. */ #define AGX_COMPUTE_BATCH_WIDTH 0xFFFF static inline bool agx_batch_is_compute(struct agx_batch *batch) { return batch->key.width == AGX_COMPUTE_BATCH_WIDTH; } struct agx_batch *agx_get_batch(struct agx_context *ctx); struct agx_batch *agx_get_compute_batch(struct agx_context *ctx); void agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch); int agx_cleanup_batches(struct agx_context *ctx); void agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q); void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q); void agx_query_increment_cpu(struct agx_context *ctx, struct agx_query *query, uint64_t increment); /* Blit shaders */ void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter, bool render_cond); void agx_blit(struct pipe_context *pipe, const struct pipe_blit_info *info); void agx_resource_copy_region(struct pipe_context *pctx, struct pipe_resource *dst, unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src, unsigned src_level, const struct pipe_box *src_box); /* Batch logic */ struct agx_encoder agx_encoder_allocate(struct agx_batch *batch, struct agx_device *dev); void agx_batch_init_state(struct agx_batch *batch); struct asahi_bg_eot { uint64_t usc; struct agx_counts_packed counts; }; struct asahi_bg_eot agx_build_bg_eot(struct agx_batch *batch, bool store, bool partial_render); /* Query management */ uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query); uint64_t agx_get_query_address(struct agx_batch *batch, struct agx_query *query); uint64_t agx_get_occlusion_heap(struct agx_batch *batch); void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts, uint64_t end_ts); bool agx_render_condition_check_inner(struct agx_context *ctx); static inline bool agx_render_condition_check(struct agx_context *ctx) { if (likely(!ctx->cond_query)) return true; else return agx_render_condition_check_inner(ctx); } static inline uint32_t agx_texture_buffer_size_el(enum pipe_format format, uint32_t size) { unsigned blocksize = util_format_get_blocksize(format); return MIN2(AGX_TEXTURE_BUFFER_MAX_SIZE, size / blocksize); }