xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/asahi/agx_state.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * Copyright 2019-2020 Collabora, Ltd.
4  * Copyright 2014-2017 Broadcom
5  * Copyright 2010 Red Hat Inc.
6  * SPDX-License-Identifier: MIT
7  */
8 #include "agx_state.h"
9 #include <errno.h>
10 #include <stdio.h>
11 #include "asahi/compiler/agx_compile.h"
12 #include "asahi/genxml/agx_pack.h"
13 #include "asahi/layout/layout.h"
14 #include "asahi/lib/agx_helpers.h"
15 #include "asahi/lib/agx_nir_passes.h"
16 #include "asahi/lib/agx_ppp.h"
17 #include "asahi/lib/agx_usc.h"
18 #include "asahi/lib/shaders/compression.h"
19 #include "asahi/lib/shaders/tessellator.h"
20 #include "compiler/nir/nir.h"
21 #include "compiler/nir/nir_serialize.h"
22 #include "compiler/shader_enums.h"
23 #include "gallium/auxiliary/nir/pipe_nir.h"
24 #include "gallium/auxiliary/nir/tgsi_to_nir.h"
25 #include "gallium/auxiliary/tgsi/tgsi_from_mesa.h"
26 #include "gallium/auxiliary/util/u_blend.h"
27 #include "gallium/auxiliary/util/u_draw.h"
28 #include "gallium/auxiliary/util/u_framebuffer.h"
29 #include "gallium/auxiliary/util/u_helpers.h"
30 #include "gallium/auxiliary/util/u_prim_restart.h"
31 #include "gallium/auxiliary/util/u_viewport.h"
32 #include "pipe/p_context.h"
33 #include "pipe/p_defines.h"
34 #include "pipe/p_screen.h"
35 #include "pipe/p_state.h"
36 #include "shaders/query.h"
37 #include "util/bitscan.h"
38 #include "util/bitset.h"
39 #include "util/blend.h"
40 #include "util/blob.h"
41 #include "util/compiler.h"
42 #include "util/format/u_format.h"
43 #include "util/format/u_formats.h"
44 #include "util/format_srgb.h"
45 #include "util/half_float.h"
46 #include "util/hash_table.h"
47 #include "util/macros.h"
48 #include "util/ralloc.h"
49 #include "util/u_dump.h"
50 #include "util/u_inlines.h"
51 #include "util/u_math.h"
52 #include "util/u_memory.h"
53 #include "util/u_prim.h"
54 #include "util/u_resource.h"
55 #include "util/u_transfer.h"
56 #include "util/u_upload_mgr.h"
57 #include "agx_bg_eot.h"
58 #include "agx_bo.h"
59 #include "agx_device.h"
60 #include "agx_disk_cache.h"
61 #include "agx_linker.h"
62 #include "agx_nir.h"
63 #include "agx_nir_lower_gs.h"
64 #include "agx_nir_lower_vbo.h"
65 #include "agx_tilebuffer.h"
66 #include "nir_builder.h"
67 #include "nir_builder_opcodes.h"
68 #include "nir_intrinsics.h"
69 #include "nir_intrinsics_indices.h"
70 #include "nir_lower_blend.h"
71 #include "nir_xfb_info.h"
72 #include "pool.h"
73 
74 void
agx_legalize_compression(struct agx_context * ctx,struct agx_resource * rsrc,enum pipe_format format)75 agx_legalize_compression(struct agx_context *ctx, struct agx_resource *rsrc,
76                          enum pipe_format format)
77 {
78    if (!ail_is_view_compatible(&rsrc->layout, format)) {
79       agx_decompress(ctx, rsrc, "Incompatible formats");
80    }
81 }
82 
83 static void
agx_set_shader_images(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start_slot,unsigned count,unsigned unbind_num_trailing_slots,const struct pipe_image_view * iviews)84 agx_set_shader_images(struct pipe_context *pctx, enum pipe_shader_type shader,
85                       unsigned start_slot, unsigned count,
86                       unsigned unbind_num_trailing_slots,
87                       const struct pipe_image_view *iviews)
88 {
89    struct agx_context *ctx = agx_context(pctx);
90    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_IMAGE;
91 
92    /* Unbind start_slot...start_slot+count */
93    if (!iviews) {
94       for (int i = start_slot;
95            i < start_slot + count + unbind_num_trailing_slots; i++) {
96          pipe_resource_reference(&ctx->stage[shader].images[i].resource, NULL);
97       }
98 
99       ctx->stage[shader].image_mask &=
100          ~BITFIELD64_MASK(count + unbind_num_trailing_slots) << start_slot;
101       return;
102    }
103 
104    /* Images writeable with pixel granularity are incompatible with
105     * compression. Decompress if necessary.
106     *
107     * Driver-internal images are used by the compute blitter and are exempt
108     * from these transitions, as it only uses compressed images when safe.
109     *
110     * We do this upfront because agx_decompress and agx_legalize_compression can
111     * call set_shader_images internall.
112     */
113    for (int i = 0; i < count; i++) {
114       const struct pipe_image_view *image = &iviews[i];
115       struct agx_resource *rsrc = agx_resource(image->resource);
116 
117       if (rsrc && !(image->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL)) {
118          if (!rsrc->layout.writeable_image &&
119              (image->shader_access & PIPE_IMAGE_ACCESS_WRITE)) {
120 
121             agx_decompress(ctx, rsrc, "Shader image");
122          }
123 
124          /* Readable images may be compressed but are still subject to format
125           * reinterpretation rules.
126           */
127          agx_legalize_compression(ctx, rsrc, image->format);
128 
129          if (image->shader_access & PIPE_IMAGE_ACCESS_WRITE)
130             assert(rsrc->layout.writeable_image);
131       }
132    }
133 
134    /* Bind start_slot...start_slot+count */
135    for (int i = 0; i < count; i++) {
136       const struct pipe_image_view *image = &iviews[i];
137 
138       if (!image->resource) {
139          util_copy_image_view(&ctx->stage[shader].images[start_slot + i], NULL);
140          ctx->stage[shader].image_mask &= ~BITFIELD_BIT(start_slot + i);
141       } else {
142          util_copy_image_view(&ctx->stage[shader].images[start_slot + i],
143                               image);
144          ctx->stage[shader].image_mask |= BITFIELD_BIT(start_slot + i);
145       }
146    }
147 
148    /* Unbind start_slot+count...start_slot+count+unbind_num_trailing_slots */
149    for (int i = 0; i < unbind_num_trailing_slots; i++) {
150       ctx->stage[shader].image_mask &= ~BITFIELD_BIT(start_slot + count + i);
151       util_copy_image_view(&ctx->stage[shader].images[start_slot + count + i],
152                            NULL);
153    }
154 }
155 
156 static void
agx_set_shader_buffers(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,const struct pipe_shader_buffer * buffers,unsigned writable_bitmask)157 agx_set_shader_buffers(struct pipe_context *pctx, enum pipe_shader_type shader,
158                        unsigned start, unsigned count,
159                        const struct pipe_shader_buffer *buffers,
160                        unsigned writable_bitmask)
161 {
162    struct agx_context *ctx = agx_context(pctx);
163 
164    util_set_shader_buffers_mask(ctx->stage[shader].ssbo,
165                                 &ctx->stage[shader].ssbo_mask, buffers, start,
166                                 count);
167 
168    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_SSBO;
169    ctx->stage[shader].ssbo_writable_mask &= ~(BITFIELD_MASK(count) << start);
170    ctx->stage[shader].ssbo_writable_mask |= writable_bitmask << start;
171 }
172 
173 static void
agx_set_blend_color(struct pipe_context * pctx,const struct pipe_blend_color * state)174 agx_set_blend_color(struct pipe_context *pctx,
175                     const struct pipe_blend_color *state)
176 {
177    struct agx_context *ctx = agx_context(pctx);
178 
179    if (state)
180       memcpy(&ctx->blend_color, state, sizeof(*state));
181 
182    ctx->dirty |= AGX_DIRTY_BLEND_COLOR;
183 }
184 
185 static void
agx_set_patch_vertices(struct pipe_context * pctx,unsigned char n)186 agx_set_patch_vertices(struct pipe_context *pctx, unsigned char n)
187 {
188    struct agx_context *ctx = agx_context(pctx);
189    ctx->patch_vertices = n;
190 }
191 
192 static void
agx_set_tess_state(struct pipe_context * pctx,const float default_outer_level[4],const float default_inner_level[2])193 agx_set_tess_state(struct pipe_context *pctx,
194                    const float default_outer_level[4],
195                    const float default_inner_level[2])
196 {
197    struct agx_context *ctx = agx_context(pctx);
198 
199    memcpy(ctx->default_outer_level, default_outer_level, 4 * sizeof(float));
200    memcpy(ctx->default_inner_level, default_inner_level, 2 * sizeof(float));
201 }
202 
203 static void *
agx_create_blend_state(struct pipe_context * ctx,const struct pipe_blend_state * state)204 agx_create_blend_state(struct pipe_context *ctx,
205                        const struct pipe_blend_state *state)
206 {
207    struct agx_blend *so = CALLOC_STRUCT(agx_blend);
208    struct agx_blend_key *key = &so->key;
209 
210    key->alpha_to_coverage = state->alpha_to_coverage;
211    key->alpha_to_one = state->alpha_to_one;
212 
213    key->logicop_func =
214       state->logicop_enable ? state->logicop_func : PIPE_LOGICOP_COPY;
215 
216    for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
217       unsigned rti = state->independent_blend_enable ? i : 0;
218       struct pipe_rt_blend_state rt = state->rt[rti];
219 
220       if (state->logicop_enable || !rt.blend_enable) {
221          /* No blending, but we get the colour mask below */
222          key->rt[i] = (struct agx_blend_rt_key){
223             .rgb_func = PIPE_BLEND_ADD,
224             .rgb_src_factor = PIPE_BLENDFACTOR_ONE,
225             .rgb_dst_factor = PIPE_BLENDFACTOR_ZERO,
226 
227             .alpha_func = PIPE_BLEND_ADD,
228             .alpha_src_factor = PIPE_BLENDFACTOR_ONE,
229             .alpha_dst_factor = PIPE_BLENDFACTOR_ZERO,
230          };
231       } else {
232          key->rt[i].rgb_func = rt.rgb_func;
233          key->rt[i].rgb_src_factor = rt.rgb_src_factor;
234          key->rt[i].rgb_dst_factor = rt.rgb_dst_factor;
235 
236          key->rt[i].alpha_func = rt.alpha_func;
237          key->rt[i].alpha_src_factor = rt.alpha_src_factor;
238          key->rt[i].alpha_dst_factor = rt.alpha_dst_factor;
239       }
240 
241       key->rt[i].colormask = rt.colormask;
242 
243       if (rt.colormask)
244          so->store |= (PIPE_CLEAR_COLOR0 << i);
245    }
246 
247    return so;
248 }
249 
250 static void
agx_bind_blend_state(struct pipe_context * pctx,void * cso)251 agx_bind_blend_state(struct pipe_context *pctx, void *cso)
252 {
253    struct agx_context *ctx = agx_context(pctx);
254    ctx->blend = cso;
255    ctx->dirty |= AGX_DIRTY_BLEND;
256 }
257 
258 static const enum agx_stencil_op agx_stencil_ops[PIPE_STENCIL_OP_INVERT + 1] = {
259    [PIPE_STENCIL_OP_KEEP] = AGX_STENCIL_OP_KEEP,
260    [PIPE_STENCIL_OP_ZERO] = AGX_STENCIL_OP_ZERO,
261    [PIPE_STENCIL_OP_REPLACE] = AGX_STENCIL_OP_REPLACE,
262    [PIPE_STENCIL_OP_INCR] = AGX_STENCIL_OP_INCR_SAT,
263    [PIPE_STENCIL_OP_DECR] = AGX_STENCIL_OP_DECR_SAT,
264    [PIPE_STENCIL_OP_INCR_WRAP] = AGX_STENCIL_OP_INCR_WRAP,
265    [PIPE_STENCIL_OP_DECR_WRAP] = AGX_STENCIL_OP_DECR_WRAP,
266    [PIPE_STENCIL_OP_INVERT] = AGX_STENCIL_OP_INVERT,
267 };
268 
269 static void
agx_pack_stencil(struct agx_fragment_stencil_packed * out,struct pipe_stencil_state st)270 agx_pack_stencil(struct agx_fragment_stencil_packed *out,
271                  struct pipe_stencil_state st)
272 {
273    if (st.enabled) {
274       agx_pack(out, FRAGMENT_STENCIL, cfg) {
275          cfg.compare = (enum agx_zs_func)st.func;
276          cfg.write_mask = st.writemask;
277          cfg.read_mask = st.valuemask;
278 
279          cfg.depth_pass = agx_stencil_ops[st.zpass_op];
280          cfg.depth_fail = agx_stencil_ops[st.zfail_op];
281          cfg.stencil_fail = agx_stencil_ops[st.fail_op];
282       }
283    } else {
284       agx_pack(out, FRAGMENT_STENCIL, cfg) {
285          cfg.compare = AGX_ZS_FUNC_ALWAYS;
286          cfg.write_mask = 0xFF;
287          cfg.read_mask = 0xFF;
288 
289          cfg.depth_pass = AGX_STENCIL_OP_KEEP;
290          cfg.depth_fail = AGX_STENCIL_OP_KEEP;
291          cfg.stencil_fail = AGX_STENCIL_OP_KEEP;
292       }
293    }
294 }
295 
296 static void *
agx_create_zsa_state(struct pipe_context * ctx,const struct pipe_depth_stencil_alpha_state * state)297 agx_create_zsa_state(struct pipe_context *ctx,
298                      const struct pipe_depth_stencil_alpha_state *state)
299 {
300    struct agx_zsa *so = CALLOC_STRUCT(agx_zsa);
301    assert(!state->depth_bounds_test && "todo");
302 
303    so->base = *state;
304 
305    /* Handle the enable flag */
306    enum pipe_compare_func depth_func =
307       state->depth_enabled ? state->depth_func : PIPE_FUNC_ALWAYS;
308 
309    /* Z func can otherwise be used as-is */
310    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_NEVER == AGX_ZS_FUNC_NEVER);
311    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_LESS == AGX_ZS_FUNC_LESS);
312    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_EQUAL == AGX_ZS_FUNC_EQUAL);
313    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_LEQUAL == AGX_ZS_FUNC_LEQUAL);
314    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_GREATER == AGX_ZS_FUNC_GREATER);
315    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_NOTEQUAL == AGX_ZS_FUNC_NOT_EQUAL);
316    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_GEQUAL == AGX_ZS_FUNC_GEQUAL);
317    STATIC_ASSERT((enum agx_zs_func)PIPE_FUNC_ALWAYS == AGX_ZS_FUNC_ALWAYS);
318 
319    agx_pack(&so->depth, FRAGMENT_FACE, cfg) {
320       cfg.depth_function = (enum agx_zs_func)depth_func;
321       cfg.disable_depth_write = !state->depth_writemask;
322    }
323 
324    agx_pack_stencil(&so->front_stencil, state->stencil[0]);
325 
326    if (state->stencil[1].enabled) {
327       agx_pack_stencil(&so->back_stencil, state->stencil[1]);
328    } else {
329       /* One sided stencil */
330       so->back_stencil = so->front_stencil;
331    }
332 
333    if (depth_func != PIPE_FUNC_NEVER && depth_func != PIPE_FUNC_ALWAYS)
334       so->load |= PIPE_CLEAR_DEPTH;
335 
336    if (state->depth_writemask) {
337       so->load |= PIPE_CLEAR_DEPTH;
338       so->store |= PIPE_CLEAR_DEPTH;
339    }
340 
341    if (state->stencil[0].enabled) {
342       so->load |= PIPE_CLEAR_STENCIL; /* TODO: Optimize */
343       so->store |= PIPE_CLEAR_STENCIL;
344    }
345 
346    return so;
347 }
348 
349 static void
agx_bind_zsa_state(struct pipe_context * pctx,void * cso)350 agx_bind_zsa_state(struct pipe_context *pctx, void *cso)
351 {
352    struct agx_context *ctx = agx_context(pctx);
353    ctx->zs = cso;
354    ctx->dirty |= AGX_DIRTY_ZS;
355 }
356 
357 static enum agx_polygon_mode
agx_translate_polygon_mode(unsigned mode)358 agx_translate_polygon_mode(unsigned mode)
359 {
360    switch (mode) {
361    case PIPE_POLYGON_MODE_FILL:
362       return AGX_POLYGON_MODE_FILL;
363    case PIPE_POLYGON_MODE_POINT:
364       return AGX_POLYGON_MODE_POINT;
365    case PIPE_POLYGON_MODE_LINE:
366       return AGX_POLYGON_MODE_LINE;
367    default:
368       unreachable("Unsupported polygon mode");
369    }
370 }
371 
372 static void *
agx_create_rs_state(struct pipe_context * ctx,const struct pipe_rasterizer_state * cso)373 agx_create_rs_state(struct pipe_context *ctx,
374                     const struct pipe_rasterizer_state *cso)
375 {
376    struct agx_rasterizer *so = CALLOC_STRUCT(agx_rasterizer);
377    so->base = *cso;
378 
379    agx_pack(so->cull, CULL, cfg) {
380       cfg.cull_front = cso->cull_face & PIPE_FACE_FRONT;
381       cfg.cull_back = cso->cull_face & PIPE_FACE_BACK;
382       cfg.front_face_ccw = cso->front_ccw;
383       cfg.depth_clip = cso->depth_clip_near;
384       cfg.depth_clamp = !cso->depth_clip_near;
385       cfg.flat_shading_vertex =
386          cso->flatshade_first ? AGX_PPP_VERTEX_0 : AGX_PPP_VERTEX_2;
387       cfg.rasterizer_discard = cso->rasterizer_discard;
388    };
389 
390    /* Two-sided polygon mode doesn't seem to work on G13. Apple's OpenGL
391     * implementation lowers to multiple draws with culling. Warn.
392     */
393    if (unlikely(cso->fill_front != cso->fill_back)) {
394       agx_msg("Warning: Two-sided fill modes are unsupported, "
395               "rendering may be incorrect.\n");
396    }
397 
398    so->polygon_mode = agx_translate_polygon_mode(cso->fill_front);
399    so->line_width = agx_pack_line_width(cso->line_width);
400    so->depth_bias = util_get_offset(cso, cso->fill_front);
401 
402    return so;
403 }
404 
405 static void
agx_bind_rasterizer_state(struct pipe_context * pctx,void * cso)406 agx_bind_rasterizer_state(struct pipe_context *pctx, void *cso)
407 {
408    struct agx_context *ctx = agx_context(pctx);
409    struct agx_rasterizer *so = cso;
410 
411    bool base_cso_changed = (cso == NULL) || (ctx->rast == NULL);
412 
413    /* Check if scissor or depth bias state has changed, since scissor/depth bias
414     * enable is part of the rasterizer state but everything else needed for
415     * scissors and depth bias is part of the scissor/depth bias arrays */
416    bool scissor_zbias_changed = base_cso_changed ||
417                                 (ctx->rast->base.scissor != so->base.scissor) ||
418                                 (ctx->rast->depth_bias != so->depth_bias);
419 
420    ctx->dirty |= AGX_DIRTY_RS;
421 
422    if (scissor_zbias_changed)
423       ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
424 
425    if (base_cso_changed ||
426        (ctx->rast->base.sprite_coord_mode != so->base.sprite_coord_mode))
427       ctx->dirty |= AGX_DIRTY_SPRITE_COORD_MODE;
428 
429    ctx->rast = so;
430 }
431 
432 static bool
has_edgeflags(struct agx_context * ctx,enum mesa_prim mode)433 has_edgeflags(struct agx_context *ctx, enum mesa_prim mode)
434 {
435    return ctx->stage[PIPE_SHADER_VERTEX].shader->info.has_edgeflags &&
436           mode == MESA_PRIM_TRIANGLES &&
437           (ctx->rast->base.fill_front != PIPE_POLYGON_MODE_FILL);
438 }
439 
440 static enum agx_wrap
agx_wrap_from_pipe(enum pipe_tex_wrap in)441 agx_wrap_from_pipe(enum pipe_tex_wrap in)
442 {
443    switch (in) {
444    case PIPE_TEX_WRAP_REPEAT:
445       return AGX_WRAP_REPEAT;
446    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
447       return AGX_WRAP_CLAMP_TO_EDGE;
448    case PIPE_TEX_WRAP_MIRROR_REPEAT:
449       return AGX_WRAP_MIRRORED_REPEAT;
450    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
451       return AGX_WRAP_CLAMP_TO_BORDER;
452    case PIPE_TEX_WRAP_CLAMP:
453       return AGX_WRAP_CLAMP_GL;
454    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
455       return AGX_WRAP_MIRRORED_CLAMP_TO_EDGE;
456    default:
457       unreachable("Invalid wrap mode");
458    }
459 }
460 
461 static enum agx_mip_filter
agx_mip_filter_from_pipe(enum pipe_tex_mipfilter in)462 agx_mip_filter_from_pipe(enum pipe_tex_mipfilter in)
463 {
464    switch (in) {
465    case PIPE_TEX_MIPFILTER_NEAREST:
466       return AGX_MIP_FILTER_NEAREST;
467    case PIPE_TEX_MIPFILTER_LINEAR:
468       return AGX_MIP_FILTER_LINEAR;
469    case PIPE_TEX_MIPFILTER_NONE:
470       return AGX_MIP_FILTER_NONE;
471    }
472 
473    unreachable("Invalid mip filter");
474 }
475 
476 static const enum agx_compare_func agx_compare_funcs[PIPE_FUNC_ALWAYS + 1] = {
477    [PIPE_FUNC_NEVER] = AGX_COMPARE_FUNC_NEVER,
478    [PIPE_FUNC_LESS] = AGX_COMPARE_FUNC_LESS,
479    [PIPE_FUNC_EQUAL] = AGX_COMPARE_FUNC_EQUAL,
480    [PIPE_FUNC_LEQUAL] = AGX_COMPARE_FUNC_LEQUAL,
481    [PIPE_FUNC_GREATER] = AGX_COMPARE_FUNC_GREATER,
482    [PIPE_FUNC_NOTEQUAL] = AGX_COMPARE_FUNC_NOT_EQUAL,
483    [PIPE_FUNC_GEQUAL] = AGX_COMPARE_FUNC_GEQUAL,
484    [PIPE_FUNC_ALWAYS] = AGX_COMPARE_FUNC_ALWAYS,
485 };
486 
487 static const enum agx_filter agx_filters[] = {
488    [PIPE_TEX_FILTER_LINEAR] = AGX_FILTER_LINEAR,
489    [PIPE_TEX_FILTER_NEAREST] = AGX_FILTER_NEAREST,
490 };
491 
492 static enum pipe_format
fixup_border_zs(enum pipe_format orig,union pipe_color_union * c)493 fixup_border_zs(enum pipe_format orig, union pipe_color_union *c)
494 {
495    switch (orig) {
496    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
497    case PIPE_FORMAT_Z24X8_UNORM:
498       /* Z24 is internally promoted to Z32F via transfer_helper. These formats
499        * are normalized so should get clamped, but Z32F does not get clamped, so
500        * we clamp here.
501        */
502       c->f[0] = SATURATE(c->f[0]);
503       return PIPE_FORMAT_Z32_FLOAT;
504 
505    case PIPE_FORMAT_X24S8_UINT:
506    case PIPE_FORMAT_X32_S8X24_UINT:
507       /* Separate stencil is internally promoted */
508       return PIPE_FORMAT_S8_UINT;
509 
510    default:
511       return orig;
512    }
513 }
514 
515 static void *
agx_create_sampler_state(struct pipe_context * pctx,const struct pipe_sampler_state * state)516 agx_create_sampler_state(struct pipe_context *pctx,
517                          const struct pipe_sampler_state *state)
518 {
519    struct agx_sampler_state *so = CALLOC_STRUCT(agx_sampler_state);
520    so->base = *state;
521 
522    /* We report a max texture LOD bias of 16, so clamp appropriately */
523    float lod_bias = CLAMP(state->lod_bias, -16.0, 16.0);
524    so->lod_bias_as_fp16 = _mesa_float_to_half(lod_bias);
525 
526    agx_pack(&so->desc, SAMPLER, cfg) {
527       cfg.minimum_lod = state->min_lod;
528       cfg.maximum_lod = state->max_lod;
529       cfg.maximum_anisotropy =
530          util_next_power_of_two(MAX2(state->max_anisotropy, 1));
531       cfg.magnify = agx_filters[state->mag_img_filter];
532       cfg.minify = agx_filters[state->min_img_filter];
533       cfg.mip_filter = agx_mip_filter_from_pipe(state->min_mip_filter);
534       cfg.wrap_s = agx_wrap_from_pipe(state->wrap_s);
535       cfg.wrap_t = agx_wrap_from_pipe(state->wrap_t);
536       cfg.wrap_r = agx_wrap_from_pipe(state->wrap_r);
537       cfg.pixel_coordinates = state->unnormalized_coords;
538       cfg.compare_func = agx_compare_funcs[state->compare_func];
539       cfg.compare_enable = state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE;
540       cfg.seamful_cube_maps = !state->seamless_cube_map;
541 
542       if (state->border_color_format != PIPE_FORMAT_NONE) {
543          /* TODO: Optimize to use compact descriptors for black/white borders */
544          so->uses_custom_border = true;
545          cfg.border_colour = AGX_BORDER_COLOUR_CUSTOM;
546       }
547    }
548 
549    memcpy(&so->desc_without_custom_border, &so->desc, sizeof(so->desc));
550 
551    if (so->uses_custom_border) {
552       union pipe_color_union border = state->border_color;
553       enum pipe_format format =
554          fixup_border_zs(state->border_color_format, &border);
555 
556       agx_pack_border(&so->border, border.ui, format);
557 
558       /* Neutralize the bindless-safe descriptor. XXX: This is a hack. */
559       so->desc_without_custom_border.opaque[1] &= ~(1u << 23);
560    }
561 
562    return so;
563 }
564 
565 static void
agx_delete_sampler_state(struct pipe_context * ctx,void * state)566 agx_delete_sampler_state(struct pipe_context *ctx, void *state)
567 {
568    struct agx_sampler_state *so = state;
569    FREE(so);
570 }
571 
572 static void
agx_bind_sampler_states(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,void ** states)573 agx_bind_sampler_states(struct pipe_context *pctx, enum pipe_shader_type shader,
574                         unsigned start, unsigned count, void **states)
575 {
576    struct agx_context *ctx = agx_context(pctx);
577 
578    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_SAMPLER;
579 
580    for (unsigned i = 0; i < count; i++) {
581       unsigned p = start + i;
582       ctx->stage[shader].samplers[p] = states ? states[i] : NULL;
583       if (ctx->stage[shader].samplers[p])
584          ctx->stage[shader].valid_samplers |= BITFIELD_BIT(p);
585       else
586          ctx->stage[shader].valid_samplers &= ~BITFIELD_BIT(p);
587    }
588 
589    ctx->stage[shader].sampler_count =
590       util_last_bit(ctx->stage[shader].valid_samplers);
591 
592    /* Recalculate whether we need custom borders */
593    ctx->stage[shader].custom_borders = false;
594 
595    u_foreach_bit(i, ctx->stage[shader].valid_samplers) {
596       if (ctx->stage[shader].samplers[i]->uses_custom_border)
597          ctx->stage[shader].custom_borders = true;
598    }
599 }
600 
601 static enum agx_texture_dimension
agx_translate_tex_dim(enum pipe_texture_target dim,unsigned samples)602 agx_translate_tex_dim(enum pipe_texture_target dim, unsigned samples)
603 {
604    assert(samples >= 1);
605 
606    switch (dim) {
607    case PIPE_BUFFER:
608    case PIPE_TEXTURE_1D:
609       /* Lowered to 2D */
610       assert(samples == 1);
611       return AGX_TEXTURE_DIMENSION_2D;
612 
613    case PIPE_TEXTURE_RECT:
614    case PIPE_TEXTURE_2D:
615       return samples > 1 ? AGX_TEXTURE_DIMENSION_2D_MULTISAMPLED
616                          : AGX_TEXTURE_DIMENSION_2D;
617 
618    case PIPE_TEXTURE_1D_ARRAY:
619       assert(samples == 1);
620       /* Lowered to 2D */
621       FALLTHROUGH;
622    case PIPE_TEXTURE_2D_ARRAY:
623       return samples > 1 ? AGX_TEXTURE_DIMENSION_2D_ARRAY_MULTISAMPLED
624                          : AGX_TEXTURE_DIMENSION_2D_ARRAY;
625 
626    case PIPE_TEXTURE_3D:
627       assert(samples == 1);
628       return AGX_TEXTURE_DIMENSION_3D;
629 
630    case PIPE_TEXTURE_CUBE:
631       assert(samples == 1);
632       return AGX_TEXTURE_DIMENSION_CUBE;
633 
634    case PIPE_TEXTURE_CUBE_ARRAY:
635       assert(samples == 1);
636       return AGX_TEXTURE_DIMENSION_CUBE_ARRAY;
637 
638    default:
639       unreachable("Unsupported texture dimension");
640    }
641 }
642 
643 static bool
target_is_cube(enum pipe_texture_target target)644 target_is_cube(enum pipe_texture_target target)
645 {
646    return target == PIPE_TEXTURE_CUBE || target == PIPE_TEXTURE_CUBE_ARRAY;
647 }
648 
649 static void
agx_pack_texture(void * out,struct agx_resource * rsrc,enum pipe_format format,const struct pipe_sampler_view * state)650 agx_pack_texture(void *out, struct agx_resource *rsrc,
651                  enum pipe_format format /* override */,
652                  const struct pipe_sampler_view *state)
653 {
654    const struct util_format_description *desc = util_format_description(format);
655 
656    assert(ail_is_valid_pixel_format(format));
657 
658    uint8_t format_swizzle[4] = {
659       desc->swizzle[0],
660       desc->swizzle[1],
661       desc->swizzle[2],
662       desc->swizzle[3],
663    };
664 
665    if (util_format_is_depth_or_stencil(format)) {
666       assert(!util_format_is_depth_and_stencil(format) &&
667              "separate stencil always used");
668 
669       /* Broadcast depth and stencil */
670       format_swizzle[0] = 0;
671       format_swizzle[1] = 0;
672       format_swizzle[2] = 0;
673       format_swizzle[3] = 0;
674    }
675 
676    /* We only have a single swizzle for the user swizzle and the format fixup,
677     * so compose them now. */
678    uint8_t out_swizzle[4];
679    uint8_t view_swizzle[4] = {state->swizzle_r, state->swizzle_g,
680                               state->swizzle_b, state->swizzle_a};
681 
682    util_format_compose_swizzles(format_swizzle, view_swizzle, out_swizzle);
683 
684    unsigned first_layer =
685       (state->target == PIPE_BUFFER) ? 0 : state->u.tex.first_layer;
686 
687    /* Pack the descriptor into GPU memory */
688    agx_pack(out, TEXTURE, cfg) {
689       cfg.dimension = agx_translate_tex_dim(state->target,
690                                             util_res_sample_count(&rsrc->base));
691       cfg.layout = agx_translate_layout(rsrc->layout.tiling);
692       cfg.channels = ail_pixel_format[format].channels;
693       cfg.type = ail_pixel_format[format].type;
694       cfg.swizzle_r = agx_channel_from_pipe(out_swizzle[0]);
695       cfg.swizzle_g = agx_channel_from_pipe(out_swizzle[1]);
696       cfg.swizzle_b = agx_channel_from_pipe(out_swizzle[2]);
697       cfg.swizzle_a = agx_channel_from_pipe(out_swizzle[3]);
698 
699       if (state->target == PIPE_BUFFER) {
700          unsigned size_el =
701             agx_texture_buffer_size_el(format, state->u.buf.size);
702 
703          /* Use a 2D texture to increase the maximum size */
704          cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
705          cfg.height = DIV_ROUND_UP(size_el, cfg.width);
706          cfg.first_level = cfg.last_level = 0;
707          cfg.buffer_size_sw = size_el;
708          cfg.buffer_offset_sw = 0;
709       } else {
710          cfg.width = rsrc->base.width0;
711          cfg.height = rsrc->base.height0;
712          cfg.first_level = state->u.tex.first_level;
713          cfg.last_level = state->u.tex.last_level;
714       }
715 
716       cfg.srgb = (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB);
717       cfg.unk_mipmapped = rsrc->mipmapped;
718       cfg.srgb_2_channel = cfg.srgb && util_format_colormask(desc) == 0x3;
719 
720       if (ail_is_compressed(&rsrc->layout)) {
721          cfg.compressed_1 = true;
722          cfg.extended = true;
723       }
724 
725       cfg.address = agx_map_texture_gpu(rsrc, first_layer);
726 
727       if (state->target == PIPE_BUFFER)
728          cfg.address += state->u.buf.offset;
729 
730       if (ail_is_compressed(&rsrc->layout)) {
731          cfg.acceleration_buffer =
732             agx_map_texture_gpu(rsrc, 0) + rsrc->layout.metadata_offset_B +
733             (first_layer * rsrc->layout.compression_layer_stride_B);
734       }
735 
736       if (state->target == PIPE_TEXTURE_3D) {
737          cfg.depth = rsrc->base.depth0;
738       } else if (state->target == PIPE_BUFFER) {
739          cfg.depth = 1;
740       } else {
741          unsigned layers =
742             state->u.tex.last_layer - state->u.tex.first_layer + 1;
743 
744          if (target_is_cube(state->target))
745             layers /= 6;
746 
747          if (rsrc->layout.tiling == AIL_TILING_LINEAR &&
748              (state->target == PIPE_TEXTURE_1D_ARRAY ||
749               state->target == PIPE_TEXTURE_2D_ARRAY)) {
750 
751             cfg.depth_linear = layers;
752             cfg.layer_stride_linear = (rsrc->layout.layer_stride_B - 0x80);
753             cfg.extended = true;
754          } else {
755             assert((rsrc->layout.tiling != AIL_TILING_LINEAR) || (layers == 1));
756             cfg.depth = layers;
757          }
758       }
759 
760       if (rsrc->base.nr_samples > 1)
761          cfg.samples = agx_translate_sample_count(rsrc->base.nr_samples);
762 
763       if (state->target == PIPE_BUFFER) {
764          cfg.stride = (cfg.width * util_format_get_blocksize(format)) - 16;
765       } else if (rsrc->layout.tiling == AIL_TILING_LINEAR) {
766          cfg.stride = ail_get_linear_stride_B(&rsrc->layout, 0) - 16;
767       } else {
768          assert(rsrc->layout.tiling == AIL_TILING_TWIDDLED ||
769                 rsrc->layout.tiling == AIL_TILING_TWIDDLED_COMPRESSED);
770 
771          cfg.page_aligned_layers = rsrc->layout.page_aligned_layers;
772       }
773    }
774 }
775 
776 static struct pipe_sampler_view *
agx_create_sampler_view(struct pipe_context * pctx,struct pipe_resource * orig_texture,const struct pipe_sampler_view * state)777 agx_create_sampler_view(struct pipe_context *pctx,
778                         struct pipe_resource *orig_texture,
779                         const struct pipe_sampler_view *state)
780 {
781    struct agx_resource *rsrc = agx_resource(orig_texture);
782    struct agx_sampler_view *so = CALLOC_STRUCT(agx_sampler_view);
783 
784    if (!so)
785       return NULL;
786 
787    struct pipe_resource *texture = orig_texture;
788    enum pipe_format format = state->format;
789 
790    const struct util_format_description *desc = util_format_description(format);
791 
792    /* Separate stencil always used on G13, so we need to fix up for Z32S8 */
793    if (util_format_has_stencil(desc) && rsrc->separate_stencil) {
794       if (util_format_has_depth(desc)) {
795          /* Reinterpret as the depth-only part */
796          format = util_format_get_depth_only(format);
797       } else {
798          /* Use the stencil-only-part */
799          rsrc = rsrc->separate_stencil;
800          texture = &rsrc->base;
801          format = texture->format;
802       }
803    }
804 
805    agx_legalize_compression(agx_context(pctx), rsrc, format);
806 
807    /* Save off the resource that we actually use, with the stencil fixed up */
808    so->rsrc = rsrc;
809    so->format = format;
810 
811    so->base = *state;
812    so->base.texture = NULL;
813    pipe_resource_reference(&so->base.texture, orig_texture);
814    pipe_reference_init(&so->base.reference, 1);
815    so->base.context = pctx;
816    return &so->base;
817 }
818 
819 static void
agx_set_sampler_views(struct pipe_context * pctx,enum pipe_shader_type shader,unsigned start,unsigned count,unsigned unbind_num_trailing_slots,bool take_ownership,struct pipe_sampler_view ** views)820 agx_set_sampler_views(struct pipe_context *pctx, enum pipe_shader_type shader,
821                       unsigned start, unsigned count,
822                       unsigned unbind_num_trailing_slots, bool take_ownership,
823                       struct pipe_sampler_view **views)
824 {
825    struct agx_context *ctx = agx_context(pctx);
826    unsigned new_nr = 0;
827    unsigned i;
828 
829    assert(start == 0);
830 
831    if (!views)
832       count = 0;
833 
834    for (i = 0; i < count; ++i) {
835       if (take_ownership) {
836          pipe_sampler_view_reference(
837             (struct pipe_sampler_view **)&ctx->stage[shader].textures[i], NULL);
838          ctx->stage[shader].textures[i] = (struct agx_sampler_view *)views[i];
839       } else {
840          pipe_sampler_view_reference(
841             (struct pipe_sampler_view **)&ctx->stage[shader].textures[i],
842             views[i]);
843       }
844    }
845 
846    for (; i < count + unbind_num_trailing_slots; i++) {
847       pipe_sampler_view_reference(
848          (struct pipe_sampler_view **)&ctx->stage[shader].textures[i], NULL);
849    }
850 
851    for (unsigned t = 0; t < MAX2(ctx->stage[shader].texture_count, count);
852         ++t) {
853       if (ctx->stage[shader].textures[t])
854          new_nr = t + 1;
855    }
856 
857    ctx->stage[shader].texture_count = new_nr;
858    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_IMAGE;
859 }
860 
861 static void
agx_sampler_view_destroy(struct pipe_context * ctx,struct pipe_sampler_view * pview)862 agx_sampler_view_destroy(struct pipe_context *ctx,
863                          struct pipe_sampler_view *pview)
864 {
865    struct agx_sampler_view *view = (struct agx_sampler_view *)pview;
866    pipe_resource_reference(&view->base.texture, NULL);
867    FREE(view);
868 }
869 
870 static struct pipe_surface *
agx_create_surface(struct pipe_context * ctx,struct pipe_resource * texture,const struct pipe_surface * surf_tmpl)871 agx_create_surface(struct pipe_context *ctx, struct pipe_resource *texture,
872                    const struct pipe_surface *surf_tmpl)
873 {
874    agx_legalize_compression(agx_context(ctx), agx_resource(texture),
875                             surf_tmpl->format);
876 
877    struct pipe_surface *surface = CALLOC_STRUCT(pipe_surface);
878 
879    if (!surface)
880       return NULL;
881 
882    unsigned level = surf_tmpl->u.tex.level;
883 
884    pipe_reference_init(&surface->reference, 1);
885    pipe_resource_reference(&surface->texture, texture);
886 
887    assert(texture->target != PIPE_BUFFER && "buffers are not renderable");
888 
889    surface->context = ctx;
890    surface->format = surf_tmpl->format;
891    surface->nr_samples = surf_tmpl->nr_samples;
892    surface->width = u_minify(texture->width0, level);
893    surface->height = u_minify(texture->height0, level);
894    surface->texture = texture;
895    surface->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
896    surface->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
897    surface->u.tex.level = level;
898 
899    return surface;
900 }
901 
902 static void
agx_set_clip_state(struct pipe_context * ctx,const struct pipe_clip_state * state)903 agx_set_clip_state(struct pipe_context *ctx,
904                    const struct pipe_clip_state *state)
905 {
906 }
907 
908 static void
agx_set_polygon_stipple(struct pipe_context * pctx,const struct pipe_poly_stipple * state)909 agx_set_polygon_stipple(struct pipe_context *pctx,
910                         const struct pipe_poly_stipple *state)
911 {
912    struct agx_context *ctx = agx_context(pctx);
913 
914    memcpy(ctx->poly_stipple, state->stipple, sizeof(ctx->poly_stipple));
915    ctx->dirty |= AGX_DIRTY_POLY_STIPPLE;
916 }
917 
918 static void
agx_set_sample_mask(struct pipe_context * pipe,unsigned sample_mask)919 agx_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
920 {
921    struct agx_context *ctx = agx_context(pipe);
922 
923    /* Optimization: At most MSAA 4x supported, so normalize to avoid pointless
924     * dirtying switching between e.g. 0xFFFF and 0xFFFFFFFF masks.
925     */
926    unsigned new_mask = sample_mask & BITFIELD_MASK(4);
927 
928    if (ctx->sample_mask != new_mask) {
929       ctx->sample_mask = new_mask;
930       ctx->dirty |= AGX_DIRTY_SAMPLE_MASK;
931    }
932 }
933 
934 static void
agx_set_scissor_states(struct pipe_context * pctx,unsigned start_slot,unsigned num_scissors,const struct pipe_scissor_state * scissor)935 agx_set_scissor_states(struct pipe_context *pctx, unsigned start_slot,
936                        unsigned num_scissors,
937                        const struct pipe_scissor_state *scissor)
938 {
939    struct agx_context *ctx = agx_context(pctx);
940 
941    STATIC_ASSERT(sizeof(ctx->scissor[0]) == sizeof(*scissor));
942    assert(start_slot + num_scissors <= AGX_MAX_VIEWPORTS);
943 
944    memcpy(&ctx->scissor[start_slot], scissor, sizeof(*scissor) * num_scissors);
945    ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
946 }
947 
948 static void
agx_set_stencil_ref(struct pipe_context * pctx,const struct pipe_stencil_ref state)949 agx_set_stencil_ref(struct pipe_context *pctx,
950                     const struct pipe_stencil_ref state)
951 {
952    struct agx_context *ctx = agx_context(pctx);
953    ctx->stencil_ref = state;
954    ctx->dirty |= AGX_DIRTY_STENCIL_REF;
955 }
956 
957 static void
agx_set_viewport_states(struct pipe_context * pctx,unsigned start_slot,unsigned num_viewports,const struct pipe_viewport_state * vp)958 agx_set_viewport_states(struct pipe_context *pctx, unsigned start_slot,
959                         unsigned num_viewports,
960                         const struct pipe_viewport_state *vp)
961 {
962    struct agx_context *ctx = agx_context(pctx);
963 
964    STATIC_ASSERT(sizeof(ctx->viewport[0]) == sizeof(*vp));
965    assert(start_slot + num_viewports <= AGX_MAX_VIEWPORTS);
966 
967    memcpy(&ctx->viewport[start_slot], vp, sizeof(*vp) * num_viewports);
968    ctx->dirty |= AGX_DIRTY_VIEWPORT;
969 }
970 
971 static void
agx_get_scissor_extents(const struct pipe_viewport_state * vp,const struct pipe_scissor_state * ss,const struct pipe_framebuffer_state * fb,unsigned * minx,unsigned * miny,unsigned * maxx,unsigned * maxy)972 agx_get_scissor_extents(const struct pipe_viewport_state *vp,
973                         const struct pipe_scissor_state *ss,
974                         const struct pipe_framebuffer_state *fb, unsigned *minx,
975                         unsigned *miny, unsigned *maxx, unsigned *maxy)
976 {
977    float trans_x = vp->translate[0], trans_y = vp->translate[1];
978    float abs_scale_x = fabsf(vp->scale[0]), abs_scale_y = fabsf(vp->scale[1]);
979 
980    /* Calculate the extent of the viewport. Note if a particular dimension of
981     * the viewport is an odd number of pixels, both the translate and the scale
982     * will have a fractional part of 0.5, so adding and subtracting them yields
983     * an integer. Therefore we don't need to round explicitly */
984    *minx = CLAMP((int)(trans_x - abs_scale_x), 0, fb->width);
985    *miny = CLAMP((int)(trans_y - abs_scale_y), 0, fb->height);
986    *maxx = CLAMP((int)(trans_x + abs_scale_x), 0, fb->width);
987    *maxy = CLAMP((int)(trans_y + abs_scale_y), 0, fb->height);
988 
989    if (ss) {
990       *minx = MAX2(ss->minx, *minx);
991       *miny = MAX2(ss->miny, *miny);
992       *maxx = MIN2(ss->maxx, *maxx);
993       *maxy = MIN2(ss->maxy, *maxy);
994    }
995 }
996 
997 static void
agx_upload_viewport_scissor(struct agx_pool * pool,struct agx_batch * batch,uint8_t ** out,const struct pipe_viewport_state * vp,const struct pipe_scissor_state * ss,bool clip_halfz,bool multi_viewport)998 agx_upload_viewport_scissor(struct agx_pool *pool, struct agx_batch *batch,
999                             uint8_t **out, const struct pipe_viewport_state *vp,
1000                             const struct pipe_scissor_state *ss,
1001                             bool clip_halfz, bool multi_viewport)
1002 {
1003    /* Number of viewports/scissors isn't precisely determinable in Gallium, so
1004     * just key off whether we can write to anything other than viewport 0. This
1005     * could be tuned in the future.
1006     */
1007    unsigned count = multi_viewport ? AGX_MAX_VIEWPORTS : 1;
1008 
1009    /* Allocate scissor descriptors */
1010    unsigned index = batch->scissor.size / AGX_SCISSOR_LENGTH;
1011    struct agx_scissor_packed *scissors =
1012       util_dynarray_grow_bytes(&batch->scissor, count, AGX_SCISSOR_LENGTH);
1013 
1014    unsigned minx[AGX_MAX_VIEWPORTS], miny[AGX_MAX_VIEWPORTS];
1015    unsigned maxx[AGX_MAX_VIEWPORTS], maxy[AGX_MAX_VIEWPORTS];
1016 
1017    /* Upload each scissor */
1018    for (unsigned i = 0; i < count; ++i) {
1019       agx_get_scissor_extents(&vp[i], ss ? &ss[i] : NULL, &batch->key, &minx[i],
1020                               &miny[i], &maxx[i], &maxy[i]);
1021 
1022       float minz, maxz;
1023       util_viewport_zmin_zmax(vp, clip_halfz, &minz, &maxz);
1024 
1025       agx_pack(scissors + i, SCISSOR, cfg) {
1026          cfg.min_x = minx[i];
1027          cfg.min_y = miny[i];
1028          cfg.min_z = minz;
1029          cfg.max_x = maxx[i];
1030          cfg.max_y = maxy[i];
1031          cfg.max_z = maxz;
1032       }
1033    }
1034 
1035    /* Upload state */
1036    struct AGX_PPP_HEADER present = {
1037       .depth_bias_scissor = true,
1038       .region_clip = true,
1039       .viewport = true,
1040       .viewport_count = count,
1041    };
1042 
1043    size_t size = agx_ppp_update_size(&present);
1044    struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 64);
1045    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
1046 
1047    agx_ppp_push(&ppp, DEPTH_BIAS_SCISSOR, cfg) {
1048       cfg.scissor = index;
1049 
1050       /* Use the current depth bias, we allocate linearly */
1051       unsigned count = batch->depth_bias.size / AGX_DEPTH_BIAS_LENGTH;
1052       cfg.depth_bias = count ? count - 1 : 0;
1053    };
1054 
1055    for (unsigned i = 0; i < count; ++i) {
1056       agx_ppp_push(&ppp, REGION_CLIP, cfg) {
1057          cfg.enable = true;
1058          cfg.min_x = minx[i] / 32;
1059          cfg.min_y = miny[i] / 32;
1060          cfg.max_x = DIV_ROUND_UP(MAX2(maxx[i], 1), 32);
1061          cfg.max_y = DIV_ROUND_UP(MAX2(maxy[i], 1), 32);
1062       }
1063    }
1064 
1065    agx_ppp_push(&ppp, VIEWPORT_CONTROL, cfg)
1066       ;
1067 
1068    /* Upload viewports */
1069    for (unsigned i = 0; i < count; ++i) {
1070       agx_ppp_push(&ppp, VIEWPORT, cfg) {
1071          cfg.translate_x = vp[i].translate[0];
1072          cfg.translate_y = vp[i].translate[1];
1073          cfg.translate_z = vp[i].translate[2];
1074          cfg.scale_x = vp[i].scale[0];
1075          cfg.scale_y = vp[i].scale[1];
1076          cfg.scale_z = vp[i].scale[2];
1077 
1078          if (!clip_halfz) {
1079             cfg.translate_z -= cfg.scale_z;
1080             cfg.scale_z *= 2;
1081          }
1082       }
1083    }
1084 
1085    agx_ppp_fini(out, &ppp);
1086 }
1087 
1088 static void
agx_upload_depth_bias(struct agx_batch * batch,const struct pipe_rasterizer_state * rast)1089 agx_upload_depth_bias(struct agx_batch *batch,
1090                       const struct pipe_rasterizer_state *rast)
1091 {
1092    void *ptr =
1093       util_dynarray_grow_bytes(&batch->depth_bias, 1, AGX_DEPTH_BIAS_LENGTH);
1094 
1095    agx_pack(ptr, DEPTH_BIAS, cfg) {
1096       cfg.depth_bias = rast->offset_units * 2.0f;
1097       cfg.slope_scale = rast->offset_scale;
1098       cfg.clamp = rast->offset_clamp;
1099    }
1100 }
1101 
1102 /* A framebuffer state can be reused across batches, so it doesn't make sense
1103  * to add surfaces to the BO list here. Instead we added them when flushing.
1104  */
1105 
1106 static void
agx_set_framebuffer_state(struct pipe_context * pctx,const struct pipe_framebuffer_state * state)1107 agx_set_framebuffer_state(struct pipe_context *pctx,
1108                           const struct pipe_framebuffer_state *state)
1109 {
1110    struct agx_context *ctx = agx_context(pctx);
1111 
1112    if (!state)
1113       return;
1114 
1115    util_copy_framebuffer_state(&ctx->framebuffer, state);
1116    ctx->batch = NULL;
1117    agx_dirty_all(ctx);
1118 }
1119 
1120 /*
1121  * To write out render targets, each render target surface is bound as a
1122  * writable shader image, written with the end-of-tile program. This helper
1123  * constructs the internal pipe_image_view used.
1124  */
1125 static struct pipe_image_view
image_view_for_surface(struct pipe_surface * surf)1126 image_view_for_surface(struct pipe_surface *surf)
1127 {
1128    return (struct pipe_image_view){
1129       .resource = surf->texture,
1130       .format = surf->format,
1131       .access = PIPE_IMAGE_ACCESS_READ_WRITE,
1132       .shader_access = PIPE_IMAGE_ACCESS_READ_WRITE,
1133       .u.tex.single_layer_view =
1134          surf->u.tex.first_layer == surf->u.tex.last_layer,
1135       .u.tex.first_layer = surf->u.tex.first_layer,
1136       .u.tex.last_layer = surf->u.tex.last_layer,
1137       .u.tex.level = surf->u.tex.level,
1138    };
1139 }
1140 
1141 /* Similarly, to read render targets, surfaces are bound as textures */
1142 static struct pipe_sampler_view
sampler_view_for_surface(struct pipe_surface * surf)1143 sampler_view_for_surface(struct pipe_surface *surf)
1144 {
1145    bool layered = surf->u.tex.last_layer > surf->u.tex.first_layer;
1146 
1147    return (struct pipe_sampler_view){
1148       /* To reduce shader variants, we always use a 2D texture. For reloads of
1149        * arrays and cube maps, we map a single layer as a 2D image.
1150        */
1151       .target = layered ? PIPE_TEXTURE_2D_ARRAY : PIPE_TEXTURE_2D,
1152       .swizzle_r = PIPE_SWIZZLE_X,
1153       .swizzle_g = PIPE_SWIZZLE_Y,
1154       .swizzle_b = PIPE_SWIZZLE_Z,
1155       .swizzle_a = PIPE_SWIZZLE_W,
1156       .u.tex =
1157          {
1158             .first_layer = surf->u.tex.first_layer,
1159             .last_layer = surf->u.tex.last_layer,
1160             .first_level = surf->u.tex.level,
1161             .last_level = surf->u.tex.level,
1162          },
1163    };
1164 }
1165 
1166 static bool
target_is_array(enum pipe_texture_target target)1167 target_is_array(enum pipe_texture_target target)
1168 {
1169    switch (target) {
1170    case PIPE_TEXTURE_3D:
1171    case PIPE_TEXTURE_CUBE:
1172    case PIPE_TEXTURE_1D_ARRAY:
1173    case PIPE_TEXTURE_2D_ARRAY:
1174    case PIPE_TEXTURE_CUBE_ARRAY:
1175       return true;
1176    default:
1177       return false;
1178    }
1179 }
1180 
1181 static void
agx_batch_upload_pbe(struct agx_batch * batch,struct agx_pbe_packed * out,struct pipe_image_view * view,bool block_access,bool arrays_as_2d,bool force_2d_array,bool emrt)1182 agx_batch_upload_pbe(struct agx_batch *batch, struct agx_pbe_packed *out,
1183                      struct pipe_image_view *view, bool block_access,
1184                      bool arrays_as_2d, bool force_2d_array, bool emrt)
1185 {
1186    struct agx_resource *tex = agx_resource(view->resource);
1187    const struct util_format_description *desc =
1188       util_format_description(view->format);
1189    enum pipe_texture_target target = tex->base.target;
1190    bool is_buffer = (target == PIPE_BUFFER);
1191 
1192    if (!is_buffer && view->u.tex.single_layer_view)
1193       target = PIPE_TEXTURE_2D;
1194 
1195    arrays_as_2d |= (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL);
1196 
1197    /* To reduce shader variants, spilled layered render targets are accessed as
1198     * 2D Arrays regardless of the actual target, so force in that case.
1199     *
1200     * Likewise, cubes are accessed as arrays for consistency with NIR.
1201     */
1202    if ((arrays_as_2d && target_is_array(target)) || target_is_cube(target) ||
1203        force_2d_array)
1204       target = PIPE_TEXTURE_2D_ARRAY;
1205 
1206    unsigned level = is_buffer ? 0 : view->u.tex.level;
1207    unsigned layer = is_buffer ? 0 : view->u.tex.first_layer;
1208 
1209    agx_pack(out, PBE, cfg) {
1210       cfg.dimension =
1211          agx_translate_tex_dim(target, util_res_sample_count(&tex->base));
1212       cfg.layout = agx_translate_layout(tex->layout.tiling);
1213       cfg.channels = ail_pixel_format[view->format].channels;
1214       cfg.type = ail_pixel_format[view->format].type;
1215       cfg.srgb = util_format_is_srgb(view->format);
1216 
1217       assert(desc->nr_channels >= 1 && desc->nr_channels <= 4);
1218 
1219       for (unsigned i = 0; i < desc->nr_channels; ++i) {
1220          if (desc->swizzle[i] == 0)
1221             cfg.swizzle_r = i;
1222          else if (desc->swizzle[i] == 1)
1223             cfg.swizzle_g = i;
1224          else if (desc->swizzle[i] == 2)
1225             cfg.swizzle_b = i;
1226          else if (desc->swizzle[i] == 3)
1227             cfg.swizzle_a = i;
1228       }
1229 
1230       cfg.buffer = agx_map_texture_gpu(tex, layer);
1231       cfg.unk_mipmapped = tex->mipmapped;
1232 
1233       if (is_buffer) {
1234          unsigned size_el =
1235             agx_texture_buffer_size_el(view->format, view->u.buf.size);
1236 
1237          /* Buffers uniquely have offsets (in bytes, not texels) */
1238          cfg.buffer += view->u.buf.offset;
1239 
1240          /* Use a 2D texture to increase the maximum size */
1241          cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
1242          cfg.height = DIV_ROUND_UP(size_el, cfg.width);
1243          cfg.level = 0;
1244          cfg.stride = (cfg.width * util_format_get_blocksize(view->format)) - 4;
1245          cfg.layers = 1;
1246          cfg.levels = 1;
1247       } else if (util_res_sample_count(&tex->base) > 1 && !block_access) {
1248          /* Multisampled images are bound like buffer textures, with
1249           * addressing arithmetic to determine the texel to write.
1250           *
1251           * Note that the end-of-tile program uses real multisample images with
1252           * image_write_block instructions.
1253           */
1254          unsigned blocksize_B = util_format_get_blocksize(view->format);
1255          unsigned size_px =
1256             (tex->layout.size_B - tex->layout.layer_stride_B * layer) /
1257             blocksize_B;
1258 
1259          cfg.dimension = AGX_TEXTURE_DIMENSION_2D;
1260          cfg.layout = AGX_LAYOUT_LINEAR;
1261          cfg.width = AGX_TEXTURE_BUFFER_WIDTH;
1262          cfg.height = DIV_ROUND_UP(size_px, cfg.width);
1263          cfg.stride = (cfg.width * blocksize_B) - 4;
1264          cfg.layers = 1;
1265          cfg.levels = 1;
1266 
1267          cfg.buffer += tex->layout.level_offsets_B[level];
1268          cfg.level = 0;
1269       } else {
1270          cfg.width = view->resource->width0;
1271          cfg.height = view->resource->height0;
1272          cfg.level = level;
1273 
1274          unsigned layers = view->u.tex.last_layer - layer + 1;
1275 
1276          if (tex->layout.tiling == AIL_TILING_LINEAR &&
1277              (target == PIPE_TEXTURE_1D_ARRAY ||
1278               target == PIPE_TEXTURE_2D_ARRAY)) {
1279 
1280             cfg.depth_linear = layers;
1281             cfg.layer_stride_linear = (tex->layout.layer_stride_B - 0x80);
1282             cfg.extended = true;
1283          } else {
1284             assert((tex->layout.tiling != AIL_TILING_LINEAR) || (layers == 1));
1285             cfg.layers = layers;
1286          }
1287 
1288          if (tex->layout.tiling == AIL_TILING_LINEAR) {
1289             cfg.stride = ail_get_linear_stride_B(&tex->layout, level) - 4;
1290             cfg.levels = 1;
1291          } else {
1292             cfg.page_aligned_layers = tex->layout.page_aligned_layers;
1293             cfg.levels = tex->base.last_level + 1;
1294          }
1295 
1296          if (tex->base.nr_samples > 1)
1297             cfg.samples = agx_translate_sample_count(tex->base.nr_samples);
1298       }
1299 
1300       if (ail_is_compressed(&tex->layout) && !emrt) {
1301          cfg.compressed_1 = true;
1302          cfg.extended = true;
1303 
1304          cfg.acceleration_buffer =
1305             agx_map_texture_gpu(tex, 0) + tex->layout.metadata_offset_B +
1306             (layer * tex->layout.compression_layer_stride_B);
1307       }
1308 
1309       /* When the descriptor isn't extended architecturally, we can use the last
1310        * 8 bytes as a sideband. We use it to provide metadata for image atomics.
1311        */
1312       if (!cfg.extended && (tex->layout.writeable_image || emrt) &&
1313           tex->base.target != PIPE_BUFFER) {
1314 
1315          if (util_res_sample_count(&tex->base) > 1) {
1316             cfg.aligned_width_msaa_sw =
1317                align(u_minify(view->resource->width0, level),
1318                      tex->layout.tilesize_el[level].width_el);
1319          } else {
1320             cfg.level_offset_sw =
1321                ail_get_level_offset_B(&tex->layout, cfg.level);
1322          }
1323 
1324          cfg.sample_count_log2_sw = util_logbase2(tex->base.nr_samples);
1325 
1326          if (tex->layout.tiling == AIL_TILING_TWIDDLED || emrt) {
1327             struct ail_tile tile_size = tex->layout.tilesize_el[level];
1328             cfg.tile_width_sw = tile_size.width_el;
1329             cfg.tile_height_sw = tile_size.height_el;
1330 
1331             cfg.layer_stride_sw = tex->layout.layer_stride_B;
1332          }
1333       }
1334    };
1335 }
1336 
1337 /* Likewise constant buffers, textures, and samplers are handled in a common
1338  * per-draw path, with dirty tracking to reduce the costs involved.
1339  */
1340 
1341 static void
agx_set_constant_buffer(struct pipe_context * pctx,enum pipe_shader_type shader,uint index,bool take_ownership,const struct pipe_constant_buffer * cb)1342 agx_set_constant_buffer(struct pipe_context *pctx, enum pipe_shader_type shader,
1343                         uint index, bool take_ownership,
1344                         const struct pipe_constant_buffer *cb)
1345 {
1346    struct agx_context *ctx = agx_context(pctx);
1347    struct agx_stage *s = &ctx->stage[shader];
1348    struct pipe_constant_buffer *constants = &s->cb[index];
1349 
1350    util_copy_constant_buffer(&s->cb[index], cb, take_ownership);
1351 
1352    /* Upload user buffer immediately */
1353    if (constants->user_buffer && !constants->buffer) {
1354       u_upload_data(ctx->base.const_uploader, 0, constants->buffer_size, 64,
1355                     constants->user_buffer, &constants->buffer_offset,
1356                     &constants->buffer);
1357    }
1358 
1359    unsigned mask = (1 << index);
1360 
1361    if (cb)
1362       s->cb_mask |= mask;
1363    else
1364       s->cb_mask &= ~mask;
1365 
1366    ctx->stage[shader].dirty |= AGX_STAGE_DIRTY_CONST;
1367 }
1368 
1369 static void
agx_surface_destroy(struct pipe_context * ctx,struct pipe_surface * surface)1370 agx_surface_destroy(struct pipe_context *ctx, struct pipe_surface *surface)
1371 {
1372    pipe_resource_reference(&surface->texture, NULL);
1373    FREE(surface);
1374 }
1375 
1376 static void
agx_delete_state(struct pipe_context * ctx,void * state)1377 agx_delete_state(struct pipe_context *ctx, void *state)
1378 {
1379    FREE(state);
1380 }
1381 
1382 /* BOs added to the batch in the uniform upload path */
1383 
1384 static void
agx_set_vertex_buffers(struct pipe_context * pctx,unsigned count,const struct pipe_vertex_buffer * buffers)1385 agx_set_vertex_buffers(struct pipe_context *pctx, unsigned count,
1386                        const struct pipe_vertex_buffer *buffers)
1387 {
1388    struct agx_context *ctx = agx_context(pctx);
1389 
1390    util_set_vertex_buffers_mask(ctx->vertex_buffers, &ctx->vb_mask, buffers,
1391                                 count, true);
1392 
1393    ctx->dirty |= AGX_DIRTY_VERTEX;
1394 }
1395 
1396 static void *
agx_create_vertex_elements(struct pipe_context * ctx,unsigned count,const struct pipe_vertex_element * state)1397 agx_create_vertex_elements(struct pipe_context *ctx, unsigned count,
1398                            const struct pipe_vertex_element *state)
1399 {
1400    assert(count <= AGX_MAX_ATTRIBS);
1401 
1402    struct agx_vertex_elements *so = calloc(1, sizeof(*so));
1403 
1404    for (unsigned i = 0; i < count; ++i) {
1405       const struct pipe_vertex_element ve = state[i];
1406 
1407       const struct util_format_description *desc =
1408          util_format_description(ve.src_format);
1409       unsigned chan_size = desc->channel[0].size / 8;
1410       assert((ve.src_offset & (chan_size - 1)) == 0);
1411 
1412       so->buffers[i] = ve.vertex_buffer_index;
1413       so->src_offsets[i] = ve.src_offset;
1414 
1415       so->key[i] = (struct agx_velem_key){
1416          .stride = ve.src_stride,
1417          .format = ve.src_format,
1418          .divisor = ve.instance_divisor,
1419          .instanced = ve.instance_divisor > 0,
1420       };
1421    }
1422 
1423    return so;
1424 }
1425 
1426 static void
agx_bind_vertex_elements_state(struct pipe_context * pctx,void * cso)1427 agx_bind_vertex_elements_state(struct pipe_context *pctx, void *cso)
1428 {
1429    struct agx_context *ctx = agx_context(pctx);
1430    ctx->attributes = cso;
1431    ctx->dirty |= AGX_DIRTY_VERTEX;
1432 }
1433 
1434 DERIVE_HASH_TABLE(asahi_vs_shader_key);
1435 DERIVE_HASH_TABLE(asahi_gs_shader_key);
1436 DERIVE_HASH_TABLE(asahi_fs_shader_key);
1437 DERIVE_HASH_TABLE(agx_fast_link_key);
1438 
1439 /* No compute variants */
1440 static uint32_t
asahi_cs_shader_key_hash(const void * key)1441 asahi_cs_shader_key_hash(const void *key)
1442 {
1443    return 0;
1444 }
1445 
1446 static bool
asahi_cs_shader_key_equal(const void * a,const void * b)1447 asahi_cs_shader_key_equal(const void *a, const void *b)
1448 {
1449    return true;
1450 }
1451 
1452 /* Dynamic lowered I/O version of nir_lower_clip_halfz */
1453 static bool
agx_nir_lower_clip_m1_1(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)1454 agx_nir_lower_clip_m1_1(nir_builder *b, nir_intrinsic_instr *intr,
1455                         UNUSED void *data)
1456 {
1457    if (intr->intrinsic != nir_intrinsic_store_output)
1458       return false;
1459    if (nir_intrinsic_io_semantics(intr).location != VARYING_SLOT_POS)
1460       return false;
1461 
1462    assert(nir_intrinsic_component(intr) == 0 && "not yet scalarized");
1463    b->cursor = nir_before_instr(&intr->instr);
1464 
1465    nir_def *pos = intr->src[0].ssa;
1466    nir_def *z = nir_channel(b, pos, 2);
1467    nir_def *w = nir_channel(b, pos, 3);
1468    nir_def *c = nir_load_clip_z_coeff_agx(b);
1469 
1470    /* Lerp. If c = 0, reduces to z. If c = 1/2, reduces to (z + w)/2 */
1471    nir_def *new_z = nir_ffma(b, nir_fneg(b, z), c, nir_ffma(b, w, c, z));
1472    nir_src_rewrite(&intr->src[0], nir_vector_insert_imm(b, pos, new_z, 2));
1473    return true;
1474 }
1475 
1476 static nir_def *
nir_channel_or_undef(nir_builder * b,nir_def * def,signed int channel)1477 nir_channel_or_undef(nir_builder *b, nir_def *def, signed int channel)
1478 {
1479    if (channel >= 0 && channel < def->num_components)
1480       return nir_channel(b, def, channel);
1481    else
1482       return nir_undef(b, 1, def->bit_size);
1483 }
1484 
1485 /*
1486  * To implement point sprites, we'll replace TEX0...7 with point coordinate
1487  * reads as required. However, the .zw needs to read back 0.0/1.0. This pass
1488  * fixes up TEX loads of Z and W according to a uniform passed in a sideband,
1489  * eliminating shader variants.
1490  */
1491 static bool
agx_nir_lower_point_sprite_zw(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * data)1492 agx_nir_lower_point_sprite_zw(nir_builder *b, nir_intrinsic_instr *intr,
1493                               UNUSED void *data)
1494 {
1495    if (intr->intrinsic != nir_intrinsic_load_input &&
1496        intr->intrinsic != nir_intrinsic_load_interpolated_input)
1497       return false;
1498 
1499    gl_varying_slot loc = nir_intrinsic_io_semantics(intr).location;
1500    if (!(loc >= VARYING_SLOT_TEX0 && loc <= VARYING_SLOT_TEX7))
1501       return false;
1502 
1503    b->cursor = nir_after_instr(&intr->instr);
1504    unsigned component = nir_intrinsic_component(intr);
1505 
1506    nir_def *mask = nir_load_tex_sprite_mask_agx(b);
1507    nir_def *location = nir_iadd_imm(b, nir_get_io_offset_src(intr)->ssa,
1508                                     loc - VARYING_SLOT_TEX0);
1509    nir_def *bit = nir_ishl(b, nir_imm_intN_t(b, 1, 16), location);
1510    nir_def *replace = nir_i2b(b, nir_iand(b, mask, bit));
1511 
1512    nir_def *vec = nir_pad_vec4(b, &intr->def);
1513    nir_def *chans[4] = {NULL, NULL, nir_imm_floatN_t(b, 0.0, vec->bit_size),
1514                         nir_imm_floatN_t(b, 1.0, vec->bit_size)};
1515 
1516    for (unsigned i = 0; i < 4; ++i) {
1517       nir_def *chan = nir_channel_or_undef(b, vec, i - component);
1518       chans[i] = chans[i] ? nir_bcsel(b, replace, chans[i], chan) : chan;
1519    }
1520 
1521    nir_def *new_vec = nir_vec(b, &chans[component], intr->def.num_components);
1522    nir_def_rewrite_uses_after(&intr->def, new_vec, new_vec->parent_instr);
1523    return true;
1524 }
1525 
1526 /*
1527  * Compile a NIR shader. The only lowering left at this point is sysvals. The
1528  * shader key should have already been applied. agx_compile_variant may call
1529  * this multiple times if there are auxiliary shaders.
1530  */
1531 static struct agx_compiled_shader *
agx_compile_nir(struct agx_device * dev,nir_shader * nir,struct util_debug_callback * debug,enum pipe_shader_type stage,bool internal_kernel,bool terminal,bool secondary,unsigned cf_base,BITSET_WORD * attrib_components_read)1532 agx_compile_nir(struct agx_device *dev, nir_shader *nir,
1533                 struct util_debug_callback *debug, enum pipe_shader_type stage,
1534                 bool internal_kernel, bool terminal, bool secondary,
1535                 unsigned cf_base, BITSET_WORD *attrib_components_read)
1536 {
1537    struct agx_compiled_shader *compiled = CALLOC_STRUCT(agx_compiled_shader);
1538    compiled->stage = stage;
1539    if (attrib_components_read)
1540       BITSET_COPY(compiled->attrib_components_read, attrib_components_read);
1541 
1542    struct agx_shader_key key = {
1543       .dev = agx_gather_device_key(dev),
1544       .libagx = dev->libagx,
1545       .has_scratch = !secondary,
1546       .promote_constants = true,
1547       .no_stop = !terminal,
1548       .secondary = secondary,
1549    };
1550 
1551    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1552       NIR_PASS(_, nir, agx_nir_lower_interpolation);
1553    }
1554 
1555    /* We always use dynamic sample shading in the GL driver. Indicate that. */
1556    if (nir->info.stage == MESA_SHADER_FRAGMENT &&
1557        nir->info.fs.uses_sample_shading)
1558       key.fs.inside_sample_loop = true;
1559 
1560    if (internal_kernel) {
1561       key.reserved_preamble = 8;
1562    } else if (!secondary) {
1563       NIR_PASS(_, nir, agx_nir_lower_sysvals, stage, true);
1564       NIR_PASS(_, nir, agx_nir_layout_uniforms, compiled,
1565                &key.reserved_preamble);
1566    }
1567 
1568    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1569       key.fs.cf_base = cf_base;
1570    }
1571 
1572    agx_compile_shader_nir(nir, &key, debug, &compiled->b);
1573 
1574    if (compiled->b.binary_size && !secondary) {
1575       compiled->bo = agx_bo_create(dev, compiled->b.binary_size, 0,
1576                                    AGX_BO_EXEC | AGX_BO_LOW_VA, "Executable");
1577 
1578       memcpy(compiled->bo->map, compiled->b.binary, compiled->b.binary_size);
1579    }
1580 
1581    return compiled;
1582 }
1583 
1584 static struct agx_compiled_shader *
1585 agx_build_meta_shader_internal(struct agx_context *ctx,
1586                                meta_shader_builder_t builder, void *data,
1587                                size_t data_size, bool prolog, bool epilog,
1588                                unsigned cf_base, bool internal_kernel);
1589 
1590 /* Does not take ownership of key. Clones if necessary. */
1591 static struct agx_compiled_shader *
agx_compile_variant(struct agx_device * dev,struct pipe_context * pctx,struct agx_uncompiled_shader * so,struct util_debug_callback * debug,union asahi_shader_key * key_)1592 agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
1593                     struct agx_uncompiled_shader *so,
1594                     struct util_debug_callback *debug,
1595                     union asahi_shader_key *key_)
1596 {
1597    struct blob_reader reader;
1598    blob_reader_init(&reader, so->serialized_nir.data, so->serialized_nir.size);
1599    nir_shader *nir = nir_deserialize(NULL, &agx_nir_options, &reader);
1600 
1601    /* Auxiliary programs */
1602    enum mesa_prim gs_out_prim = MESA_PRIM_MAX;
1603    uint64_t outputs = 0;
1604    struct agx_fs_epilog_link_info epilog_key = {false};
1605    unsigned gs_out_count_words = 0;
1606    nir_shader *gs_count = NULL;
1607    nir_shader *gs_copy = NULL;
1608    nir_shader *pre_gs = NULL;
1609    BITSET_DECLARE(attrib_components_read, VERT_ATTRIB_MAX * 4) = {0};
1610 
1611    /* This can happen at inopportune times and cause jank, log it */
1612    perf_debug(dev, "Compiling %s shader variant #%u",
1613               _mesa_shader_stage_to_abbrev(so->type),
1614               _mesa_hash_table_num_entries(so->variants));
1615 
1616    struct agx_unlinked_uvs_layout uvs = {0};
1617    bool translucent = false;
1618 
1619    if (nir->info.stage == MESA_SHADER_VERTEX) {
1620       struct asahi_vs_shader_key *key = &key_->vs;
1621 
1622       NIR_PASS(_, nir, agx_nir_lower_vs_input_to_prolog,
1623                attrib_components_read);
1624 
1625       if (key->hw) {
1626          NIR_PASS(_, nir, agx_nir_lower_point_size, true);
1627          NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
1628                   nir_metadata_control_flow, NULL);
1629 
1630          NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL,
1631                   NULL);
1632          NIR_PASS(_, nir, agx_nir_lower_cull_distance_vs);
1633          NIR_PASS(_, nir, agx_nir_lower_uvs, &uvs);
1634       } else {
1635          NIR_PASS(_, nir, agx_nir_lower_vs_before_gs, dev->libagx);
1636 
1637          /* Turn into a compute shader now that we're free of vertexisms */
1638          nir->info.stage = MESA_SHADER_COMPUTE;
1639          memset(&nir->info.cs, 0, sizeof(nir->info.cs));
1640          nir->xfb_info = NULL;
1641          outputs = nir->info.outputs_written;
1642       }
1643    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1644       NIR_PASS_V(nir, agx_nir_lower_tcs, dev->libagx);
1645    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1646       struct asahi_gs_shader_key *key = &key_->gs;
1647 
1648       NIR_PASS(_, nir, agx_nir_lower_gs, dev->libagx, key->rasterizer_discard,
1649                &gs_count, &gs_copy, &pre_gs, &gs_out_prim, &gs_out_count_words);
1650    } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1651       struct asahi_fs_shader_key *key = &key_->fs;
1652 
1653       /* Discards must be lowering before lowering MSAA to handle discards */
1654       NIR_PASS(_, nir, agx_nir_lower_discard_zs_emit);
1655       NIR_PASS(_, nir, agx_nir_lower_fs_output_to_epilog, &epilog_key);
1656 
1657       if (nir->info.fs.uses_fbfetch_output) {
1658          struct agx_tilebuffer_layout tib = agx_build_tilebuffer_layout(
1659             key->rt_formats, ARRAY_SIZE(key->rt_formats), key->nr_samples,
1660             true);
1661 
1662          if (dev->debug & AGX_DBG_SMALLTILE)
1663             tib.tile_size = (struct agx_tile_size){16, 16};
1664 
1665          /* XXX: don't replicate this all over the driver */
1666          unsigned rt_spill_base = BITSET_LAST_BIT(nir->info.textures_used) +
1667                                   (2 * BITSET_LAST_BIT(nir->info.images_used));
1668          unsigned rt_spill = rt_spill_base;
1669          NIR_PASS(_, nir, agx_nir_lower_tilebuffer, &tib, NULL, &rt_spill, NULL,
1670                   &translucent);
1671       }
1672 
1673       if (nir->info.fs.uses_sample_shading) {
1674          /* Ensure the sample ID is preserved in register */
1675          nir_builder b =
1676             nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(nir)));
1677          nir_export_agx(&b, nir_load_exported_agx(&b, 1, 16, .base = 1),
1678                         .base = 1);
1679 
1680          NIR_PASS(_, nir, agx_nir_lower_to_per_sample);
1681       }
1682 
1683       NIR_PASS(_, nir, agx_nir_lower_sample_mask);
1684       NIR_PASS(_, nir, agx_nir_lower_fs_active_samples_to_register);
1685    }
1686 
1687    NIR_PASS(_, nir, agx_nir_lower_multisampled_image_store);
1688 
1689    struct agx_compiled_shader *compiled = agx_compile_nir(
1690       dev, nir, debug, so->type, false, so->type != PIPE_SHADER_FRAGMENT, false,
1691       0, attrib_components_read);
1692 
1693    if (so->type == PIPE_SHADER_FRAGMENT) {
1694       /* XXX: don't replicate this all over the driver */
1695       epilog_key.rt_spill_base = BITSET_LAST_BIT(nir->info.textures_used) +
1696                                  (2 * BITSET_LAST_BIT(nir->info.images_used));
1697 
1698       compiled->epilog_key = epilog_key;
1699       compiled->b.info.reads_tib |= translucent;
1700    }
1701 
1702    compiled->so = so;
1703    compiled->uvs = uvs;
1704 
1705    /* Compile auxiliary programs */
1706    if (gs_count) {
1707       compiled->gs_count = agx_compile_nir(dev, gs_count, debug, so->type,
1708                                            false, true, false, 0, NULL);
1709       compiled->gs_count->so = so;
1710    }
1711 
1712    if (pre_gs) {
1713       compiled->pre_gs = agx_compile_nir(
1714          dev, pre_gs, debug, PIPE_SHADER_COMPUTE, false, true, false, 0, NULL);
1715    }
1716 
1717    if (gs_copy) {
1718       /* Replace the point size write if present, but do not insert a write:
1719        * the GS rast program writes point size iff we have points.
1720        */
1721       NIR_PASS(_, gs_copy, agx_nir_lower_point_size, false);
1722 
1723       NIR_PASS(_, gs_copy, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
1724                nir_metadata_control_flow, NULL);
1725 
1726       NIR_PASS(_, gs_copy, nir_lower_io_to_scalar, nir_var_shader_out, NULL,
1727                NULL);
1728       NIR_PASS(_, gs_copy, agx_nir_lower_cull_distance_vs);
1729 
1730       struct agx_unlinked_uvs_layout uvs = {0};
1731       NIR_PASS(_, gs_copy, agx_nir_lower_uvs, &uvs);
1732 
1733       compiled->gs_copy =
1734          agx_compile_nir(dev, gs_copy, debug, PIPE_SHADER_GEOMETRY, false, true,
1735                          false, 0, NULL);
1736       compiled->gs_copy->so = so;
1737       compiled->gs_copy->stage = so->type;
1738       compiled->gs_copy->uvs = uvs;
1739    }
1740 
1741    compiled->gs_output_mode = gs_out_prim;
1742    compiled->gs_count_words = gs_out_count_words;
1743    compiled->b.info.outputs = outputs;
1744 
1745    ralloc_free(nir);
1746    ralloc_free(pre_gs);
1747    ralloc_free(gs_count);
1748    return compiled;
1749 }
1750 
1751 static struct agx_compiled_shader *
agx_get_shader_variant(struct agx_screen * screen,struct pipe_context * pctx,struct agx_uncompiled_shader * so,struct util_debug_callback * debug,union asahi_shader_key * key)1752 agx_get_shader_variant(struct agx_screen *screen, struct pipe_context *pctx,
1753                        struct agx_uncompiled_shader *so,
1754                        struct util_debug_callback *debug,
1755                        union asahi_shader_key *key)
1756 {
1757    struct agx_compiled_shader *compiled =
1758       agx_disk_cache_retrieve(screen, so, key);
1759 
1760    if (!compiled) {
1761       compiled = agx_compile_variant(&screen->dev, pctx, so, debug, key);
1762       agx_disk_cache_store(screen->disk_cache, so, key, compiled);
1763    }
1764 
1765    /* key may be destroyed after we return, so clone it before using it as a
1766     * hash table key. The clone is logically owned by the hash table.
1767     */
1768    union asahi_shader_key *cloned_key =
1769       rzalloc(so->variants, union asahi_shader_key);
1770 
1771    if (so->type == PIPE_SHADER_FRAGMENT) {
1772       memcpy(cloned_key, key, sizeof(struct asahi_fs_shader_key));
1773    } else if (so->type == PIPE_SHADER_VERTEX ||
1774               so->type == PIPE_SHADER_TESS_EVAL) {
1775       memcpy(cloned_key, key, sizeof(struct asahi_vs_shader_key));
1776    } else if (so->type == PIPE_SHADER_GEOMETRY) {
1777       memcpy(cloned_key, key, sizeof(struct asahi_gs_shader_key));
1778    } else {
1779       assert(gl_shader_stage_is_compute(so->type) ||
1780              so->type == PIPE_SHADER_TESS_CTRL);
1781       /* No key */
1782    }
1783 
1784    _mesa_hash_table_insert(so->variants, cloned_key, compiled);
1785 
1786    return compiled;
1787 }
1788 
1789 static int
glsl_type_size(const struct glsl_type * type,bool bindless)1790 glsl_type_size(const struct glsl_type *type, bool bindless)
1791 {
1792    return glsl_count_attribute_slots(type, false);
1793 }
1794 
1795 static void
agx_shader_initialize(struct agx_device * dev,struct agx_uncompiled_shader * so,nir_shader * nir,bool support_lod_bias,bool robust)1796 agx_shader_initialize(struct agx_device *dev, struct agx_uncompiled_shader *so,
1797                       nir_shader *nir, bool support_lod_bias, bool robust)
1798 {
1799    if (nir->info.stage == MESA_SHADER_KERNEL)
1800       nir->info.stage = MESA_SHADER_COMPUTE;
1801 
1802    blob_init(&so->early_serialized_nir);
1803    nir_serialize(&so->early_serialized_nir, nir, true);
1804 
1805    nir_lower_robust_access_options robustness = {
1806       /* Images accessed through the texture or PBE hardware are robust, so we
1807        * don't set lower_image. However, buffer images and image atomics are
1808        * lowered so require robustness lowering.
1809        */
1810       .lower_buffer_image = true,
1811       .lower_image_atomic = true,
1812 
1813       /* Buffer access is based on raw pointers and hence needs lowering to be
1814          robust */
1815       .lower_ubo = robust,
1816       .lower_ssbo = robust,
1817    };
1818 
1819    /* We need to lower robustness before bindings, since robustness lowering
1820     * affects the bindings used.
1821     */
1822    NIR_PASS(_, nir, nir_lower_robust_access, &robustness);
1823 
1824    /* Similarly, we need to do early texture lowering before bindings */
1825    NIR_PASS(_, nir, agx_nir_lower_texture_early, support_lod_bias);
1826 
1827    /* We need to lower binding tables before calling agx_preprocess_nir, since
1828     * that does texture lowering that needs to know the binding model.
1829     */
1830    NIR_PASS(_, nir, agx_nir_lower_bindings, &so->uses_bindless_samplers);
1831 
1832    /* We need to do some I/O lowering before lowering textures */
1833    so->info.nr_bindful_textures = BITSET_LAST_BIT(nir->info.textures_used);
1834    so->info.nr_bindful_images = BITSET_LAST_BIT(nir->info.images_used);
1835 
1836    NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
1837             glsl_type_size, nir_lower_io_lower_64bit_to_32);
1838 
1839    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1840       struct agx_interp_info interp = agx_gather_interp_info(nir);
1841 
1842       /* Interpolate varyings at fp16 and write to the tilebuffer at fp16. As an
1843        * exception, interpolate flat shaded at fp32. This works around a
1844        * hardware limitation. The resulting code (with an extra f2f16 at the end
1845        * if needed) matches what Metal produces.
1846        */
1847       if (likely(!(dev->debug & AGX_DBG_NO16))) {
1848          uint64_t texcoord = agx_gather_texcoords(nir);
1849 
1850          NIR_PASS(_, nir, nir_lower_mediump_io,
1851                   nir_var_shader_in | nir_var_shader_out,
1852                   ~(interp.flat | texcoord), false);
1853       }
1854 
1855       so->info.inputs_flat_shaded = interp.flat;
1856       so->info.inputs_linear_shaded = interp.linear;
1857       so->info.uses_fbfetch = nir->info.fs.uses_fbfetch_output;
1858    } else if (nir->info.stage == MESA_SHADER_VERTEX ||
1859               nir->info.stage == MESA_SHADER_TESS_EVAL) {
1860       so->info.has_edgeflags = nir->info.outputs_written & VARYING_BIT_EDGE;
1861       so->info.cull_distance_size = nir->info.cull_distance_array_size;
1862    }
1863 
1864    NIR_PASS(_, nir, agx_nir_lower_texture);
1865    NIR_PASS(_, nir, nir_lower_ssbo, NULL);
1866 
1867    agx_preprocess_nir(nir, dev->libagx);
1868 
1869    if (nir->info.stage == MESA_SHADER_FRAGMENT &&
1870        (nir->info.inputs_read & VARYING_BITS_TEX_ANY)) {
1871 
1872       NIR_PASS(_, nir, nir_shader_intrinsics_pass,
1873                agx_nir_lower_point_sprite_zw, nir_metadata_control_flow, NULL);
1874    }
1875 
1876    if (nir->info.stage == MESA_SHADER_FRAGMENT) {
1877       NIR_PASS(_, nir, agx_nir_lower_sample_intrinsics, true);
1878    }
1879 
1880    so->type = pipe_shader_type_from_mesa(nir->info.stage);
1881 
1882    if (nir->info.stage == MESA_SHADER_TESS_EVAL) {
1883       NIR_PASS(_, nir, agx_nir_lower_tes, dev->libagx, true);
1884    }
1885 
1886    blob_init(&so->serialized_nir);
1887    nir_serialize(&so->serialized_nir, nir, true);
1888    _mesa_sha1_compute(so->serialized_nir.data, so->serialized_nir.size,
1889                       so->nir_sha1);
1890 
1891    so->has_xfb_info = (nir->xfb_info != NULL);
1892 
1893    static_assert(
1894       ARRAY_SIZE(so->xfb_strides) == ARRAY_SIZE(nir->info.xfb_stride),
1895       "known target count");
1896 
1897    if (so->has_xfb_info) {
1898       struct nir_xfb_info *xfb = nir->xfb_info;
1899 
1900       for (unsigned i = 0; i < ARRAY_SIZE(so->xfb_strides); ++i) {
1901          so->xfb_strides[i] = xfb->buffers[i].stride;
1902       }
1903    }
1904 }
1905 
1906 static void *
agx_create_shader_state(struct pipe_context * pctx,const struct pipe_shader_state * cso)1907 agx_create_shader_state(struct pipe_context *pctx,
1908                         const struct pipe_shader_state *cso)
1909 {
1910    struct agx_context *ctx = agx_context(pctx);
1911    struct agx_uncompiled_shader *so =
1912       rzalloc(NULL, struct agx_uncompiled_shader);
1913    struct agx_device *dev = agx_device(pctx->screen);
1914 
1915    if (!so)
1916       return NULL;
1917 
1918    so->base = *cso;
1919 
1920    nir_shader *nir = cso->type == PIPE_SHADER_IR_NIR
1921                         ? cso->ir.nir
1922                         : tgsi_to_nir(cso->tokens, pctx->screen, false);
1923 
1924    if (nir->info.stage == MESA_SHADER_VERTEX ||
1925        nir->info.stage == MESA_SHADER_TESS_EVAL) {
1926       so->variants = asahi_vs_shader_key_table_create(so);
1927       so->linked_shaders = agx_fast_link_key_table_create(so);
1928    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1929       so->variants = asahi_gs_shader_key_table_create(so);
1930    } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
1931       /* No variants */
1932       so->variants = _mesa_hash_table_create(NULL, asahi_cs_shader_key_hash,
1933                                              asahi_cs_shader_key_equal);
1934    } else {
1935       so->variants = asahi_fs_shader_key_table_create(so);
1936       so->linked_shaders = agx_fast_link_key_table_create(so);
1937    }
1938 
1939    if (nir->info.stage == MESA_SHADER_TESS_EVAL ||
1940        nir->info.stage == MESA_SHADER_TESS_CTRL) {
1941 
1942       so->tess.ccw = nir->info.tess.ccw;
1943       so->tess.point_mode = nir->info.tess.point_mode;
1944       so->tess.spacing = nir->info.tess.spacing;
1945       so->tess.output_patch_size = nir->info.tess.tcs_vertices_out;
1946       so->tess.primitive = nir->info.tess._primitive_mode;
1947       so->tess.per_vertex_outputs = agx_tcs_per_vertex_outputs(nir);
1948       so->tess.nr_patch_outputs =
1949          util_last_bit(nir->info.patch_outputs_written);
1950       if (nir->info.stage == MESA_SHADER_TESS_CTRL)
1951          so->tess.output_stride = agx_tcs_output_stride(nir);
1952    } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
1953       so->gs_mode = nir->info.gs.output_primitive;
1954    }
1955 
1956    agx_shader_initialize(dev, so, nir, ctx->support_lod_bias, ctx->robust);
1957    gl_shader_stage next_stage = nir->info.next_stage;
1958 
1959    /* We're done with the NIR, throw it away */
1960    ralloc_free(nir);
1961    nir = NULL;
1962 
1963    /* Precompile shaders that have a small key. For shader-db, precompile a
1964     * shader with a default key. This could be improved but hopefully this is
1965     * acceptable for now.
1966     */
1967    if ((so->type == PIPE_SHADER_TESS_CTRL) ||
1968        (so->type == PIPE_SHADER_FRAGMENT && !so->info.uses_fbfetch)) {
1969       union asahi_shader_key key = {0};
1970       agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
1971                              &key);
1972    } else if (so->type == PIPE_SHADER_VERTEX) {
1973       union asahi_shader_key key = {
1974          .vs.hw = next_stage == MESA_SHADER_FRAGMENT,
1975       };
1976       agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
1977                              &key);
1978 
1979       if (!next_stage) {
1980          key.vs.hw = true;
1981          agx_get_shader_variant(agx_screen(pctx->screen), pctx, so,
1982                                 &pctx->debug, &key);
1983       }
1984    } else if (dev->debug & AGX_DBG_PRECOMPILE) {
1985       union asahi_shader_key key = {0};
1986 
1987       switch (so->type) {
1988       case PIPE_SHADER_GEOMETRY:
1989          break;
1990 
1991       case PIPE_SHADER_TESS_EVAL:
1992          /* TODO: Tessellation shaders with shader-db */
1993          return so;
1994 
1995       case PIPE_SHADER_FRAGMENT:
1996          key.fs.nr_samples = 1;
1997          break;
1998       default:
1999          unreachable("Unknown shader stage in shader-db precompile");
2000       }
2001 
2002       agx_compile_variant(dev, pctx, so, &pctx->debug, &key);
2003    }
2004 
2005    return so;
2006 }
2007 
2008 static void *
agx_create_compute_state(struct pipe_context * pctx,const struct pipe_compute_state * cso)2009 agx_create_compute_state(struct pipe_context *pctx,
2010                          const struct pipe_compute_state *cso)
2011 {
2012    struct agx_context *ctx = agx_context(pctx);
2013    struct agx_device *dev = agx_device(pctx->screen);
2014    struct agx_uncompiled_shader *so =
2015       rzalloc(NULL, struct agx_uncompiled_shader);
2016 
2017    if (!so)
2018       return NULL;
2019 
2020    so->variants = _mesa_hash_table_create(so, asahi_cs_shader_key_hash,
2021                                           asahi_cs_shader_key_equal);
2022 
2023    union asahi_shader_key key = {0};
2024 
2025    assert(cso->ir_type == PIPE_SHADER_IR_NIR && "TGSI kernels unsupported");
2026    nir_shader *nir = (void *)cso->prog;
2027 
2028    agx_shader_initialize(dev, so, nir, ctx->support_lod_bias, ctx->robust);
2029    agx_get_shader_variant(agx_screen(pctx->screen), pctx, so, &pctx->debug,
2030                           &key);
2031 
2032    /* We're done with the NIR, throw it away */
2033    ralloc_free(nir);
2034    return so;
2035 }
2036 
2037 static void
agx_get_compute_state_info(struct pipe_context * pctx,void * cso,struct pipe_compute_state_object_info * info)2038 agx_get_compute_state_info(struct pipe_context *pctx, void *cso,
2039                            struct pipe_compute_state_object_info *info)
2040 {
2041    union asahi_shader_key key = {0};
2042    struct agx_compiled_shader *so = agx_get_shader_variant(
2043       agx_screen(pctx->screen), pctx, cso, &pctx->debug, &key);
2044 
2045    info->max_threads =
2046       agx_occupancy_for_register_count(so->b.info.nr_gprs).max_threads;
2047    info->private_memory = 0;
2048    info->preferred_simd_size = 32;
2049    info->simd_sizes = 32;
2050 }
2051 
2052 /* Does not take ownership of key. Clones if necessary. */
2053 static bool
agx_update_shader(struct agx_context * ctx,struct agx_compiled_shader ** out,enum pipe_shader_type stage,union asahi_shader_key * key)2054 agx_update_shader(struct agx_context *ctx, struct agx_compiled_shader **out,
2055                   enum pipe_shader_type stage, union asahi_shader_key *key)
2056 {
2057    struct agx_uncompiled_shader *so = ctx->stage[stage].shader;
2058    assert(so != NULL);
2059 
2060    struct hash_entry *he = _mesa_hash_table_search(so->variants, key);
2061 
2062    if (he) {
2063       if ((*out) == he->data)
2064          return false;
2065 
2066       *out = he->data;
2067       return true;
2068    }
2069 
2070    struct agx_screen *screen = agx_screen(ctx->base.screen);
2071    *out = agx_get_shader_variant(screen, &ctx->base, so, &ctx->base.debug, key);
2072    return true;
2073 }
2074 
2075 static enum mesa_prim
rast_prim(enum mesa_prim mode,unsigned fill_mode)2076 rast_prim(enum mesa_prim mode, unsigned fill_mode)
2077 {
2078    if (u_reduced_prim(mode) == MESA_PRIM_TRIANGLES) {
2079       if (fill_mode == PIPE_POLYGON_MODE_POINT)
2080          return MESA_PRIM_POINTS;
2081       else if (fill_mode == PIPE_POLYGON_MODE_LINE)
2082          return MESA_PRIM_LINES;
2083    }
2084 
2085    return mode;
2086 }
2087 
2088 static bool
lower_fs_prolog_abi(nir_builder * b,nir_intrinsic_instr * intr,UNUSED void * _)2089 lower_fs_prolog_abi(nir_builder *b, nir_intrinsic_instr *intr, UNUSED void *_)
2090 {
2091    if (intr->intrinsic == nir_intrinsic_load_polygon_stipple_agx) {
2092       b->cursor = nir_instr_remove(&intr->instr);
2093 
2094       nir_def *root = nir_load_preamble(b, 1, 64, .base = 12);
2095       off_t stipple_offs = offsetof(struct agx_draw_uniforms, polygon_stipple);
2096       nir_def *stipple_ptr_ptr = nir_iadd_imm(b, root, stipple_offs);
2097       nir_def *base = nir_load_global_constant(b, stipple_ptr_ptr, 4, 1, 64);
2098 
2099       nir_def *row = intr->src[0].ssa;
2100       nir_def *addr = nir_iadd(b, base, nir_u2u64(b, nir_imul_imm(b, row, 4)));
2101 
2102       nir_def *pattern = nir_load_global_constant(b, addr, 4, 1, 32);
2103       nir_def_rewrite_uses(&intr->def, pattern);
2104       return true;
2105    } else if (intr->intrinsic == nir_intrinsic_load_stat_query_address_agx) {
2106       b->cursor = nir_instr_remove(&intr->instr);
2107 
2108       /* ABI: root descriptor address in u6_u7 */
2109       nir_def *root = nir_load_preamble(b, 1, intr->def.bit_size, .base = 12);
2110 
2111       off_t offs = offsetof(struct agx_draw_uniforms,
2112                             pipeline_statistics[nir_intrinsic_base(intr)]);
2113 
2114       nir_def *ptr = nir_iadd_imm(b, root, offs);
2115       nir_def *load = nir_load_global_constant(b, ptr, 4, 1, 64);
2116       nir_def_rewrite_uses(&intr->def, load);
2117       return true;
2118    } else {
2119       return false;
2120    }
2121 }
2122 
2123 static void
build_fs_prolog(nir_builder * b,const void * key)2124 build_fs_prolog(nir_builder *b, const void *key)
2125 {
2126    agx_nir_fs_prolog(b, key);
2127 
2128    NIR_PASS(_, b->shader, nir_shader_intrinsics_pass, lower_fs_prolog_abi,
2129             nir_metadata_control_flow, NULL);
2130 }
2131 
2132 static struct agx_linked_shader *
asahi_fast_link(struct agx_context * ctx,struct agx_uncompiled_shader * so,struct agx_fast_link_key * key)2133 asahi_fast_link(struct agx_context *ctx, struct agx_uncompiled_shader *so,
2134                 struct agx_fast_link_key *key)
2135 {
2136    /* Try the cache */
2137    struct hash_entry *ent = _mesa_hash_table_search(so->linked_shaders, key);
2138    if (ent)
2139       return ent->data;
2140 
2141    struct agx_compiled_shader *prolog = NULL, *epilog = NULL;
2142 
2143    /* Build the prolog/epilog now */
2144    if (so->type == MESA_SHADER_FRAGMENT) {
2145       prolog = agx_build_meta_shader_internal(
2146          ctx, build_fs_prolog, &key->prolog.fs, sizeof(key->prolog.fs), true,
2147          false, key->prolog.fs.cf_base, false);
2148 
2149       epilog = agx_build_meta_shader_internal(
2150          ctx, agx_nir_fs_epilog, &key->epilog.fs, sizeof(key->epilog.fs), false,
2151          true, 0, false);
2152 
2153    } else {
2154       assert(so->type == MESA_SHADER_VERTEX ||
2155              so->type == MESA_SHADER_TESS_EVAL);
2156 
2157       prolog = agx_build_meta_shader_internal(
2158          ctx, agx_nir_vs_prolog, &key->prolog.vs, sizeof(key->prolog.vs), true,
2159          false, 0, false);
2160    }
2161 
2162    /* Fast-link it all together */
2163    struct agx_device *dev = agx_device(ctx->base.screen);
2164 
2165    struct agx_linked_shader *linked =
2166       rzalloc(so->linked_shaders, struct agx_linked_shader);
2167    agx_fast_link(linked, dev, so->type == PIPE_SHADER_FRAGMENT, &key->main->b,
2168                  &prolog->b, &epilog->b, key->nr_samples_shaded);
2169 
2170    /* Cache the fast linked program */
2171    union asahi_shader_key *cloned_key =
2172       ralloc_memdup(so->linked_shaders, key, sizeof(*key));
2173    _mesa_hash_table_insert(so->linked_shaders, cloned_key, linked);
2174    return linked;
2175 }
2176 
2177 static bool
agx_update_vs(struct agx_context * ctx,unsigned index_size_B)2178 agx_update_vs(struct agx_context *ctx, unsigned index_size_B)
2179 {
2180    /* Only proceed if the shader or anything the key depends on changes
2181     *
2182     * vb_mask, attributes, vertex_buffers: VERTEX
2183     */
2184    if (!((ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB)) ||
2185          ctx->stage[PIPE_SHADER_TESS_EVAL].dirty ||
2186          ctx->stage[PIPE_SHADER_GEOMETRY].dirty ||
2187          ctx->stage[PIPE_SHADER_TESS_EVAL].shader ||
2188          ctx->stage[PIPE_SHADER_GEOMETRY].shader || ctx->in_tess))
2189       return false;
2190 
2191    struct asahi_vs_shader_key key = {
2192       .hw = !((ctx->stage[PIPE_SHADER_TESS_EVAL].shader && !ctx->in_tess) ||
2193               ctx->stage[PIPE_SHADER_GEOMETRY].shader),
2194    };
2195 
2196    agx_update_shader(ctx, &ctx->vs, PIPE_SHADER_VERTEX,
2197                      (union asahi_shader_key *)&key);
2198 
2199    struct agx_device *dev = agx_device(ctx->base.screen);
2200    struct agx_fast_link_key link_key = {
2201       .prolog.vs.hw = key.hw,
2202       .prolog.vs.sw_index_size_B = key.hw ? 0 : index_size_B,
2203 
2204       /* TODO: We could optimize this */
2205       .prolog.vs.robustness.level = AGX_ROBUSTNESS_GL,
2206       .prolog.vs.robustness.soft_fault = agx_has_soft_fault(dev),
2207       .main = ctx->vs,
2208    };
2209 
2210    STATIC_ASSERT(sizeof(link_key.prolog.vs.component_mask) ==
2211                  sizeof(ctx->vs->attrib_components_read));
2212    BITSET_COPY(link_key.prolog.vs.component_mask,
2213                ctx->vs->attrib_components_read);
2214 
2215    memcpy(link_key.prolog.vs.attribs, &ctx->attributes->key,
2216           sizeof(link_key.prolog.vs.attribs));
2217 
2218    void *old = ctx->linked.vs;
2219 
2220    ctx->linked.vs =
2221       asahi_fast_link(ctx, ctx->stage[PIPE_SHADER_VERTEX].shader, &link_key);
2222 
2223    return old != ctx->linked.vs;
2224 }
2225 
2226 static bool
agx_update_tcs(struct agx_context * ctx,const struct pipe_draw_info * info)2227 agx_update_tcs(struct agx_context *ctx, const struct pipe_draw_info *info)
2228 {
2229    assert(info->mode == MESA_PRIM_PATCHES);
2230 
2231    ctx->tcs = _mesa_hash_table_next_entry(
2232                  ctx->stage[PIPE_SHADER_TESS_CTRL].shader->variants, NULL)
2233                  ->data;
2234    return true;
2235 }
2236 
2237 static bool
agx_update_gs(struct agx_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect)2238 agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
2239               const struct pipe_draw_indirect_info *indirect)
2240 {
2241    /* Only proceed if there is a geometry shader. Due to input assembly
2242     * dependence, we don't bother to dirty track right now.
2243     */
2244    if (!ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
2245       ctx->gs = NULL;
2246       return false;
2247    }
2248 
2249    /* Transform feedback always happens via the geometry shader, so look there
2250     * to get the XFB strides.
2251     */
2252    struct agx_uncompiled_shader *gs = ctx->stage[PIPE_SHADER_GEOMETRY].shader;
2253 
2254    for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2255       struct agx_streamout_target *tgt =
2256          agx_so_target(ctx->streamout.targets[i]);
2257 
2258       if (tgt != NULL)
2259          tgt->stride = gs->xfb_strides[i];
2260    }
2261 
2262    struct asahi_gs_shader_key key = {
2263       .rasterizer_discard = ctx->rast->base.rasterizer_discard,
2264    };
2265 
2266    return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY,
2267                             (union asahi_shader_key *)&key);
2268 }
2269 
2270 static enum pipe_blendfactor
optimize_blend_factor_w_1(enum pipe_blendfactor f)2271 optimize_blend_factor_w_1(enum pipe_blendfactor f)
2272 {
2273    if (f == PIPE_BLENDFACTOR_SRC_ALPHA)
2274       return PIPE_BLENDFACTOR_ONE;
2275    else if (f == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
2276       return PIPE_BLENDFACTOR_ZERO;
2277    else
2278       return f;
2279 }
2280 
2281 static bool
agx_update_fs(struct agx_batch * batch)2282 agx_update_fs(struct agx_batch *batch)
2283 {
2284    struct agx_context *ctx = batch->ctx;
2285 
2286    /* Only proceed if the shader or anything the key depends on changes
2287     *
2288     * batch->key: implicitly dirties everything, no explicit check
2289     * rast: RS
2290     * blend: BLEND
2291     * sample_mask: SAMPLE_MASK
2292     * reduced_prim: PRIM
2293     */
2294    if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_FS_PROG | AGX_DIRTY_RS |
2295                        AGX_DIRTY_BLEND | AGX_DIRTY_SAMPLE_MASK |
2296                        AGX_DIRTY_PRIM | AGX_DIRTY_QUERY)))
2297       return false;
2298 
2299    struct agx_device *dev = agx_device(ctx->base.screen);
2300    unsigned nr_samples = util_framebuffer_get_num_samples(&batch->key);
2301 
2302    /* Get main shader */
2303    struct asahi_fs_shader_key key = {0};
2304 
2305    if (ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.uses_fbfetch) {
2306       key.nr_samples = nr_samples;
2307 
2308       for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
2309          struct pipe_surface *surf = batch->key.cbufs[i];
2310 
2311          key.rt_formats[i] = surf ? surf->format : PIPE_FORMAT_NONE;
2312       }
2313    }
2314 
2315    agx_update_shader(ctx, &ctx->fs, PIPE_SHADER_FRAGMENT,
2316                      (union asahi_shader_key *)&key);
2317 
2318    /* Fast link with prolog/epilog */
2319    bool msaa = ctx->rast->base.multisample;
2320    unsigned sample_mask = ctx->sample_mask & BITFIELD_MASK(nr_samples);
2321 
2322    struct agx_fast_link_key link_key = {
2323       .prolog.fs.statistics =
2324          ctx->pipeline_statistics[PIPE_STAT_QUERY_PS_INVOCATIONS],
2325 
2326       .prolog.fs.cull_distance_size =
2327          ctx->stage[MESA_SHADER_VERTEX].shader->info.cull_distance_size,
2328 
2329       .prolog.fs.polygon_stipple =
2330          ctx->rast->base.poly_stipple_enable &&
2331          rast_prim(batch->reduced_prim, ctx->rast->base.fill_front) ==
2332             MESA_PRIM_TRIANGLES,
2333 
2334       .prolog.fs.api_sample_mask =
2335          (msaa && nr_samples > 1 && sample_mask != BITFIELD_MASK(nr_samples))
2336             ? sample_mask
2337             : 0xff,
2338 
2339       .epilog.fs.nr_samples = nr_samples,
2340       .epilog.fs.link = ctx->fs->epilog_key,
2341       .epilog.fs.force_small_tile = dev->debug & AGX_DBG_SMALLTILE,
2342 
2343       .main = ctx->fs,
2344       .nr_samples_shaded = ctx->fs->epilog_key.sample_shading ? nr_samples : 0,
2345    };
2346 
2347    for (unsigned i = 0; i < PIPE_MAX_COLOR_BUFS; ++i) {
2348       struct pipe_surface *surf = batch->key.cbufs[i];
2349 
2350       link_key.epilog.fs.rt_formats[i] = surf ? surf->format : PIPE_FORMAT_NONE;
2351    }
2352 
2353    memcpy(&link_key.epilog.fs.blend, &ctx->blend->key,
2354           sizeof(link_key.epilog.fs.blend));
2355 
2356    /* Normalize */
2357    if (!agx_tilebuffer_spills(&batch->tilebuffer_layout))
2358       link_key.epilog.fs.link.rt_spill_base = 0;
2359 
2360    /* Try to disable blending to get rid of some fsats */
2361    if (link_key.epilog.fs.link.rt0_w_1) {
2362       struct agx_blend_rt_key *k = &link_key.epilog.fs.blend.rt[0];
2363 
2364       k->rgb_src_factor = optimize_blend_factor_w_1(k->rgb_src_factor);
2365       k->rgb_dst_factor = optimize_blend_factor_w_1(k->rgb_dst_factor);
2366 
2367       k->alpha_src_factor = optimize_blend_factor_w_1(k->alpha_src_factor);
2368       k->alpha_dst_factor = optimize_blend_factor_w_1(k->alpha_dst_factor);
2369    }
2370 
2371    link_key.epilog.fs.blend.alpha_to_coverage &= msaa;
2372 
2373    /* The main shader must not run tests if the epilog will */
2374    bool epilog_discards = link_key.epilog.fs.blend.alpha_to_coverage;
2375    batch->uniforms.no_epilog_discard = !epilog_discards ? ~0 : 0;
2376 
2377    bool prolog_discards = (link_key.prolog.fs.api_sample_mask != 0xff ||
2378                            link_key.prolog.fs.cull_distance_size ||
2379                            link_key.prolog.fs.polygon_stipple);
2380 
2381    /* The prolog runs tests if neither the main shader nor epilog will */
2382    link_key.prolog.fs.run_zs_tests = !ctx->fs->b.info.writes_sample_mask &&
2383                                      !epilog_discards && prolog_discards;
2384 
2385    if (link_key.prolog.fs.cull_distance_size)
2386       link_key.prolog.fs.cf_base = ctx->fs->b.info.varyings.fs.nr_cf;
2387 
2388    void *old = ctx->linked.fs;
2389 
2390    ctx->linked.fs =
2391       asahi_fast_link(ctx, ctx->stage[PIPE_SHADER_FRAGMENT].shader, &link_key);
2392 
2393    return old != ctx->linked.fs;
2394 }
2395 
2396 static void
agx_bind_shader_state(struct pipe_context * pctx,void * cso,enum pipe_shader_type stage)2397 agx_bind_shader_state(struct pipe_context *pctx, void *cso,
2398                       enum pipe_shader_type stage)
2399 {
2400    struct agx_context *ctx = agx_context(pctx);
2401 
2402    if (stage == PIPE_SHADER_VERTEX)
2403       ctx->dirty |= AGX_DIRTY_VS_PROG;
2404    else if (stage == PIPE_SHADER_FRAGMENT)
2405       ctx->dirty |= AGX_DIRTY_FS_PROG;
2406    else
2407       ctx->stage[stage].dirty = ~0;
2408 
2409    ctx->stage[stage].shader = cso;
2410 }
2411 
2412 static void
agx_bind_vs_state(struct pipe_context * pctx,void * cso)2413 agx_bind_vs_state(struct pipe_context *pctx, void *cso)
2414 {
2415    agx_bind_shader_state(pctx, cso, PIPE_SHADER_VERTEX);
2416 }
2417 
2418 static void
agx_bind_fs_state(struct pipe_context * pctx,void * cso)2419 agx_bind_fs_state(struct pipe_context *pctx, void *cso)
2420 {
2421    agx_bind_shader_state(pctx, cso, PIPE_SHADER_FRAGMENT);
2422 }
2423 
2424 static void
agx_bind_gs_state(struct pipe_context * pctx,void * cso)2425 agx_bind_gs_state(struct pipe_context *pctx, void *cso)
2426 {
2427    agx_bind_shader_state(pctx, cso, PIPE_SHADER_GEOMETRY);
2428 }
2429 
2430 static void
agx_bind_tcs_state(struct pipe_context * pctx,void * cso)2431 agx_bind_tcs_state(struct pipe_context *pctx, void *cso)
2432 {
2433    agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_CTRL);
2434 }
2435 
2436 static void
agx_bind_tes_state(struct pipe_context * pctx,void * cso)2437 agx_bind_tes_state(struct pipe_context *pctx, void *cso)
2438 {
2439    agx_bind_shader_state(pctx, cso, PIPE_SHADER_TESS_EVAL);
2440 }
2441 
2442 static void
agx_bind_cs_state(struct pipe_context * pctx,void * cso)2443 agx_bind_cs_state(struct pipe_context *pctx, void *cso)
2444 {
2445    agx_bind_shader_state(pctx, cso, PIPE_SHADER_COMPUTE);
2446 }
2447 
2448 /* Forward declare because of the recursion hit with geometry shaders */
2449 static void agx_delete_uncompiled_shader(struct agx_device *dev,
2450                                          struct agx_uncompiled_shader *so);
2451 
2452 static void
agx_delete_compiled_shader(struct agx_device * dev,struct agx_compiled_shader * so)2453 agx_delete_compiled_shader(struct agx_device *dev,
2454                            struct agx_compiled_shader *so)
2455 {
2456    if (so->gs_count)
2457       agx_delete_compiled_shader(dev, so->gs_count);
2458 
2459    if (so->pre_gs)
2460       agx_delete_compiled_shader(dev, so->pre_gs);
2461 
2462    if (so->gs_copy)
2463       agx_delete_compiled_shader(dev, so->gs_copy);
2464 
2465    agx_bo_unreference(dev, so->bo);
2466    FREE(so);
2467 }
2468 
2469 static void
agx_delete_uncompiled_shader(struct agx_device * dev,struct agx_uncompiled_shader * so)2470 agx_delete_uncompiled_shader(struct agx_device *dev,
2471                              struct agx_uncompiled_shader *so)
2472 {
2473    hash_table_foreach(so->variants, ent) {
2474       agx_delete_compiled_shader(dev, ent->data);
2475    }
2476 
2477    _mesa_hash_table_destroy(so->variants, NULL);
2478    blob_finish(&so->serialized_nir);
2479    blob_finish(&so->early_serialized_nir);
2480 
2481    for (unsigned i = 0; i < MESA_PRIM_COUNT; ++i) {
2482       for (unsigned j = 0; j < 3; ++j) {
2483          for (unsigned k = 0; k < 2; ++k) {
2484             if (so->passthrough_progs[i][j][k])
2485                agx_delete_uncompiled_shader(dev,
2486                                             so->passthrough_progs[i][j][k]);
2487          }
2488       }
2489    }
2490 
2491    for (unsigned i = 0; i < ARRAY_SIZE(so->passthrough_tcs); ++i) {
2492       if (so->passthrough_tcs[i])
2493          agx_delete_uncompiled_shader(dev, so->passthrough_tcs[i]);
2494    }
2495 
2496    ralloc_free(so);
2497 }
2498 
2499 static void
agx_delete_shader_state(struct pipe_context * ctx,void * cso)2500 agx_delete_shader_state(struct pipe_context *ctx, void *cso)
2501 {
2502    struct agx_device *dev = agx_device(ctx->screen);
2503    agx_delete_uncompiled_shader(dev, cso);
2504 }
2505 
2506 struct agx_generic_meta_key {
2507    meta_shader_builder_t builder;
2508    size_t key_size;
2509    uint8_t key[];
2510 };
2511 
2512 static uint32_t
meta_key_hash(const void * key_)2513 meta_key_hash(const void *key_)
2514 {
2515    const struct agx_generic_meta_key *key = key_;
2516 
2517    return _mesa_hash_data(key,
2518                           sizeof(struct agx_generic_meta_key) + key->key_size);
2519 }
2520 
2521 static bool
meta_key_equal(const void * a_,const void * b_)2522 meta_key_equal(const void *a_, const void *b_)
2523 {
2524    const struct agx_generic_meta_key *a = a_;
2525    const struct agx_generic_meta_key *b = b_;
2526 
2527    return a->builder == b->builder && a->key_size == b->key_size &&
2528           memcmp(a->key, b->key, a->key_size) == 0;
2529 }
2530 
2531 void
agx_init_meta_shaders(struct agx_context * ctx)2532 agx_init_meta_shaders(struct agx_context *ctx)
2533 {
2534    ctx->generic_meta =
2535       _mesa_hash_table_create(ctx, meta_key_hash, meta_key_equal);
2536 }
2537 
2538 void
agx_destroy_meta_shaders(struct agx_context * ctx)2539 agx_destroy_meta_shaders(struct agx_context *ctx)
2540 {
2541    struct agx_device *dev = agx_device(ctx->base.screen);
2542    hash_table_foreach(ctx->generic_meta, ent) {
2543       agx_delete_compiled_shader(dev, ent->data);
2544    }
2545 
2546    _mesa_hash_table_destroy(ctx->generic_meta, NULL);
2547 }
2548 
2549 static struct agx_compiled_shader *
agx_build_meta_shader_internal(struct agx_context * ctx,meta_shader_builder_t builder,void * data,size_t data_size,bool prolog,bool epilog,unsigned cf_base,bool internal_kernel)2550 agx_build_meta_shader_internal(struct agx_context *ctx,
2551                                meta_shader_builder_t builder, void *data,
2552                                size_t data_size, bool prolog, bool epilog,
2553                                unsigned cf_base, bool internal_kernel)
2554 {
2555    /* Build the meta shader key */
2556    size_t total_key_size = sizeof(struct agx_generic_meta_key) + data_size;
2557    struct agx_generic_meta_key *key = alloca(total_key_size);
2558 
2559    *key = (struct agx_generic_meta_key){
2560       .builder = builder,
2561       .key_size = data_size,
2562    };
2563 
2564    if (data_size)
2565       memcpy(key->key, data, data_size);
2566 
2567    /* Try to get the cached shader */
2568    struct hash_entry *ent = _mesa_hash_table_search(ctx->generic_meta, key);
2569    if (ent)
2570       return ent->data;
2571 
2572    /* Otherwise, compile the shader fresh */
2573    nir_builder b = nir_builder_init_simple_shader(
2574       MESA_SHADER_COMPUTE, &agx_nir_options, "AGX meta shader");
2575 
2576    builder(&b, data);
2577 
2578    struct agx_device *dev = agx_device(ctx->base.screen);
2579    if (!prolog) {
2580       /* We need to link libagx and assign shared before preprocessing, matching
2581        * what the driver would otherwise produce.
2582        */
2583       agx_link_libagx(b.shader, dev->libagx);
2584 
2585       NIR_PASS(_, b.shader, nir_lower_vars_to_explicit_types,
2586                nir_var_mem_shared, glsl_get_cl_type_size_align);
2587 
2588       NIR_PASS(_, b.shader, nir_lower_explicit_io, nir_var_mem_shared,
2589                nir_address_format_62bit_generic);
2590 
2591       agx_preprocess_nir(b.shader, NULL);
2592       NIR_PASS(_, b.shader, agx_nir_lower_texture);
2593       NIR_PASS(_, b.shader, agx_nir_lower_multisampled_image_store);
2594    }
2595 
2596    struct agx_compiled_shader *shader = agx_compile_nir(
2597       dev, b.shader, NULL, PIPE_SHADER_COMPUTE, internal_kernel,
2598       !prolog && !(b.shader->info.stage == MESA_SHADER_FRAGMENT &&
2599                    b.shader->info.fs.uses_sample_shading),
2600       prolog || epilog, cf_base, NULL);
2601 
2602    ralloc_free(b.shader);
2603 
2604    /* ..and cache it before we return. The key is on the stack right now, so
2605     * clone it before using it as a hash table key. The clone is logically owned
2606     * by the hash table.
2607     */
2608    void *cloned_key = rzalloc_size(ctx->generic_meta, total_key_size);
2609    memcpy(cloned_key, key, total_key_size);
2610 
2611    _mesa_hash_table_insert(ctx->generic_meta, cloned_key, shader);
2612    return shader;
2613 }
2614 
2615 struct agx_compiled_shader *
agx_build_meta_shader(struct agx_context * ctx,meta_shader_builder_t builder,void * data,size_t data_size)2616 agx_build_meta_shader(struct agx_context *ctx, meta_shader_builder_t builder,
2617                       void *data, size_t data_size)
2618 {
2619    return agx_build_meta_shader_internal(ctx, builder, data, data_size, false,
2620                                          false, 0, false);
2621 }
2622 
2623 static unsigned
sampler_count(struct agx_context * ctx,enum pipe_shader_type stage)2624 sampler_count(struct agx_context *ctx, enum pipe_shader_type stage)
2625 {
2626    /* We reserve sampler #0 for txf so add 1 to the API count */
2627    return ctx->stage[stage].sampler_count + 1;
2628 }
2629 
2630 static inline enum agx_sampler_states
translate_sampler_state_count(struct agx_context * ctx,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2631 translate_sampler_state_count(struct agx_context *ctx,
2632                               struct agx_compiled_shader *cs,
2633                               enum pipe_shader_type stage)
2634 {
2635    /* Clamp to binding table maximum, anything larger will be bindless */
2636    return agx_translate_sampler_state_count(MIN2(sampler_count(ctx, stage), 16),
2637                                             ctx->stage[stage].custom_borders);
2638 }
2639 
2640 static uint32_t
agx_nr_tex_descriptors_without_spilled_rts(const struct agx_compiled_shader * cs)2641 agx_nr_tex_descriptors_without_spilled_rts(const struct agx_compiled_shader *cs)
2642 {
2643    if (!cs || !cs->so)
2644       return 0;
2645 
2646    /* 2 descriptors per image, 1 descriptor per texture */
2647    return cs->so->info.nr_bindful_textures +
2648           (2 * cs->so->info.nr_bindful_images);
2649 }
2650 
2651 static uint32_t
agx_nr_tex_descriptors(struct agx_batch * batch,struct agx_compiled_shader * cs)2652 agx_nr_tex_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs)
2653 {
2654    unsigned n = agx_nr_tex_descriptors_without_spilled_rts(cs);
2655 
2656    /* We add on texture/PBE descriptors for spilled render targets */
2657    bool spilled_rt = cs->stage == PIPE_SHADER_FRAGMENT &&
2658                      agx_tilebuffer_spills(&batch->tilebuffer_layout);
2659    if (spilled_rt)
2660       n += (batch->key.nr_cbufs * 2);
2661 
2662    return n;
2663 }
2664 
2665 /*
2666  * For spilled render targets, upload a texture/PBE pair for each surface to
2667  * allow loading/storing to the render target from the shader.
2668  */
2669 static void
agx_upload_spilled_rt_descriptors(struct agx_texture_packed * out,struct agx_batch * batch)2670 agx_upload_spilled_rt_descriptors(struct agx_texture_packed *out,
2671                                   struct agx_batch *batch)
2672 {
2673    for (unsigned rt = 0; rt < batch->key.nr_cbufs; ++rt) {
2674       struct agx_texture_packed *texture = out + (2 * rt);
2675       struct agx_pbe_packed *pbe = (struct agx_pbe_packed *)(texture + 1);
2676 
2677       struct pipe_surface *surf = batch->key.cbufs[rt];
2678       if (!surf)
2679          continue;
2680 
2681       struct agx_resource *rsrc = agx_resource(surf->texture);
2682       struct pipe_image_view view = image_view_for_surface(surf);
2683       struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
2684       sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
2685 
2686       agx_pack_texture(texture, rsrc, surf->format, &sampler_view);
2687       agx_batch_upload_pbe(batch, pbe, &view, false, false, true, true);
2688    }
2689 }
2690 
2691 static void
agx_upload_textures(struct agx_batch * batch,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2692 agx_upload_textures(struct agx_batch *batch, struct agx_compiled_shader *cs,
2693                     enum pipe_shader_type stage)
2694 {
2695    struct agx_context *ctx = batch->ctx;
2696 
2697    /* This can occur for meta shaders */
2698    if (!cs->so) {
2699       batch->texture_count[stage] = 0;
2700       batch->stage_uniforms[stage].texture_base = 0;
2701       return;
2702    }
2703 
2704    unsigned nr_textures = cs->so->info.nr_bindful_textures;
2705 
2706    unsigned nr_active_textures = ctx->stage[stage].texture_count;
2707    unsigned nr_tex_descriptors = agx_nr_tex_descriptors(batch, cs);
2708    unsigned nr_images = cs->so->info.nr_bindful_images;
2709 
2710    struct agx_ptr T_tex = agx_pool_alloc_aligned(
2711       &batch->pool, AGX_TEXTURE_LENGTH * nr_tex_descriptors, 64);
2712 
2713    struct agx_texture_packed *textures = T_tex.cpu;
2714 
2715    for (unsigned i = 0; i < MIN2(nr_textures, nr_active_textures); ++i) {
2716       struct agx_sampler_view *tex = ctx->stage[stage].textures[i];
2717 
2718       if (tex == NULL) {
2719          agx_set_null_texture(&textures[i], T_tex.gpu);
2720          continue;
2721       }
2722 
2723       struct agx_resource *rsrc = tex->rsrc;
2724       agx_batch_reads(batch, tex->rsrc);
2725 
2726       /* Re-emit state because the layout might have changed from under us.
2727        * TODO: optimize this somehow?
2728        */
2729       agx_pack_texture(&tex->desc, rsrc, tex->format, &tex->base);
2730 
2731       textures[i] = tex->desc;
2732    }
2733 
2734    for (unsigned i = nr_active_textures; i < nr_textures; ++i)
2735       agx_set_null_texture(&textures[i], T_tex.gpu);
2736 
2737    for (unsigned i = 0; i < nr_images; ++i) {
2738       /* Image descriptors come in pairs after the textures */
2739       struct agx_texture_packed *texture =
2740          ((struct agx_texture_packed *)T_tex.cpu) + nr_textures + (2 * i);
2741 
2742       struct agx_pbe_packed *pbe = (struct agx_pbe_packed *)(texture + 1);
2743 
2744       if (!(ctx->stage[stage].image_mask & BITFIELD_BIT(i))) {
2745          agx_set_null_texture(texture, T_tex.gpu);
2746          agx_set_null_pbe(pbe, agx_pool_alloc_aligned(&batch->pool, 1, 64).gpu);
2747          continue;
2748       }
2749 
2750       struct pipe_image_view *view = &ctx->stage[stage].images[i];
2751       agx_batch_track_image(batch, view);
2752 
2753       struct pipe_sampler_view sampler_view = util_image_to_sampler_view(view);
2754 
2755       /* For the texture descriptor, lower cubes to 2D arrays. This matches the
2756        * transform done in the compiler. Also, force 2D arrays for internal
2757        * blitter images, this helps reduce shader variants.
2758        */
2759       bool internal = (view->access & PIPE_IMAGE_ACCESS_DRIVER_INTERNAL);
2760 
2761       if (target_is_cube(sampler_view.target) ||
2762           (sampler_view.target == PIPE_TEXTURE_3D && internal))
2763          sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
2764 
2765       agx_pack_texture(texture, agx_resource(view->resource), view->format,
2766                        &sampler_view);
2767       agx_batch_upload_pbe(batch, pbe, view, false, false, false, false);
2768    }
2769 
2770    if (stage == PIPE_SHADER_FRAGMENT &&
2771        agx_tilebuffer_spills(&batch->tilebuffer_layout)) {
2772 
2773       struct agx_texture_packed *out =
2774          ((struct agx_texture_packed *)T_tex.cpu) +
2775          agx_nr_tex_descriptors_without_spilled_rts(cs);
2776 
2777       agx_upload_spilled_rt_descriptors(out, batch);
2778    }
2779 
2780    batch->texture_count[stage] = nr_tex_descriptors;
2781    batch->stage_uniforms[stage].texture_base = T_tex.gpu;
2782 }
2783 
2784 uint16_t
agx_sampler_heap_add(struct agx_device * dev,struct agx_sampler_heap * heap,struct agx_sampler_packed * sampler)2785 agx_sampler_heap_add(struct agx_device *dev, struct agx_sampler_heap *heap,
2786                      struct agx_sampler_packed *sampler)
2787 {
2788    /* Allocate (maximally sized) BO if we haven't already */
2789    if (!heap->bo) {
2790       heap->bo = agx_bo_create(dev, AGX_SAMPLER_HEAP_SIZE * AGX_SAMPLER_LENGTH,
2791                                0, AGX_BO_WRITEBACK, "Sampler heap");
2792 
2793       assert(heap->count == 0);
2794    }
2795 
2796    /* TODO search */
2797 
2798    /* Precondition: there is room in the heap */
2799    assert(heap->count < AGX_SAMPLER_HEAP_SIZE);
2800    struct agx_sampler_packed *samplers = heap->bo->map;
2801    memcpy(samplers + heap->count, sampler, sizeof(*sampler));
2802 
2803    return heap->count++;
2804 }
2805 
2806 static void
agx_upload_samplers(struct agx_batch * batch,struct agx_compiled_shader * cs,enum pipe_shader_type stage)2807 agx_upload_samplers(struct agx_batch *batch, struct agx_compiled_shader *cs,
2808                     enum pipe_shader_type stage)
2809 {
2810    struct agx_context *ctx = batch->ctx;
2811 
2812    unsigned nr_samplers = sampler_count(ctx, stage);
2813    bool custom_borders = ctx->stage[stage].custom_borders;
2814 
2815    size_t sampler_length =
2816       AGX_SAMPLER_LENGTH + (custom_borders ? AGX_BORDER_LENGTH : 0);
2817 
2818    struct agx_ptr T =
2819       agx_pool_alloc_aligned(&batch->pool, sampler_length * nr_samplers, 64);
2820 
2821    /* Sampler #0 is reserved for txf */
2822    agx_pack_txf_sampler(T.cpu);
2823 
2824    /* Remaining samplers are API samplers */
2825    uint8_t *out_sampler = (uint8_t *)T.cpu + sampler_length;
2826    for (unsigned i = 0; i < ctx->stage[stage].sampler_count; ++i) {
2827       struct agx_sampler_state *sampler = ctx->stage[stage].samplers[i];
2828       struct agx_sampler_packed *out = (struct agx_sampler_packed *)out_sampler;
2829 
2830       if (sampler) {
2831          *out = sampler->desc;
2832 
2833          if (custom_borders) {
2834             STATIC_ASSERT(sizeof(sampler->border) == AGX_BORDER_LENGTH);
2835 
2836             memcpy(out_sampler + AGX_SAMPLER_LENGTH, &sampler->border,
2837                    AGX_BORDER_LENGTH);
2838          } else {
2839             assert(!sampler->uses_custom_border && "invalid combination");
2840          }
2841       } else {
2842          memset(out, 0, sampler_length);
2843       }
2844 
2845       out_sampler += sampler_length;
2846    }
2847 
2848    batch->sampler_count[stage] = nr_samplers;
2849    batch->samplers[stage] = T.gpu;
2850 }
2851 
2852 static void
agx_update_descriptors(struct agx_batch * batch,struct agx_compiled_shader * cs)2853 agx_update_descriptors(struct agx_batch *batch, struct agx_compiled_shader *cs)
2854 {
2855    struct agx_context *ctx = batch->ctx;
2856    if (!cs)
2857       return;
2858 
2859    enum pipe_shader_type stage = cs->stage;
2860    if (!ctx->stage[stage].dirty)
2861       return;
2862 
2863    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_CONST)
2864       agx_set_cbuf_uniforms(batch, stage);
2865 
2866    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SSBO)
2867       agx_set_ssbo_uniforms(batch, stage);
2868 
2869    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_IMAGE)
2870       agx_upload_textures(batch, cs, stage);
2871 
2872    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER)
2873       agx_set_sampler_uniforms(batch, stage);
2874 
2875    if (ctx->stage[stage].dirty & AGX_STAGE_DIRTY_SAMPLER)
2876       agx_upload_samplers(batch, cs, stage);
2877 
2878    struct agx_stage_uniforms *unif = &batch->stage_uniforms[stage];
2879 
2880    batch->uniforms.tables[AGX_SYSVAL_STAGE(stage)] =
2881       agx_pool_upload_aligned(&batch->pool, unif, sizeof(*unif), 16);
2882 }
2883 
2884 static void
agx_usc_immediates(struct agx_usc_builder * b,struct agx_batch * batch,struct agx_compiled_shader * cs)2885 agx_usc_immediates(struct agx_usc_builder *b, struct agx_batch *batch,
2886                    struct agx_compiled_shader *cs)
2887 {
2888    unsigned constant_push_ranges =
2889       DIV_ROUND_UP(cs->b.info.immediate_size_16, 64);
2890 
2891    if (cs->b.info.immediate_size_16) {
2892       /* XXX: do ahead of time */
2893       uint64_t ptr =
2894          agx_pool_upload_aligned(&batch->pool, cs->b.info.immediates,
2895                                  cs->b.info.immediate_size_16 * 2, 64);
2896 
2897       for (unsigned range = 0; range < constant_push_ranges; ++range) {
2898          unsigned offset = 64 * range;
2899          assert(offset < cs->b.info.immediate_size_16);
2900 
2901          agx_usc_uniform(b, cs->b.info.immediate_base_uniform + offset,
2902                          MIN2(64, cs->b.info.immediate_size_16 - offset),
2903                          ptr + (offset * 2));
2904       }
2905    }
2906 }
2907 
2908 static uint32_t
agx_build_pipeline(struct agx_batch * batch,struct agx_compiled_shader * cs,struct agx_linked_shader * linked,enum pipe_shader_type phys_stage,unsigned variable_shared_mem,size_t max_subgroups)2909 agx_build_pipeline(struct agx_batch *batch, struct agx_compiled_shader *cs,
2910                    struct agx_linked_shader *linked,
2911                    enum pipe_shader_type phys_stage,
2912                    unsigned variable_shared_mem, size_t max_subgroups)
2913 {
2914    struct agx_context *ctx = batch->ctx;
2915    struct agx_device *dev = agx_device(ctx->base.screen);
2916    unsigned constant_push_ranges =
2917       DIV_ROUND_UP(cs->b.info.immediate_size_16, 64);
2918 
2919    size_t usc_size =
2920       agx_usc_size(constant_push_ranges + cs->push_range_count + 2);
2921 
2922    struct agx_ptr t =
2923       agx_pool_alloc_aligned(&batch->pipeline_pool, usc_size, 64);
2924 
2925    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
2926 
2927    enum pipe_shader_type stage = cs->stage;
2928 
2929    if (batch->texture_count[stage]) {
2930       agx_usc_pack(&b, TEXTURE, cfg) {
2931          cfg.start = 0;
2932          cfg.count =
2933             MIN2(batch->texture_count[stage], AGX_NUM_TEXTURE_STATE_REGS);
2934          cfg.buffer = batch->stage_uniforms[stage].texture_base;
2935       }
2936    }
2937 
2938    if (batch->sampler_count[stage]) {
2939       agx_usc_pack(&b, SAMPLER, cfg) {
2940          cfg.start = 0;
2941          cfg.count = batch->sampler_count[stage];
2942          cfg.buffer = batch->samplers[stage];
2943       }
2944    }
2945 
2946    for (unsigned i = 0; i < cs->push_range_count; ++i) {
2947       unsigned table = cs->push[i].table;
2948       uint64_t table_ptr = batch->uniforms.tables[table];
2949 
2950       /* Params may be omitted if the VS prolog does not read them, but the
2951        * reservation is always there in the API shader just in case.
2952        */
2953       if (table == AGX_SYSVAL_TABLE_PARAMS && !table_ptr)
2954          continue;
2955 
2956       assert(table_ptr);
2957 
2958       agx_usc_uniform(&b, cs->push[i].uniform, cs->push[i].length,
2959                       table_ptr + cs->push[i].offset);
2960    }
2961 
2962    agx_usc_immediates(&b, batch, cs);
2963 
2964    uint32_t max_scratch_size =
2965       MAX2(cs->b.info.scratch_size, cs->b.info.preamble_scratch_size);
2966 
2967    if (max_scratch_size > 0) {
2968       unsigned preamble_size = (cs->b.info.preamble_scratch_size > 0) ? 1 : 0;
2969 
2970       switch (phys_stage) {
2971       case PIPE_SHADER_FRAGMENT:
2972          agx_scratch_alloc(&ctx->scratch_fs, max_scratch_size, max_subgroups);
2973          batch->fs_scratch = true;
2974          batch->fs_preamble_scratch =
2975             MAX2(batch->fs_preamble_scratch, preamble_size);
2976          break;
2977       case PIPE_SHADER_VERTEX:
2978          agx_scratch_alloc(&ctx->scratch_vs, max_scratch_size, max_subgroups);
2979          batch->vs_scratch = true;
2980          batch->vs_preamble_scratch =
2981             MAX2(batch->vs_preamble_scratch, preamble_size);
2982          break;
2983       default:
2984          agx_scratch_alloc(&ctx->scratch_cs, max_scratch_size, max_subgroups);
2985          batch->cs_scratch = true;
2986          batch->cs_preamble_scratch =
2987             MAX2(batch->cs_preamble_scratch, preamble_size);
2988          break;
2989       }
2990    }
2991 
2992    if (stage == PIPE_SHADER_FRAGMENT) {
2993       agx_usc_push_packed(&b, SHARED, &batch->tilebuffer_layout.usc);
2994    } else {
2995       agx_usc_shared_non_fragment(&b, &cs->b.info, variable_shared_mem);
2996    }
2997 
2998    if (linked) {
2999       agx_usc_push_packed(&b, SHADER, linked->shader);
3000       agx_usc_push_packed(&b, REGISTERS, linked->regs);
3001 
3002       if (stage == PIPE_SHADER_FRAGMENT)
3003          agx_usc_push_packed(&b, FRAGMENT_PROPERTIES, linked->fragment_props);
3004    } else {
3005       agx_usc_pack(&b, SHADER, cfg) {
3006          cfg.code =
3007             agx_usc_addr(dev, cs->bo->va->addr + cs->b.info.main_offset);
3008          cfg.unk_2 = 3;
3009       }
3010 
3011       agx_usc_pack(&b, REGISTERS, cfg) {
3012          cfg.register_count = cs->b.info.nr_gprs;
3013          cfg.spill_size = cs->b.info.scratch_size
3014                              ? agx_scratch_get_bucket(cs->b.info.scratch_size)
3015                              : 0;
3016       }
3017    }
3018 
3019    if (cs->b.info.has_preamble) {
3020       agx_usc_pack(&b, PRESHADER, cfg) {
3021          cfg.code =
3022             agx_usc_addr(dev, cs->bo->va->addr + cs->b.info.preamble_offset);
3023       }
3024    } else {
3025       agx_usc_pack(&b, NO_PRESHADER, cfg)
3026          ;
3027    }
3028 
3029    return agx_usc_addr(dev, t.gpu);
3030 }
3031 
3032 static uint32_t
agx_build_internal_usc(struct agx_batch * batch,struct agx_compiled_shader * cs,uint64_t data)3033 agx_build_internal_usc(struct agx_batch *batch, struct agx_compiled_shader *cs,
3034                        uint64_t data)
3035 {
3036    struct agx_device *dev = agx_device(batch->ctx->base.screen);
3037    bool needs_sampler = cs->b.info.uses_txf;
3038    size_t usc_size = agx_usc_size(12 + (needs_sampler ? 1 : 0));
3039 
3040    struct agx_ptr t =
3041       agx_pool_alloc_aligned(&batch->pipeline_pool, usc_size, 64);
3042 
3043    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
3044 
3045    agx_usc_uniform(&b, 0, 4, agx_pool_upload(&batch->pool, &data, 8));
3046    agx_usc_immediates(&b, batch, cs);
3047 
3048    if (needs_sampler) {
3049       /* TODO: deduplicate */
3050       struct agx_ptr t = agx_pool_alloc_aligned(
3051          &batch->pool, sizeof(struct agx_sampler_packed), 64);
3052 
3053       agx_pack_txf_sampler((struct agx_sampler_packed *)t.cpu);
3054 
3055       agx_usc_pack(&b, SAMPLER, cfg) {
3056          cfg.start = 0;
3057          cfg.count = 1;
3058          cfg.buffer = t.gpu;
3059       }
3060    }
3061 
3062    assert(cs->b.info.scratch_size == 0 && "internal kernels don't spill");
3063    assert(cs->b.info.preamble_scratch_size == 0 && "internal doesn't spill");
3064 
3065    unsigned local_size = cs->b.info.local_size;
3066 
3067    agx_usc_pack(&b, SHARED, cfg) {
3068       cfg.layout = AGX_SHARED_LAYOUT_VERTEX_COMPUTE;
3069       cfg.bytes_per_threadgroup = local_size > 0 ? local_size : 65536;
3070       cfg.uses_shared_memory = local_size > 0;
3071    }
3072 
3073    agx_usc_pack(&b, SHADER, cfg) {
3074       cfg.code = agx_usc_addr(dev, cs->bo->va->addr + cs->b.info.main_offset);
3075       cfg.unk_2 = 3;
3076    }
3077 
3078    agx_usc_pack(&b, REGISTERS, cfg) {
3079       cfg.register_count = cs->b.info.nr_gprs;
3080       cfg.spill_size = 0;
3081    }
3082 
3083    if (cs->b.info.has_preamble) {
3084       agx_usc_pack(&b, PRESHADER, cfg) {
3085          cfg.code =
3086             agx_usc_addr(dev, cs->bo->va->addr + cs->b.info.preamble_offset);
3087       }
3088    } else {
3089       agx_usc_pack(&b, NO_PRESHADER, cfg)
3090          ;
3091    }
3092 
3093    return agx_usc_addr(dev, t.gpu);
3094 }
3095 
3096 static void
agx_launch_with_uploaded_data(struct agx_batch * batch,const struct agx_grid * grid,meta_shader_builder_t builder,void * key,size_t key_size,uint64_t data)3097 agx_launch_with_uploaded_data(struct agx_batch *batch,
3098                               const struct agx_grid *grid,
3099                               meta_shader_builder_t builder, void *key,
3100                               size_t key_size, uint64_t data)
3101 {
3102    struct agx_compiled_shader *cs = agx_build_meta_shader_internal(
3103       batch->ctx, builder, key, key_size, false, false, 0, true);
3104 
3105    uint32_t usc = agx_build_internal_usc(batch, cs, data);
3106    agx_launch_internal(batch, grid, cs, PIPE_SHADER_COMPUTE, usc);
3107 }
3108 
3109 void
agx_launch_with_data(struct agx_batch * batch,const struct agx_grid * grid,meta_shader_builder_t builder,void * key,size_t key_size,void * data,size_t data_size)3110 agx_launch_with_data(struct agx_batch *batch, const struct agx_grid *grid,
3111                      meta_shader_builder_t builder, void *key, size_t key_size,
3112                      void *data, size_t data_size)
3113 {
3114    uint64_t upload = agx_pool_upload_aligned(&batch->pool, data, data_size, 4);
3115    agx_launch_with_uploaded_data(batch, grid, builder, key, key_size, upload);
3116 }
3117 
3118 struct asahi_bg_eot
agx_build_bg_eot(struct agx_batch * batch,bool store,bool partial_render)3119 agx_build_bg_eot(struct agx_batch *batch, bool store, bool partial_render)
3120 {
3121    struct agx_context *ctx = batch->ctx;
3122 
3123    /* Construct the key */
3124    struct agx_bg_eot_key key = {.tib = batch->tilebuffer_layout};
3125 
3126    bool needs_textures_for_spilled_rts =
3127       agx_tilebuffer_spills(&batch->tilebuffer_layout) && !partial_render &&
3128       !store;
3129 
3130    for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
3131       struct pipe_surface *surf = batch->key.cbufs[rt];
3132 
3133       if (surf == NULL)
3134          continue;
3135 
3136       if (store) {
3137          /* TODO: Suppress stores to discarded render targets */
3138          key.op[rt] = AGX_EOT_STORE;
3139       } else if (batch->tilebuffer_layout.spilled[rt] && partial_render) {
3140          /* Partial render programs exist only to store/load the tilebuffer to
3141           * main memory. When render targets are already spilled to main memory,
3142           * there's nothing to do.
3143           */
3144          key.op[rt] = AGX_BG_EOT_NONE;
3145       } else {
3146          bool valid = (batch->load & (PIPE_CLEAR_COLOR0 << rt));
3147          bool clear = (batch->clear & (PIPE_CLEAR_COLOR0 << rt));
3148          bool load = valid && !clear;
3149 
3150          /* Don't read back spilled render targets, they're already in memory */
3151          load &= !batch->tilebuffer_layout.spilled[rt];
3152 
3153          /* The background program used for partial renders must always load
3154           * whatever was stored in the mid-frame end-of-tile program.
3155           */
3156          load |= partial_render;
3157 
3158          key.op[rt] = load    ? AGX_BG_LOAD
3159                       : clear ? AGX_BG_CLEAR
3160                               : AGX_BG_EOT_NONE;
3161       }
3162    }
3163 
3164    /* Begin building the pipeline */
3165    size_t usc_size = agx_usc_size(3 + PIPE_MAX_COLOR_BUFS);
3166    struct agx_ptr t =
3167       agx_pool_alloc_aligned(&batch->pipeline_pool, usc_size, 64);
3168    struct agx_usc_builder b = agx_usc_builder(t.cpu, usc_size);
3169 
3170    bool needs_sampler = false;
3171    unsigned uniforms = 0;
3172    unsigned nr_tex = 0;
3173 
3174    for (unsigned rt = 0; rt < PIPE_MAX_COLOR_BUFS; ++rt) {
3175       if (key.op[rt] == AGX_BG_LOAD) {
3176          /* Each reloaded render target is textured */
3177          needs_sampler = true;
3178 
3179          /* Will be uploaded later, this would be clobbered */
3180          if (needs_textures_for_spilled_rts)
3181             continue;
3182 
3183          struct agx_ptr texture =
3184             agx_pool_alloc_aligned(&batch->pool, AGX_TEXTURE_LENGTH, 64);
3185          struct pipe_surface *surf = batch->key.cbufs[rt];
3186          assert(surf != NULL && "cannot load nonexistent attachment");
3187 
3188          struct agx_resource *rsrc = agx_resource(surf->texture);
3189          struct pipe_sampler_view sampler_view = sampler_view_for_surface(surf);
3190 
3191          agx_pack_texture(texture.cpu, rsrc, surf->format, &sampler_view);
3192 
3193          agx_usc_pack(&b, TEXTURE, cfg) {
3194             /* Shifted to match eMRT indexing, could be optimized */
3195             cfg.start = rt * 2;
3196             cfg.count = 1;
3197             cfg.buffer = texture.gpu;
3198          }
3199 
3200          nr_tex = (rt * 2) + 1;
3201       } else if (key.op[rt] == AGX_BG_CLEAR) {
3202          assert(batch->uploaded_clear_color[rt] && "set when cleared");
3203          agx_usc_uniform(&b, 4 + (8 * rt), 8, batch->uploaded_clear_color[rt]);
3204          uniforms = MAX2(uniforms, 4 + (8 * rt) + 8);
3205       } else if (key.op[rt] == AGX_EOT_STORE) {
3206          struct pipe_image_view view =
3207             image_view_for_surface(batch->key.cbufs[rt]);
3208          struct agx_ptr pbe =
3209             agx_pool_alloc_aligned(&batch->pool, AGX_PBE_LENGTH, 256);
3210 
3211          /* The tilebuffer is already in sRGB space if needed. Do not convert */
3212          view.format = util_format_linear(view.format);
3213 
3214          agx_batch_upload_pbe(batch, pbe.cpu, &view, true, true, false, false);
3215 
3216          agx_usc_pack(&b, TEXTURE, cfg) {
3217             cfg.start = rt;
3218             cfg.count = 1;
3219             cfg.buffer = pbe.gpu;
3220          }
3221 
3222          nr_tex = rt + 1;
3223       }
3224    }
3225 
3226    if (needs_textures_for_spilled_rts) {
3227       /* Upload texture/PBE descriptors for each render target so we can clear
3228        * spilled render targets.
3229        */
3230       struct agx_ptr descs = agx_pool_alloc_aligned(
3231          &batch->pool, AGX_TEXTURE_LENGTH * 2 * batch->key.nr_cbufs, 64);
3232       agx_upload_spilled_rt_descriptors(descs.cpu, batch);
3233 
3234       agx_usc_pack(&b, TEXTURE, cfg) {
3235          cfg.start = 0;
3236          cfg.count = 2 * batch->key.nr_cbufs;
3237          cfg.buffer = descs.gpu;
3238       }
3239 
3240       nr_tex = MAX2(nr_tex, 2 * batch->key.nr_cbufs);
3241 
3242       /* Bind the base as u0_u1 for bindless access */
3243       agx_usc_uniform(&b, 0, 4,
3244                       agx_pool_upload_aligned(&batch->pool, &descs.gpu, 8, 8));
3245       uniforms = MAX2(uniforms, 4);
3246    }
3247 
3248    /* All render targets share a sampler */
3249    if (needs_sampler) {
3250       struct agx_ptr sampler =
3251          agx_pool_alloc_aligned(&batch->pool, AGX_SAMPLER_LENGTH, 64);
3252 
3253       agx_pack(sampler.cpu, SAMPLER, cfg) {
3254          cfg.magnify = AGX_FILTER_LINEAR;
3255          cfg.minify = AGX_FILTER_NEAREST;
3256          cfg.mip_filter = AGX_MIP_FILTER_NONE;
3257          cfg.wrap_s = AGX_WRAP_CLAMP_TO_EDGE;
3258          cfg.wrap_t = AGX_WRAP_CLAMP_TO_EDGE;
3259          cfg.wrap_r = AGX_WRAP_CLAMP_TO_EDGE;
3260          cfg.pixel_coordinates = true;
3261          cfg.compare_func = AGX_COMPARE_FUNC_ALWAYS;
3262       }
3263 
3264       agx_usc_pack(&b, SAMPLER, cfg) {
3265          cfg.start = 0;
3266          cfg.count = 1;
3267          cfg.buffer = sampler.gpu;
3268       }
3269    }
3270 
3271    agx_usc_push_packed(&b, SHARED, &batch->tilebuffer_layout.usc);
3272 
3273    /* Get the shader */
3274    key.reserved_preamble = uniforms;
3275    struct agx_device *dev = agx_device(ctx->base.screen);
3276    struct agx_bg_eot_shader *shader = agx_get_bg_eot_shader(&ctx->bg_eot, &key);
3277    agx_batch_add_bo(batch, shader->bo);
3278 
3279    agx_usc_pack(&b, SHADER, cfg) {
3280       cfg.code = agx_usc_addr(dev, shader->ptr);
3281       cfg.unk_2 = 0;
3282    }
3283 
3284    agx_usc_pack(&b, REGISTERS, cfg)
3285       cfg.register_count = shader->info.nr_gprs;
3286 
3287    if (shader->info.has_preamble) {
3288       agx_usc_pack(&b, PRESHADER, cfg) {
3289          cfg.code =
3290             agx_usc_addr(dev, shader->ptr + shader->info.preamble_offset);
3291       }
3292    } else {
3293       agx_usc_pack(&b, NO_PRESHADER, cfg)
3294          ;
3295    }
3296 
3297    struct asahi_bg_eot ret = {.usc = t.gpu};
3298 
3299    agx_pack(&ret.counts, COUNTS, cfg) {
3300       cfg.uniform_register_count = shader->info.push_count;
3301       cfg.preshader_register_count = shader->info.nr_preamble_gprs;
3302       cfg.texture_state_register_count = nr_tex;
3303       cfg.sampler_state_register_count =
3304          agx_translate_sampler_state_count(needs_sampler ? 1 : 0, false);
3305 
3306       if (!store)
3307          cfg.unknown = 0xFFFF;
3308    }
3309 
3310    return ret;
3311 }
3312 
3313 /*
3314  * Return the standard sample positions, packed into a 32-bit word with fixed
3315  * point nibbles for each x/y component of the (at most 4) samples. This is
3316  * suitable for programming the PPP_MULTISAMPLECTL control register.
3317  */
3318 static uint32_t
agx_default_sample_positions(unsigned nr_samples)3319 agx_default_sample_positions(unsigned nr_samples)
3320 {
3321    switch (nr_samples) {
3322    case 1:
3323       return 0x88;
3324    case 2:
3325       return 0x44cc;
3326    case 4:
3327       return 0xeaa26e26;
3328    default:
3329       unreachable("Invalid sample count");
3330    }
3331 }
3332 
3333 void
agx_batch_init_state(struct agx_batch * batch)3334 agx_batch_init_state(struct agx_batch *batch)
3335 {
3336    if (batch->initialized)
3337       return;
3338 
3339    if (agx_batch_is_compute(batch)) {
3340       batch->initialized = true;
3341 
3342       struct agx_context *ctx = batch->ctx;
3343       struct agx_device *dev = agx_device(ctx->base.screen);
3344       uint8_t *out = batch->cdm.current;
3345 
3346       /* See below */
3347       agx_push(out, CDM_BARRIER, cfg) {
3348          cfg.usc_cache_inval = true;
3349          cfg.unk_5 = true;
3350          cfg.unk_6 = true;
3351          cfg.unk_8 = true;
3352          // cfg.unk_11 = true;
3353          // cfg.unk_20 = true;
3354          if (dev->params.num_clusters_total > 1) {
3355             // cfg.unk_24 = true;
3356             if (dev->params.gpu_generation == 13) {
3357                cfg.unk_4 = true;
3358                // cfg.unk_26 = true;
3359             }
3360          }
3361       }
3362 
3363       return;
3364    }
3365 
3366    /* Emit state on the batch that we don't change and so don't dirty track */
3367    uint8_t *out = batch->vdm.current;
3368 
3369    /* Barrier to enforce GPU-CPU coherency, in case this batch is back to back
3370     * with another that caused stale data to be cached and the CPU wrote to it
3371     * in the meantime.
3372     */
3373    agx_push(out, VDM_BARRIER, cfg) {
3374       cfg.usc_cache_inval = true;
3375    }
3376 
3377    struct AGX_PPP_HEADER present = {
3378       .w_clamp = true,
3379       .occlusion_query_2 = true,
3380       .output_unknown = true,
3381       .varying_word_2 = true,
3382       .viewport_count = 1, /* irrelevant */
3383    };
3384 
3385    size_t size = agx_ppp_update_size(&present);
3386    struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 64);
3387    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &present);
3388 
3389    /* clang-format off */
3390    agx_ppp_push(&ppp, W_CLAMP, cfg) cfg.w_clamp = 1e-10;
3391    agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY_2, cfg);
3392    agx_ppp_push(&ppp, OUTPUT_UNKNOWN, cfg);
3393    agx_ppp_push(&ppp, VARYING_2, cfg);
3394    /* clang-format on */
3395 
3396    agx_ppp_fini(&out, &ppp);
3397    batch->vdm.current = out;
3398 
3399    /* Mark it as initialized now, since agx_batch_writes() will check this. */
3400    batch->initialized = true;
3401 
3402    /* Choose a tilebuffer layout given the framebuffer key */
3403    enum pipe_format formats[PIPE_MAX_COLOR_BUFS] = {0};
3404    for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3405       struct pipe_surface *surf = batch->key.cbufs[i];
3406       if (surf)
3407          formats[i] = surf->format;
3408    }
3409 
3410    batch->tilebuffer_layout = agx_build_tilebuffer_layout(
3411       formats, batch->key.nr_cbufs,
3412       util_framebuffer_get_num_samples(&batch->key),
3413       util_framebuffer_get_num_layers(&batch->key) > 1);
3414 
3415    if (agx_device(batch->ctx->base.screen)->debug & AGX_DBG_SMALLTILE)
3416       batch->tilebuffer_layout.tile_size = (struct agx_tile_size){16, 16};
3417 
3418    /* If the layout spilled render targets, we need to decompress those render
3419     * targets to ensure we can write to them.
3420     */
3421    if (agx_tilebuffer_spills(&batch->tilebuffer_layout)) {
3422       for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3423          if (!batch->tilebuffer_layout.spilled[i])
3424             continue;
3425 
3426          struct pipe_surface *surf = batch->key.cbufs[i];
3427          if (!surf)
3428             continue;
3429 
3430          struct agx_resource *rsrc = agx_resource(surf->texture);
3431          struct ail_layout *layout = &rsrc->layout;
3432          unsigned level = surf->u.tex.level;
3433 
3434          if (!ail_is_level_compressed(layout, level))
3435             continue;
3436 
3437          if (true || (rsrc->base.bind & PIPE_BIND_SHARED)) {
3438             struct agx_context *ctx = batch->ctx;
3439             struct agx_device *dev = agx_device(ctx->base.screen);
3440 
3441             perf_debug(dev, "Decompressing in-place");
3442 
3443             if (!batch->cdm.bo)
3444                batch->cdm = agx_encoder_allocate(batch, dev);
3445 
3446             struct agx_ptr data = agx_pool_alloc_aligned(
3447                &batch->pool, sizeof(struct libagx_decompress_push), 64);
3448             struct libagx_decompress_push *push = data.cpu;
3449             agx_fill_decompress_push(push, layout, surf->u.tex.first_layer,
3450                                      level, agx_map_texture_gpu(rsrc, 0));
3451 
3452             struct pipe_sampler_view sampler_view =
3453                sampler_view_for_surface(surf);
3454             sampler_view.target = PIPE_TEXTURE_2D_ARRAY;
3455             struct pipe_image_view view = image_view_for_surface(surf);
3456             agx_pack_texture(&push->compressed, rsrc, surf->format,
3457                              &sampler_view);
3458             agx_batch_upload_pbe(batch, &push->uncompressed, &view, false, true,
3459                                  true, true);
3460 
3461             struct agx_grid grid = agx_grid_direct(
3462                ail_metadata_width_tl(layout, level) * 32,
3463                ail_metadata_height_tl(layout, level),
3464                surf->u.tex.last_layer - surf->u.tex.first_layer + 1, 32, 1, 1);
3465 
3466             struct agx_decompress_key key = {
3467                .nr_samples = layout->sample_count_sa,
3468             };
3469 
3470             agx_launch_with_uploaded_data(batch, &grid, agx_nir_decompress,
3471                                           &key, sizeof(key), data.gpu);
3472          } else {
3473             agx_decompress(batch->ctx, rsrc, "Render target spilled");
3474          }
3475       }
3476    }
3477 
3478    if (batch->key.zsbuf) {
3479       unsigned level = batch->key.zsbuf->u.tex.level;
3480       struct agx_resource *rsrc = agx_resource(batch->key.zsbuf->texture);
3481 
3482       agx_batch_writes(batch, rsrc, level);
3483 
3484       if (rsrc->separate_stencil)
3485          agx_batch_writes(batch, rsrc->separate_stencil, level);
3486    }
3487 
3488    for (unsigned i = 0; i < batch->key.nr_cbufs; ++i) {
3489       if (batch->key.cbufs[i]) {
3490          struct agx_resource *rsrc = agx_resource(batch->key.cbufs[i]->texture);
3491          unsigned level = batch->key.cbufs[i]->u.tex.level;
3492 
3493          if (agx_resource_valid(rsrc, level))
3494             batch->load |= PIPE_CLEAR_COLOR0 << i;
3495 
3496          agx_batch_writes(batch, rsrc, batch->key.cbufs[i]->u.tex.level);
3497       }
3498    }
3499 
3500    /* Set up standard sample positions */
3501    batch->uniforms.ppp_multisamplectl =
3502       agx_default_sample_positions(batch->tilebuffer_layout.nr_samples);
3503 }
3504 
3505 static enum agx_object_type
agx_point_object_type(struct agx_rasterizer * rast)3506 agx_point_object_type(struct agx_rasterizer *rast)
3507 {
3508    return (rast->base.sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT)
3509              ? AGX_OBJECT_TYPE_POINT_SPRITE_UV01
3510              : AGX_OBJECT_TYPE_POINT_SPRITE_UV10;
3511 }
3512 
3513 #define MAX_PPP_UPDATES 2
3514 #define IS_DIRTY(ST)    !!(ctx->dirty & AGX_DIRTY_##ST)
3515 
3516 static uint8_t *
agx_encode_state(struct agx_batch * batch,uint8_t * out)3517 agx_encode_state(struct agx_batch *batch, uint8_t *out)
3518 {
3519    struct agx_context *ctx = batch->ctx;
3520    struct agx_device *dev = agx_device(ctx->base.screen);
3521 
3522    /* If nothing is dirty, encode nothing */
3523    if (!ctx->dirty)
3524       return out;
3525 
3526    struct agx_rasterizer *rast = ctx->rast;
3527    unsigned ppp_updates = 0;
3528 
3529    struct agx_compiled_shader *vs = ctx->vs;
3530    if (ctx->gs)
3531       vs = ctx->gs->gs_copy;
3532 
3533    bool varyings_dirty = false;
3534 
3535    if (IS_DIRTY(VS_PROG) || IS_DIRTY(FS_PROG) || IS_DIRTY(RS) ||
3536        IS_DIRTY(PRIM)) {
3537 
3538       unsigned bindings = ctx->linked.fs->cf.nr_bindings;
3539       if (bindings) {
3540          size_t linkage_size =
3541             AGX_CF_BINDING_HEADER_LENGTH + (bindings * AGX_CF_BINDING_LENGTH);
3542 
3543          struct agx_ptr t =
3544             agx_pool_alloc_aligned(&batch->pipeline_pool, linkage_size, 16);
3545 
3546          agx_link_varyings_vs_fs(t.cpu, &batch->linked_varyings,
3547                                  vs->uvs.user_size, &ctx->linked.fs->cf,
3548                                  ctx->rast->base.flatshade_first ? 0 : 2,
3549                                  (batch->reduced_prim == MESA_PRIM_POINTS)
3550                                     ? ctx->rast->base.sprite_coord_enable
3551                                     : 0,
3552                                  &batch->generate_primitive_id);
3553 
3554          batch->varyings = agx_usc_addr(dev, t.gpu);
3555       } else {
3556          batch->varyings = 0;
3557       }
3558 
3559       varyings_dirty = true;
3560       ppp_updates++;
3561    }
3562 
3563    if (IS_DIRTY(VS) || varyings_dirty) {
3564       agx_push(out, VDM_STATE, cfg) {
3565          cfg.vertex_shader_word_0_present = true;
3566          cfg.vertex_shader_word_1_present = true;
3567          cfg.vertex_outputs_present = true;
3568          cfg.vertex_unknown_present = true;
3569       }
3570 
3571       agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_0, cfg) {
3572          cfg.uniform_register_count = vs->b.info.push_count;
3573          cfg.preshader_register_count = vs->b.info.nr_preamble_gprs;
3574          cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, vs);
3575          cfg.sampler_state_register_count =
3576             translate_sampler_state_count(ctx, vs, vs->stage);
3577       }
3578 
3579       agx_push(out, VDM_STATE_VERTEX_SHADER_WORD_1, cfg) {
3580          cfg.pipeline =
3581             agx_build_pipeline(batch, vs, ctx->gs ? NULL : ctx->linked.vs,
3582                                PIPE_SHADER_VERTEX, 0, 0);
3583       }
3584 
3585       agx_push_packed(out, vs->uvs.vdm, VDM_STATE_VERTEX_OUTPUTS);
3586 
3587       agx_push(out, VDM_STATE_VERTEX_UNKNOWN, cfg) {
3588          cfg.flat_shading_control = ctx->rast->base.flatshade_first
3589                                        ? AGX_VDM_VERTEX_0
3590                                        : AGX_VDM_VERTEX_2;
3591          cfg.unknown_4 = cfg.unknown_5 = ctx->rast->base.rasterizer_discard;
3592 
3593          cfg.generate_primitive_id = batch->generate_primitive_id;
3594       }
3595 
3596       /* Pad up to a multiple of 8 bytes */
3597       memset(out, 0, 4);
3598       out += 4;
3599    }
3600 
3601    struct agx_pool *pool = &batch->pool;
3602 
3603    if ((ctx->dirty & AGX_DIRTY_RS) && ctx->rast->depth_bias) {
3604       agx_upload_depth_bias(batch, &ctx->rast->base);
3605       ctx->dirty |= AGX_DIRTY_SCISSOR_ZBIAS;
3606    }
3607 
3608    if (ctx->dirty & (AGX_DIRTY_VIEWPORT | AGX_DIRTY_SCISSOR_ZBIAS |
3609                      AGX_DIRTY_RS | AGX_DIRTY_VS)) {
3610 
3611       agx_upload_viewport_scissor(pool, batch, &out, ctx->viewport,
3612                                   ctx->rast->base.scissor ? ctx->scissor : NULL,
3613                                   ctx->rast->base.clip_halfz,
3614                                   vs->b.info.nonzero_viewport);
3615    }
3616 
3617    bool is_points = batch->reduced_prim == MESA_PRIM_POINTS;
3618    bool is_lines = batch->reduced_prim == MESA_PRIM_LINES;
3619 
3620    bool object_type_dirty =
3621       IS_DIRTY(PRIM) || (is_points && IS_DIRTY(SPRITE_COORD_MODE));
3622 
3623    bool fragment_face_dirty =
3624       IS_DIRTY(ZS) || IS_DIRTY(STENCIL_REF) || IS_DIRTY(RS);
3625 
3626    enum agx_object_type object_type = is_points  ? agx_point_object_type(rast)
3627                                       : is_lines ? AGX_OBJECT_TYPE_LINE
3628                                                  : AGX_OBJECT_TYPE_TRIANGLE;
3629 
3630    struct AGX_PPP_HEADER dirty = {
3631       .fragment_control =
3632          IS_DIRTY(ZS) || IS_DIRTY(RS) || IS_DIRTY(PRIM) || IS_DIRTY(QUERY),
3633       .fragment_control_2 = IS_DIRTY(FS_PROG) || IS_DIRTY(RS),
3634       .fragment_front_face = fragment_face_dirty,
3635       .fragment_front_face_2 = object_type_dirty || IS_DIRTY(FS_PROG),
3636       .fragment_front_stencil = IS_DIRTY(ZS),
3637       .fragment_back_face = fragment_face_dirty,
3638       .fragment_back_face_2 = object_type_dirty || IS_DIRTY(FS_PROG),
3639       .fragment_back_stencil = IS_DIRTY(ZS),
3640       .output_select = varyings_dirty,
3641       .varying_counts_32 = varyings_dirty,
3642       .varying_counts_16 = varyings_dirty,
3643       .cull = IS_DIRTY(RS),
3644       .cull_2 = varyings_dirty,
3645       .fragment_shader =
3646          IS_DIRTY(FS) || varyings_dirty || IS_DIRTY(SAMPLE_MASK),
3647       .occlusion_query = IS_DIRTY(QUERY),
3648       .output_size = IS_DIRTY(VS_PROG),
3649       .viewport_count = 1, /* irrelevant */
3650    };
3651 
3652    size_t size = agx_ppp_update_size(&dirty);
3653    struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 64);
3654    struct agx_ppp_update ppp = agx_new_ppp_update(T, size, &dirty);
3655 
3656    if (dirty.fragment_control) {
3657       agx_ppp_push(&ppp, FRAGMENT_CONTROL, cfg) {
3658          if (ctx->active_queries && ctx->occlusion_query) {
3659             if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
3660                cfg.visibility_mode = AGX_VISIBILITY_MODE_COUNTING;
3661             else
3662                cfg.visibility_mode = AGX_VISIBILITY_MODE_BOOLEAN;
3663          }
3664 
3665          cfg.stencil_test_enable = ctx->zs->base.stencil[0].enabled;
3666          cfg.two_sided_stencil = ctx->zs->base.stencil[1].enabled;
3667          cfg.depth_bias_enable =
3668             rast->depth_bias && object_type == AGX_OBJECT_TYPE_TRIANGLE;
3669 
3670          /* Always enable scissoring so we may scissor to the viewport (TODO:
3671           * optimize this out if the viewport is the default and the app does
3672           * not use the scissor test)
3673           */
3674          cfg.scissor_enable = true;
3675 
3676          /* This avoids broken derivatives along primitive edges */
3677          cfg.disable_tri_merging = is_lines || is_points;
3678       }
3679    }
3680 
3681    if (dirty.fragment_control_2) {
3682       /* Annoying, rasterizer_discard seems to be ignored (sometimes?) in the
3683        * main fragment control word and has to be combined into the secondary
3684        * word for reliable behaviour.
3685        */
3686       agx_ppp_push_merged(&ppp, FRAGMENT_CONTROL, cfg,
3687                           ctx->linked.fs->fragment_control) {
3688          cfg.tag_write_disable = rast->base.rasterizer_discard;
3689       }
3690    }
3691 
3692    if (dirty.fragment_front_face) {
3693       agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, ctx->zs->depth) {
3694          cfg.stencil_reference = ctx->stencil_ref.ref_value[0];
3695          cfg.line_width = rast->line_width;
3696          cfg.polygon_mode = rast->polygon_mode;
3697       }
3698    }
3699 
3700    if (dirty.fragment_front_face_2)
3701       agx_ppp_fragment_face_2(&ppp, object_type, &ctx->fs->b.info);
3702 
3703    if (dirty.fragment_front_stencil) {
3704       agx_ppp_push_packed(&ppp, ctx->zs->front_stencil.opaque,
3705                           FRAGMENT_STENCIL);
3706    }
3707 
3708    if (dirty.fragment_back_face) {
3709       agx_ppp_push_merged(&ppp, FRAGMENT_FACE, cfg, ctx->zs->depth) {
3710          bool twosided = ctx->zs->base.stencil[1].enabled;
3711          cfg.stencil_reference = ctx->stencil_ref.ref_value[twosided ? 1 : 0];
3712          cfg.line_width = rast->line_width;
3713          cfg.polygon_mode = rast->polygon_mode;
3714       }
3715    }
3716 
3717    if (dirty.fragment_back_face_2)
3718       agx_ppp_fragment_face_2(&ppp, object_type, &ctx->fs->b.info);
3719 
3720    if (dirty.fragment_back_stencil)
3721       agx_ppp_push_packed(&ppp, ctx->zs->back_stencil.opaque, FRAGMENT_STENCIL);
3722 
3723    assert(dirty.varying_counts_32 == dirty.varying_counts_16);
3724    assert(dirty.varying_counts_32 == dirty.output_select);
3725 
3726    if (dirty.output_select) {
3727       agx_ppp_push_merged_blobs(&ppp, AGX_OUTPUT_SELECT_LENGTH, &vs->uvs.osel,
3728                                 &ctx->linked.fs->osel);
3729 
3730       agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_32,
3731                           VARYING_COUNTS);
3732 
3733       agx_ppp_push_packed(&ppp, &batch->linked_varyings.counts_16,
3734                           VARYING_COUNTS);
3735    }
3736 
3737    if (dirty.cull)
3738       agx_ppp_push_packed(&ppp, ctx->rast->cull, CULL);
3739 
3740    if (dirty.cull_2) {
3741       agx_ppp_push(&ppp, CULL_2, cfg) {
3742          cfg.needs_primitive_id = batch->generate_primitive_id;
3743       }
3744    }
3745 
3746    if (dirty.fragment_shader) {
3747       unsigned frag_tex_count = ctx->stage[PIPE_SHADER_FRAGMENT].texture_count;
3748 
3749       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_0, cfg) {
3750          cfg.uniform_register_count = ctx->fs->b.info.push_count;
3751          cfg.preshader_register_count = ctx->fs->b.info.nr_preamble_gprs;
3752          cfg.texture_state_register_count =
3753             agx_nr_tex_descriptors(batch, ctx->fs);
3754          cfg.sampler_state_register_count =
3755             translate_sampler_state_count(ctx, ctx->fs, PIPE_SHADER_FRAGMENT);
3756          cfg.cf_binding_count = ctx->linked.fs->cf.nr_bindings;
3757       }
3758 
3759       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_1, cfg) {
3760          cfg.pipeline = agx_build_pipeline(batch, ctx->fs, ctx->linked.fs,
3761                                            PIPE_SHADER_FRAGMENT, 0, 0);
3762       }
3763 
3764       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_2, cfg) {
3765          cfg.cf_bindings = batch->varyings;
3766       }
3767 
3768       agx_ppp_push(&ppp, FRAGMENT_SHADER_WORD_3, cfg) {
3769          /* XXX: This is wrong */
3770          cfg.unknown = frag_tex_count >= 4;
3771       }
3772    }
3773 
3774    if (dirty.occlusion_query) {
3775       agx_ppp_push(&ppp, FRAGMENT_OCCLUSION_QUERY, cfg) {
3776          if (ctx->active_queries && ctx->occlusion_query) {
3777             cfg.index = agx_get_oq_index(batch, ctx->occlusion_query);
3778          }
3779       }
3780    }
3781 
3782    if (dirty.output_size) {
3783       agx_ppp_push(&ppp, OUTPUT_SIZE, cfg)
3784          cfg.count = vs->uvs.size;
3785    }
3786 
3787    agx_ppp_fini(&out, &ppp);
3788    ppp_updates++;
3789 
3790    assert(ppp_updates <= MAX_PPP_UPDATES);
3791    return out;
3792 }
3793 
3794 static enum agx_primitive
agx_primitive_for_pipe(enum mesa_prim mode)3795 agx_primitive_for_pipe(enum mesa_prim mode)
3796 {
3797    switch (mode) {
3798    case MESA_PRIM_POINTS:
3799       return AGX_PRIMITIVE_POINTS;
3800    case MESA_PRIM_LINES:
3801       return AGX_PRIMITIVE_LINES;
3802    case MESA_PRIM_LINE_STRIP:
3803       return AGX_PRIMITIVE_LINE_STRIP;
3804    case MESA_PRIM_LINE_LOOP:
3805       return AGX_PRIMITIVE_LINE_LOOP;
3806    case MESA_PRIM_TRIANGLES:
3807       return AGX_PRIMITIVE_TRIANGLES;
3808    case MESA_PRIM_TRIANGLE_STRIP:
3809       return AGX_PRIMITIVE_TRIANGLE_STRIP;
3810    case MESA_PRIM_TRIANGLE_FAN:
3811       return AGX_PRIMITIVE_TRIANGLE_FAN;
3812    case MESA_PRIM_QUADS:
3813       return AGX_PRIMITIVE_QUADS;
3814    case MESA_PRIM_QUAD_STRIP:
3815       return AGX_PRIMITIVE_QUAD_STRIP;
3816    default:
3817       unreachable("todo: other primitive types");
3818    }
3819 }
3820 
3821 static uint64_t
agx_index_buffer_rsrc_ptr(struct agx_batch * batch,const struct pipe_draw_info * info,size_t * extent)3822 agx_index_buffer_rsrc_ptr(struct agx_batch *batch,
3823                           const struct pipe_draw_info *info, size_t *extent)
3824 {
3825    assert(!info->has_user_indices && "cannot use user pointers with indirect");
3826 
3827    struct agx_resource *rsrc = agx_resource(info->index.resource);
3828    agx_batch_reads(batch, rsrc);
3829 
3830    *extent = ALIGN_POT(rsrc->layout.size_B, 4);
3831    return rsrc->bo->va->addr;
3832 }
3833 
3834 static uint64_t
agx_index_buffer_direct_ptr(struct agx_batch * batch,const struct pipe_draw_start_count_bias * draw,const struct pipe_draw_info * info,size_t * extent)3835 agx_index_buffer_direct_ptr(struct agx_batch *batch,
3836                             const struct pipe_draw_start_count_bias *draw,
3837                             const struct pipe_draw_info *info, size_t *extent)
3838 {
3839    off_t offset = draw->start * info->index_size;
3840    uint32_t max_extent = draw->count * info->index_size;
3841 
3842    if (!info->has_user_indices) {
3843       uint64_t base = agx_index_buffer_rsrc_ptr(batch, info, extent);
3844 
3845       *extent = ALIGN_POT(MIN2(*extent - offset, max_extent), 4);
3846       return base + offset;
3847    } else {
3848       *extent = ALIGN_POT(max_extent, 4);
3849 
3850       return agx_pool_upload_aligned(&batch->pool,
3851                                      ((uint8_t *)info->index.user) + offset,
3852                                      draw->count * info->index_size, 64);
3853    }
3854 }
3855 
3856 static uint64_t
agx_index_buffer_ptr(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,size_t * extent)3857 agx_index_buffer_ptr(struct agx_batch *batch, const struct pipe_draw_info *info,
3858                      const struct pipe_draw_start_count_bias *draw,
3859                      size_t *extent)
3860 {
3861    if (draw)
3862       return agx_index_buffer_direct_ptr(batch, draw, info, extent);
3863    else
3864       return agx_index_buffer_rsrc_ptr(batch, info, extent);
3865 }
3866 
3867 static void
agx_ensure_cmdbuf_has_space(struct agx_batch * batch,struct agx_encoder * enc,size_t space)3868 agx_ensure_cmdbuf_has_space(struct agx_batch *batch, struct agx_encoder *enc,
3869                             size_t space)
3870 {
3871    bool vdm = enc == &batch->vdm;
3872    assert(vdm || (enc == &batch->cdm));
3873 
3874    size_t link_length =
3875       vdm ? AGX_VDM_STREAM_LINK_LENGTH : AGX_CDM_STREAM_LINK_LENGTH;
3876 
3877    /* Assert that we have space for a link tag */
3878    assert((enc->current + link_length) <= enc->end && "Encoder overflowed");
3879 
3880    /* Always leave room for a link tag, in case we run out of space later,
3881     * plus padding because VDM apparently overreads?
3882     *
3883     * 0x200 is not enough. 0x400 seems to work. 0x800 for safety.
3884     */
3885    space += link_length + 0x800;
3886 
3887    /* If there is room in the command buffer, we're done */
3888    if (likely((enc->end - enc->current) >= space))
3889       return;
3890 
3891    /* Otherwise, we need to allocate a new command buffer. We use memory owned
3892     * by the batch to simplify lifetime management for the BO.
3893     */
3894    size_t size = 65536;
3895    struct agx_ptr T = agx_pool_alloc_aligned(&batch->pool, size, 256);
3896 
3897    /* Jump from the old command buffer to the new command buffer */
3898    if (vdm) {
3899       agx_pack(enc->current, VDM_STREAM_LINK, cfg) {
3900          cfg.target_lo = T.gpu & BITFIELD_MASK(32);
3901          cfg.target_hi = T.gpu >> 32;
3902       }
3903    } else {
3904       agx_pack(enc->current, CDM_STREAM_LINK, cfg) {
3905          cfg.target_lo = T.gpu & BITFIELD_MASK(32);
3906          cfg.target_hi = T.gpu >> 32;
3907       }
3908    }
3909 
3910    /* Swap out the command buffer */
3911    enc->current = T.cpu;
3912    enc->end = enc->current + size;
3913 }
3914 
3915 static void
agx_ia_update(struct agx_batch * batch,const struct pipe_draw_info * info,uint64_t draw,uint64_t ib,uint64_t ib_range_el)3916 agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info,
3917               uint64_t draw, uint64_t ib, uint64_t ib_range_el)
3918 {
3919    struct agx_context *ctx = batch->ctx;
3920    struct agx_device *dev = agx_device(ctx->base.screen);
3921 
3922    struct agx_increment_ia_counters_key key = {
3923       .index_size_B = info->primitive_restart ? info->index_size : 0,
3924    };
3925 
3926    struct libagx_increment_ia_counters args = {
3927       .ia_vertices = agx_get_query_address(
3928          batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]),
3929 
3930       .vs_invocations = agx_get_query_address(
3931          batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]),
3932 
3933       .restart_index = info->restart_index,
3934       .index_buffer = ib,
3935       .index_buffer_range_el = ib_range_el,
3936       .draw = draw,
3937    };
3938 
3939    uint64_t wg_size = key.index_size_B ? 1024 : 1;
3940    struct agx_grid grid = agx_grid_direct(wg_size, 1, 1, wg_size, 1, 1);
3941 
3942    if (!batch->cdm.bo) {
3943       batch->cdm = agx_encoder_allocate(batch, dev);
3944    }
3945 
3946    perf_debug(dev, "Input assembly counters");
3947    agx_launch_with_data(batch, &grid, agx_nir_increment_ia_counters, &key,
3948                         sizeof(key), &args, sizeof(args));
3949 }
3950 
3951 static uint64_t
agx_batch_geometry_state(struct agx_batch * batch)3952 agx_batch_geometry_state(struct agx_batch *batch)
3953 {
3954    struct agx_context *ctx = batch->ctx;
3955 
3956    if (!batch->geometry_state) {
3957       uint32_t size = 128 * 1024 * 1024;
3958 
3959       if (!ctx->heap) {
3960          ctx->heap = pipe_buffer_create(ctx->base.screen, PIPE_BIND_GLOBAL,
3961                                         PIPE_USAGE_DEFAULT, size);
3962       }
3963 
3964       struct agx_geometry_state state = {
3965          .heap = agx_resource(ctx->heap)->bo->va->addr,
3966          .heap_size = size,
3967       };
3968 
3969       agx_batch_writes(batch, agx_resource(ctx->heap), 0);
3970 
3971       batch->geometry_state =
3972          agx_pool_upload_aligned(&batch->pool, &state, sizeof(state), 8);
3973    }
3974 
3975    return batch->geometry_state;
3976 }
3977 
3978 static uint64_t
agx_batch_geometry_params(struct agx_batch * batch,uint64_t input_index_buffer,size_t index_buffer_size_B,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draw,const struct pipe_draw_indirect_info * indirect)3979 agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
3980                           size_t index_buffer_size_B,
3981                           const struct pipe_draw_info *info,
3982                           const struct pipe_draw_start_count_bias *draw,
3983                           const struct pipe_draw_indirect_info *indirect)
3984 {
3985    struct agx_ia_state ia = {
3986       .index_buffer = input_index_buffer,
3987       .index_buffer_range_el = index_buffer_size_B / info->index_size,
3988       .verts_per_instance = draw ? draw->count : 0,
3989    };
3990 
3991    batch->uniforms.input_assembly =
3992       agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
3993 
3994    struct agx_geometry_params params = {
3995       .state = agx_batch_geometry_state(batch),
3996       .indirect_desc = batch->geom_indirect,
3997       .flat_outputs =
3998          batch->ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
3999       .input_topology = info->mode,
4000    };
4001 
4002    for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->streamout.targets); ++i) {
4003       struct agx_streamout_target *so =
4004          agx_so_target(batch->ctx->streamout.targets[i]);
4005       struct agx_resource *rsrc = so ? agx_resource(so->offset) : NULL;
4006 
4007       uint32_t size;
4008       params.xfb_base_original[i] = agx_batch_get_so_address(batch, i, &size);
4009       params.xfb_size[i] = size;
4010 
4011       if (rsrc) {
4012          params.xfb_offs_ptrs[i] = rsrc->bo->va->addr;
4013          agx_batch_writes(batch, rsrc, 0);
4014          batch->incoherent_writes = true;
4015       } else {
4016          params.xfb_offs_ptrs[i] = 0;
4017       }
4018    }
4019 
4020    for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->prims_generated); ++i) {
4021       params.prims_generated_counter[i] =
4022          agx_get_query_address(batch, batch->ctx->prims_generated[i]);
4023    }
4024 
4025    for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->tf_prims_generated); ++i) {
4026       params.xfb_prims_generated_counter[i] =
4027          agx_get_query_address(batch, batch->ctx->tf_prims_generated[i]);
4028    }
4029 
4030    if (batch->ctx->active_queries && batch->ctx->streamout.num_targets > 0) {
4031       for (unsigned i = 0; i < ARRAY_SIZE(batch->ctx->tf_overflow); ++i) {
4032          params.xfb_overflow[i] =
4033             agx_get_query_address(batch, batch->ctx->tf_overflow[i]);
4034       }
4035 
4036       params.xfb_any_overflow =
4037          agx_get_query_address(batch, batch->ctx->tf_any_overflow);
4038    }
4039 
4040    /* Calculate input primitive count for direct draws, and allocate the vertex
4041     * & count buffers. GPU calculates and allocates for indirect draws.
4042     */
4043    unsigned count_buffer_stride = batch->ctx->gs->gs_count_words * 4;
4044    batch->uniforms.vertex_outputs = batch->ctx->vs->b.info.outputs;
4045    params.input_mask = batch->uniforms.vertex_outputs;
4046 
4047    if (indirect) {
4048       params.count_buffer_stride = count_buffer_stride;
4049       batch->uniforms.vertex_output_buffer_ptr =
4050          agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
4051 
4052       params.vs_grid[2] = params.gs_grid[2] = 1;
4053    } else {
4054       params.vs_grid[0] = draw->count;
4055       params.gs_grid[0] =
4056          u_decomposed_prims_for_vertices(info->mode, draw->count);
4057 
4058       params.primitives_log2 = util_logbase2_ceil(params.gs_grid[0]);
4059 
4060       params.input_primitives = params.gs_grid[0] * info->instance_count;
4061 
4062       unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count,
4063                                             batch->uniforms.vertex_outputs);
4064       unsigned size = params.input_primitives * count_buffer_stride;
4065 
4066       if (size) {
4067          params.count_buffer =
4068             agx_pool_alloc_aligned(&batch->pool, size, 4).gpu;
4069       }
4070 
4071       if (vb_size) {
4072          uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
4073          batch->uniforms.vertex_output_buffer_ptr =
4074             agx_pool_upload(&batch->pool, &addr, 8);
4075 
4076          params.input_buffer = addr;
4077       }
4078    }
4079 
4080    return agx_pool_upload_aligned_with_bo(&batch->pool, &params, sizeof(params),
4081                                           8, &batch->geom_params_bo);
4082 }
4083 
4084 static uint64_t
agx_indirect_buffer_ptr(struct agx_batch * batch,const struct pipe_draw_indirect_info * indirect)4085 agx_indirect_buffer_ptr(struct agx_batch *batch,
4086                         const struct pipe_draw_indirect_info *indirect)
4087 {
4088    assert(indirect->buffer && "drawauto already handled");
4089 
4090    struct agx_resource *rsrc = agx_resource(indirect->buffer);
4091    agx_batch_reads(batch, rsrc);
4092    return rsrc->bo->va->addr + indirect->offset;
4093 }
4094 
4095 static void
agx_launch_gs_prerast(struct agx_batch * batch,const struct pipe_draw_info * info,const struct pipe_draw_start_count_bias * draws,const struct pipe_draw_indirect_info * indirect)4096 agx_launch_gs_prerast(struct agx_batch *batch,
4097                       const struct pipe_draw_info *info,
4098                       const struct pipe_draw_start_count_bias *draws,
4099                       const struct pipe_draw_indirect_info *indirect)
4100 {
4101    struct agx_context *ctx = batch->ctx;
4102    struct agx_device *dev = agx_device(ctx->base.screen);
4103    struct agx_compiled_shader *gs = ctx->gs;
4104 
4105    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader->is_xfb_passthrough)
4106       perf_debug(dev, "Transform feedbck");
4107    else
4108       perf_debug(dev, "Geometry shader");
4109 
4110    /* This is a graphics batch, so it may not have had a CDM encoder allocated
4111     * yet. Allocate that so we can start enqueueing compute work.
4112     */
4113    if (!batch->cdm.bo) {
4114       batch->cdm = agx_encoder_allocate(batch, dev);
4115    }
4116 
4117    agx_ensure_cmdbuf_has_space(
4118       batch, &batch->cdm,
4119       8 * (AGX_CDM_LAUNCH_WORD_0_LENGTH + AGX_CDM_LAUNCH_WORD_1_LENGTH +
4120            AGX_CDM_UNK_G14X_LENGTH + AGX_CDM_INDIRECT_LENGTH +
4121            AGX_CDM_GLOBAL_SIZE_LENGTH + AGX_CDM_LOCAL_SIZE_LENGTH +
4122            AGX_CDM_BARRIER_LENGTH));
4123 
4124    assert(!info->primitive_restart && "should have been lowered");
4125 
4126    struct agx_grid grid_vs, grid_gs;
4127 
4128    /* Setup grids */
4129    if (indirect) {
4130       struct agx_gs_setup_indirect_key key = {
4131          .prim = info->mode,
4132       };
4133 
4134       uint64_t ib = 0;
4135       size_t ib_extent = 0;
4136 
4137       if (info->index_size) {
4138          ib = agx_index_buffer_ptr(batch, info, indirect ? NULL : draws,
4139                                    &ib_extent);
4140       }
4141 
4142       struct agx_gs_setup_indirect_params gsi = {
4143          .index_buffer = ib,
4144          .index_buffer_range_el = ib_extent / info->index_size,
4145          .draw = agx_indirect_buffer_ptr(batch, indirect),
4146          .vertex_buffer = batch->uniforms.vertex_output_buffer_ptr,
4147          .ia = batch->uniforms.input_assembly,
4148          .geom = batch->uniforms.geometry_params,
4149          .vs_outputs = batch->uniforms.vertex_outputs,
4150          .index_size_B = info->index_size,
4151       };
4152 
4153       const struct agx_grid grid_setup = agx_grid_direct(1, 1, 1, 1, 1, 1);
4154       agx_launch_with_data(batch, &grid_setup, agx_nir_gs_setup_indirect, &key,
4155                            sizeof(key), &gsi, sizeof(gsi));
4156 
4157       uint64_t gp = batch->uniforms.geometry_params;
4158 
4159       grid_vs = agx_grid_indirect(
4160          gp + offsetof(struct agx_geometry_params, vs_grid), 1, 1, 1);
4161 
4162       grid_gs = agx_grid_indirect(
4163          gp + offsetof(struct agx_geometry_params, gs_grid), 1, 1, 1);
4164    } else {
4165       grid_vs =
4166          agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1);
4167 
4168       grid_gs = agx_grid_direct(
4169          u_decomposed_prims_for_vertices(info->mode, draws->count),
4170          info->instance_count, 1, 64, 1, 1);
4171    }
4172 
4173    /* Launch the vertex shader first */
4174    agx_launch(batch, &grid_vs, ctx->vs, ctx->linked.vs, ctx->vs->stage, 0);
4175 
4176    /* If there is a count shader, launch it and prefix sum the results. */
4177    if (gs->gs_count) {
4178       perf_debug(dev, "Geometry shader count");
4179       agx_launch(batch, &grid_gs, gs->gs_count, NULL, PIPE_SHADER_GEOMETRY, 0);
4180 
4181       unsigned words = gs->gs_count_words;
4182       struct agx_grid grid =
4183          agx_grid_direct(1024 * gs->gs_count_words, 1, 1, 1024, 1, 1);
4184 
4185       agx_launch(batch, &grid,
4186                  agx_build_meta_shader(ctx, agx_nir_prefix_sum_gs, &words,
4187                                        sizeof(words)),
4188                  NULL, PIPE_SHADER_COMPUTE, 0);
4189    }
4190 
4191    /* Pre-GS shader */
4192    struct agx_grid grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
4193    agx_launch(batch, &grid, gs->pre_gs, NULL, PIPE_SHADER_COMPUTE, 0);
4194 
4195    /* Pre-rast geometry shader */
4196    agx_launch(batch, &grid_gs, gs, NULL, PIPE_SHADER_GEOMETRY, 0);
4197 }
4198 
4199 static void
agx_draw_without_restart(struct agx_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draw)4200 agx_draw_without_restart(struct agx_batch *batch,
4201                          const struct pipe_draw_info *info,
4202                          unsigned drawid_offset,
4203                          const struct pipe_draw_indirect_info *indirect,
4204                          const struct pipe_draw_start_count_bias *draw)
4205 {
4206    struct agx_context *ctx = batch->ctx;
4207    struct agx_device *dev = agx_device(ctx->base.screen);
4208 
4209    perf_debug(dev, "Unrolling primitive restart due to GS/XFB");
4210 
4211    agx_batch_init_state(batch);
4212 
4213    size_t ib_extent = 0;
4214    uint64_t ib;
4215 
4216    /* The rest of this function handles only the general case of indirect
4217     * multidraws, so synthesize an indexed indirect draw now if we need one for
4218     * a direct draw (necessarily only one). This unifies the code paths.
4219     */
4220    struct pipe_draw_indirect_info indirect_synthesized = {.draw_count = 1};
4221 
4222    if (!indirect) {
4223       /* Adds in the offset so set to 0 in the desc */
4224       ib = agx_index_buffer_direct_ptr(batch, draw, info, &ib_extent);
4225 
4226       uint32_t desc[5] = {draw->count, info->instance_count, 0,
4227                           draw->index_bias, info->start_instance};
4228 
4229       u_upload_data(ctx->base.const_uploader, 0, sizeof(desc), 4, &desc,
4230                     &indirect_synthesized.offset, &indirect_synthesized.buffer);
4231 
4232       indirect = &indirect_synthesized;
4233    } else {
4234       /* Does not add in offset, the unroll kernel uses the desc's offset */
4235       ib = agx_index_buffer_rsrc_ptr(batch, info, &ib_extent);
4236    }
4237 
4238    /* Next, we unroll the index buffer used by the indirect draw */
4239    if (!batch->cdm.bo)
4240       batch->cdm = agx_encoder_allocate(batch, dev);
4241 
4242    struct agx_unroll_restart_key key = {
4243       .prim = info->mode,
4244       .index_size_B = info->index_size,
4245    };
4246 
4247    /* Allocate output indirect draw descriptors. This is exact. */
4248    struct agx_resource out_draws_rsrc = {0};
4249    struct agx_ptr out_draws = agx_pool_alloc_aligned_with_bo(
4250       &batch->pool, 5 * sizeof(uint32_t) * indirect->draw_count, 4,
4251       &out_draws_rsrc.bo);
4252 
4253    struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
4254    agx_batch_reads(batch, indirect_rsrc);
4255 
4256    struct agx_restart_unroll_params unroll = {
4257       .heap = agx_batch_geometry_state(batch),
4258       .index_buffer = ib,
4259       .out_draws = out_draws.gpu,
4260       .restart_index = info->restart_index,
4261       .index_buffer_size_el = ib_extent / info->index_size,
4262       .flatshade_first = batch->ctx->rast->base.flatshade_first,
4263       .draws = indirect_rsrc->bo->va->addr + indirect->offset,
4264    };
4265 
4266    /* Unroll the index buffer for each draw */
4267    const struct agx_grid grid_setup =
4268       agx_grid_direct(1024 * indirect->draw_count, 1, 1, 1024, 1, 1);
4269 
4270    agx_launch_with_data(batch, &grid_setup, agx_nir_unroll_restart, &key,
4271                         sizeof(key), &unroll, sizeof(unroll));
4272 
4273    /* Now draw the results without restart */
4274    struct pipe_draw_info new_info = {
4275       .mode = u_decomposed_prim(info->mode),
4276       .index_size = info->index_size,
4277       .index.resource = ctx->heap,
4278       .view_mask = info->view_mask,
4279       .increment_draw_id = info->increment_draw_id,
4280       .index_bias_varies = info->index_bias_varies,
4281    };
4282 
4283    struct pipe_draw_indirect_info new_indirect = *indirect;
4284    new_indirect.buffer = &out_draws_rsrc.base;
4285    new_indirect.offset = out_draws.gpu - out_draws_rsrc.bo->va->addr;
4286    new_indirect.stride = 5 * sizeof(uint32_t);
4287 
4288    ctx->active_draw_without_restart = true;
4289    ctx->base.draw_vbo(&ctx->base, &new_info, drawid_offset, &new_indirect, NULL,
4290                       1);
4291    ctx->active_draw_without_restart = false;
4292 }
4293 
4294 static bool
agx_needs_passthrough_gs(struct agx_context * ctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,bool * xfb_only)4295 agx_needs_passthrough_gs(struct agx_context *ctx,
4296                          const struct pipe_draw_info *info,
4297                          const struct pipe_draw_indirect_info *indirect,
4298                          bool *xfb_only)
4299 {
4300    /* If there is already a geometry shader in the pipeline, we do not need to
4301     * apply a passthrough GS of our own.
4302     */
4303    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader)
4304       return false;
4305 
4306    /* Rendering adjacency requires a GS, add a passthrough since we don't have
4307     * one.
4308     */
4309    if (info->mode == MESA_PRIM_LINES_ADJACENCY ||
4310        info->mode == MESA_PRIM_TRIANGLES_ADJACENCY ||
4311        info->mode == MESA_PRIM_TRIANGLE_STRIP_ADJACENCY ||
4312        info->mode == MESA_PRIM_LINE_STRIP_ADJACENCY) {
4313       perf_debug_ctx(ctx, "Using passthrough GS due to adjacency primitives");
4314       return true;
4315    }
4316 
4317    /* TODO: Handle fans properly, we need to plumb a sysval. */
4318    if (info->mode == MESA_PRIM_TRIANGLE_FAN &&
4319        ctx->rast->base.flatshade_first &&
4320        ctx->stage[MESA_SHADER_FRAGMENT].shader->info.inputs_flat_shaded) {
4321 
4322       perf_debug_ctx(ctx, "Using passthrough GS due to first tri fans");
4323       return true;
4324    }
4325 
4326    /* TODO: this is really sloppy, we should add a VDM kernel for this. */
4327    if ((indirect || info->mode == MESA_PRIM_PATCHES) && ctx->active_queries &&
4328        ctx->prims_generated[0]) {
4329       perf_debug_ctx(ctx, "Using passthrough GS due to indirect prim query");
4330       return true;
4331    }
4332 
4333    /* Edge flags are emulated with a geometry shader */
4334    if (has_edgeflags(ctx, info->mode)) {
4335       perf_debug_ctx(ctx, "Using passthrough GS due to edge flags");
4336       return true;
4337    }
4338 
4339    /* Various pipeline statistics are implemented in the pre-GS shader. */
4340    if (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_PRIMITIVES] ||
4341        ctx->pipeline_statistics[PIPE_STAT_QUERY_C_PRIMITIVES] ||
4342        ctx->pipeline_statistics[PIPE_STAT_QUERY_C_INVOCATIONS]) {
4343       perf_debug_ctx(ctx, "Using passthrough GS due to pipeline statistics");
4344       return true;
4345    }
4346 
4347    /* Transform feedback is layered on geometry shaders, so if transform
4348     * feedback is used, we need a GS.
4349     */
4350    struct agx_uncompiled_shader *last_vtx =
4351       ctx->stage[PIPE_SHADER_TESS_EVAL].shader
4352          ?: ctx->stage[PIPE_SHADER_VERTEX].shader;
4353 
4354    if (last_vtx->has_xfb_info && ctx->streamout.num_targets) {
4355       *xfb_only = true;
4356       return true;
4357    }
4358 
4359    /* Otherwise, we don't need one */
4360    return false;
4361 }
4362 
4363 static enum mesa_prim
agx_tess_output_prim(struct agx_uncompiled_shader * tcs,struct agx_uncompiled_shader * tes)4364 agx_tess_output_prim(struct agx_uncompiled_shader *tcs,
4365                      struct agx_uncompiled_shader *tes)
4366 {
4367    if ((tcs && tcs->tess.point_mode) || tes->tess.point_mode) {
4368       return MESA_PRIM_POINTS;
4369    } else if (TESS_PRIMITIVE_ISOLINES ==
4370               MAX2(tcs ? tcs->tess.primitive : 0, tes->tess.primitive)) {
4371       return MESA_PRIM_LINES;
4372    } else {
4373       return MESA_PRIM_TRIANGLES;
4374    }
4375 }
4376 
4377 static struct agx_uncompiled_shader *
agx_get_passthrough_gs(struct agx_context * ctx,struct agx_uncompiled_shader * prev_cso,enum mesa_prim mode,bool xfb_passthrough)4378 agx_get_passthrough_gs(struct agx_context *ctx,
4379                        struct agx_uncompiled_shader *prev_cso,
4380                        enum mesa_prim mode, bool xfb_passthrough)
4381 {
4382    bool edgeflags = has_edgeflags(ctx, mode);
4383 
4384    if (mode == MESA_PRIM_PATCHES) {
4385       mode = agx_tess_output_prim(ctx->stage[MESA_SHADER_TESS_CTRL].shader,
4386                                   ctx->stage[MESA_SHADER_TESS_EVAL].shader);
4387    }
4388 
4389    /* Only handle the polygon mode when edge flags are in use, because
4390     * nir_passthrough_gs doesn't handle transform feedback + polygon mode
4391     * properly. Technically this can break edge flags + transform feedback
4392     * but that's firmly in "doctor, it hurts when I do this" territory, and
4393     * I'm not sure that's even possible to hit. TODO: Reevaluate.
4394     */
4395    unsigned poly_mode =
4396       edgeflags ? ctx->rast->base.fill_front : PIPE_POLYGON_MODE_FILL;
4397 
4398    if (prev_cso->passthrough_progs[mode][poly_mode][edgeflags])
4399       return prev_cso->passthrough_progs[mode][poly_mode][edgeflags];
4400 
4401    struct blob_reader reader;
4402    blob_reader_init(&reader, prev_cso->early_serialized_nir.data,
4403                     prev_cso->early_serialized_nir.size);
4404    nir_shader *prev = nir_deserialize(NULL, &agx_nir_options, &reader);
4405 
4406    nir_shader *gs = nir_create_passthrough_gs(
4407       &agx_nir_options, prev, mode, rast_prim(mode, poly_mode), edgeflags,
4408       false /* force line strip out */);
4409 
4410    ralloc_free(prev);
4411 
4412    struct agx_uncompiled_shader *cso = pipe_shader_from_nir(&ctx->base, gs);
4413    cso->is_xfb_passthrough = xfb_passthrough;
4414    prev_cso->passthrough_progs[mode][poly_mode][edgeflags] = cso;
4415    return cso;
4416 }
4417 
4418 static void
agx_apply_passthrough_gs(struct agx_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws,bool xfb_passthrough)4419 agx_apply_passthrough_gs(struct agx_context *ctx,
4420                          const struct pipe_draw_info *info,
4421                          unsigned drawid_offset,
4422                          const struct pipe_draw_indirect_info *indirect,
4423                          const struct pipe_draw_start_count_bias *draws,
4424                          unsigned num_draws, bool xfb_passthrough)
4425 {
4426    enum pipe_shader_type prev_stage = ctx->stage[PIPE_SHADER_TESS_EVAL].shader
4427                                          ? PIPE_SHADER_TESS_EVAL
4428                                          : PIPE_SHADER_VERTEX;
4429    struct agx_uncompiled_shader *prev_cso = ctx->stage[prev_stage].shader;
4430 
4431    assert(ctx->stage[PIPE_SHADER_GEOMETRY].shader == NULL);
4432 
4433    /* Draw with passthrough */
4434    ctx->base.bind_gs_state(
4435       &ctx->base,
4436       agx_get_passthrough_gs(ctx, prev_cso, info->mode, xfb_passthrough));
4437    ctx->base.draw_vbo(&ctx->base, info, drawid_offset, indirect, draws,
4438                       num_draws);
4439    ctx->base.bind_gs_state(&ctx->base, NULL);
4440 }
4441 
4442 static void
util_draw_multi_unroll_indirect(struct pipe_context * pctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws)4443 util_draw_multi_unroll_indirect(struct pipe_context *pctx,
4444                                 const struct pipe_draw_info *info,
4445                                 const struct pipe_draw_indirect_info *indirect,
4446                                 const struct pipe_draw_start_count_bias *draws)
4447 {
4448    for (unsigned i = 0; i < indirect->draw_count; ++i) {
4449       const struct pipe_draw_indirect_info subindirect = {
4450          .buffer = indirect->buffer,
4451          .count_from_stream_output = indirect->count_from_stream_output,
4452          .offset = indirect->offset + (i * indirect->stride),
4453          .draw_count = 1,
4454       };
4455 
4456       pctx->draw_vbo(pctx, info, i, &subindirect, draws, 1);
4457    }
4458 }
4459 
4460 static void
util_draw_multi_upload_indirect(struct pipe_context * pctx,const struct pipe_draw_info * info,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws)4461 util_draw_multi_upload_indirect(struct pipe_context *pctx,
4462                                 const struct pipe_draw_info *info,
4463                                 const struct pipe_draw_indirect_info *indirect,
4464                                 const struct pipe_draw_start_count_bias *draws)
4465 {
4466    struct pipe_draw_indirect_info indirect_ = *indirect;
4467    u_upload_data(pctx->const_uploader, 0, 4, 4, &indirect->draw_count,
4468                  &indirect_.indirect_draw_count_offset,
4469                  &indirect_.indirect_draw_count);
4470 
4471    pctx->draw_vbo(pctx, info, 0, &indirect_, draws, 1);
4472 }
4473 
4474 static void
agx_upload_draw_params(struct agx_batch * batch,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,const struct pipe_draw_info * info)4475 agx_upload_draw_params(struct agx_batch *batch,
4476                        const struct pipe_draw_indirect_info *indirect,
4477                        const struct pipe_draw_start_count_bias *draws,
4478                        const struct pipe_draw_info *info)
4479 {
4480    if (indirect) {
4481       struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
4482       uint64_t address = indirect_rsrc->bo->va->addr + indirect->offset;
4483       agx_batch_reads(batch, indirect_rsrc);
4484 
4485       /* To implement draw parameters, we use the last 2 words of the
4486        * indirect draw descriptor. Offset by 3 words for indexed draw (5
4487        * total) and 2 words for non-indexed (4 total).  See the layouts of
4488        * indexed vs non-indexed draw descriptors.
4489        *
4490        * This gives us a consistent layout
4491        *
4492        *    uint32_t first_vertex;
4493        *    uint32_t base_instance;
4494        *
4495        * and we can implement load_first_vertex & load_base_instance without
4496        * checking for indexing.
4497        */
4498       uint32_t offset = info->index_size ? 3 : 2;
4499       batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] = address + offset * 4;
4500    } else {
4501       /* Upload just those two words. */
4502       uint32_t params[2] = {
4503          info->index_size ? draws->index_bias : draws->start,
4504          info->start_instance,
4505       };
4506 
4507       batch->uniforms.tables[AGX_SYSVAL_TABLE_PARAMS] =
4508          agx_pool_upload_aligned(&batch->pool, params, sizeof(params), 4);
4509    }
4510 }
4511 
4512 static void
agx_draw_patches(struct agx_context * ctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4513 agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,
4514                  unsigned drawid_offset,
4515                  const struct pipe_draw_indirect_info *indirect,
4516                  const struct pipe_draw_start_count_bias *draws,
4517                  unsigned num_draws)
4518 {
4519    struct agx_device *dev = agx_device(ctx->base.screen);
4520    perf_debug(dev, "Tessellation");
4521 
4522    struct agx_uncompiled_shader *tcs = ctx->stage[MESA_SHADER_TESS_CTRL].shader;
4523    struct agx_uncompiled_shader *tes = ctx->stage[MESA_SHADER_TESS_EVAL].shader;
4524 
4525    assert(tes != NULL && "required with patches");
4526 
4527    unsigned patch_vertices = ctx->patch_vertices;
4528 
4529    /* OpenGL allows omitting the tcs, fill in a passthrough program if needed.
4530     * In principle, we could optimize this case, but I don't think it matters.
4531     */
4532    bool unbind_tcs_when_done = false;
4533    if (!tcs) {
4534       struct agx_uncompiled_shader *vs = ctx->stage[MESA_SHADER_VERTEX].shader;
4535 
4536       assert(patch_vertices >= 1 &&
4537              patch_vertices <= ARRAY_SIZE(vs->passthrough_tcs));
4538 
4539       if (!vs->passthrough_tcs[patch_vertices - 1]) {
4540          struct blob_reader reader;
4541          blob_reader_init(&reader, vs->early_serialized_nir.data,
4542                           vs->early_serialized_nir.size);
4543          nir_shader *vs_nir = nir_deserialize(NULL, &agx_nir_options, &reader);
4544          nir_shader *nir = nir_create_passthrough_tcs(&agx_nir_options, vs_nir,
4545                                                       patch_vertices);
4546          ralloc_free(vs_nir);
4547 
4548          /* Lower the tess level sysvals and gather info, since mesa/st won't do
4549           * either for us.
4550           */
4551          NIR_PASS(_, nir, nir_lower_system_values);
4552 
4553          nir_shader_gather_info(nir, nir_shader_get_entrypoint(nir));
4554 
4555          vs->passthrough_tcs[patch_vertices - 1] =
4556             pipe_shader_from_nir(&ctx->base, nir);
4557       }
4558 
4559       tcs = vs->passthrough_tcs[patch_vertices - 1];
4560       ctx->base.bind_tcs_state(&ctx->base, tcs);
4561       unbind_tcs_when_done = true;
4562    }
4563 
4564    enum tess_primitive_mode mode =
4565       MAX2(tcs->tess.primitive, tes->tess.primitive);
4566    enum gl_tess_spacing spacing = MAX2(tcs->tess.spacing, tes->tess.spacing);
4567 
4568    enum pipe_tess_spacing pspacing = spacing == TESS_SPACING_EQUAL
4569                                         ? PIPE_TESS_SPACING_EQUAL
4570                                      : spacing == TESS_SPACING_FRACTIONAL_ODD
4571                                         ? PIPE_TESS_SPACING_FRACTIONAL_ODD
4572                                         : PIPE_TESS_SPACING_FRACTIONAL_EVEN;
4573 
4574    bool point_mode = MAX2(tcs->tess.point_mode, tes->tess.point_mode);
4575    enum mesa_prim out_prim = agx_tess_output_prim(tcs, tes);
4576 
4577    enum libagx_tess_partitioning partitioning =
4578       (enum libagx_tess_partitioning)pspacing;
4579 
4580    enum libagx_tess_output_primitive prim =
4581       point_mode       ? LIBAGX_TESS_OUTPUT_POINT
4582       : !tes->tess.ccw ? LIBAGX_TESS_OUTPUT_TRIANGLE_CCW
4583                        : LIBAGX_TESS_OUTPUT_TRIANGLE_CW;
4584 
4585    struct agx_bo *draw_bo = NULL;
4586    bool with_counts =
4587       indirect || ctx->stage[MESA_SHADER_GEOMETRY].shader != NULL;
4588    size_t draw_stride =
4589       ((!with_counts && point_mode) ? 4 : 6) * sizeof(uint32_t);
4590 
4591    struct agx_batch *batch = agx_get_batch(ctx);
4592    agx_batch_init_state(batch);
4593 
4594    if (!batch->cdm.bo) {
4595       batch->cdm = agx_encoder_allocate(batch, dev);
4596    }
4597 
4598    uint64_t ib = 0;
4599    size_t ib_extent = 0;
4600 
4601    if (info->index_size)
4602       ib = agx_index_buffer_ptr(batch, info, draws, &ib_extent);
4603 
4604    struct agx_ia_state ia = {
4605       .index_buffer = ib,
4606       .index_buffer_range_el = ib_extent,
4607       .verts_per_instance = draws ? draws->count : 0,
4608    };
4609 
4610    batch->uniforms.input_assembly =
4611       agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8);
4612 
4613    agx_upload_draw_params(batch, indirect, draws, info);
4614 
4615    /* Setup parameters */
4616    uint64_t geom_state = agx_batch_geometry_state(batch);
4617    assert((tcs->tess.output_stride & 3) == 0 && "must be aligned");
4618 
4619    struct libagx_tess_args args = {
4620       .heap = geom_state,
4621       .tcs_stride_el = tcs->tess.output_stride / 4,
4622       .statistic = agx_get_query_address(
4623          batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_DS_INVOCATIONS]),
4624       .input_patch_size = patch_vertices,
4625       .output_patch_size = tcs->tess.output_patch_size,
4626       .tcs_patch_constants = tcs->tess.nr_patch_outputs,
4627       .tcs_per_vertex_outputs = tcs->tess.per_vertex_outputs,
4628       .patch_coord_buffer = agx_resource(ctx->heap)->bo->va->addr,
4629    };
4630 
4631    memcpy(&args.tess_level_outer_default, ctx->default_outer_level,
4632           sizeof(ctx->default_outer_level));
4633    memcpy(&args.tess_level_inner_default, ctx->default_inner_level,
4634           sizeof(ctx->default_inner_level));
4635 
4636    struct agx_grid vs_grid, tcs_grid, tess_grid;
4637    unsigned tess_wg_size = 64;
4638 
4639    agx_upload_vbos(batch);
4640    agx_update_vs(ctx, info->index_size);
4641    agx_update_tcs(ctx, info);
4642    /* XXX */
4643    ctx->stage[PIPE_SHADER_TESS_CTRL].dirty = ~0;
4644    ctx->stage[PIPE_SHADER_TESS_EVAL].dirty = ~0;
4645    agx_update_descriptors(batch, ctx->vs);
4646    agx_update_descriptors(batch, ctx->tcs);
4647    agx_batch_add_bo(batch, ctx->vs->bo);
4648    agx_batch_add_bo(batch, ctx->linked.vs->bo);
4649 
4650    batch->uniforms.vertex_outputs = ctx->vs->b.info.outputs;
4651 
4652    if (indirect == NULL) {
4653       unsigned in_patches = draws->count / patch_vertices;
4654       if (in_patches == 0)
4655          return;
4656 
4657       /* TCS invocation counter increments once per-patch */
4658       agx_query_increment_cpu(
4659          ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS],
4660          in_patches);
4661 
4662       unsigned unrolled_patches = in_patches * info->instance_count;
4663 
4664       uint32_t alloc = 0;
4665       uint32_t tcs_out_offs = alloc;
4666       alloc += unrolled_patches * tcs->tess.output_stride;
4667 
4668       uint32_t patch_coord_offs = alloc;
4669       alloc += unrolled_patches * 4;
4670 
4671       uint32_t count_offs = alloc;
4672       if (with_counts)
4673          alloc += unrolled_patches * sizeof(uint32_t);
4674 
4675       uint32_t draw_offs = alloc;
4676 
4677       if (with_counts) {
4678          alloc += draw_stride;
4679       } else {
4680          /* Padding added because VDM overreads */
4681          alloc +=
4682             (draw_stride * unrolled_patches) + (AGX_VDM_BARRIER_LENGTH + 0x800);
4683       }
4684 
4685       struct agx_ptr blob =
4686          agx_pool_alloc_aligned_with_bo(&batch->pool, alloc, 4, &draw_bo);
4687 
4688       args.tcs_buffer = blob.gpu + tcs_out_offs;
4689       args.patches_per_instance = in_patches;
4690       args.coord_allocs = blob.gpu + patch_coord_offs;
4691       args.nr_patches = unrolled_patches;
4692       args.out_draws = blob.gpu + draw_offs;
4693 
4694       if (with_counts) {
4695          args.counts = blob.gpu + count_offs;
4696       } else {
4697          /* Arrange so we return after all generated draws */
4698          uint8_t *ret =
4699             (uint8_t *)blob.cpu + draw_offs + (draw_stride * unrolled_patches);
4700 
4701          agx_pack(ret, VDM_BARRIER, cfg) {
4702             cfg.returns = true;
4703          }
4704       }
4705 
4706       unsigned vb_size = libagx_tcs_in_size(draws->count * info->instance_count,
4707                                             batch->uniforms.vertex_outputs);
4708       uint64_t addr = agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
4709       batch->uniforms.vertex_output_buffer_ptr =
4710          agx_pool_upload(&batch->pool, &addr, 8);
4711 
4712       vs_grid =
4713          agx_grid_direct(draws->count, info->instance_count, 1, 64, 1, 1);
4714 
4715       tcs_grid = agx_grid_direct(in_patches * tcs->tess.output_patch_size,
4716                                  info->instance_count, 1,
4717                                  tcs->tess.output_patch_size, 1, 1);
4718 
4719       tess_grid = agx_grid_direct(unrolled_patches, 1, 1, tess_wg_size, 1, 1);
4720    } else if (indirect) {
4721       args.tcs_statistic = agx_get_query_address(
4722          batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_HS_INVOCATIONS]);
4723 
4724       args.indirect = agx_indirect_buffer_ptr(batch, indirect);
4725 
4726       /* Allocate 3x indirect global+local grids for VS/TCS/tess */
4727       uint32_t grid_stride = sizeof(uint32_t) * 6;
4728       args.grids = agx_pool_alloc_aligned(&batch->pool, grid_stride * 3, 4).gpu;
4729 
4730       vs_grid = agx_grid_indirect_local(args.grids + 0 * grid_stride);
4731       tcs_grid = agx_grid_indirect_local(args.grids + 1 * grid_stride);
4732       tess_grid = agx_grid_indirect_local(args.grids + 2 * grid_stride);
4733 
4734       args.vertex_outputs = ctx->vs->b.info.outputs;
4735       args.vertex_output_buffer_ptr =
4736          agx_pool_alloc_aligned(&batch->pool, 8, 8).gpu;
4737 
4738       batch->uniforms.vertex_output_buffer_ptr = args.vertex_output_buffer_ptr;
4739 
4740       if (with_counts) {
4741          args.out_draws = agx_pool_alloc_aligned_with_bo(
4742                              &batch->pool, draw_stride, 4, &draw_bo)
4743                              .gpu;
4744       } else {
4745          unreachable("need an extra indirection...");
4746       }
4747    }
4748 
4749    uint64_t state =
4750       agx_pool_upload_aligned(&batch->pool, &args, sizeof(args), 4);
4751 
4752    if (indirect) {
4753       const struct agx_grid indirect_grid = agx_grid_direct(1, 1, 1, 1, 1, 1);
4754       struct agx_tess_setup_indirect_key indirect_key = {
4755          .point_mode = point_mode,
4756          .with_counts = with_counts,
4757       };
4758 
4759       agx_launch_with_uploaded_data(batch, &indirect_grid,
4760                                     agx_nir_tess_setup_indirect, &indirect_key,
4761                                     sizeof(indirect_key), state);
4762    }
4763 
4764    batch->uniforms.tess_params = state;
4765 
4766    agx_launch(batch, &vs_grid, ctx->vs, ctx->linked.vs, PIPE_SHADER_VERTEX, 0);
4767    agx_launch(batch, &tcs_grid, ctx->tcs, NULL, PIPE_SHADER_TESS_CTRL, 0);
4768    batch->uniforms.vertex_output_buffer_ptr = 0;
4769 
4770    struct agx_tessellator_key key = {
4771       .prim = mode,
4772       .output_primitive = prim,
4773       .partitioning = partitioning,
4774    };
4775 
4776    if (with_counts) {
4777       /* Generate counts */
4778       key.mode = LIBAGX_TESS_MODE_COUNT;
4779       agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key,
4780                                     sizeof(key), state);
4781 
4782       /* Prefix sum counts, allocating index buffer space. */
4783       const struct agx_grid prefix_sum_grid =
4784          agx_grid_direct(1024, 1, 1, 1024, 1, 1);
4785 
4786       agx_launch_with_uploaded_data(batch, &prefix_sum_grid,
4787                                     agx_nir_prefix_sum_tess, NULL, 0, state);
4788 
4789       key.mode = LIBAGX_TESS_MODE_WITH_COUNTS;
4790    } else {
4791       key.mode = LIBAGX_TESS_MODE_VDM;
4792    }
4793 
4794    /* Now we can tessellate */
4795    agx_launch_with_uploaded_data(batch, &tess_grid, agx_nir_tessellate, &key,
4796                                  sizeof(key), state);
4797 
4798    /* Run TES as VS */
4799    void *vs_cso = ctx->stage[PIPE_SHADER_VERTEX].shader;
4800    void *tes_cso = ctx->stage[PIPE_SHADER_TESS_EVAL].shader;
4801    ctx->base.bind_vs_state(&ctx->base, tes_cso);
4802    ctx->in_tess = true;
4803    ctx->in_generated_vdm = !with_counts;
4804 
4805    struct pipe_draw_info draw_info = {
4806       .mode = out_prim,
4807       .index_size = with_counts ? 4 : (point_mode ? 0 : 2),
4808       .index.resource = (!with_counts && point_mode) ? NULL : ctx->heap,
4809       .instance_count = 1,
4810       .view_mask = info->view_mask,
4811    };
4812 
4813    /* Wrap the pool allocation in a fake resource for meta-Gallium use */
4814    struct agx_resource indirect_rsrc = {.bo = draw_bo};
4815 
4816    struct pipe_draw_indirect_info copy_indirect = {
4817       .buffer = &indirect_rsrc.base,
4818       .offset = args.out_draws - draw_bo->va->addr,
4819       .stride = draw_stride,
4820       .draw_count = 1,
4821    };
4822 
4823    ctx->base.draw_vbo(&ctx->base, &draw_info, 0, &copy_indirect, NULL, 1);
4824 
4825    /* Restore vertex state */
4826    ctx->base.bind_vs_state(&ctx->base, vs_cso);
4827    ctx->in_generated_vdm = false;
4828    ctx->in_tess = false;
4829 
4830    if (unbind_tcs_when_done) {
4831       ctx->base.bind_tcs_state(&ctx->base, NULL);
4832    }
4833 }
4834 
4835 /*
4836  * From the ARB_texture_barrier spec:
4837  *
4838  *  Specifically, the values of rendered fragments are undefined if any
4839  *  shader stage fetches texels and the same texels are written via fragment
4840  *  shader outputs, even if the reads and writes are not in the same Draw
4841  *  call, unless any of the following exceptions apply:
4842  *
4843  *  - The reads and writes are from/to disjoint sets of texels (after
4844  *    accounting for texture filtering rules).
4845  *
4846  *  - There is only a single read and write of each texel, and the read is in
4847  *    the fragment shader invocation that writes the same texel (e.g. using
4848  *    "texelFetch2D(sampler, ivec2(gl_FragCoord.xy), 0);").
4849  *
4850  *  - If a texel has been written, then in order to safely read the result
4851  *    a texel fetch must be in a subsequent Draw separated by the command
4852  *
4853  *      void TextureBarrier(void);
4854  *
4855  *    TextureBarrier() will guarantee that writes have completed and caches
4856  *    have been invalidated before subsequent Draws are executed."
4857  *
4858  * The wording is subtle, but we are not required to flush implicitly for
4859  * feedback loops, even though we're a tiler. What we are required to do is
4860  * decompress framebuffers involved in feedback loops, because otherwise
4861  * the hardware will race itself with exception #1, where we have a disjoint
4862  * group texels that intersects a compressed tile being written out.
4863  */
4864 static void
agx_legalize_feedback_loops(struct agx_context * ctx)4865 agx_legalize_feedback_loops(struct agx_context *ctx)
4866 {
4867    /* Trust that u_blitter knows what it's doing */
4868    if (ctx->blitter->running)
4869       return;
4870 
4871    for (unsigned stage = 0; stage < ARRAY_SIZE(ctx->stage); ++stage) {
4872       if (!(ctx->stage[stage].dirty & AGX_STAGE_DIRTY_IMAGE))
4873          continue;
4874 
4875       for (unsigned i = 0; i < ctx->stage[stage].texture_count; ++i) {
4876          if (!ctx->stage[stage].textures[i])
4877             continue;
4878 
4879          struct agx_resource *rsrc = ctx->stage[stage].textures[i]->rsrc;
4880 
4881          for (unsigned cb = 0; cb < ctx->framebuffer.nr_cbufs; ++cb) {
4882             if (ctx->framebuffer.cbufs[cb] &&
4883                 agx_resource(ctx->framebuffer.cbufs[cb]->texture) == rsrc) {
4884 
4885                if (rsrc->layout.tiling == AIL_TILING_TWIDDLED_COMPRESSED) {
4886                   /* Decompress if we can and shadow if we can't. */
4887                   if (rsrc->base.bind & PIPE_BIND_SHARED)
4888                      unreachable("TODO");
4889                   else
4890                      agx_decompress(ctx, rsrc, "Texture feedback loop");
4891                }
4892 
4893                /* Not required by the spec, just for debug */
4894                if (agx_device(ctx->base.screen)->debug & AGX_DBG_FEEDBACK)
4895                   agx_flush_writer(ctx, rsrc, "Feedback loop");
4896             }
4897          }
4898       }
4899    }
4900 }
4901 
4902 static void
agx_draw_vbo(struct pipe_context * pctx,const struct pipe_draw_info * info,unsigned drawid_offset,const struct pipe_draw_indirect_info * indirect,const struct pipe_draw_start_count_bias * draws,unsigned num_draws)4903 agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
4904              unsigned drawid_offset,
4905              const struct pipe_draw_indirect_info *indirect,
4906              const struct pipe_draw_start_count_bias *draws, unsigned num_draws)
4907 {
4908    struct agx_context *ctx = agx_context(pctx);
4909 
4910    if (unlikely(!agx_render_condition_check(ctx)))
4911       return;
4912 
4913    if (num_draws > 1) {
4914       util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);
4915       return;
4916    }
4917 
4918    if (indirect && indirect->draw_count > 1 && !indirect->indirect_draw_count) {
4919       assert(drawid_offset == 0);
4920       assert(num_draws == 1);
4921 
4922       util_draw_multi_unroll_indirect(pctx, info, indirect, draws);
4923       return;
4924    }
4925 
4926    if (indirect && indirect->count_from_stream_output) {
4927       agx_draw_vbo_from_xfb(pctx, info, drawid_offset, indirect);
4928       return;
4929    }
4930 
4931    /* TODO: stop cheating */
4932    if (indirect && indirect->indirect_draw_count) {
4933       perf_debug_ctx(ctx, "multi-draw indirect");
4934       util_draw_indirect(pctx, info, drawid_offset, indirect);
4935       return;
4936    }
4937 
4938    bool xfb_passthrough = false;
4939    if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) {
4940       agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
4941                                num_draws, xfb_passthrough);
4942       return;
4943    }
4944 
4945    if (info->mode == MESA_PRIM_PATCHES) {
4946       agx_draw_patches(ctx, info, drawid_offset, indirect, draws, num_draws);
4947       return;
4948    }
4949 
4950    agx_legalize_feedback_loops(ctx);
4951 
4952    /* Only the rasterization stream counts */
4953    if (ctx->active_queries && ctx->prims_generated[0] &&
4954        !ctx->stage[PIPE_SHADER_GEOMETRY].shader) {
4955 
4956       assert(!indirect && "we force a passthrough GS for this");
4957       agx_primitives_update_direct(ctx, info, draws);
4958    }
4959 
4960    struct agx_batch *batch = agx_get_batch(ctx);
4961 
4962    uint64_t ib = 0;
4963    size_t ib_extent = 0;
4964 
4965    if (info->index_size) {
4966       ib =
4967          agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
4968    }
4969 
4970    if (ctx->active_queries && !ctx->active_draw_without_restart &&
4971        (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
4972         ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS])) {
4973 
4974       uint64_t ptr;
4975       if (indirect) {
4976          ptr = agx_indirect_buffer_ptr(batch, indirect);
4977       } else {
4978          uint32_t desc[] = {draws->count, info->instance_count, 0};
4979          ptr = agx_pool_upload(&batch->pool, &desc, sizeof(desc));
4980       }
4981 
4982       agx_ia_update(batch, info, ptr, ib,
4983                     info->index_size ? ib_extent / info->index_size : 1);
4984    }
4985 
4986    if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart &&
4987        info->index_size) {
4988 
4989       agx_draw_without_restart(batch, info, drawid_offset, indirect, draws);
4990       return;
4991    }
4992 
4993    agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
4994 
4995 #ifndef NDEBUG
4996    if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
4997       agx_dirty_all(ctx);
4998 #endif
4999 
5000    agx_batch_init_state(batch);
5001 
5002    /* Dirty track the reduced prim: lines vs points vs triangles. Happens before
5003     * agx_update_vs/agx_update_fs, which specialize based on primitive.
5004     */
5005    enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
5006    if (reduced_prim != batch->reduced_prim)
5007       ctx->dirty |= AGX_DIRTY_PRIM;
5008    batch->reduced_prim = reduced_prim;
5009 
5010    /* Update shaders first so we can use them after */
5011    if (agx_update_vs(ctx, info->index_size)) {
5012       ctx->dirty |= AGX_DIRTY_VS | AGX_DIRTY_VS_PROG;
5013       ctx->stage[PIPE_SHADER_VERTEX].dirty = ~0;
5014 
5015       agx_batch_add_bo(batch, ctx->vs->bo);
5016       if (ctx->linked.vs)
5017          agx_batch_add_bo(batch, ctx->linked.vs->bo);
5018    } else if (ctx->stage[PIPE_SHADER_VERTEX].dirty ||
5019               (ctx->dirty & AGX_DIRTY_VERTEX))
5020       ctx->dirty |= AGX_DIRTY_VS;
5021 
5022    agx_update_gs(ctx, info, indirect);
5023 
5024    if (ctx->gs) {
5025       batch->geom_indirect = agx_pool_alloc_aligned_with_bo(
5026                                 &batch->pool, 64, 4, &batch->geom_indirect_bo)
5027                                 .gpu;
5028 
5029       batch->uniforms.geometry_params =
5030          agx_batch_geometry_params(batch, ib, ib_extent, info, draws, indirect);
5031 
5032       agx_batch_add_bo(batch, ctx->gs->bo);
5033       agx_batch_add_bo(batch, ctx->gs->gs_copy->bo);
5034    }
5035 
5036    if (ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_FS_PROG)) {
5037       struct agx_compiled_shader *vs = ctx->vs;
5038       if (ctx->gs)
5039          vs = ctx->gs->gs_copy;
5040 
5041       agx_assign_uvs(
5042          &batch->linked_varyings, &vs->uvs,
5043          ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
5044          ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded);
5045 
5046       for (unsigned i = 0; i < VARYING_SLOT_MAX; ++i) {
5047          batch->uniforms.uvs_index[i] = batch->linked_varyings.slots[i];
5048       }
5049    }
5050 
5051    /* Set draw ID */
5052    if (ctx->vs->b.info.uses_draw_id) {
5053       batch->uniforms.draw_id = drawid_offset;
5054 
5055       ctx->dirty |= AGX_DIRTY_VS;
5056    }
5057 
5058    if (agx_update_fs(batch)) {
5059       ctx->dirty |= AGX_DIRTY_FS | AGX_DIRTY_FS_PROG;
5060       ctx->stage[PIPE_SHADER_FRAGMENT].dirty = ~0;
5061 
5062       if (ctx->fs->bo)
5063          agx_batch_add_bo(batch, ctx->fs->bo);
5064 
5065       agx_batch_add_bo(batch, ctx->linked.fs->bo);
5066    } else if ((ctx->stage[PIPE_SHADER_FRAGMENT].dirty) ||
5067               (ctx->dirty & (AGX_DIRTY_BLEND_COLOR | AGX_DIRTY_SAMPLE_MASK))) {
5068       ctx->dirty |= AGX_DIRTY_FS;
5069    }
5070 
5071    if (ctx->linked.vs->uses_base_param || ctx->gs) {
5072       agx_upload_draw_params(batch, indirect, draws, info);
5073 
5074       batch->uniforms.is_indexed_draw = (info->index_size > 0);
5075       ctx->dirty |= AGX_DIRTY_VS;
5076    }
5077 
5078    agx_update_descriptors(batch, ctx->vs);
5079    agx_update_descriptors(batch, ctx->gs);
5080    agx_update_descriptors(batch, ctx->fs);
5081 
5082    if (IS_DIRTY(VS) || IS_DIRTY(FS) || ctx->gs || IS_DIRTY(VERTEX) ||
5083        IS_DIRTY(BLEND_COLOR) || IS_DIRTY(QUERY) || IS_DIRTY(POLY_STIPPLE) ||
5084        IS_DIRTY(RS) || IS_DIRTY(PRIM) || ctx->in_tess) {
5085 
5086       if (IS_DIRTY(VERTEX)) {
5087          agx_upload_vbos(batch);
5088       }
5089 
5090       if (IS_DIRTY(BLEND_COLOR)) {
5091          memcpy(batch->uniforms.blend_constant, &ctx->blend_color,
5092                 sizeof(ctx->blend_color));
5093       }
5094 
5095       if (IS_DIRTY(RS)) {
5096          struct pipe_rasterizer_state *rs = &ctx->rast->base;
5097 
5098          batch->uniforms.fixed_point_size =
5099             rs->point_size_per_vertex ? 0.0 : rs->point_size;
5100 
5101          /* TODO: tri fans */
5102          batch->uniforms.provoking_vertex = !rs->flatshade_first ? 2 : 0;
5103       }
5104 
5105       if (IS_DIRTY(QUERY)) {
5106          for (unsigned i = 0; i < ARRAY_SIZE(ctx->pipeline_statistics); ++i) {
5107             struct agx_query *query = ctx->pipeline_statistics[i];
5108             batch->uniforms.pipeline_statistics[i] =
5109                agx_get_query_address(batch, query);
5110          }
5111       }
5112 
5113       if (IS_DIRTY(POLY_STIPPLE)) {
5114          STATIC_ASSERT(sizeof(ctx->poly_stipple) == 32 * 4);
5115 
5116          batch->uniforms.polygon_stipple = agx_pool_upload_aligned(
5117             &batch->pool, ctx->poly_stipple, sizeof(ctx->poly_stipple), 4);
5118       }
5119 
5120       agx_upload_uniforms(batch);
5121    }
5122 
5123    struct pipe_draw_info info_gs;
5124    struct pipe_draw_indirect_info indirect_gs;
5125 
5126    /* Wrap the pool allocation in a fake resource for meta-Gallium use */
5127    struct agx_resource indirect_rsrc = {.bo = batch->geom_indirect_bo};
5128 
5129    if (ctx->gs) {
5130       /* Launch the pre-rasterization parts of the geometry shader */
5131       agx_launch_gs_prerast(batch, info, draws, indirect);
5132 
5133       if (ctx->rast->base.rasterizer_discard)
5134          return;
5135 
5136       /* Setup to rasterize the GS results */
5137       info_gs = (struct pipe_draw_info){
5138          .mode = ctx->gs->gs_output_mode,
5139          .index_size = 4,
5140          .primitive_restart = ctx->gs->gs_output_mode != MESA_PRIM_POINTS,
5141          .restart_index = ~0,
5142          .index.resource = ctx->heap,
5143          .instance_count = 1,
5144          .view_mask = info->view_mask,
5145       };
5146 
5147       indirect_gs = (struct pipe_draw_indirect_info){
5148          .draw_count = 1,
5149          .buffer = &indirect_rsrc.base,
5150          .offset = batch->geom_indirect - indirect_rsrc.bo->va->addr,
5151       };
5152 
5153       info = &info_gs;
5154       indirect = &indirect_gs;
5155 
5156       /* TODO: Deduplicate? */
5157       batch->reduced_prim = u_reduced_prim(info->mode);
5158       ctx->dirty |= AGX_DIRTY_PRIM;
5159 
5160       if (info_gs.index_size) {
5161          ib = agx_resource(ctx->heap)->bo->va->addr;
5162          ib_extent = agx_resource(ctx->heap)->bo->size;
5163       } else {
5164          ib = 0;
5165          ib_extent = 0;
5166       }
5167 
5168       /* We need to reemit geometry descriptors since the txf sampler may change
5169        * between the GS prepass and the GS rast program.
5170        */
5171       agx_update_descriptors(batch, ctx->gs->gs_copy);
5172    }
5173 
5174    assert((!indirect || !indirect->indirect_draw_count) && "multidraw handled");
5175 
5176    /* Update batch masks based on current state */
5177    if (ctx->dirty & AGX_DIRTY_BLEND) {
5178       /* TODO: Any point to tracking load? */
5179       batch->draw |= ctx->blend->store;
5180       batch->resolve |= ctx->blend->store;
5181    }
5182 
5183    if (ctx->dirty & AGX_DIRTY_ZS) {
5184       batch->load |= ctx->zs->load;
5185       batch->draw |= ctx->zs->store;
5186       batch->resolve |= ctx->zs->store;
5187    }
5188 
5189    /* When we approach the end of a command buffer, cycle it out for a new one.
5190     * We only need to do this once per draw as long as we conservatively
5191     * estimate the maximum bytes of VDM commands that this draw will emit.
5192     */
5193    agx_ensure_cmdbuf_has_space(
5194       batch, &batch->vdm,
5195       (AGX_VDM_STATE_LENGTH * 2) + (AGX_PPP_STATE_LENGTH * MAX_PPP_UPDATES) +
5196          AGX_VDM_STATE_RESTART_INDEX_LENGTH +
5197          AGX_VDM_STATE_VERTEX_SHADER_WORD_0_LENGTH +
5198          AGX_VDM_STATE_VERTEX_SHADER_WORD_1_LENGTH +
5199          AGX_VDM_STATE_VERTEX_OUTPUTS_LENGTH +
5200          AGX_VDM_STATE_VERTEX_UNKNOWN_LENGTH + 4 /* padding */ +
5201          AGX_INDEX_LIST_LENGTH + AGX_INDEX_LIST_BUFFER_LO_LENGTH +
5202          AGX_INDEX_LIST_COUNT_LENGTH + AGX_INDEX_LIST_INSTANCES_LENGTH +
5203          AGX_INDEX_LIST_START_LENGTH + AGX_INDEX_LIST_BUFFER_SIZE_LENGTH);
5204 
5205    uint8_t *out = agx_encode_state(batch, batch->vdm.current);
5206 
5207    if (ctx->in_generated_vdm) {
5208       struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
5209       uint64_t address = indirect_rsrc->bo->va->addr + indirect->offset;
5210 
5211       agx_push(out, VDM_STREAM_LINK, cfg) {
5212          cfg.target_lo = address & BITFIELD_MASK(32);
5213          cfg.target_hi = address >> 32;
5214          cfg.with_return = true;
5215       }
5216    } else {
5217 
5218       if (info->index_size && info->primitive_restart) {
5219          agx_push(out, VDM_STATE, cfg)
5220             cfg.restart_index_present = true;
5221 
5222          agx_push(out, VDM_STATE_RESTART_INDEX, cfg)
5223             cfg.value = info->restart_index;
5224       }
5225 
5226       agx_push(out, INDEX_LIST, cfg) {
5227          cfg.primitive = agx_primitive_for_pipe(info->mode);
5228 
5229          if (indirect != NULL) {
5230             cfg.indirect_buffer_present = true;
5231          } else {
5232             cfg.instance_count_present = true;
5233             cfg.index_count_present = true;
5234             cfg.start_present = true;
5235          }
5236 
5237          if (info->index_size) {
5238             cfg.restart_enable = info->primitive_restart;
5239             cfg.index_buffer_hi = (ib >> 32);
5240             cfg.index_size = agx_translate_index_size(info->index_size);
5241             cfg.index_buffer_present = true;
5242             cfg.index_buffer_size_present = true;
5243          }
5244       }
5245 
5246       if (info->index_size) {
5247          agx_push(out, INDEX_LIST_BUFFER_LO, cfg) {
5248             cfg.buffer_lo = ib & BITFIELD_MASK(32);
5249          }
5250       }
5251 
5252       if (indirect) {
5253          struct agx_resource *indirect_rsrc = agx_resource(indirect->buffer);
5254          uint64_t address = indirect_rsrc->bo->va->addr + indirect->offset;
5255 
5256          agx_push(out, INDEX_LIST_INDIRECT_BUFFER, cfg) {
5257             cfg.address_hi = address >> 32;
5258             cfg.address_lo = address & BITFIELD_MASK(32);
5259          }
5260       } else {
5261          agx_push(out, INDEX_LIST_COUNT, cfg)
5262             cfg.count = draws->count;
5263 
5264          agx_push(out, INDEX_LIST_INSTANCES, cfg)
5265             cfg.count = info->instance_count;
5266 
5267          agx_push(out, INDEX_LIST_START, cfg) {
5268             cfg.start = info->index_size ? draws->index_bias : draws->start;
5269          }
5270       }
5271 
5272       if (info->index_size) {
5273          agx_push(out, INDEX_LIST_BUFFER_SIZE, cfg) {
5274             cfg.size = ib_extent;
5275          }
5276       }
5277    }
5278 
5279    batch->vdm.current = out;
5280    assert((batch->vdm.current + AGX_VDM_STREAM_LINK_LENGTH) <= batch->vdm.end &&
5281           "Failed to reserve sufficient space in encoder");
5282    agx_dirty_reset_graphics(ctx);
5283 
5284    assert(batch == agx_get_batch(ctx) && "batch should not change under us");
5285 
5286    batch->draws++;
5287 
5288    /* The scissor/zbias arrays are indexed with 16-bit integers, imposigin a
5289     * maximum of UINT16_MAX descriptors. Flush if the next draw would overflow
5290     */
5291    if (unlikely(
5292           (((batch->scissor.size / AGX_SCISSOR_LENGTH) + AGX_MAX_VIEWPORTS) >
5293            UINT16_MAX) ||
5294           (batch->depth_bias.size / AGX_DEPTH_BIAS_LENGTH) >= UINT16_MAX)) {
5295       agx_flush_batch_for_reason(ctx, batch, "Scissor/depth bias overflow");
5296    } else if (unlikely(batch->draws > 100000)) {
5297       /* Mostly so drawoverhead doesn't OOM */
5298       agx_flush_batch_for_reason(ctx, batch, "Absurd number of draws");
5299    } else if (unlikely(batch->sampler_heap.count >
5300                        (AGX_SAMPLER_HEAP_SIZE - (PIPE_MAX_SAMPLERS * 6)))) {
5301       agx_flush_batch_for_reason(ctx, batch, "Sampler heap overflow");
5302    }
5303 }
5304 
5305 static void
agx_texture_barrier(struct pipe_context * pipe,unsigned flags)5306 agx_texture_barrier(struct pipe_context *pipe, unsigned flags)
5307 {
5308    struct agx_context *ctx = agx_context(pipe);
5309 
5310    /* Framebuffer fetch is coherent, so barriers are a no-op. */
5311    if (flags == PIPE_TEXTURE_BARRIER_FRAMEBUFFER)
5312       return;
5313 
5314    agx_flush_all(ctx, "Texture barrier");
5315 }
5316 
5317 void
agx_launch_internal(struct agx_batch * batch,const struct agx_grid * grid,struct agx_compiled_shader * cs,enum pipe_shader_type stage,uint32_t usc)5318 agx_launch_internal(struct agx_batch *batch, const struct agx_grid *grid,
5319                     struct agx_compiled_shader *cs, enum pipe_shader_type stage,
5320                     uint32_t usc)
5321 {
5322    struct agx_context *ctx = batch->ctx;
5323    struct agx_device *dev = agx_device(ctx->base.screen);
5324 
5325    /* TODO: Ensure space if we allow multiple kernels in a batch */
5326    uint8_t *out = batch->cdm.current;
5327 
5328    agx_push(out, CDM_LAUNCH_WORD_0, cfg) {
5329       cfg.mode = grid->mode;
5330       cfg.uniform_register_count = cs->b.info.push_count;
5331       cfg.preshader_register_count = cs->b.info.nr_preamble_gprs;
5332       cfg.texture_state_register_count = agx_nr_tex_descriptors(batch, cs);
5333       cfg.sampler_state_register_count =
5334          translate_sampler_state_count(ctx, cs, stage);
5335    }
5336 
5337    agx_push(out, CDM_LAUNCH_WORD_1, cfg) {
5338       cfg.pipeline = usc;
5339    }
5340 
5341    /* Added in G14X */
5342    if (dev->params.gpu_generation >= 14 && dev->params.num_clusters_total > 1) {
5343       agx_push(out, CDM_UNK_G14X, cfg)
5344          ;
5345    }
5346 
5347    if (grid->mode == AGX_CDM_MODE_DIRECT) {
5348       agx_push(out, CDM_GLOBAL_SIZE, cfg) {
5349          cfg.x = grid->global[0];
5350          cfg.y = grid->global[1];
5351          cfg.z = grid->global[2];
5352       }
5353    } else {
5354       agx_push(out, CDM_INDIRECT, cfg) {
5355          cfg.address_hi = grid->indirect >> 32;
5356          cfg.address_lo = grid->indirect & BITFIELD64_MASK(32);
5357       }
5358    }
5359 
5360    if (grid->mode != AGX_CDM_MODE_INDIRECT_LOCAL) {
5361       agx_push(out, CDM_LOCAL_SIZE, cfg) {
5362          cfg.x = grid->local[0];
5363          cfg.y = grid->local[1];
5364          cfg.z = grid->local[2];
5365       }
5366    }
5367 
5368    agx_push(out, CDM_BARRIER, cfg) {
5369       cfg.unk_5 = true;
5370       cfg.unk_6 = true;
5371       cfg.unk_8 = true;
5372       // cfg.unk_11 = true;
5373       // cfg.unk_20 = true;
5374       if (dev->params.num_clusters_total > 1) {
5375          // cfg.unk_24 = true;
5376          if (dev->params.gpu_generation == 13) {
5377             cfg.unk_4 = true;
5378             // cfg.unk_26 = true;
5379          }
5380       }
5381 
5382       /* With multiple launches in the same CDM stream, we can get cache
5383        * coherency (? or sync?) issues. We hit this with blits, which need - in
5384        * between dispatches - need the PBE cache to be flushed and the texture
5385        * cache to be invalidated. Until we know what bits mean what exactly,
5386        * let's just set these after every launch to be safe. We can revisit in
5387        * the future when we figure out what the bits mean.
5388        */
5389       cfg.unk_0 = true;
5390       cfg.unk_1 = true;
5391       cfg.unk_2 = true;
5392       cfg.usc_cache_inval = true;
5393       cfg.unk_4 = true;
5394       cfg.unk_5 = true;
5395       cfg.unk_6 = true;
5396       cfg.unk_7 = true;
5397       cfg.unk_8 = true;
5398       cfg.unk_9 = true;
5399       cfg.unk_10 = true;
5400       cfg.unk_11 = true;
5401       cfg.unk_12 = true;
5402       cfg.unk_13 = true;
5403       cfg.unk_14 = true;
5404       cfg.unk_15 = true;
5405       cfg.unk_16 = true;
5406       cfg.unk_17 = true;
5407       cfg.unk_18 = true;
5408       cfg.unk_19 = true;
5409    }
5410 
5411    batch->cdm.current = out;
5412    assert(batch->cdm.current <= batch->cdm.end &&
5413           "Failed to reserve sufficient space in encoder");
5414 }
5415 
5416 void
agx_launch(struct agx_batch * batch,const struct agx_grid * grid,struct agx_compiled_shader * cs,struct agx_linked_shader * linked,enum pipe_shader_type stage,unsigned variable_shared_mem)5417 agx_launch(struct agx_batch *batch, const struct agx_grid *grid,
5418            struct agx_compiled_shader *cs, struct agx_linked_shader *linked,
5419            enum pipe_shader_type stage, unsigned variable_shared_mem)
5420 {
5421    struct agx_context *ctx = batch->ctx;
5422 
5423    /* To implement load_num_workgroups, the number of workgroups needs to be
5424     * available in GPU memory. This is either the indirect buffer, or just a
5425     * buffer we upload ourselves if not indirect.
5426     */
5427    if (grid->mode == AGX_CDM_MODE_DIRECT) {
5428       uint32_t groups[3] = {
5429          grid->global[0] / grid->local[0],
5430          grid->global[1] / grid->local[1],
5431          grid->global[2] / grid->local[2],
5432       };
5433 
5434       batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] =
5435          agx_pool_upload_aligned(&batch->pool, groups, sizeof(groups), 4);
5436    } else {
5437       batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = grid->indirect;
5438    }
5439 
5440    util_dynarray_foreach(&ctx->global_buffers, struct pipe_resource *, res) {
5441       if (!*res)
5442          continue;
5443 
5444       struct agx_resource *buffer = agx_resource(*res);
5445       agx_batch_writes(batch, buffer, 0);
5446       batch->incoherent_writes = true;
5447    }
5448 
5449    agx_batch_add_bo(batch, cs->bo);
5450 
5451    agx_update_descriptors(batch, cs);
5452    agx_upload_uniforms(batch);
5453 
5454    // TODO: This is broken.
5455    size_t subgroups_per_core = 0;
5456 #if 0
5457    if (!info->indirect) {
5458       size_t subgroups_per_workgroup =
5459          DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], 32);
5460       subgroups_per_core =
5461          local_workgroups *
5462          DIV_ROUND_UP(info->grid[0] * info->grid[1] * info->grid[2],
5463                      ctx->scratch_cs.num_cores);
5464    }
5465 #endif
5466 
5467    uint32_t usc = agx_build_pipeline(batch, cs, linked, PIPE_SHADER_COMPUTE,
5468                                      variable_shared_mem, subgroups_per_core);
5469 
5470    agx_launch_internal(batch, grid, cs, stage, usc);
5471 }
5472 
5473 static void
agx_launch_grid(struct pipe_context * pipe,const struct pipe_grid_info * info)5474 agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
5475 {
5476    struct agx_context *ctx = agx_context(pipe);
5477    if (unlikely(!ctx->compute_blitter.active &&
5478                 !agx_render_condition_check(ctx)))
5479       return;
5480 
5481    struct agx_batch *batch = agx_get_compute_batch(ctx);
5482 
5483    uint64_t indirect = 0;
5484    if (info->indirect) {
5485       struct agx_resource *rsrc = agx_resource(info->indirect);
5486       agx_batch_reads(batch, rsrc);
5487       indirect = rsrc->bo->va->addr + info->indirect_offset;
5488    }
5489 
5490    /* Increment the pipeline stats query.
5491     *
5492     * TODO: Can we use the hardware counter for this?
5493     */
5494    if (ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS]) {
5495       unsigned blocksize = info->block[0] * info->block[1] * info->block[2];
5496 
5497       if (info->indirect) {
5498          struct libagx_cs_invocation_params p = {
5499             .grid = indirect,
5500             .local_size_threads = blocksize,
5501             .statistic = agx_get_query_address(
5502                batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS]),
5503          };
5504 
5505          const struct agx_grid g = agx_grid_direct(1, 1, 1, 1, 1, 1);
5506          agx_launch_with_data(batch, &g, agx_nir_increment_cs_invocations, NULL,
5507                               0, &p, sizeof(p));
5508       } else {
5509          agx_query_increment_cpu(
5510             ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_CS_INVOCATIONS],
5511             libagx_cs_invocations(blocksize, info->grid[0], info->grid[1],
5512                                   info->grid[2]));
5513       }
5514    }
5515 
5516    agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
5517 
5518    agx_batch_init_state(batch);
5519 
5520    struct agx_uncompiled_shader *uncompiled =
5521       ctx->stage[PIPE_SHADER_COMPUTE].shader;
5522 
5523    /* There is exactly one variant, get it */
5524    struct agx_compiled_shader *cs =
5525       _mesa_hash_table_next_entry(uncompiled->variants, NULL)->data;
5526 
5527    struct agx_grid grid = {
5528       .local[0] = info->block[0],
5529       .local[1] = info->block[1],
5530       .local[2] = info->block[2],
5531    };
5532 
5533    if (info->indirect) {
5534       struct agx_resource *indirect = agx_resource(info->indirect);
5535       agx_batch_reads(batch, indirect);
5536 
5537       grid.mode = AGX_CDM_MODE_INDIRECT_GLOBAL;
5538       grid.indirect = indirect->bo->va->addr + info->indirect_offset;
5539    } else {
5540       grid.mode = AGX_CDM_MODE_DIRECT;
5541 
5542       for (unsigned d = 0; d < 3; ++d) {
5543          grid.global[d] = ((info->grid[d] - 1) * info->block[d]) +
5544                           (info->last_block[d] ?: info->block[d]);
5545       }
5546    }
5547 
5548    agx_launch(batch, &grid, cs, NULL, PIPE_SHADER_COMPUTE,
5549               info->variable_shared_mem);
5550 
5551    /* TODO: Dirty tracking? */
5552    agx_dirty_all(ctx);
5553 
5554    batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = 0;
5555 
5556    /* If the next dispatch might overflow, flush now. TODO: If this is ever hit
5557     * in practice, we can use CDM stream links.
5558     */
5559    size_t dispatch_upper_bound =
5560       AGX_CDM_LAUNCH_WORD_0_LENGTH + AGX_CDM_LAUNCH_WORD_1_LENGTH +
5561       AGX_CDM_UNK_G14X_LENGTH + AGX_CDM_INDIRECT_LENGTH +
5562       AGX_CDM_GLOBAL_SIZE_LENGTH + AGX_CDM_LOCAL_SIZE_LENGTH +
5563       AGX_CDM_BARRIER_LENGTH;
5564 
5565    if (batch->cdm.current + dispatch_upper_bound >= batch->cdm.end)
5566       agx_flush_batch_for_reason(ctx, batch, "CDM overfull");
5567 }
5568 
5569 static void
agx_set_global_binding(struct pipe_context * pipe,unsigned first,unsigned count,struct pipe_resource ** resources,uint32_t ** handles)5570 agx_set_global_binding(struct pipe_context *pipe, unsigned first,
5571                        unsigned count, struct pipe_resource **resources,
5572                        uint32_t **handles)
5573 {
5574    struct agx_context *ctx = agx_context(pipe);
5575    unsigned old_size =
5576       util_dynarray_num_elements(&ctx->global_buffers, *resources);
5577 
5578    if (old_size < first + count) {
5579       /* we are screwed no matter what */
5580       if (!util_dynarray_grow(&ctx->global_buffers, *resources,
5581                               (first + count) - old_size))
5582          unreachable("out of memory");
5583 
5584       for (unsigned i = old_size; i < first + count; i++)
5585          *util_dynarray_element(&ctx->global_buffers, struct pipe_resource *,
5586                                 i) = NULL;
5587    }
5588 
5589    for (unsigned i = 0; i < count; ++i) {
5590       struct pipe_resource **res = util_dynarray_element(
5591          &ctx->global_buffers, struct pipe_resource *, first + i);
5592       if (resources && resources[i]) {
5593          pipe_resource_reference(res, resources[i]);
5594 
5595          /* The handle points to uint32_t, but space is allocated for 64
5596           * bits. We need to respect the offset passed in. This interface
5597           * is so bad.
5598           */
5599          uint64_t addr = 0;
5600          struct agx_resource *rsrc = agx_resource(resources[i]);
5601 
5602          memcpy(&addr, handles[i], sizeof(addr));
5603          addr += rsrc->bo->va->addr;
5604          memcpy(handles[i], &addr, sizeof(addr));
5605       } else {
5606          pipe_resource_reference(res, NULL);
5607       }
5608    }
5609 }
5610 
5611 void agx_init_state_functions(struct pipe_context *ctx);
5612 
5613 void
agx_init_state_functions(struct pipe_context * ctx)5614 agx_init_state_functions(struct pipe_context *ctx)
5615 {
5616    ctx->create_blend_state = agx_create_blend_state;
5617    ctx->create_depth_stencil_alpha_state = agx_create_zsa_state;
5618    ctx->create_fs_state = agx_create_shader_state;
5619    ctx->create_rasterizer_state = agx_create_rs_state;
5620    ctx->create_sampler_state = agx_create_sampler_state;
5621    ctx->create_sampler_view = agx_create_sampler_view;
5622    ctx->create_surface = agx_create_surface;
5623    ctx->create_vertex_elements_state = agx_create_vertex_elements;
5624    ctx->create_vs_state = agx_create_shader_state;
5625    ctx->create_gs_state = agx_create_shader_state;
5626    ctx->create_tcs_state = agx_create_shader_state;
5627    ctx->create_tes_state = agx_create_shader_state;
5628    ctx->create_compute_state = agx_create_compute_state;
5629    ctx->bind_blend_state = agx_bind_blend_state;
5630    ctx->bind_depth_stencil_alpha_state = agx_bind_zsa_state;
5631    ctx->bind_sampler_states = agx_bind_sampler_states;
5632    ctx->bind_fs_state = agx_bind_fs_state;
5633    ctx->bind_rasterizer_state = agx_bind_rasterizer_state;
5634    ctx->bind_vertex_elements_state = agx_bind_vertex_elements_state;
5635    ctx->bind_vs_state = agx_bind_vs_state;
5636    ctx->bind_gs_state = agx_bind_gs_state;
5637    ctx->bind_tcs_state = agx_bind_tcs_state;
5638    ctx->bind_tes_state = agx_bind_tes_state;
5639    ctx->bind_compute_state = agx_bind_cs_state;
5640    ctx->delete_blend_state = agx_delete_state;
5641    ctx->delete_depth_stencil_alpha_state = agx_delete_state;
5642    ctx->delete_fs_state = agx_delete_shader_state;
5643    ctx->delete_compute_state = agx_delete_shader_state;
5644    ctx->delete_rasterizer_state = agx_delete_state;
5645    ctx->delete_sampler_state = agx_delete_sampler_state;
5646    ctx->delete_vertex_elements_state = agx_delete_state;
5647    ctx->delete_vs_state = agx_delete_shader_state;
5648    ctx->delete_gs_state = agx_delete_shader_state;
5649    ctx->delete_tcs_state = agx_delete_shader_state;
5650    ctx->delete_tes_state = agx_delete_shader_state;
5651    ctx->set_blend_color = agx_set_blend_color;
5652    ctx->set_clip_state = agx_set_clip_state;
5653    ctx->set_constant_buffer = agx_set_constant_buffer;
5654    ctx->set_shader_buffers = agx_set_shader_buffers;
5655    ctx->set_shader_images = agx_set_shader_images;
5656    ctx->set_sampler_views = agx_set_sampler_views;
5657    ctx->set_framebuffer_state = agx_set_framebuffer_state;
5658    ctx->set_polygon_stipple = agx_set_polygon_stipple;
5659    ctx->set_patch_vertices = agx_set_patch_vertices;
5660    ctx->set_sample_mask = agx_set_sample_mask;
5661    ctx->set_scissor_states = agx_set_scissor_states;
5662    ctx->set_stencil_ref = agx_set_stencil_ref;
5663    ctx->set_vertex_buffers = agx_set_vertex_buffers;
5664    ctx->set_viewport_states = agx_set_viewport_states;
5665    ctx->sampler_view_destroy = agx_sampler_view_destroy;
5666    ctx->surface_destroy = agx_surface_destroy;
5667    ctx->draw_vbo = agx_draw_vbo;
5668    ctx->launch_grid = agx_launch_grid;
5669    ctx->set_global_binding = agx_set_global_binding;
5670    ctx->texture_barrier = agx_texture_barrier;
5671    ctx->get_compute_state_info = agx_get_compute_state_info;
5672    ctx->set_tess_state = agx_set_tess_state;
5673 }
5674