xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pipe.h"
8 #include "si_query.h"
9 #include "si_shader_internal.h"
10 
gfx10_ngg_get_vertices_per_prim(struct si_shader * shader)11 unsigned gfx10_ngg_get_vertices_per_prim(struct si_shader *shader)
12 {
13    const struct si_shader_info *info = &shader->selector->info;
14 
15    if (shader->selector->stage == MESA_SHADER_GEOMETRY)
16       return mesa_vertices_per_prim(info->base.gs.output_primitive);
17    else if (shader->selector->stage == MESA_SHADER_VERTEX) {
18       if (info->base.vs.blit_sgprs_amd) {
19          /* Blits always use axis-aligned rectangles with 3 vertices. */
20          return 3;
21       } else if (shader->key.ge.opt.ngg_culling & SI_NGG_CULL_LINES)
22          return 2;
23       else {
24          /* The shader compiler replaces 0 with 3. The generated code will be correct regardless
25           * of the draw primitive type, but it's less efficient.
26           *
27           * Computing prim export values for non-existent vertices has no effect.
28           */
29          return 0; /* unknown */
30       }
31    } else {
32       assert(shader->selector->stage == MESA_SHADER_TESS_EVAL);
33 
34       if (info->base.tess.point_mode)
35          return 1;
36       else if (info->base.tess._primitive_mode == TESS_PRIMITIVE_ISOLINES)
37          return 2;
38       else
39          return 3;
40    }
41 }
42 
gfx10_ngg_export_prim_early(struct si_shader * shader)43 bool gfx10_ngg_export_prim_early(struct si_shader *shader)
44 {
45    struct si_shader_selector *sel = shader->selector;
46 
47    assert(shader->key.ge.as_ngg && !shader->key.ge.as_es);
48 
49    return sel->stage != MESA_SHADER_GEOMETRY &&
50           !gfx10_ngg_writes_user_edgeflags(shader);
51 }
52 
clamp_gsprims_to_esverts(unsigned * max_gsprims,unsigned max_esverts,unsigned min_verts_per_prim,bool use_adjacency)53 static void clamp_gsprims_to_esverts(unsigned *max_gsprims, unsigned max_esverts,
54                                      unsigned min_verts_per_prim, bool use_adjacency)
55 {
56    unsigned max_reuse = max_esverts - min_verts_per_prim;
57    if (use_adjacency)
58       max_reuse /= 2;
59    *max_gsprims = MIN2(*max_gsprims, 1 + max_reuse);
60 }
61 
gfx10_ngg_get_scratch_dw_size(struct si_shader * shader)62 unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader)
63 {
64    const struct si_shader_selector *sel = shader->selector;
65 
66    return ac_ngg_get_scratch_lds_size(sel->stage,
67                                       si_get_max_workgroup_size(shader),
68                                       shader->wave_size,
69                                       si_shader_uses_streamout(shader),
70                                       shader->key.ge.opt.ngg_culling) / 4;
71 }
72 
73 /**
74  * Determine subgroup information like maximum number of vertices and prims.
75  *
76  * This happens before the shader is uploaded, since LDS relocations during
77  * upload depend on the subgroup size.
78  */
gfx10_ngg_calculate_subgroup_info(struct si_shader * shader)79 bool gfx10_ngg_calculate_subgroup_info(struct si_shader *shader)
80 {
81    const struct si_shader_selector *gs_sel = shader->selector;
82    const struct si_shader_selector *es_sel =
83       shader->previous_stage_sel ? shader->previous_stage_sel : gs_sel;
84    const gl_shader_stage gs_stage = gs_sel->stage;
85    const unsigned gs_num_invocations = MAX2(gs_sel->info.base.gs.invocations, 1);
86    const unsigned input_prim = si_get_input_prim(gs_sel, &shader->key);
87    const bool use_adjacency =
88       input_prim >= MESA_PRIM_LINES_ADJACENCY && input_prim <= MESA_PRIM_TRIANGLE_STRIP_ADJACENCY;
89    const unsigned max_verts_per_prim = mesa_vertices_per_prim(input_prim);
90    const unsigned min_verts_per_prim = gs_stage == MESA_SHADER_GEOMETRY ? max_verts_per_prim : 1;
91 
92    /* All these are in dwords. The maximum is 16K dwords (64KB) of LDS per workgroup. */
93    const unsigned scratch_lds_size = gfx10_ngg_get_scratch_dw_size(shader);
94    /* Scratch is at last of LDS space and 2 dwords aligned, so it may cost more for alignment. */
95    const unsigned max_lds_size = 16 * 1024 - ALIGN(scratch_lds_size, 2);
96    const unsigned target_lds_size = max_lds_size;
97    unsigned esvert_lds_size = 0;
98    unsigned gsprim_lds_size = 0;
99 
100    /* All these are per subgroup: */
101    const unsigned min_esverts =
102       gs_sel->screen->info.gfx_level >= GFX11 ? 3 : /* gfx11 requires at least 1 primitive per TG */
103       gs_sel->screen->info.gfx_level >= GFX10_3 ? 29 : (24 - 1 + max_verts_per_prim);
104    bool max_vert_out_per_gs_instance = false;
105    unsigned max_gsprims_base, max_esverts_base;
106 
107    max_gsprims_base = max_esverts_base = si_get_max_workgroup_size(shader);
108 
109    if (gs_stage == MESA_SHADER_GEOMETRY) {
110       bool force_multi_cycling = false;
111       unsigned max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out * gs_num_invocations;
112 
113 retry_select_mode:
114       if (max_out_verts_per_gsprim <= 256 && !force_multi_cycling) {
115          if (max_out_verts_per_gsprim) {
116             max_gsprims_base = MIN2(max_gsprims_base, 256 / max_out_verts_per_gsprim);
117          }
118       } else {
119          /* Use special multi-cycling mode in which each GS
120           * instance gets its own subgroup. Does not work with
121           * tessellation. */
122          max_vert_out_per_gs_instance = true;
123          max_gsprims_base = 1;
124          max_out_verts_per_gsprim = gs_sel->info.base.gs.vertices_out;
125       }
126 
127       esvert_lds_size = es_sel->info.esgs_vertex_stride / 4;
128       gsprim_lds_size = (gs_sel->info.gsvs_vertex_size / 4 + 1) * max_out_verts_per_gsprim;
129 
130       if (gsprim_lds_size > target_lds_size && !force_multi_cycling) {
131          if (gs_sel->tess_turns_off_ngg || es_sel->stage != MESA_SHADER_TESS_EVAL) {
132             force_multi_cycling = true;
133             goto retry_select_mode;
134          }
135       }
136    } else {
137       /* VS and TES. */
138 
139       bool uses_instance_id = gs_sel->info.uses_instanceid;
140       bool uses_primitive_id = gs_sel->info.uses_primid;
141       if (gs_stage == MESA_SHADER_VERTEX) {
142          uses_instance_id |=
143             shader->key.ge.mono.instance_divisor_is_one ||
144             shader->key.ge.mono.instance_divisor_is_fetched;
145       } else {
146          uses_primitive_id |= shader->key.ge.mono.u.vs_export_prim_id;
147       }
148 
149       esvert_lds_size = ac_ngg_nogs_get_pervertex_lds_size(
150          gs_stage, gs_sel->info.num_outputs,
151          si_shader_uses_streamout(shader),
152          shader->key.ge.mono.u.vs_export_prim_id,
153          gfx10_ngg_writes_user_edgeflags(shader),
154          shader->key.ge.opt.ngg_culling,
155          uses_instance_id,
156          uses_primitive_id) / 4;
157    }
158 
159    unsigned max_gsprims = max_gsprims_base;
160    unsigned max_esverts = max_esverts_base;
161 
162    if (esvert_lds_size)
163       max_esverts = MIN2(max_esverts, target_lds_size / esvert_lds_size);
164    if (gsprim_lds_size)
165       max_gsprims = MIN2(max_gsprims, target_lds_size / gsprim_lds_size);
166 
167    max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
168    clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
169    assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
170 
171    if (esvert_lds_size || gsprim_lds_size) {
172       /* Now that we have a rough proportionality between esverts
173        * and gsprims based on the primitive type, scale both of them
174        * down simultaneously based on required LDS space.
175        *
176        * We could be smarter about this if we knew how much vertex
177        * reuse to expect.
178        */
179       unsigned lds_total = max_esverts * esvert_lds_size + max_gsprims * gsprim_lds_size;
180       if (lds_total > target_lds_size) {
181          max_esverts = max_esverts * target_lds_size / lds_total;
182          max_gsprims = max_gsprims * target_lds_size / lds_total;
183 
184          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
185          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
186          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
187       }
188    }
189 
190    /* Round up towards full wave sizes for better ALU utilization. */
191    if (!max_vert_out_per_gs_instance) {
192       unsigned orig_max_esverts;
193       unsigned orig_max_gsprims;
194       do {
195          orig_max_esverts = max_esverts;
196          orig_max_gsprims = max_gsprims;
197 
198          max_esverts = align(max_esverts, shader->wave_size);
199          max_esverts = MIN2(max_esverts, max_esverts_base);
200          if (esvert_lds_size)
201             max_esverts =
202                MIN2(max_esverts, (max_lds_size - max_gsprims * gsprim_lds_size) / esvert_lds_size);
203          max_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
204 
205          /* Hardware restriction: minimum value of max_esverts */
206          max_esverts = MAX2(max_esverts, min_esverts);
207 
208          max_gsprims = align(max_gsprims, shader->wave_size);
209          max_gsprims = MIN2(max_gsprims, max_gsprims_base);
210          if (gsprim_lds_size) {
211             /* Don't count unusable vertices to the LDS size. Those are vertices above
212              * the maximum number of vertices that can occur in the workgroup,
213              * which is e.g. max_gsprims * 3 for triangles.
214              */
215             unsigned usable_esverts = MIN2(max_esverts, max_gsprims * max_verts_per_prim);
216             max_gsprims =
217                MIN2(max_gsprims, (max_lds_size - usable_esverts * esvert_lds_size) / gsprim_lds_size);
218          }
219          clamp_gsprims_to_esverts(&max_gsprims, max_esverts, min_verts_per_prim, use_adjacency);
220          assert(max_esverts >= max_verts_per_prim && max_gsprims >= 1);
221       } while (orig_max_esverts != max_esverts || orig_max_gsprims != max_gsprims);
222 
223       /* Verify the restriction. */
224       assert(max_esverts >= min_esverts);
225    } else {
226       max_esverts = MAX2(max_esverts, min_esverts);
227    }
228 
229    unsigned max_out_vertices =
230       max_vert_out_per_gs_instance
231          ? gs_sel->info.base.gs.vertices_out
232          : gs_stage == MESA_SHADER_GEOMETRY
233               ? max_gsprims * gs_num_invocations * gs_sel->info.base.gs.vertices_out
234               : max_esverts;
235    assert(max_out_vertices <= 256);
236 
237    shader->ngg.hw_max_esverts = max_esverts;
238    shader->ngg.max_gsprims = max_gsprims;
239    shader->ngg.max_out_verts = max_out_vertices;
240    shader->ngg.max_vert_out_per_gs_instance = max_vert_out_per_gs_instance;
241 
242    /* Don't count unusable vertices. */
243    shader->gs_info.esgs_ring_size = MIN2(max_esverts, max_gsprims * max_verts_per_prim) *
244                                     esvert_lds_size;
245    shader->ngg.ngg_emit_size = max_gsprims * gsprim_lds_size;
246 
247    assert(shader->ngg.hw_max_esverts >= min_esverts); /* HW limitation */
248 
249    /* If asserts are disabled, we use the same conditions to return false */
250    return max_esverts >= max_verts_per_prim && max_gsprims >= 1 &&
251           max_out_vertices <= 256 &&
252           shader->ngg.hw_max_esverts >= min_esverts;
253 }
254