xref: /aosp_15_r20/external/mesa3d/src/amd/common/ac_gather_context_rolls.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2023 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* Utility for gathering context rolls for performance bottleneck analysis.
8  *
9  * Usage for radeonsi:
10  *    AMD_ROLLS=filename app1
11  *    AMD_ROLLS=filename app2
12  *    ...
13  *    AMD_ROLLS=filename appN
14  *
15  *    sort filename | uniq -c | sort -n > rolls_sorted.txt
16  *
17  *    Then try to reduce the most frequent context rolls.
18  */
19 
20 #include "ac_debug.h"
21 #include "sid.h"
22 #include "sid_tables.h"
23 
24 #include "util/bitset.h"
25 #include "util/hash_table.h"
26 #include "util/u_dynarray.h"
27 #include "util/u_memory.h"
28 
29 #define COLOR_RESET  "\033[0m"
30 #define COLOR_RED    "\033[31m"
31 #define COLOR_GREEN  "\033[1;32m"
32 
33 struct ac_context_reg_deltas {
34    uint32_t changed_masks[1024];    /* changes masks of context registers */
35    BITSET_DECLARE(changed, 1024);   /* which context register was set */
36    bool acquire_mem;                /* whether ACQUIRE_MEM rolled the context */
37 };
38 
39 struct ac_context_reg_state {
40    uint32_t regs[1024];
41    struct ac_context_reg_deltas deltas;
42    const char *annotation;
43 };
44 
45 struct ac_context_roll_ctx {
46    struct ac_context_reg_state *cur;
47    bool context_busy;
48 
49    unsigned num_busy_contexts;
50    struct util_dynarray rolls;
51 
52    const struct radeon_info *info;
53 };
54 
ac_roll_context(struct ac_context_roll_ctx * ctx)55 static void ac_roll_context(struct ac_context_roll_ctx *ctx)
56 {
57    if (!ctx->context_busy)
58       return;
59 
60    struct ac_context_reg_state *last = ctx->cur;
61    ctx->cur = CALLOC_STRUCT(ac_context_reg_state);
62    memcpy(ctx->cur->regs, last->regs, sizeof(last->regs));
63    ctx->context_busy = false;
64    ctx->num_busy_contexts++;
65 
66    /* Ignore the first context at the beginning or after waiting for idle. */
67    if (ctx->num_busy_contexts > 1) {
68       util_dynarray_append(&ctx->rolls, struct ac_context_reg_state *, last);
69    } else {
70       FREE(last);
71    }
72 }
73 
ac_record_wait_idle(struct ac_context_roll_ctx * ctx)74 static void ac_record_wait_idle(struct ac_context_roll_ctx *ctx)
75 {
76    ctx->num_busy_contexts = 0;
77    ctx->context_busy = false;
78    memset(&ctx->cur->deltas, 0, sizeof(ctx->cur->deltas));
79 }
80 
ac_record_set_context_reg(struct ac_context_roll_ctx * ctx,unsigned reg_rel_dw_offset,unsigned value)81 static void ac_record_set_context_reg(struct ac_context_roll_ctx *ctx,
82                                       unsigned reg_rel_dw_offset, unsigned value)
83 {
84    if (!ac_register_exists(ctx->info->gfx_level, ctx->info->family,
85                            SI_CONTEXT_REG_OFFSET + reg_rel_dw_offset * 4)) {
86       fprintf(stderr, "This register is not supported by this chip: 0x%X\n",
87               SI_CONTEXT_REG_OFFSET + reg_rel_dw_offset * 4);
88       abort();
89    }
90 
91    assert(reg_rel_dw_offset < 1024);
92    BITSET_SET(ctx->cur->deltas.changed, reg_rel_dw_offset);
93    ctx->cur->deltas.changed_masks[reg_rel_dw_offset] |= ctx->cur->regs[reg_rel_dw_offset] ^ value;
94    ctx->cur->regs[reg_rel_dw_offset] = value;
95 }
96 
get_reg_index(unsigned reg)97 static unsigned get_reg_index(unsigned reg)
98 {
99    return (reg - SI_CONTEXT_REG_OFFSET) / 4;
100 }
101 
ac_ib_gather_context_rolls(struct ac_context_roll_ctx * ctx,uint32_t * ib,int num_dw,struct hash_table * annotations)102 static void ac_ib_gather_context_rolls(struct ac_context_roll_ctx *ctx, uint32_t *ib, int num_dw,
103                                        struct hash_table *annotations)
104 {
105    for (unsigned cur_dw = 0; cur_dw < num_dw;) {
106       if (annotations) {
107          struct hash_entry *marker = _mesa_hash_table_search(annotations, ib + cur_dw);
108          if (marker)
109             ctx->cur->annotation = marker->data;
110       }
111 
112       uint32_t header = ib[cur_dw++];
113       unsigned type = PKT_TYPE_G(header);
114 
115       if (type != 3) {
116          fprintf(stderr, "Unexpected type %u packet\n", type);
117          abort();
118       }
119 
120       int count = PKT_COUNT_G(header);
121       unsigned op = PKT3_IT_OPCODE_G(header);
122 
123       switch (op) {
124       /* Record context register changes. */
125       case PKT3_SET_CONTEXT_REG: {
126          ac_roll_context(ctx);
127 
128          unsigned reg_dw = ib[cur_dw++];
129          unsigned reg_rel_dw_offset = reg_dw & 0xFFFF;
130 
131          for (int i = 0; i < count; i++)
132             ac_record_set_context_reg(ctx, reg_rel_dw_offset + i, ib[cur_dw++]);
133          continue;
134       }
135 
136       case PKT3_SET_CONTEXT_REG_PAIRS:
137          ac_roll_context(ctx);
138 
139          for (int i = 0; i < (count + 1) / 2; i++) {
140             unsigned reg_rel_dw_offset = ib[cur_dw++];
141             ac_record_set_context_reg(ctx, reg_rel_dw_offset, ib[cur_dw++]);
142          }
143          continue;
144 
145       case PKT3_SET_CONTEXT_REG_PAIRS_PACKED: {
146          ac_roll_context(ctx);
147 
148          unsigned reg_rel_dw_offset0 = 0, reg_rel_dw_offset1 = 0;
149          cur_dw++;
150 
151          for (int i = 0; i < count; i++) {
152             if (i % 3 == 0) {
153                unsigned tmp = ib[cur_dw++];
154                reg_rel_dw_offset0 = tmp & 0xffff;
155                reg_rel_dw_offset1 = tmp >> 16;
156             } else if (i % 3 == 1) {
157                ac_record_set_context_reg(ctx, reg_rel_dw_offset0, ib[cur_dw++]);
158             } else {
159                ac_record_set_context_reg(ctx, reg_rel_dw_offset1, ib[cur_dw++]);
160             }
161          }
162          continue;
163       }
164 
165       case PKT3_CLEAR_STATE:
166          ac_roll_context(ctx);
167 
168          ac_record_set_context_reg(ctx, get_reg_index(R_028000_DB_RENDER_CONTROL), 0);
169          ac_record_set_context_reg(ctx, get_reg_index(R_028004_DB_COUNT_CONTROL), 0);
170 
171          ac_record_set_context_reg(ctx, get_reg_index(R_028BDC_PA_SC_LINE_CNTL), 0x1000);
172          ac_record_set_context_reg(ctx, get_reg_index(R_028BE0_PA_SC_AA_CONFIG), 0);
173 
174          ac_record_set_context_reg(ctx, get_reg_index(R_028BE4_PA_SU_VTX_CNTL), 0x5);
175          ac_record_set_context_reg(ctx, get_reg_index(R_028BE8_PA_CL_GB_VERT_CLIP_ADJ), 0x3f800000);
176          ac_record_set_context_reg(ctx, get_reg_index(R_028BEC_PA_CL_GB_VERT_DISC_ADJ), 0x3f800000);
177          ac_record_set_context_reg(ctx, get_reg_index(R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ), 0x3f800000);
178          ac_record_set_context_reg(ctx, get_reg_index(R_028BF4_PA_CL_GB_HORZ_DISC_ADJ), 0x3f800000);
179 
180          ac_record_set_context_reg(ctx, get_reg_index(R_02870C_SPI_SHADER_POS_FORMAT), 0);
181 
182          ac_record_set_context_reg(ctx, get_reg_index(R_028710_SPI_SHADER_Z_FORMAT), 0);
183          ac_record_set_context_reg(ctx, get_reg_index(R_028714_SPI_SHADER_COL_FORMAT), 0);
184          ac_record_set_context_reg(ctx, get_reg_index(R_0286E0_SPI_BARYC_CNTL), 0);
185          ac_record_set_context_reg(ctx, get_reg_index(R_0286CC_SPI_PS_INPUT_ENA), 0);
186          ac_record_set_context_reg(ctx, get_reg_index(R_0286D0_SPI_PS_INPUT_ADDR), 0);
187 
188          ac_record_set_context_reg(ctx, get_reg_index(R_028804_DB_EQAA), 0);
189          ac_record_set_context_reg(ctx, get_reg_index(R_02880C_DB_SHADER_CONTROL), 0);
190          ac_record_set_context_reg(ctx, get_reg_index(R_02823C_CB_SHADER_MASK), 0xffffffff);
191          ac_record_set_context_reg(ctx, get_reg_index(R_028238_CB_TARGET_MASK), 0xffffffff);
192          ac_record_set_context_reg(ctx, get_reg_index(R_028810_PA_CL_CLIP_CNTL), 0x90000);
193          ac_record_set_context_reg(ctx, get_reg_index(R_02881C_PA_CL_VS_OUT_CNTL), 0);
194          ac_record_set_context_reg(ctx, get_reg_index(R_028818_PA_CL_VTE_CNTL), 0);
195          ac_record_set_context_reg(ctx, get_reg_index(R_02820C_PA_SC_CLIPRECT_RULE), 0xffff);
196          ac_record_set_context_reg(ctx, get_reg_index(R_028A0C_PA_SC_LINE_STIPPLE), 0);
197          ac_record_set_context_reg(ctx, get_reg_index(R_028A4C_PA_SC_MODE_CNTL_1), 0);
198          ac_record_set_context_reg(ctx, get_reg_index(R_028234_PA_SU_HARDWARE_SCREEN_OFFSET), 0);
199          ac_record_set_context_reg(ctx, get_reg_index(R_0286D8_SPI_PS_IN_CONTROL), 0x2);
200          ac_record_set_context_reg(ctx, get_reg_index(R_028B90_VGT_GS_INSTANCE_CNT), 0);
201          ac_record_set_context_reg(ctx, get_reg_index(R_028B38_VGT_GS_MAX_VERT_OUT), 0);
202          ac_record_set_context_reg(ctx, get_reg_index(R_028B54_VGT_SHADER_STAGES_EN), 0);
203          ac_record_set_context_reg(ctx, get_reg_index(R_028B58_VGT_LS_HS_CONFIG), 0);
204          ac_record_set_context_reg(ctx, get_reg_index(R_028B6C_VGT_TF_PARAM), 0);
205          ac_record_set_context_reg(ctx, get_reg_index(R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL), 0);
206          ac_record_set_context_reg(ctx, get_reg_index(R_028C44_PA_SC_BINNER_CNTL_0), 0x3);
207          if (ctx->info->gfx_level >= GFX10) {
208             ac_record_set_context_reg(ctx, get_reg_index(R_0287FC_GE_MAX_OUTPUT_PER_SUBGROUP), 0);
209             ac_record_set_context_reg(ctx, get_reg_index(R_028B4C_GE_NGG_SUBGRP_CNTL), 0);
210          }
211          if (ctx->info->gfx_level >= GFX11)
212             ac_record_set_context_reg(ctx, get_reg_index(R_0283D0_PA_SC_VRS_OVERRIDE_CNTL), 0);
213          else if (ctx->info->gfx_level == GFX10_3)
214             ac_record_set_context_reg(ctx, get_reg_index(R_028064_DB_VRS_OVERRIDE_CNTL), 0);
215 
216          ac_record_set_context_reg(ctx, get_reg_index(R_028754_SX_PS_DOWNCONVERT), 0);
217          ac_record_set_context_reg(ctx, get_reg_index(R_028758_SX_BLEND_OPT_EPSILON), 0);
218          ac_record_set_context_reg(ctx, get_reg_index(R_02875C_SX_BLEND_OPT_CONTROL), 0);
219 
220          ac_record_set_context_reg(ctx, get_reg_index(R_028AAC_VGT_ESGS_RING_ITEMSIZE), 0);
221          ac_record_set_context_reg(ctx, get_reg_index(R_028AB4_VGT_REUSE_OFF), 0);
222          if (ctx->info->gfx_level <= GFX9)
223             ac_record_set_context_reg(ctx, get_reg_index(R_028AA8_IA_MULTI_VGT_PARAM), 0xff);
224 
225          if (ctx->info->gfx_level == GFX9)
226             ac_record_set_context_reg(ctx, get_reg_index(R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP), 0);
227          if (ctx->info->gfx_level <= GFX10_3) {
228             ac_record_set_context_reg(ctx, get_reg_index(R_028A44_VGT_GS_ONCHIP_CNTL), 0);
229             ac_record_set_context_reg(ctx, get_reg_index(R_028AB0_VGT_GSVS_RING_ITEMSIZE), 0);
230             ac_record_set_context_reg(ctx, get_reg_index(R_028A40_VGT_GS_MODE), 0);
231             ac_record_set_context_reg(ctx, get_reg_index(R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL), 0x1e);
232             ac_record_set_context_reg(ctx, get_reg_index(R_028A6C_VGT_GS_OUT_PRIM_TYPE), 0);
233 
234             ac_record_set_context_reg(ctx, get_reg_index(R_028A60_VGT_GSVS_RING_OFFSET_1), 0);
235             ac_record_set_context_reg(ctx, get_reg_index(R_028A64_VGT_GSVS_RING_OFFSET_2), 0);
236             ac_record_set_context_reg(ctx, get_reg_index(R_028A68_VGT_GSVS_RING_OFFSET_3), 0);
237 
238             ac_record_set_context_reg(ctx, get_reg_index(R_028B5C_VGT_GS_VERT_ITEMSIZE), 0);
239             ac_record_set_context_reg(ctx, get_reg_index(R_028B60_VGT_GS_VERT_ITEMSIZE_1), 0);
240             ac_record_set_context_reg(ctx, get_reg_index(R_028B64_VGT_GS_VERT_ITEMSIZE_2), 0);
241             ac_record_set_context_reg(ctx, get_reg_index(R_028B68_VGT_GS_VERT_ITEMSIZE_3), 0);
242          }
243 
244          ac_record_set_context_reg(ctx, get_reg_index(R_028010_DB_RENDER_OVERRIDE2), 0);
245          ac_record_set_context_reg(ctx, get_reg_index(R_0286C4_SPI_VS_OUT_CONFIG), 0);
246          ac_record_set_context_reg(ctx, get_reg_index(R_028A84_VGT_PRIMITIVEID_EN), 0);
247          ac_record_set_context_reg(ctx, get_reg_index(R_028424_CB_DCC_CONTROL), 0);
248          break;
249 
250       case PKT3_LOAD_CONTEXT_REG_INDEX:
251       case PKT3_COPY_DATA:
252          /* TODO */
253          break;
254 
255       case PKT3_ACQUIRE_MEM:
256          if (G_580_PWS_ENA2(ib[cur_dw])) {
257             ac_record_wait_idle(ctx);
258          } else {
259             ac_roll_context(ctx);
260             ctx->cur->deltas.acquire_mem = true;
261          }
262          break;
263 
264       case PKT3_WAIT_REG_MEM:
265          ac_record_wait_idle(ctx);
266          break;
267 
268       case PKT3_EVENT_WRITE:
269          if (G_490_EVENT_TYPE(ib[cur_dw]) == V_028A90_PS_PARTIAL_FLUSH)
270             ac_record_wait_idle(ctx);
271          break;
272 
273       /* Record draws. */
274       case PKT3_DRAW_INDEX_AUTO:
275       case PKT3_DRAW_INDEX_IMMD:
276       case PKT3_DRAW_INDEX_MULTI_AUTO:
277       case PKT3_DRAW_INDEX_2:
278       case PKT3_DRAW_INDEX_OFFSET_2:
279       case PKT3_DRAW_INDIRECT:
280       case PKT3_DRAW_INDEX_INDIRECT:
281       case PKT3_DRAW_INDIRECT_MULTI:
282       case PKT3_DRAW_INDEX_INDIRECT_MULTI:
283       case PKT3_DISPATCH_MESH_DIRECT:
284       case PKT3_DISPATCH_MESH_INDIRECT_MULTI:
285       case PKT3_DISPATCH_TASKMESH_GFX:
286          ctx->context_busy = true;
287          break;
288 
289       case PKT3_INDIRECT_BUFFER:
290          /* Chaining. Note that the CHAIN bit is not set at this point, so we can't distinguish
291           * between chaining and IB2.
292           */
293          return;
294 
295       case PKT3_CONTEXT_REG_RMW:
296       case PKT3_INDIRECT_BUFFER_SI:
297       case PKT3_SURFACE_SYNC:
298          fprintf(stderr, "Unhandled packet: 0x%x\n", op);
299          abort();
300          break;
301       }
302 
303       cur_dw += count + 1;
304    }
305 }
306 
ac_gather_context_rolls(FILE * f,uint32_t ** ibs,uint32_t * ib_dw_sizes,unsigned num_ibs,struct hash_table * annotations,const struct radeon_info * info)307 void ac_gather_context_rolls(FILE *f, uint32_t **ibs, uint32_t *ib_dw_sizes, unsigned num_ibs,
308                              struct hash_table *annotations, const struct radeon_info *info)
309 {
310    struct ac_context_roll_ctx ctx;
311 
312    /* Initialize. */
313    memset(&ctx, 0, sizeof(ctx));
314    ctx.info = info;
315    ctx.cur = CALLOC_STRUCT(ac_context_reg_state);
316    util_dynarray_init(&ctx.rolls, NULL);
317 
318    /* Parse the IBs. */
319    for (unsigned i = 0; i < num_ibs; i++)
320       ac_ib_gather_context_rolls(&ctx, ibs[i], ib_dw_sizes[i], annotations);
321 
322    /* Roll the last context to add it to the list. */
323    ac_roll_context(&ctx);
324 
325    /* Print context rolls. */
326    if (util_dynarray_num_elements(&ctx.rolls, struct ac_context_reg_state *)) {
327       /* Print the context rolls starting with the most frequent one. */
328       util_dynarray_foreach(&ctx.rolls, struct ac_context_reg_state *, iter) {
329          struct ac_context_reg_state *state = *iter;
330 
331          if (state->annotation)
332             fprintf(f, "%s: ", state->annotation);
333 
334          unsigned i;
335          BITSET_FOREACH_SET(i, state->deltas.changed, 1024) {
336             unsigned reg_offset = SI_CONTEXT_REG_OFFSET + i * 4;
337             const struct si_reg *reg = ac_find_register(info->gfx_level, info->family,
338                                                         reg_offset);
339 
340             if (!state->deltas.changed_masks[i])
341                fprintf(f, COLOR_RED);
342             else
343                fprintf(f, COLOR_GREEN);
344 
345             if (!reg) {
346                fprintf(f, "0x%X(0x%x) ", reg_offset, state->deltas.changed_masks[i]);
347             } else {
348                fprintf(f, "%s(0x%x) ", sid_strings + reg->name_offset,
349                        state->deltas.changed_masks[i]);
350             }
351 
352             fprintf(f, COLOR_RESET);
353          }
354 
355          if (state->deltas.acquire_mem)
356             fprintf(f, "ACQUIRE_MEM");
357 
358          fprintf(f, "\n\n");
359       }
360    }
361 
362    /* Free. */
363    FREE(ctx.cur);
364    util_dynarray_foreach(&ctx.rolls, struct ac_context_reg_state *, iter) {
365       FREE(*iter);
366    }
367    util_dynarray_fini(&ctx.rolls);
368 }
369