xref: /aosp_15_r20/external/mesa3d/src/freedreno/decode/crashdec-mempool.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 
7 #include "crashdec.h"
8 
9 
10 static void
dump_mem_pool_reg_write(unsigned reg,uint32_t data,unsigned context,bool pipe)11 dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context,
12                         bool pipe)
13 {
14    /* TODO deal better somehow w/ 64b regs: */
15    struct regacc r = {
16          .rnn = pipe ? rnn_pipe : NULL,
17          .regbase = reg,
18          .value = data,
19    };
20    if (pipe) {
21       struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
22       printf("\t\twrite %s (%02x) pipe\n", info->name, reg);
23 
24       if (!strcmp(info->typeinfo->name, "void")) {
25          /* registers that ignore their payload */
26       } else {
27          printf("\t\t\t");
28          dump_register(&r);
29       }
30       rnn_reginfo_free(info);
31    } else {
32       printf("\t\twrite %s (%05x)", regname(reg, 1), reg);
33 
34       if (is_a6xx()) {
35          printf(" context %d", context);
36       }
37 
38       printf("\n");
39       dump_register_val(&r, 2);
40    }
41 }
42 
43 static void
dump_mem_pool_chunk(const uint32_t * chunk)44 dump_mem_pool_chunk(const uint32_t *chunk)
45 {
46    struct __attribute__((packed)) {
47       bool reg0_enabled : 1;
48       bool reg1_enabled : 1;
49       uint32_t data0 : 32;
50       uint32_t data1 : 32;
51       uint32_t reg0 : 18;
52       uint32_t reg1 : 18;
53       bool reg0_pipe : 1;
54       bool reg1_pipe : 1;
55       uint32_t reg0_context : 1;
56       uint32_t reg1_context : 1;
57       uint32_t padding : 22;
58    } fields;
59 
60    memcpy(&fields, chunk, 4 * sizeof(uint32_t));
61 
62    if (fields.reg0_enabled) {
63       dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context,
64                               fields.reg0_pipe);
65    }
66 
67    if (fields.reg1_enabled) {
68       dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context,
69                               fields.reg1_pipe);
70    }
71 }
72 
73 void
dump_cp_mem_pool(uint32_t * mempool)74 dump_cp_mem_pool(uint32_t *mempool)
75 {
76    /* The mem pool is a shared pool of memory used for storing in-flight
77     * register writes. There are 6 different queues, one for each
78     * cluster. Writing to $data (or for some special registers, $addr)
79     * pushes data onto the appropriate queue, and each queue is pulled
80     * from by the appropriate cluster. The queues are thus written to
81     * in-order, but may be read out-of-order.
82     *
83     * The queues are conceptually divided into 128-bit "chunks", and the
84     * read and write pointers are in units of chunks.  These chunks are
85     * organized internally into 8-chunk "blocks", and memory is allocated
86     * dynamically in terms of blocks. Each queue is represented as a
87     * singly-linked list of blocks, as well as 3-bit start/end chunk
88     * pointers that point within the first/last block.  The next pointers
89     * are located in a separate array, rather than inline.
90     */
91 
92    /* TODO: The firmware CP_MEM_POOL save/restore routines do something
93     * like:
94     *
95     * cread $02, [ $00 + 0 ]
96     * and $02, $02, 0x118
97     * ...
98     * brne $02, 0, #label
99     * mov $03, 0x2000
100     * mov $03, 0x1000
101     * label:
102     * ...
103     *
104     * I think that control register 0 is the GPU version, and some
105     * versions have a smaller mem pool. It seems some models have a mem
106     * pool that's half the size, and a bunch of offsets are shifted
107     * accordingly. Unfortunately the kernel driver's dumping code doesn't
108     * seem to take this into account, even the downstream android driver,
109     * and we don't know which versions 0x8, 0x10, or 0x100 correspond
110     * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
111     */
112    bool small_mem_pool = false;
113 
114    /* The array of next pointers for each block. */
115    const uint32_t *next_pointers =
116       small_mem_pool ? &mempool[0x800] : &mempool[0x1000];
117 
118    /* Maximum number of blocks in the pool, also the size of the pointers
119     * array.
120     */
121    const int num_blocks = small_mem_pool ? 0x30 : 0x80;
122 
123    /* Number of queues */
124    const unsigned num_queues = is_a6xx() ? 6 : 7;
125 
126    /* Unfortunately the per-queue state is a little more complicated than
127     * a simple pair of begin/end pointers. Instead of a single beginning
128     * block, there are *two*, with the property that either the two are
129     * equal or the second is the "next" of the first. Similarly there are
130     * two end blocks. Thus the queue either looks like this:
131     *
132     * A -> B -> ... -> C -> D
133     *
134     * Or like this, or some combination:
135     *
136     * A/B -> ... -> C/D
137     *
138     * However, there's only one beginning/end chunk offset. Now the
139     * question is, which of A or B is the actual start? I.e. is the chunk
140     * offset an offset inside A or B? It depends. I'll show a typical read
141     * cycle, starting here (read pointer marked with a *) with a chunk
142     * offset of 0:
143     *
144     *	  A                    B
145     *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
146     * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
147     *
148     * Once the pointer advances far enough, the hardware decides to free
149     * A, after which the read-side state looks like:
150     *
151     *	(free)                A/B
152     *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
153     * |_|_|_|_|_|_|_|_|    |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
154     *
155     * Then after advancing the pointer a bit more, the hardware fetches
156     * the "next" pointer for A and stores it in B:
157     *
158     *	(free)                 A                     B
159     *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
160     * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
161     *
162     * Then the read pointer advances into B, at which point we've come
163     * back to the first state having advanced a whole block:
164     *
165     *	(free)                 A                     B
166     *  _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _      _ _ _ _ _ _ _ _
167     * |_|_|_|_|_|_|_|_|    |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
168     *
169     *
170     * There is a similar cycle for the write pointer. Now, the question
171     * is, how do we know which state we're in? We need to know this to
172     * know whether the pointer (*) is in A or B if they're different. It
173     * seems like there should be some bit somewhere describing this, but
174     * after lots of experimentation I've come up empty-handed. For now we
175     * assume that if the pointer is in the first half, then we're in
176     * either the first or second state and use B, and otherwise we're in
177     * the second or third state and use A. So far I haven't seen anything
178     * that violates this assumption.
179     */
180 
181    struct {
182       uint32_t unk0;
183       uint32_t padding0[7]; /* Mirrors of unk0 */
184 
185       struct {
186          uint32_t chunk : 3;
187          uint32_t first_block : 32 - 3;
188       } writer[6];
189       uint32_t padding1[2]; /* Mirror of writer[5] */
190 
191       uint32_t unk1;
192       uint32_t padding2[7]; /* Mirrors of unk1 */
193 
194       uint32_t writer_second_block[7];
195       uint32_t padding3[1];
196 
197       uint32_t unk2[7];
198       uint32_t padding4[1];
199 
200       struct {
201          uint32_t chunk : 3;
202          uint32_t first_block : 32 - 3;
203       } reader[7];
204       uint32_t padding5[1]; /* Mirror of reader[5] */
205 
206       uint32_t unk3;
207       uint32_t padding6[7]; /* Mirrors of unk3 */
208 
209       uint32_t reader_second_block[7];
210       uint32_t padding7[1];
211 
212       uint32_t block_count[7];
213       uint32_t padding[1];
214 
215       uint32_t unk4;
216       uint32_t padding9[7]; /* Mirrors of unk4 */
217    } data1;
218 
219    const uint32_t *data1_ptr =
220       small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
221    memcpy(&data1, data1_ptr, sizeof(data1));
222 
223    /* Based on the kernel, the first dword is the mem pool size (in
224     * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
225     */
226    const uint32_t *data2_ptr =
227       small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
228    const int data2_size = 0x60;
229 
230    /* This seems to be the size of each queue in chunks. */
231    const uint32_t *queue_sizes = &data2_ptr[0x18];
232 
233    printf("\tdata2:\n");
234    dump_hex_ascii(data2_ptr, 4 * data2_size, 1);
235 
236    /* These seem to be some kind of counter of allocated/deallocated blocks */
237    if (verbose) {
238       printf("\tunk0: %x\n", data1.unk0);
239       printf("\tunk1: %x\n", data1.unk1);
240       printf("\tunk3: %x\n", data1.unk3);
241       printf("\tunk4: %x\n\n", data1.unk4);
242    }
243 
244    for (int queue = 0; queue < num_queues; queue++) {
245       const char *cluster_names_a6xx[6] = {"FE",   "SP_VS", "PC_VS",
246                                            "GRAS", "SP_PS", "PS"};
247       const char *cluster_names_a7xx[7] = {"FE",   "SP_VS", "PC_VS",
248                                            "GRAS", "SP_PS", "VPC_PS", "PS"};
249       printf("\tCLUSTER_%s:\n\n",
250              is_a6xx() ? cluster_names_a6xx[queue] : cluster_names_a7xx[queue]);
251 
252       if (verbose) {
253          printf("\t\twriter_first_block: 0x%x\n",
254                 data1.writer[queue].first_block);
255          printf("\t\twriter_second_block: 0x%x\n",
256                 data1.writer_second_block[queue]);
257          printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
258          printf("\t\treader_first_block: 0x%x\n",
259                 data1.reader[queue].first_block);
260          printf("\t\treader_second_block: 0x%x\n",
261                 data1.reader_second_block[queue]);
262          printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
263          printf("\t\tblock_count: %d\n", data1.block_count[queue]);
264          printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
265          printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
266       }
267 
268       uint32_t cur_chunk = data1.reader[queue].chunk;
269       uint32_t cur_block = cur_chunk > 3 ? data1.reader[queue].first_block
270                                          : data1.reader_second_block[queue];
271       uint32_t last_chunk = data1.writer[queue].chunk;
272       uint32_t last_block = last_chunk > 3 ? data1.writer[queue].first_block
273                                            : data1.writer_second_block[queue];
274 
275       if (verbose)
276          printf("\tblock %x\n", cur_block);
277       if (cur_block >= num_blocks) {
278          fprintf(stderr, "block %x too large\n", cur_block);
279          exit(1);
280       }
281       unsigned calculated_queue_size = 0;
282       while (cur_block != last_block || cur_chunk != last_chunk) {
283          calculated_queue_size++;
284          uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];
285 
286          dump_mem_pool_chunk(chunk_ptr);
287 
288          printf("\t%05x: %08x %08x %08x %08x\n",
289                 4 * (cur_block * 0x20 + cur_chunk + 4), chunk_ptr[0],
290                 chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);
291 
292          cur_chunk++;
293          if (cur_chunk == 8) {
294             cur_block = next_pointers[cur_block];
295             if (verbose)
296                printf("\tblock %x\n", cur_block);
297             if (cur_block >= num_blocks) {
298                fprintf(stderr, "block %x too large\n", cur_block);
299                exit(1);
300             }
301             cur_chunk = 0;
302          }
303       }
304       if (calculated_queue_size != queue_sizes[queue]) {
305          printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n",
306                 calculated_queue_size);
307       }
308       printf("\n");
309    }
310 }
311 
312