1 /*
2 * Copyright © 2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6
7 #include "crashdec.h"
8
9
10 static void
dump_mem_pool_reg_write(unsigned reg,uint32_t data,unsigned context,bool pipe)11 dump_mem_pool_reg_write(unsigned reg, uint32_t data, unsigned context,
12 bool pipe)
13 {
14 /* TODO deal better somehow w/ 64b regs: */
15 struct regacc r = {
16 .rnn = pipe ? rnn_pipe : NULL,
17 .regbase = reg,
18 .value = data,
19 };
20 if (pipe) {
21 struct rnndecaddrinfo *info = rnn_reginfo(rnn_pipe, reg);
22 printf("\t\twrite %s (%02x) pipe\n", info->name, reg);
23
24 if (!strcmp(info->typeinfo->name, "void")) {
25 /* registers that ignore their payload */
26 } else {
27 printf("\t\t\t");
28 dump_register(&r);
29 }
30 rnn_reginfo_free(info);
31 } else {
32 printf("\t\twrite %s (%05x)", regname(reg, 1), reg);
33
34 if (is_a6xx()) {
35 printf(" context %d", context);
36 }
37
38 printf("\n");
39 dump_register_val(&r, 2);
40 }
41 }
42
43 static void
dump_mem_pool_chunk(const uint32_t * chunk)44 dump_mem_pool_chunk(const uint32_t *chunk)
45 {
46 struct __attribute__((packed)) {
47 bool reg0_enabled : 1;
48 bool reg1_enabled : 1;
49 uint32_t data0 : 32;
50 uint32_t data1 : 32;
51 uint32_t reg0 : 18;
52 uint32_t reg1 : 18;
53 bool reg0_pipe : 1;
54 bool reg1_pipe : 1;
55 uint32_t reg0_context : 1;
56 uint32_t reg1_context : 1;
57 uint32_t padding : 22;
58 } fields;
59
60 memcpy(&fields, chunk, 4 * sizeof(uint32_t));
61
62 if (fields.reg0_enabled) {
63 dump_mem_pool_reg_write(fields.reg0, fields.data0, fields.reg0_context,
64 fields.reg0_pipe);
65 }
66
67 if (fields.reg1_enabled) {
68 dump_mem_pool_reg_write(fields.reg1, fields.data1, fields.reg1_context,
69 fields.reg1_pipe);
70 }
71 }
72
73 void
dump_cp_mem_pool(uint32_t * mempool)74 dump_cp_mem_pool(uint32_t *mempool)
75 {
76 /* The mem pool is a shared pool of memory used for storing in-flight
77 * register writes. There are 6 different queues, one for each
78 * cluster. Writing to $data (or for some special registers, $addr)
79 * pushes data onto the appropriate queue, and each queue is pulled
80 * from by the appropriate cluster. The queues are thus written to
81 * in-order, but may be read out-of-order.
82 *
83 * The queues are conceptually divided into 128-bit "chunks", and the
84 * read and write pointers are in units of chunks. These chunks are
85 * organized internally into 8-chunk "blocks", and memory is allocated
86 * dynamically in terms of blocks. Each queue is represented as a
87 * singly-linked list of blocks, as well as 3-bit start/end chunk
88 * pointers that point within the first/last block. The next pointers
89 * are located in a separate array, rather than inline.
90 */
91
92 /* TODO: The firmware CP_MEM_POOL save/restore routines do something
93 * like:
94 *
95 * cread $02, [ $00 + 0 ]
96 * and $02, $02, 0x118
97 * ...
98 * brne $02, 0, #label
99 * mov $03, 0x2000
100 * mov $03, 0x1000
101 * label:
102 * ...
103 *
104 * I think that control register 0 is the GPU version, and some
105 * versions have a smaller mem pool. It seems some models have a mem
106 * pool that's half the size, and a bunch of offsets are shifted
107 * accordingly. Unfortunately the kernel driver's dumping code doesn't
108 * seem to take this into account, even the downstream android driver,
109 * and we don't know which versions 0x8, 0x10, or 0x100 correspond
110 * to. Or maybe we can use CP_DBG_MEM_POOL_SIZE to figure this out?
111 */
112 bool small_mem_pool = false;
113
114 /* The array of next pointers for each block. */
115 const uint32_t *next_pointers =
116 small_mem_pool ? &mempool[0x800] : &mempool[0x1000];
117
118 /* Maximum number of blocks in the pool, also the size of the pointers
119 * array.
120 */
121 const int num_blocks = small_mem_pool ? 0x30 : 0x80;
122
123 /* Number of queues */
124 const unsigned num_queues = is_a6xx() ? 6 : 7;
125
126 /* Unfortunately the per-queue state is a little more complicated than
127 * a simple pair of begin/end pointers. Instead of a single beginning
128 * block, there are *two*, with the property that either the two are
129 * equal or the second is the "next" of the first. Similarly there are
130 * two end blocks. Thus the queue either looks like this:
131 *
132 * A -> B -> ... -> C -> D
133 *
134 * Or like this, or some combination:
135 *
136 * A/B -> ... -> C/D
137 *
138 * However, there's only one beginning/end chunk offset. Now the
139 * question is, which of A or B is the actual start? I.e. is the chunk
140 * offset an offset inside A or B? It depends. I'll show a typical read
141 * cycle, starting here (read pointer marked with a *) with a chunk
142 * offset of 0:
143 *
144 * A B
145 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
146 * |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_| -> |_|_|_|_|_|_|_|_|
147 *
148 * Once the pointer advances far enough, the hardware decides to free
149 * A, after which the read-side state looks like:
150 *
151 * (free) A/B
152 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
153 * |_|_|_|_|_|_|_|_| |_|_|_|*|_|_|_|_| -> |_|_|_|_|_|_|_|_|
154 *
155 * Then after advancing the pointer a bit more, the hardware fetches
156 * the "next" pointer for A and stores it in B:
157 *
158 * (free) A B
159 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
160 * |_|_|_|_|_|_|_|_| |_|_|_|_|_|_|_|*| -> |_|_|_|_|_|_|_|_|
161 *
162 * Then the read pointer advances into B, at which point we've come
163 * back to the first state having advanced a whole block:
164 *
165 * (free) A B
166 * _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
167 * |_|_|_|_|_|_|_|_| |_|_|_|_|_|_|_|_| -> |*|_|_|_|_|_|_|_|
168 *
169 *
170 * There is a similar cycle for the write pointer. Now, the question
171 * is, how do we know which state we're in? We need to know this to
172 * know whether the pointer (*) is in A or B if they're different. It
173 * seems like there should be some bit somewhere describing this, but
174 * after lots of experimentation I've come up empty-handed. For now we
175 * assume that if the pointer is in the first half, then we're in
176 * either the first or second state and use B, and otherwise we're in
177 * the second or third state and use A. So far I haven't seen anything
178 * that violates this assumption.
179 */
180
181 struct {
182 uint32_t unk0;
183 uint32_t padding0[7]; /* Mirrors of unk0 */
184
185 struct {
186 uint32_t chunk : 3;
187 uint32_t first_block : 32 - 3;
188 } writer[6];
189 uint32_t padding1[2]; /* Mirror of writer[5] */
190
191 uint32_t unk1;
192 uint32_t padding2[7]; /* Mirrors of unk1 */
193
194 uint32_t writer_second_block[7];
195 uint32_t padding3[1];
196
197 uint32_t unk2[7];
198 uint32_t padding4[1];
199
200 struct {
201 uint32_t chunk : 3;
202 uint32_t first_block : 32 - 3;
203 } reader[7];
204 uint32_t padding5[1]; /* Mirror of reader[5] */
205
206 uint32_t unk3;
207 uint32_t padding6[7]; /* Mirrors of unk3 */
208
209 uint32_t reader_second_block[7];
210 uint32_t padding7[1];
211
212 uint32_t block_count[7];
213 uint32_t padding[1];
214
215 uint32_t unk4;
216 uint32_t padding9[7]; /* Mirrors of unk4 */
217 } data1;
218
219 const uint32_t *data1_ptr =
220 small_mem_pool ? &mempool[0xc00] : &mempool[0x1800];
221 memcpy(&data1, data1_ptr, sizeof(data1));
222
223 /* Based on the kernel, the first dword is the mem pool size (in
224 * blocks?) and mirrors CP_MEM_POOL_DBG_SIZE.
225 */
226 const uint32_t *data2_ptr =
227 small_mem_pool ? &mempool[0x1000] : &mempool[0x2000];
228 const int data2_size = 0x60;
229
230 /* This seems to be the size of each queue in chunks. */
231 const uint32_t *queue_sizes = &data2_ptr[0x18];
232
233 printf("\tdata2:\n");
234 dump_hex_ascii(data2_ptr, 4 * data2_size, 1);
235
236 /* These seem to be some kind of counter of allocated/deallocated blocks */
237 if (verbose) {
238 printf("\tunk0: %x\n", data1.unk0);
239 printf("\tunk1: %x\n", data1.unk1);
240 printf("\tunk3: %x\n", data1.unk3);
241 printf("\tunk4: %x\n\n", data1.unk4);
242 }
243
244 for (int queue = 0; queue < num_queues; queue++) {
245 const char *cluster_names_a6xx[6] = {"FE", "SP_VS", "PC_VS",
246 "GRAS", "SP_PS", "PS"};
247 const char *cluster_names_a7xx[7] = {"FE", "SP_VS", "PC_VS",
248 "GRAS", "SP_PS", "VPC_PS", "PS"};
249 printf("\tCLUSTER_%s:\n\n",
250 is_a6xx() ? cluster_names_a6xx[queue] : cluster_names_a7xx[queue]);
251
252 if (verbose) {
253 printf("\t\twriter_first_block: 0x%x\n",
254 data1.writer[queue].first_block);
255 printf("\t\twriter_second_block: 0x%x\n",
256 data1.writer_second_block[queue]);
257 printf("\t\twriter_chunk: %d\n", data1.writer[queue].chunk);
258 printf("\t\treader_first_block: 0x%x\n",
259 data1.reader[queue].first_block);
260 printf("\t\treader_second_block: 0x%x\n",
261 data1.reader_second_block[queue]);
262 printf("\t\treader_chunk: %d\n", data1.reader[queue].chunk);
263 printf("\t\tblock_count: %d\n", data1.block_count[queue]);
264 printf("\t\tunk2: 0x%x\n", data1.unk2[queue]);
265 printf("\t\tqueue_size: %d\n\n", queue_sizes[queue]);
266 }
267
268 uint32_t cur_chunk = data1.reader[queue].chunk;
269 uint32_t cur_block = cur_chunk > 3 ? data1.reader[queue].first_block
270 : data1.reader_second_block[queue];
271 uint32_t last_chunk = data1.writer[queue].chunk;
272 uint32_t last_block = last_chunk > 3 ? data1.writer[queue].first_block
273 : data1.writer_second_block[queue];
274
275 if (verbose)
276 printf("\tblock %x\n", cur_block);
277 if (cur_block >= num_blocks) {
278 fprintf(stderr, "block %x too large\n", cur_block);
279 exit(1);
280 }
281 unsigned calculated_queue_size = 0;
282 while (cur_block != last_block || cur_chunk != last_chunk) {
283 calculated_queue_size++;
284 uint32_t *chunk_ptr = &mempool[cur_block * 0x20 + cur_chunk * 4];
285
286 dump_mem_pool_chunk(chunk_ptr);
287
288 printf("\t%05x: %08x %08x %08x %08x\n",
289 4 * (cur_block * 0x20 + cur_chunk + 4), chunk_ptr[0],
290 chunk_ptr[1], chunk_ptr[2], chunk_ptr[3]);
291
292 cur_chunk++;
293 if (cur_chunk == 8) {
294 cur_block = next_pointers[cur_block];
295 if (verbose)
296 printf("\tblock %x\n", cur_block);
297 if (cur_block >= num_blocks) {
298 fprintf(stderr, "block %x too large\n", cur_block);
299 exit(1);
300 }
301 cur_chunk = 0;
302 }
303 }
304 if (calculated_queue_size != queue_sizes[queue]) {
305 printf("\t\tCALCULATED SIZE %d DOES NOT MATCH!\n",
306 calculated_queue_size);
307 }
308 printf("\n");
309 }
310 }
311
312