1 /*
2 * Copyright © 2021 Google, Inc.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include <assert.h>
7 #include <ctype.h>
8 #include <errno.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <sys/mman.h>
13 #include <unistd.h>
14
15 #include "util/u_math.h"
16
17 #include "freedreno_pm4.h"
18
19 #include "afuc-isa.h"
20
21 #include "emu.h"
22 #include "util.h"
23
24 #define rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
25 #define rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r))))
26
27 EMU_SQE_REG(SP);
28 EMU_SQE_REG(STACK0);
29 EMU_CONTROL_REG(DRAW_STATE_SET_HDR);
30
31 /**
32 * AFUC emulator. Currently only supports a6xx
33 *
34 * TODO to add a5xx it might be easier to compile this multiple times
35 * with conditional compile to deal with differences between generations.
36 */
37
38 static uint32_t
emu_alu(struct emu * emu,afuc_opc opc,uint32_t src1,uint32_t src2)39 emu_alu(struct emu *emu, afuc_opc opc, uint32_t src1, uint32_t src2)
40 {
41 uint64_t tmp;
42 switch (opc) {
43 case OPC_ADD:
44 tmp = (uint64_t)src1 + (uint64_t)src2;
45 emu->carry = tmp >> 32;
46 return (uint32_t)tmp;
47 case OPC_ADDHI:
48 return src1 + src2 + emu->carry;
49 case OPC_SUB:
50 tmp = (uint64_t)src1 - (uint64_t)src2;
51 emu->carry = tmp >> 32;
52 return (uint32_t)tmp;
53 case OPC_SUBHI:
54 return src1 - src2 + emu->carry;
55 case OPC_AND:
56 return src1 & src2;
57 case OPC_OR:
58 return src1 | src2;
59 case OPC_XOR:
60 return src1 ^ src2;
61 case OPC_NOT:
62 return ~src1;
63 case OPC_SHL:
64 return src1 << src2;
65 case OPC_USHR:
66 return src1 >> src2;
67 case OPC_ISHR:
68 return (int32_t)src1 >> src2;
69 case OPC_ROT:
70 if (src2 & 0x80000000)
71 return rotl64(src1, -*(int32_t *)&src2);
72 else
73 return rotl32(src1, src2);
74 case OPC_MUL8:
75 return (src1 & 0xff) * (src2 & 0xff);
76 case OPC_MIN:
77 return MIN2(src1, src2);
78 case OPC_MAX:
79 return MAX2(src1, src2);
80 case OPC_CMP:
81 if (src1 > src2)
82 return 0x00;
83 else if (src1 == src2)
84 return 0x2b;
85 return 0x1e;
86 case OPC_BIC:
87 return src1 & ~src2;
88 case OPC_MSB:
89 if (!src2)
90 return 0;
91 return util_last_bit(src2) - 1;
92 case OPC_SETBIT: {
93 unsigned bit = src2 >> 1;
94 unsigned val = src2 & 1;
95 return (src1 & ~(1u << bit)) | (val << bit);
96 }
97 default:
98 printf("unhandled alu opc: 0x%02x\n", opc);
99 exit(1);
100 }
101 }
102
103 /**
104 * Helper to calculate load/store address based on LOAD_STORE_HI
105 */
106 static uintptr_t
load_store_addr(struct emu * emu,unsigned gpr)107 load_store_addr(struct emu *emu, unsigned gpr)
108 {
109 EMU_CONTROL_REG(LOAD_STORE_HI);
110
111 uintptr_t addr = emu_get_reg32(emu, &LOAD_STORE_HI);
112 addr <<= 32;
113
114 return addr + emu_get_gpr_reg(emu, gpr);
115 }
116
117 static void
emu_instr(struct emu * emu,struct afuc_instr * instr)118 emu_instr(struct emu *emu, struct afuc_instr *instr)
119 {
120 uint32_t rem = emu_get_gpr_reg(emu, REG_REM);
121
122 switch (instr->opc) {
123 case OPC_NOP:
124 break;
125 case OPC_MSB:
126 case OPC_ADD ... OPC_BIC: {
127 uint32_t val = emu_alu(emu, instr->opc,
128 emu_get_gpr_reg(emu, instr->src1),
129 instr->has_immed ? instr->immed :
130 emu_get_gpr_reg_alu(emu, instr->src2, instr->peek));
131 emu_set_gpr_reg(emu, instr->dst, val);
132
133 if (instr->xmov) {
134 unsigned m = MIN2(instr->xmov, rem);
135
136 assert(m <= 3);
137
138 if (m == 1) {
139 emu_set_gpr_reg(emu, REG_REM, --rem);
140 emu_dump_state_change(emu);
141 emu_set_gpr_reg(emu, REG_DATA,
142 emu_get_gpr_reg(emu, instr->src2));
143 } else if (m == 2) {
144 emu_set_gpr_reg(emu, REG_REM, --rem);
145 emu_dump_state_change(emu);
146 emu_set_gpr_reg(emu, REG_DATA,
147 emu_get_gpr_reg(emu, instr->src2));
148 emu_set_gpr_reg(emu, REG_REM, --rem);
149 emu_dump_state_change(emu);
150 emu_set_gpr_reg(emu, REG_DATA,
151 emu_get_gpr_reg(emu, instr->src2));
152 } else if (m == 3) {
153 emu_set_gpr_reg(emu, REG_REM, --rem);
154 emu_dump_state_change(emu);
155 emu_set_gpr_reg(emu, REG_DATA,
156 emu_get_gpr_reg(emu, instr->src2));
157 emu_set_gpr_reg(emu, REG_REM, --rem);
158 emu_dump_state_change(emu);
159 emu_set_gpr_reg(emu, instr->dst,
160 emu_get_gpr_reg(emu, instr->src2));
161 emu_set_gpr_reg(emu, REG_REM, --rem);
162 emu_dump_state_change(emu);
163 emu_set_gpr_reg(emu, REG_DATA,
164 emu_get_gpr_reg(emu, instr->src2));
165 }
166 }
167 break;
168 }
169 case OPC_MOVI: {
170 uint32_t val = instr->immed << instr->shift;
171 emu_set_gpr_reg(emu, instr->dst, val);
172 break;
173 }
174 case OPC_SETBITI: {
175 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
176 emu_set_gpr_reg(emu, instr->dst, src | (1u << instr->bit));
177 break;
178 }
179 case OPC_CLRBIT: {
180 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
181 emu_set_gpr_reg(emu, instr->dst, src & ~(1u << instr->bit));
182 break;
183 }
184 case OPC_UBFX: {
185 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
186 unsigned lo = instr->bit, hi = instr->immed;
187 uint32_t dst = (src >> lo) & BITFIELD_MASK(hi - lo + 1);
188 emu_set_gpr_reg(emu, instr->dst, dst);
189 break;
190 }
191 case OPC_BFI: {
192 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
193 unsigned lo = instr->bit, hi = instr->immed;
194 src = (src & BITFIELD_MASK(hi - lo + 1)) << lo;
195 emu_set_gpr_reg(emu, instr->dst, emu_get_gpr_reg(emu, instr->dst) | src);
196 break;
197 }
198 case OPC_CWRITE: {
199 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
200 uint32_t src2 = emu_get_gpr_reg(emu, instr->src2);
201 uint32_t reg = src2 + instr->immed;
202
203 if (instr->preincrement) {
204 emu_set_gpr_reg(emu, instr->src2, reg);
205 }
206
207 emu_set_control_reg(emu, reg, src1);
208
209 for (unsigned i = 0; i < instr->sds; i++) {
210 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
211
212 /* TODO: There is likely a DRAW_STATE_SET_BASE register on a6xx, as
213 * there is on a7xx, and we should be writing that instead of setting
214 * the base directly.
215 */
216 if (reg == emu_reg_offset(&DRAW_STATE_SET_HDR))
217 emu_set_draw_state_base(emu, i, src1);
218 }
219 break;
220 }
221 case OPC_CREAD: {
222 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
223
224 if (instr->preincrement) {
225 emu_set_gpr_reg(emu, instr->src1, src1 + instr->immed);
226 }
227
228 emu_set_gpr_reg(emu, instr->dst,
229 emu_get_control_reg(emu, src1 + instr->immed));
230 break;
231 }
232 case OPC_SWRITE: {
233 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
234 uint32_t src2 = emu_get_gpr_reg(emu, instr->src2);
235
236 if (instr->preincrement) {
237 emu_set_gpr_reg(emu, instr->src2, src2 + instr->immed);
238 }
239
240 emu_set_sqe_reg(emu, src2 + instr->immed, src1);
241 break;
242 }
243 case OPC_SREAD: {
244 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
245
246 if (instr->preincrement) {
247 emu_set_gpr_reg(emu, instr->src1, src1 + instr->immed);
248 }
249
250 emu_set_gpr_reg(emu, instr->dst,
251 emu_get_sqe_reg(emu, src1 + instr->immed));
252 break;
253 }
254 case OPC_LOAD: {
255 uintptr_t addr = load_store_addr(emu, instr->src1) +
256 instr->immed;
257
258 if (instr->preincrement) {
259 uint32_t src1 = emu_get_gpr_reg(emu, instr->src1);
260 emu_set_gpr_reg(emu, instr->src1, src1 + instr->immed);
261 }
262
263 uint32_t val = emu_mem_read_dword(emu, addr);
264
265 emu_set_gpr_reg(emu, instr->dst, val);
266
267 break;
268 }
269 case OPC_STORE: {
270 uintptr_t addr = load_store_addr(emu, instr->src2) +
271 instr->immed;
272
273 if (instr->preincrement) {
274 uint32_t src2 = emu_get_gpr_reg(emu, instr->src2);
275 emu_set_gpr_reg(emu, instr->src2, src2 + instr->immed);
276 }
277
278 uint32_t val = emu_get_gpr_reg(emu, instr->src1);
279
280 emu_mem_write_dword(emu, addr, val);
281
282 break;
283 }
284 case OPC_BRNEI ... OPC_BREQB: {
285 uint32_t off = emu->gpr_regs.pc + instr->offset;
286 uint32_t src = emu_get_gpr_reg(emu, instr->src1);
287
288 if (instr->opc == OPC_BRNEI) {
289 if (src != instr->immed)
290 emu->branch_target = off;
291 } else if (instr->opc == OPC_BREQI) {
292 if (src == instr->immed)
293 emu->branch_target = off;
294 } else if (instr->opc == OPC_BRNEB) {
295 if (!(src & (1 << instr->bit)))
296 emu->branch_target = off;
297 } else if (instr->opc == OPC_BREQB) {
298 if (src & (1 << instr->bit))
299 emu->branch_target = off;
300 } else {
301 assert(0);
302 }
303 break;
304 }
305 case OPC_RET: {
306 unsigned sp = emu_get_reg32(emu, &SP);
307 assert(sp > 0);
308
309 /* counter-part to 'call' instruction, also has a delay slot: */
310 emu->branch_target = emu_get_sqe_reg(emu, emu_reg_offset(&STACK0) + sp - 1);
311 emu_set_reg32(emu, &SP, sp - 1);
312
313 break;
314 }
315 case OPC_CALL: {
316 unsigned sp = emu_get_reg32(emu, &SP);
317 assert(sp + emu_reg_offset(&STACK0) < ARRAY_SIZE(emu->sqe_regs.val));
318
319 /* call looks to have same delay-slot behavior as branch/etc, so
320 * presumably the return PC is two instructions later:
321 */
322 emu_set_sqe_reg(emu, emu_reg_offset(&STACK0) + sp, emu->gpr_regs.pc + 2);
323 emu_set_reg32(emu, &SP, sp + 1);
324 emu->branch_target = instr->literal;
325
326 break;
327 }
328 case OPC_WAITIN: {
329 assert(!emu->branch_target);
330 emu->run_mode = false;
331 emu->waitin = true;
332 break;
333 }
334 case OPC_BL: {
335 emu_set_gpr_reg(emu, REG_LR, emu->gpr_regs.pc + 2);
336 emu->branch_target = instr->literal;
337 break;
338 }
339 case OPC_JUMPR: {
340 emu->branch_target = emu_get_gpr_reg(emu, instr->src1);
341 break;
342 }
343 case OPC_SRET: {
344 emu->branch_target = emu_get_gpr_reg(emu, REG_LR);
345 /* TODO: read $sp and check for stack overflow? */
346 break;
347 }
348 case OPC_SETSECURE: {
349 // TODO this acts like a conditional branch, but in which case
350 // does it branch?
351 break;
352 }
353 default:
354 printf("unhandled opc: 0x%02x\n", instr->opc);
355 exit(1);
356 }
357
358 if (instr->rep) {
359 assert(rem > 0);
360 emu_set_gpr_reg(emu, REG_REM, --rem);
361 }
362 }
363
364 void
emu_step(struct emu * emu)365 emu_step(struct emu *emu)
366 {
367 struct afuc_instr *instr;
368 bool decoded =
369 afuc_isa_decode((void *)&instr, (void *)&emu->instrs[emu->gpr_regs.pc],
370 &(struct isa_decode_options){
371 .gpu_id = gpuver,
372 });
373
374 if (!decoded) {
375 uint32_t instr_val = emu->instrs[emu->gpr_regs.pc];
376 if ((instr_val >> 27) == 0) {
377 /* This is printed as an undecoded literal to show the immediate
378 * payload, but when executing it's just a NOP.
379 */
380 instr = calloc(1, sizeof(struct afuc_instr));
381 instr->opc = OPC_NOP;
382 } else {
383 printf("unmatched instruction: 0x%08x\n", instr_val);
384 exit(1);
385 }
386 }
387
388 emu_main_prompt(emu);
389
390 uint32_t branch_target = emu->branch_target;
391 emu->branch_target = 0;
392
393 bool waitin = emu->waitin;
394 emu->waitin = false;
395
396 if (instr->rep) {
397 do {
398 if (!emu_get_gpr_reg(emu, REG_REM))
399 break;
400
401 emu_clear_state_change(emu);
402 emu_instr(emu, instr);
403
404 /* defer last state-change dump until after any
405 * post-delay-slot handling below:
406 */
407 if (emu_get_gpr_reg(emu, REG_REM))
408 emu_dump_state_change(emu);
409 } while (true);
410 } else {
411 emu_clear_state_change(emu);
412 emu_instr(emu, instr);
413 }
414
415 emu->gpr_regs.pc++;
416
417 if (branch_target) {
418 emu->gpr_regs.pc = branch_target;
419 }
420
421 if (waitin) {
422 uint32_t hdr = emu_get_gpr_reg(emu, 1);
423 uint32_t id, count;
424
425 if (pkt_is_type4(hdr)) {
426 id = afuc_pm4_id("PKT4");
427 count = type4_pkt_size(hdr);
428
429 /* Possibly a hack, not sure what the hw actually
430 * does here, but we want to mask out the pkt
431 * type field from the hdr, so that PKT4 handler
432 * doesn't see it and interpret it as part as the
433 * register offset:
434 */
435 emu->gpr_regs.val[1] &= 0x0fffffff;
436 } else if (pkt_is_type7(hdr)) {
437 id = cp_type7_opcode(hdr);
438 count = type7_pkt_size(hdr);
439 } else {
440 printf("Invalid opcode: 0x%08x\n", hdr);
441 exit(1); /* GPU goes *boom* */
442 }
443
444 assert(id < ARRAY_SIZE(emu->jmptbl));
445
446 emu_set_gpr_reg(emu, REG_REM, count);
447 emu->gpr_regs.pc = emu->jmptbl[id];
448 }
449
450 emu_dump_state_change(emu);
451
452 free(instr);
453 }
454
455 void
emu_run_bootstrap(struct emu * emu)456 emu_run_bootstrap(struct emu *emu)
457 {
458 EMU_CONTROL_REG(THREAD_SYNC);
459
460 emu->quiet = true;
461 emu->run_mode = true;
462 emu->bootstrap_mode = true;
463 emu->bootstrap_finished = false;
464
465 if (gpuver == 6 && emu->processor == EMU_PROC_LPAC) {
466 /* Emulate what the SQE bootstrap routine does after launching LPAC */
467 emu_set_reg32(emu, &THREAD_SYNC, 1u << 0);
468 }
469
470 while (!emu->bootstrap_finished && !emu->waitin) {
471 emu_step(emu);
472 }
473
474 emu->bootstrap_mode = false;
475 }
476
477
478 static void
check_access(struct emu * emu,uintptr_t gpuaddr,unsigned sz)479 check_access(struct emu *emu, uintptr_t gpuaddr, unsigned sz)
480 {
481 if ((gpuaddr % sz) != 0) {
482 printf("unaligned access fault: %p\n", (void *)gpuaddr);
483 exit(1);
484 }
485
486 if ((gpuaddr + sz) >= EMU_MEMORY_SIZE) {
487 printf("iova fault: %p\n", (void *)gpuaddr);
488 exit(1);
489 }
490 }
491
492 uint32_t
emu_mem_read_dword(struct emu * emu,uintptr_t gpuaddr)493 emu_mem_read_dword(struct emu *emu, uintptr_t gpuaddr)
494 {
495 check_access(emu, gpuaddr, 4);
496 return *(uint32_t *)(emu->gpumem + gpuaddr);
497 }
498
499 static void
mem_write_dword(struct emu * emu,uintptr_t gpuaddr,uint32_t val)500 mem_write_dword(struct emu *emu, uintptr_t gpuaddr, uint32_t val)
501 {
502 check_access(emu, gpuaddr, 4);
503 *(uint32_t *)(emu->gpumem + gpuaddr) = val;
504 }
505
506 void
emu_mem_write_dword(struct emu * emu,uintptr_t gpuaddr,uint32_t val)507 emu_mem_write_dword(struct emu *emu, uintptr_t gpuaddr, uint32_t val)
508 {
509 mem_write_dword(emu, gpuaddr, val);
510 assert(emu->gpumem_written == ~0);
511 emu->gpumem_written = gpuaddr;
512 }
513
514 void
emu_init(struct emu * emu)515 emu_init(struct emu *emu)
516 {
517 emu->gpumem = mmap(NULL, EMU_MEMORY_SIZE,
518 PROT_READ | PROT_WRITE,
519 MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE,
520 0, 0);
521 if (emu->gpumem == MAP_FAILED) {
522 printf("Could not allocate GPU memory: %s\n", strerror(errno));
523 exit(1);
524 }
525
526 /* Copy the instructions into GPU memory: */
527 for (unsigned i = 0; i < emu->sizedwords; i++) {
528 mem_write_dword(emu, EMU_INSTR_BASE + (4 * i), emu->instrs[i]);
529 }
530
531 EMU_GPU_REG(CP_SQE_INSTR_BASE);
532 EMU_GPU_REG(CP_LPAC_SQE_INSTR_BASE);
533 EMU_CONTROL_REG(BV_INSTR_BASE);
534 EMU_CONTROL_REG(LPAC_INSTR_BASE);
535
536 /* Setup the address of the SQE fw, just use the normal CPU ptr address: */
537 switch (emu->processor) {
538 case EMU_PROC_SQE:
539 emu_set_reg64(emu, &CP_SQE_INSTR_BASE, EMU_INSTR_BASE);
540 break;
541 case EMU_PROC_BV:
542 emu_set_reg64(emu, &BV_INSTR_BASE, EMU_INSTR_BASE);
543 break;
544 case EMU_PROC_LPAC:
545 if (gpuver >= 7)
546 emu_set_reg64(emu, &LPAC_INSTR_BASE, EMU_INSTR_BASE);
547 else
548 emu_set_reg64(emu, &CP_LPAC_SQE_INSTR_BASE, EMU_INSTR_BASE);
549 break;
550 }
551
552 if (emu->fw_id == AFUC_A750) {
553 emu_set_control_reg(emu, 0, 7 << 28);
554 emu_set_control_reg(emu, 2, 0x40 << 8);
555 } else if (emu->fw_id == AFUC_A730 || emu->fw_id == AFUC_A740) {
556 emu_set_control_reg(emu, 0xef, 1 << 21);
557 emu_set_control_reg(emu, 0, 7 << 28);
558 } else if (emu->fw_id == AFUC_A660) {
559 emu_set_control_reg(emu, 0, 3 << 28);
560 } else if (emu->fw_id == AFUC_A650) {
561 emu_set_control_reg(emu, 0, 1 << 28);
562 }
563 }
564
565 void
emu_fini(struct emu * emu)566 emu_fini(struct emu *emu)
567 {
568 uint32_t *instrs = emu->instrs;
569 unsigned sizedwords = emu->sizedwords;
570 unsigned fw_id = emu->fw_id;
571
572 munmap(emu->gpumem, EMU_MEMORY_SIZE);
573 memset(emu, 0, sizeof(*emu));
574
575 emu->instrs = instrs;
576 emu->sizedwords = sizedwords;
577 emu->fw_id = fw_id;
578 }
579