1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30 #include "util/u_debug_cb.h"
31
32 static void
vc4_dump_program(struct vc4_compile * c)33 vc4_dump_program(struct vc4_compile *c)
34 {
35 fprintf(stderr, "%s prog %d/%d QPU:\n",
36 qir_get_stage_name(c->stage),
37 c->program_id, c->variant_id);
38
39 for (int i = 0; i < c->qpu_inst_count; i++) {
40 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
41 vc4_qpu_disasm(&c->qpu_insts[i], 1);
42 fprintf(stderr, "\n");
43 }
44 fprintf(stderr, "\n");
45 }
46
47 static void
queue(struct qblock * block,uint64_t inst)48 queue(struct qblock *block, uint64_t inst)
49 {
50 struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
51 q->inst = inst;
52 list_addtail(&q->link, &block->qpu_inst_list);
53 }
54
55 static uint64_t *
last_inst(struct qblock * block)56 last_inst(struct qblock *block)
57 {
58 struct queued_qpu_inst *q =
59 (struct queued_qpu_inst *)block->qpu_inst_list.prev;
60 return &q->inst;
61 }
62
63 static void
set_last_cond_add(struct qblock * block,uint32_t cond)64 set_last_cond_add(struct qblock *block, uint32_t cond)
65 {
66 *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
67 }
68
69 static void
set_last_cond_mul(struct qblock * block,uint32_t cond)70 set_last_cond_mul(struct qblock *block, uint32_t cond)
71 {
72 *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
73 }
74
75 /**
76 * Some special registers can be read from either file, which lets us resolve
77 * raddr conflicts without extra MOVs.
78 */
79 static bool
swap_file(struct qpu_reg * src)80 swap_file(struct qpu_reg *src)
81 {
82 switch (src->addr) {
83 case QPU_R_UNIF:
84 case QPU_R_VARY:
85 if (src->mux == QPU_MUX_SMALL_IMM) {
86 return false;
87 } else {
88 if (src->mux == QPU_MUX_A)
89 src->mux = QPU_MUX_B;
90 else
91 src->mux = QPU_MUX_A;
92 return true;
93 }
94
95 default:
96 return false;
97 }
98 }
99
100 /**
101 * Sets up the VPM read FIFO before we do any VPM read.
102 *
103 * VPM reads (vertex attribute input) and VPM writes (varyings output) from
104 * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
105 * VPM block. In the VS/CS (unlike in the FS), the block starts out
106 * uninitialized, and you need to emit setup to the block before any VPM
107 * reads/writes.
108 *
109 * VRI has a FIFO in each direction, with each FIFO able to hold four
110 * 32-bit-per-vertex values. VPM reads come through the read FIFO and VPM
111 * writes go through the write FIFO. The read/write setup values from QPU go
112 * through the write FIFO as well, with a sideband signal indicating that
113 * they're setup values. Once a read setup reaches the other side of the
114 * FIFO, the VPM block will start asynchronously reading vertex attributes and
115 * filling the read FIFO -- that way hopefully the QPU doesn't have to block
116 * on reads later.
117 *
118 * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
119 * time, which is 4 vec4s. If more than that is being read (since we support
120 * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
121 *
122 * The existence of the FIFO makes it seem like you should be able to emit
123 * both setups for the 5-8 attribute cases and then do all the attribute
124 * reads. However, once the setup value makes it to the other end of the
125 * write FIFO, it will immediately update the VPM block's setup register.
126 * That updated setup register would be used for read FIFO fills from then on,
127 * breaking whatever remaining VPM values were supposed to be read into the
128 * read FIFO from the previous attribute set.
129 *
130 * As a result, we need to emit the read setup, pull every VPM read value from
131 * that setup, and only then emit the second setup if applicable.
132 */
133 static void
setup_for_vpm_read(struct vc4_compile * c,struct qblock * block)134 setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
135 {
136 if (c->num_inputs_in_fifo) {
137 c->num_inputs_in_fifo--;
138 return;
139 }
140
141 c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
142
143 queue(block,
144 qpu_load_imm_ui(qpu_vrsetup(),
145 c->vpm_read_offset |
146 0x00001a00 |
147 ((c->num_inputs_in_fifo & 0xf) << 20)));
148 c->num_inputs_remaining -= c->num_inputs_in_fifo;
149 c->vpm_read_offset += c->num_inputs_in_fifo;
150
151 c->num_inputs_in_fifo--;
152 }
153
154 /**
155 * This is used to resolve the fact that we might register-allocate two
156 * different operands of an instruction to the same physical register file
157 * even though instructions have only one field for the register file source
158 * address.
159 *
160 * In that case, we need to move one to a temporary that can be used in the
161 * instruction, instead. We reserve ra14/rb14 for this purpose.
162 */
163 static void
fixup_raddr_conflict(struct qblock * block,struct qpu_reg dst,struct qpu_reg * src0,struct qpu_reg * src1,struct qinst * inst,uint64_t * unpack)164 fixup_raddr_conflict(struct qblock *block,
165 struct qpu_reg dst,
166 struct qpu_reg *src0, struct qpu_reg *src1,
167 struct qinst *inst, uint64_t *unpack)
168 {
169 uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
170 uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
171
172 if (mux0 <= QPU_MUX_R5 ||
173 mux0 != mux1 ||
174 (src0->addr == src1->addr &&
175 src0->mux == src1->mux)) {
176 return;
177 }
178
179 if (swap_file(src0) || swap_file(src1))
180 return;
181
182 if (mux0 == QPU_MUX_A) {
183 /* Make sure we use the same type of MOV as the instruction,
184 * in case of unpacks.
185 */
186 if (qir_is_float_input(inst))
187 queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
188 else
189 queue(block, qpu_a_MOV(qpu_rb(14), *src0));
190
191 /* If we had an unpack on this A-file source, we need to put
192 * it into this MOV, not into the later move from regfile B.
193 */
194 if (inst->src[0].pack) {
195 *last_inst(block) |= *unpack;
196 *unpack = 0;
197 }
198 *src0 = qpu_rb(14);
199 } else {
200 queue(block, qpu_a_MOV(qpu_ra(14), *src0));
201 *src0 = qpu_ra(14);
202 }
203 }
204
205 static void
set_last_dst_pack(struct qblock * block,struct qinst * inst)206 set_last_dst_pack(struct qblock *block, struct qinst *inst)
207 {
208 ASSERTED bool had_pm = *last_inst(block) & QPU_PM;
209 ASSERTED bool had_ws = *last_inst(block) & QPU_WS;
210 ASSERTED uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
211
212 if (!inst->dst.pack)
213 return;
214
215 *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
216
217 if (qir_is_mul(inst)) {
218 assert(!unpack || had_pm);
219 *last_inst(block) |= QPU_PM;
220 } else {
221 assert(!unpack || !had_pm);
222 assert(!had_ws); /* dst must be a-file to pack. */
223 }
224 }
225
226 static void
handle_r4_qpu_write(struct qblock * block,struct qinst * qinst,struct qpu_reg dst)227 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
228 struct qpu_reg dst)
229 {
230 if (dst.mux != QPU_MUX_R4) {
231 queue(block, qpu_a_MOV(dst, qpu_r4()));
232 set_last_cond_add(block, qinst->cond);
233 } else {
234 assert(qinst->cond == QPU_COND_ALWAYS);
235 if (qinst->sf)
236 queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
237 }
238 }
239
240 static void
vc4_generate_code_block(struct vc4_compile * c,struct qblock * block,struct qpu_reg * temp_registers)241 vc4_generate_code_block(struct vc4_compile *c,
242 struct qblock *block,
243 struct qpu_reg *temp_registers)
244 {
245 int last_vpm_read_index = -1;
246
247 qir_for_each_inst(qinst, block) {
248 #if 0
249 fprintf(stderr, "translating qinst to qpu: ");
250 qir_dump_inst(qinst);
251 fprintf(stderr, "\n");
252 #endif
253
254 static const struct {
255 uint32_t op;
256 } translate[] = {
257 #define A(name) [QOP_##name] = {QPU_A_##name}
258 #define M(name) [QOP_##name] = {QPU_M_##name}
259 A(FADD),
260 A(FSUB),
261 A(FMIN),
262 A(FMAX),
263 A(FMINABS),
264 A(FMAXABS),
265 A(FTOI),
266 A(ITOF),
267 A(ADD),
268 A(SUB),
269 A(SHL),
270 A(SHR),
271 A(ASR),
272 A(MIN),
273 A(MAX),
274 A(AND),
275 A(OR),
276 A(XOR),
277 A(NOT),
278
279 M(FMUL),
280 M(V8MULD),
281 M(V8MIN),
282 M(V8MAX),
283 M(V8ADDS),
284 M(V8SUBS),
285 M(MUL24),
286
287 /* If we replicate src[0] out to src[1], this works
288 * out the same as a MOV.
289 */
290 [QOP_MOV] = { QPU_A_OR },
291 [QOP_FMOV] = { QPU_A_FMAX },
292 [QOP_MMOV] = { QPU_M_V8MIN },
293
294 [QOP_MIN_NOIMM] = { QPU_A_MIN },
295 };
296
297 uint64_t unpack = 0;
298 struct qpu_reg src[ARRAY_SIZE(qinst->src)];
299 for (int i = 0; i < qir_get_nsrc(qinst); i++) {
300 int index = qinst->src[i].index;
301 switch (qinst->src[i].file) {
302 case QFILE_NULL:
303 case QFILE_LOAD_IMM:
304 src[i] = qpu_rn(0);
305 break;
306 case QFILE_TEMP:
307 src[i] = temp_registers[index];
308 if (qinst->src[i].pack) {
309 assert(!unpack ||
310 unpack == qinst->src[i].pack);
311 unpack = QPU_SET_FIELD(qinst->src[i].pack,
312 QPU_UNPACK);
313 if (src[i].mux == QPU_MUX_R4)
314 unpack |= QPU_PM;
315 }
316 break;
317 case QFILE_UNIF:
318 src[i] = qpu_unif();
319 break;
320 case QFILE_VARY:
321 src[i] = qpu_vary();
322 break;
323 case QFILE_SMALL_IMM:
324 src[i].mux = QPU_MUX_SMALL_IMM;
325 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
326 /* This should only have returned a valid
327 * small immediate field, not ~0 for failure.
328 */
329 assert(src[i].addr <= 47);
330 break;
331 case QFILE_VPM:
332 setup_for_vpm_read(c, block);
333 assert((int)qinst->src[i].index >=
334 last_vpm_read_index);
335 (void)last_vpm_read_index;
336 last_vpm_read_index = qinst->src[i].index;
337 src[i] = qpu_ra(QPU_R_VPM);
338 break;
339
340 case QFILE_FRAG_X:
341 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
342 break;
343 case QFILE_FRAG_Y:
344 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
345 break;
346 case QFILE_FRAG_REV_FLAG:
347 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
348 break;
349 case QFILE_QPU_ELEMENT:
350 src[i] = qpu_ra(QPU_R_ELEM_QPU);
351 break;
352
353 case QFILE_TLB_COLOR_WRITE:
354 case QFILE_TLB_COLOR_WRITE_MS:
355 case QFILE_TLB_Z_WRITE:
356 case QFILE_TLB_STENCIL_SETUP:
357 case QFILE_TEX_S:
358 case QFILE_TEX_S_DIRECT:
359 case QFILE_TEX_T:
360 case QFILE_TEX_R:
361 case QFILE_TEX_B:
362 unreachable("bad qir src file");
363 }
364 }
365
366 struct qpu_reg dst;
367 switch (qinst->dst.file) {
368 case QFILE_NULL:
369 dst = qpu_ra(QPU_W_NOP);
370 break;
371 case QFILE_TEMP:
372 dst = temp_registers[qinst->dst.index];
373 break;
374 case QFILE_VPM:
375 dst = qpu_ra(QPU_W_VPM);
376 break;
377
378 case QFILE_TLB_COLOR_WRITE:
379 dst = qpu_tlbc();
380 break;
381
382 case QFILE_TLB_COLOR_WRITE_MS:
383 dst = qpu_tlbc_ms();
384 break;
385
386 case QFILE_TLB_Z_WRITE:
387 dst = qpu_ra(QPU_W_TLB_Z);
388 break;
389
390 case QFILE_TLB_STENCIL_SETUP:
391 dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
392 break;
393
394 case QFILE_TEX_S:
395 case QFILE_TEX_S_DIRECT:
396 dst = qpu_rb(QPU_W_TMU0_S);
397 break;
398
399 case QFILE_TEX_T:
400 dst = qpu_rb(QPU_W_TMU0_T);
401 break;
402
403 case QFILE_TEX_R:
404 dst = qpu_rb(QPU_W_TMU0_R);
405 break;
406
407 case QFILE_TEX_B:
408 dst = qpu_rb(QPU_W_TMU0_B);
409 break;
410
411 case QFILE_VARY:
412 case QFILE_UNIF:
413 case QFILE_SMALL_IMM:
414 case QFILE_LOAD_IMM:
415 case QFILE_FRAG_X:
416 case QFILE_FRAG_Y:
417 case QFILE_FRAG_REV_FLAG:
418 case QFILE_QPU_ELEMENT:
419 assert(!"not reached");
420 break;
421 }
422
423 ASSERTED bool handled_qinst_cond = false;
424
425 switch (qinst->op) {
426 case QOP_RCP:
427 case QOP_RSQ:
428 case QOP_EXP2:
429 case QOP_LOG2:
430 switch (qinst->op) {
431 case QOP_RCP:
432 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
433 src[0]) | unpack);
434 break;
435 case QOP_RSQ:
436 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
437 src[0]) | unpack);
438 break;
439 case QOP_EXP2:
440 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
441 src[0]) | unpack);
442 break;
443 case QOP_LOG2:
444 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
445 src[0]) | unpack);
446 break;
447 default:
448 abort();
449 }
450
451 handle_r4_qpu_write(block, qinst, dst);
452 handled_qinst_cond = true;
453
454 break;
455
456 case QOP_LOAD_IMM:
457 assert(qinst->src[0].file == QFILE_LOAD_IMM);
458 queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
459 break;
460
461 case QOP_LOAD_IMM_U2:
462 queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
463 break;
464
465 case QOP_LOAD_IMM_I2:
466 queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
467 break;
468
469 case QOP_ROT_MUL:
470 /* Rotation at the hardware level occurs on the inputs
471 * to the MUL unit, and they must be accumulators in
472 * order to have the time necessary to move things.
473 */
474 assert(src[0].mux <= QPU_MUX_R3);
475
476 queue(block,
477 qpu_m_rot(dst, src[0], qinst->src[1].index -
478 QPU_SMALL_IMM_MUL_ROT) | unpack);
479 set_last_cond_mul(block, qinst->cond);
480 handled_qinst_cond = true;
481 set_last_dst_pack(block, qinst);
482 break;
483
484 case QOP_MS_MASK:
485 src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
486 fixup_raddr_conflict(block, dst, &src[0], &src[1],
487 qinst, &unpack);
488 queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
489 src[0], src[1]) | unpack);
490 break;
491
492 case QOP_FRAG_Z:
493 case QOP_FRAG_W:
494 /* QOP_FRAG_Z/W don't emit instructions, just allocate
495 * the register to the Z/W payload.
496 */
497 break;
498
499 case QOP_TLB_COLOR_READ:
500 queue(block, qpu_NOP());
501 *last_inst(block) = qpu_set_sig(*last_inst(block),
502 QPU_SIG_COLOR_LOAD);
503 handle_r4_qpu_write(block, qinst, dst);
504 handled_qinst_cond = true;
505 break;
506
507 case QOP_VARY_ADD_C:
508 queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
509 break;
510
511
512 case QOP_TEX_RESULT:
513 queue(block, qpu_NOP());
514 *last_inst(block) = qpu_set_sig(*last_inst(block),
515 QPU_SIG_LOAD_TMU0);
516 handle_r4_qpu_write(block, qinst, dst);
517 handled_qinst_cond = true;
518 break;
519
520 case QOP_THRSW:
521 queue(block, qpu_NOP());
522 *last_inst(block) = qpu_set_sig(*last_inst(block),
523 QPU_SIG_THREAD_SWITCH);
524 c->last_thrsw = last_inst(block);
525 break;
526
527 case QOP_BRANCH:
528 /* The branch target will be updated at QPU scheduling
529 * time.
530 */
531 queue(block, (qpu_branch(qinst->cond, 0) |
532 QPU_BRANCH_REL));
533 handled_qinst_cond = true;
534 break;
535
536 case QOP_UNIFORMS_RESET:
537 fixup_raddr_conflict(block, dst, &src[0], &src[1],
538 qinst, &unpack);
539
540 queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
541 src[0], src[1]));
542 break;
543
544 default:
545 assert(qinst->op < ARRAY_SIZE(translate));
546 assert(translate[qinst->op].op != 0); /* NOPs */
547
548 /* Skip emitting the MOV if it's a no-op. */
549 if (qir_is_raw_mov(qinst) &&
550 dst.mux == src[0].mux && dst.addr == src[0].addr) {
551 break;
552 }
553
554 /* If we have only one source, put it in the second
555 * argument slot as well so that we don't take up
556 * another raddr just to get unused data.
557 */
558 if (qir_get_non_sideband_nsrc(qinst) == 1)
559 src[1] = src[0];
560
561 fixup_raddr_conflict(block, dst, &src[0], &src[1],
562 qinst, &unpack);
563
564 if (qir_is_mul(qinst)) {
565 queue(block, qpu_m_alu2(translate[qinst->op].op,
566 dst,
567 src[0], src[1]) | unpack);
568 set_last_cond_mul(block, qinst->cond);
569 } else {
570 queue(block, qpu_a_alu2(translate[qinst->op].op,
571 dst,
572 src[0], src[1]) | unpack);
573 set_last_cond_add(block, qinst->cond);
574 }
575 handled_qinst_cond = true;
576 set_last_dst_pack(block, qinst);
577
578 break;
579 }
580
581 assert(qinst->cond == QPU_COND_ALWAYS ||
582 handled_qinst_cond);
583
584 if (qinst->sf)
585 *last_inst(block) |= QPU_SF;
586 }
587 }
588
589 void
vc4_generate_code(struct vc4_context * vc4,struct vc4_compile * c)590 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
591 {
592 struct qblock *start_block = list_first_entry(&c->blocks,
593 struct qblock, link);
594
595 struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
596 if (!temp_registers)
597 return;
598
599 switch (c->stage) {
600 case QSTAGE_VERT:
601 case QSTAGE_COORD:
602 c->num_inputs_remaining = c->num_inputs;
603 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
604 break;
605 case QSTAGE_FRAG:
606 break;
607 }
608
609 qir_for_each_block(block, c)
610 vc4_generate_code_block(c, block, temp_registers);
611
612 /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
613 *
614 * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
615 * that ensures that a later thread doesn't try to lock the scoreboard
616 * and terminate before an earlier-spawned thread on the same QPU, by
617 * delaying switching back to the later shader until earlier has
618 * finished. Otherwise, if the earlier thread was hitting the same
619 * quad, the scoreboard would deadlock.
620 */
621 if (c->last_thrsw) {
622 assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
623 QPU_SIG_THREAD_SWITCH);
624 *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
625 QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
626 QPU_SIG));
627 }
628
629 uint32_t cycles = qpu_schedule_instructions(c);
630 uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
631
632 /* thread end can't have VPM write or read */
633 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
634 QPU_WADDR_ADD) == QPU_W_VPM ||
635 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
636 QPU_WADDR_MUL) == QPU_W_VPM ||
637 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
638 QPU_RADDR_A) == QPU_R_VPM ||
639 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
640 QPU_RADDR_B) == QPU_R_VPM) {
641 qpu_serialize_one_inst(c, qpu_NOP());
642 }
643
644 /* thread end can't have uniform read */
645 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
646 QPU_RADDR_A) == QPU_R_UNIF ||
647 QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
648 QPU_RADDR_B) == QPU_R_UNIF) {
649 qpu_serialize_one_inst(c, qpu_NOP());
650 }
651
652 /* thread end can't have TLB operations */
653 if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
654 qpu_serialize_one_inst(c, qpu_NOP());
655
656 /* Make sure there's no existing signal set (like for a small
657 * immediate)
658 */
659 if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
660 QPU_SIG) != QPU_SIG_NONE) {
661 qpu_serialize_one_inst(c, qpu_NOP());
662 }
663
664 c->qpu_insts[c->qpu_inst_count - 1] =
665 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
666 QPU_SIG_PROG_END);
667 qpu_serialize_one_inst(c, qpu_NOP());
668 qpu_serialize_one_inst(c, qpu_NOP());
669
670 switch (c->stage) {
671 case QSTAGE_VERT:
672 case QSTAGE_COORD:
673 break;
674 case QSTAGE_FRAG:
675 c->qpu_insts[c->qpu_inst_count - 1] =
676 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
677 QPU_SIG_SCOREBOARD_UNLOCK);
678 break;
679 }
680
681 cycles += c->qpu_inst_count - inst_count_at_schedule_time;
682
683 if (VC4_DBG(SHADERDB)) {
684 util_debug_message(&vc4->base.debug, SHADER_INFO,
685 "%s shader: %d inst, %d threads, %d uniforms, %d max-temps, %d estimated-cycles",
686 qir_get_stage_name(c->stage),
687 c->qpu_inst_count,
688 1 + c->fs_threaded,
689 c->num_uniforms,
690 c->max_reg_pressure,
691 cycles);
692 }
693
694 if (VC4_DBG(QPU))
695 vc4_dump_program(c);
696
697 vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
698
699 free(temp_registers);
700 }
701