xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/vc4/vc4_qpu_emit.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include <inttypes.h>
25 
26 #include "vc4_context.h"
27 #include "vc4_qir.h"
28 #include "vc4_qpu.h"
29 #include "util/ralloc.h"
30 #include "util/u_debug_cb.h"
31 
32 static void
vc4_dump_program(struct vc4_compile * c)33 vc4_dump_program(struct vc4_compile *c)
34 {
35         fprintf(stderr, "%s prog %d/%d QPU:\n",
36                 qir_get_stage_name(c->stage),
37                 c->program_id, c->variant_id);
38 
39         for (int i = 0; i < c->qpu_inst_count; i++) {
40                 fprintf(stderr, "0x%016"PRIx64" ", c->qpu_insts[i]);
41                 vc4_qpu_disasm(&c->qpu_insts[i], 1);
42                 fprintf(stderr, "\n");
43         }
44         fprintf(stderr, "\n");
45 }
46 
47 static void
queue(struct qblock * block,uint64_t inst)48 queue(struct qblock *block, uint64_t inst)
49 {
50         struct queued_qpu_inst *q = rzalloc(block, struct queued_qpu_inst);
51         q->inst = inst;
52         list_addtail(&q->link, &block->qpu_inst_list);
53 }
54 
55 static uint64_t *
last_inst(struct qblock * block)56 last_inst(struct qblock *block)
57 {
58         struct queued_qpu_inst *q =
59                 (struct queued_qpu_inst *)block->qpu_inst_list.prev;
60         return &q->inst;
61 }
62 
63 static void
set_last_cond_add(struct qblock * block,uint32_t cond)64 set_last_cond_add(struct qblock *block, uint32_t cond)
65 {
66         *last_inst(block) = qpu_set_cond_add(*last_inst(block), cond);
67 }
68 
69 static void
set_last_cond_mul(struct qblock * block,uint32_t cond)70 set_last_cond_mul(struct qblock *block, uint32_t cond)
71 {
72         *last_inst(block) = qpu_set_cond_mul(*last_inst(block), cond);
73 }
74 
75 /**
76  * Some special registers can be read from either file, which lets us resolve
77  * raddr conflicts without extra MOVs.
78  */
79 static bool
swap_file(struct qpu_reg * src)80 swap_file(struct qpu_reg *src)
81 {
82         switch (src->addr) {
83         case QPU_R_UNIF:
84         case QPU_R_VARY:
85                 if (src->mux == QPU_MUX_SMALL_IMM) {
86                         return false;
87                 } else {
88                         if (src->mux == QPU_MUX_A)
89                                 src->mux = QPU_MUX_B;
90                         else
91                                 src->mux = QPU_MUX_A;
92                         return true;
93                 }
94 
95         default:
96                 return false;
97         }
98 }
99 
100 /**
101  * Sets up the VPM read FIFO before we do any VPM read.
102  *
103  * VPM reads (vertex attribute input) and VPM writes (varyings output) from
104  * the QPU reuse the VRI (varying interpolation) block's FIFOs to talk to the
105  * VPM block.  In the VS/CS (unlike in the FS), the block starts out
106  * uninitialized, and you need to emit setup to the block before any VPM
107  * reads/writes.
108  *
109  * VRI has a FIFO in each direction, with each FIFO able to hold four
110  * 32-bit-per-vertex values.  VPM reads come through the read FIFO and VPM
111  * writes go through the write FIFO.  The read/write setup values from QPU go
112  * through the write FIFO as well, with a sideband signal indicating that
113  * they're setup values.  Once a read setup reaches the other side of the
114  * FIFO, the VPM block will start asynchronously reading vertex attributes and
115  * filling the read FIFO -- that way hopefully the QPU doesn't have to block
116  * on reads later.
117  *
118  * VPM read setup can configure 16 32-bit-per-vertex values to be read at a
119  * time, which is 4 vec4s.  If more than that is being read (since we support
120  * 8 vec4 vertex attributes), then multiple read setup writes need to be done.
121  *
122  * The existence of the FIFO makes it seem like you should be able to emit
123  * both setups for the 5-8 attribute cases and then do all the attribute
124  * reads.  However, once the setup value makes it to the other end of the
125  * write FIFO, it will immediately update the VPM block's setup register.
126  * That updated setup register would be used for read FIFO fills from then on,
127  * breaking whatever remaining VPM values were supposed to be read into the
128  * read FIFO from the previous attribute set.
129  *
130  * As a result, we need to emit the read setup, pull every VPM read value from
131  * that setup, and only then emit the second setup if applicable.
132  */
133 static void
setup_for_vpm_read(struct vc4_compile * c,struct qblock * block)134 setup_for_vpm_read(struct vc4_compile *c, struct qblock *block)
135 {
136         if (c->num_inputs_in_fifo) {
137                 c->num_inputs_in_fifo--;
138                 return;
139         }
140 
141         c->num_inputs_in_fifo = MIN2(c->num_inputs_remaining, 16);
142 
143         queue(block,
144               qpu_load_imm_ui(qpu_vrsetup(),
145                               c->vpm_read_offset |
146                               0x00001a00 |
147                               ((c->num_inputs_in_fifo & 0xf) << 20)));
148         c->num_inputs_remaining -= c->num_inputs_in_fifo;
149         c->vpm_read_offset += c->num_inputs_in_fifo;
150 
151         c->num_inputs_in_fifo--;
152 }
153 
154 /**
155  * This is used to resolve the fact that we might register-allocate two
156  * different operands of an instruction to the same physical register file
157  * even though instructions have only one field for the register file source
158  * address.
159  *
160  * In that case, we need to move one to a temporary that can be used in the
161  * instruction, instead.  We reserve ra14/rb14 for this purpose.
162  */
163 static void
fixup_raddr_conflict(struct qblock * block,struct qpu_reg dst,struct qpu_reg * src0,struct qpu_reg * src1,struct qinst * inst,uint64_t * unpack)164 fixup_raddr_conflict(struct qblock *block,
165                      struct qpu_reg dst,
166                      struct qpu_reg *src0, struct qpu_reg *src1,
167                      struct qinst *inst, uint64_t *unpack)
168 {
169         uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux;
170         uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux;
171 
172         if (mux0 <= QPU_MUX_R5 ||
173             mux0 != mux1 ||
174             (src0->addr == src1->addr &&
175              src0->mux == src1->mux)) {
176                 return;
177         }
178 
179         if (swap_file(src0) || swap_file(src1))
180                 return;
181 
182         if (mux0 == QPU_MUX_A) {
183                 /* Make sure we use the same type of MOV as the instruction,
184                  * in case of unpacks.
185                  */
186                 if (qir_is_float_input(inst))
187                         queue(block, qpu_a_FMAX(qpu_rb(14), *src0, *src0));
188                 else
189                         queue(block, qpu_a_MOV(qpu_rb(14), *src0));
190 
191                 /* If we had an unpack on this A-file source, we need to put
192                  * it into this MOV, not into the later move from regfile B.
193                  */
194                 if (inst->src[0].pack) {
195                         *last_inst(block) |= *unpack;
196                         *unpack = 0;
197                 }
198                 *src0 = qpu_rb(14);
199         } else {
200                 queue(block, qpu_a_MOV(qpu_ra(14), *src0));
201                 *src0 = qpu_ra(14);
202         }
203 }
204 
205 static void
set_last_dst_pack(struct qblock * block,struct qinst * inst)206 set_last_dst_pack(struct qblock *block, struct qinst *inst)
207 {
208         ASSERTED bool had_pm = *last_inst(block) & QPU_PM;
209         ASSERTED bool had_ws = *last_inst(block) & QPU_WS;
210         ASSERTED uint32_t unpack = QPU_GET_FIELD(*last_inst(block), QPU_UNPACK);
211 
212         if (!inst->dst.pack)
213                 return;
214 
215         *last_inst(block) |= QPU_SET_FIELD(inst->dst.pack, QPU_PACK);
216 
217         if (qir_is_mul(inst)) {
218                 assert(!unpack || had_pm);
219                 *last_inst(block) |= QPU_PM;
220         } else {
221                 assert(!unpack || !had_pm);
222                 assert(!had_ws); /* dst must be a-file to pack. */
223         }
224 }
225 
226 static void
handle_r4_qpu_write(struct qblock * block,struct qinst * qinst,struct qpu_reg dst)227 handle_r4_qpu_write(struct qblock *block, struct qinst *qinst,
228                     struct qpu_reg dst)
229 {
230         if (dst.mux != QPU_MUX_R4) {
231                 queue(block, qpu_a_MOV(dst, qpu_r4()));
232                 set_last_cond_add(block, qinst->cond);
233         } else {
234                 assert(qinst->cond == QPU_COND_ALWAYS);
235                 if (qinst->sf)
236                         queue(block, qpu_a_MOV(qpu_ra(QPU_W_NOP), qpu_r4()));
237         }
238 }
239 
240 static void
vc4_generate_code_block(struct vc4_compile * c,struct qblock * block,struct qpu_reg * temp_registers)241 vc4_generate_code_block(struct vc4_compile *c,
242                         struct qblock *block,
243                         struct qpu_reg *temp_registers)
244 {
245         int last_vpm_read_index = -1;
246 
247         qir_for_each_inst(qinst, block) {
248 #if 0
249                 fprintf(stderr, "translating qinst to qpu: ");
250                 qir_dump_inst(qinst);
251                 fprintf(stderr, "\n");
252 #endif
253 
254                 static const struct {
255                         uint32_t op;
256                 } translate[] = {
257 #define A(name) [QOP_##name] = {QPU_A_##name}
258 #define M(name) [QOP_##name] = {QPU_M_##name}
259                         A(FADD),
260                         A(FSUB),
261                         A(FMIN),
262                         A(FMAX),
263                         A(FMINABS),
264                         A(FMAXABS),
265                         A(FTOI),
266                         A(ITOF),
267                         A(ADD),
268                         A(SUB),
269                         A(SHL),
270                         A(SHR),
271                         A(ASR),
272                         A(MIN),
273                         A(MAX),
274                         A(AND),
275                         A(OR),
276                         A(XOR),
277                         A(NOT),
278 
279                         M(FMUL),
280                         M(V8MULD),
281                         M(V8MIN),
282                         M(V8MAX),
283                         M(V8ADDS),
284                         M(V8SUBS),
285                         M(MUL24),
286 
287                         /* If we replicate src[0] out to src[1], this works
288                          * out the same as a MOV.
289                          */
290                         [QOP_MOV] = { QPU_A_OR },
291                         [QOP_FMOV] = { QPU_A_FMAX },
292                         [QOP_MMOV] = { QPU_M_V8MIN },
293 
294                         [QOP_MIN_NOIMM] = { QPU_A_MIN },
295                 };
296 
297                 uint64_t unpack = 0;
298                 struct qpu_reg src[ARRAY_SIZE(qinst->src)];
299                 for (int i = 0; i < qir_get_nsrc(qinst); i++) {
300                         int index = qinst->src[i].index;
301                         switch (qinst->src[i].file) {
302                         case QFILE_NULL:
303                         case QFILE_LOAD_IMM:
304                                 src[i] = qpu_rn(0);
305                                 break;
306                         case QFILE_TEMP:
307                                 src[i] = temp_registers[index];
308                                 if (qinst->src[i].pack) {
309                                         assert(!unpack ||
310                                                unpack == qinst->src[i].pack);
311                                         unpack = QPU_SET_FIELD(qinst->src[i].pack,
312                                                                QPU_UNPACK);
313                                         if (src[i].mux == QPU_MUX_R4)
314                                                 unpack |= QPU_PM;
315                                 }
316                                 break;
317                         case QFILE_UNIF:
318                                 src[i] = qpu_unif();
319                                 break;
320                         case QFILE_VARY:
321                                 src[i] = qpu_vary();
322                                 break;
323                         case QFILE_SMALL_IMM:
324                                 src[i].mux = QPU_MUX_SMALL_IMM;
325                                 src[i].addr = qpu_encode_small_immediate(qinst->src[i].index);
326                                 /* This should only have returned a valid
327                                  * small immediate field, not ~0 for failure.
328                                  */
329                                 assert(src[i].addr <= 47);
330                                 break;
331                         case QFILE_VPM:
332                                 setup_for_vpm_read(c, block);
333                                 assert((int)qinst->src[i].index >=
334                                        last_vpm_read_index);
335                                 (void)last_vpm_read_index;
336                                 last_vpm_read_index = qinst->src[i].index;
337                                 src[i] = qpu_ra(QPU_R_VPM);
338                                 break;
339 
340                         case QFILE_FRAG_X:
341                                 src[i] = qpu_ra(QPU_R_XY_PIXEL_COORD);
342                                 break;
343                         case QFILE_FRAG_Y:
344                                 src[i] = qpu_rb(QPU_R_XY_PIXEL_COORD);
345                                 break;
346                         case QFILE_FRAG_REV_FLAG:
347                                 src[i] = qpu_rb(QPU_R_MS_REV_FLAGS);
348                                 break;
349                         case QFILE_QPU_ELEMENT:
350                                 src[i] = qpu_ra(QPU_R_ELEM_QPU);
351                                 break;
352 
353                         case QFILE_TLB_COLOR_WRITE:
354                         case QFILE_TLB_COLOR_WRITE_MS:
355                         case QFILE_TLB_Z_WRITE:
356                         case QFILE_TLB_STENCIL_SETUP:
357                         case QFILE_TEX_S:
358                         case QFILE_TEX_S_DIRECT:
359                         case QFILE_TEX_T:
360                         case QFILE_TEX_R:
361                         case QFILE_TEX_B:
362                                 unreachable("bad qir src file");
363                         }
364                 }
365 
366                 struct qpu_reg dst;
367                 switch (qinst->dst.file) {
368                 case QFILE_NULL:
369                         dst = qpu_ra(QPU_W_NOP);
370                         break;
371                 case QFILE_TEMP:
372                         dst = temp_registers[qinst->dst.index];
373                         break;
374                 case QFILE_VPM:
375                         dst = qpu_ra(QPU_W_VPM);
376                         break;
377 
378                 case QFILE_TLB_COLOR_WRITE:
379                         dst = qpu_tlbc();
380                         break;
381 
382                 case QFILE_TLB_COLOR_WRITE_MS:
383                         dst = qpu_tlbc_ms();
384                         break;
385 
386                 case QFILE_TLB_Z_WRITE:
387                         dst = qpu_ra(QPU_W_TLB_Z);
388                         break;
389 
390                 case QFILE_TLB_STENCIL_SETUP:
391                         dst = qpu_ra(QPU_W_TLB_STENCIL_SETUP);
392                         break;
393 
394                 case QFILE_TEX_S:
395                 case QFILE_TEX_S_DIRECT:
396                         dst = qpu_rb(QPU_W_TMU0_S);
397                         break;
398 
399                 case QFILE_TEX_T:
400                         dst = qpu_rb(QPU_W_TMU0_T);
401                         break;
402 
403                 case QFILE_TEX_R:
404                         dst = qpu_rb(QPU_W_TMU0_R);
405                         break;
406 
407                 case QFILE_TEX_B:
408                         dst = qpu_rb(QPU_W_TMU0_B);
409                         break;
410 
411                 case QFILE_VARY:
412                 case QFILE_UNIF:
413                 case QFILE_SMALL_IMM:
414                 case QFILE_LOAD_IMM:
415                 case QFILE_FRAG_X:
416                 case QFILE_FRAG_Y:
417                 case QFILE_FRAG_REV_FLAG:
418                 case QFILE_QPU_ELEMENT:
419                         assert(!"not reached");
420                         break;
421                 }
422 
423                 ASSERTED bool handled_qinst_cond = false;
424 
425                 switch (qinst->op) {
426                 case QOP_RCP:
427                 case QOP_RSQ:
428                 case QOP_EXP2:
429                 case QOP_LOG2:
430                         switch (qinst->op) {
431                         case QOP_RCP:
432                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
433                                                        src[0]) | unpack);
434                                 break;
435                         case QOP_RSQ:
436                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIPSQRT),
437                                                        src[0]) | unpack);
438                                 break;
439                         case QOP_EXP2:
440                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_EXP),
441                                                        src[0]) | unpack);
442                                 break;
443                         case QOP_LOG2:
444                                 queue(block, qpu_a_MOV(qpu_rb(QPU_W_SFU_LOG),
445                                                        src[0]) | unpack);
446                                 break;
447                         default:
448                                 abort();
449                         }
450 
451                         handle_r4_qpu_write(block, qinst, dst);
452                         handled_qinst_cond = true;
453 
454                         break;
455 
456                 case QOP_LOAD_IMM:
457                         assert(qinst->src[0].file == QFILE_LOAD_IMM);
458                         queue(block, qpu_load_imm_ui(dst, qinst->src[0].index));
459                         break;
460 
461                 case QOP_LOAD_IMM_U2:
462                         queue(block, qpu_load_imm_u2(dst, qinst->src[0].index));
463                         break;
464 
465                 case QOP_LOAD_IMM_I2:
466                         queue(block, qpu_load_imm_i2(dst, qinst->src[0].index));
467                         break;
468 
469                 case QOP_ROT_MUL:
470                         /* Rotation at the hardware level occurs on the inputs
471                          * to the MUL unit, and they must be accumulators in
472                          * order to have the time necessary to move things.
473                          */
474                         assert(src[0].mux <= QPU_MUX_R3);
475 
476                         queue(block,
477                               qpu_m_rot(dst, src[0], qinst->src[1].index -
478                                         QPU_SMALL_IMM_MUL_ROT) | unpack);
479                         set_last_cond_mul(block, qinst->cond);
480                         handled_qinst_cond = true;
481                         set_last_dst_pack(block, qinst);
482                         break;
483 
484                 case QOP_MS_MASK:
485                         src[1] = qpu_ra(QPU_R_MS_REV_FLAGS);
486                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
487                                              qinst, &unpack);
488                         queue(block, qpu_a_AND(qpu_ra(QPU_W_MS_FLAGS),
489                                                src[0], src[1]) | unpack);
490                         break;
491 
492                 case QOP_FRAG_Z:
493                 case QOP_FRAG_W:
494                         /* QOP_FRAG_Z/W don't emit instructions, just allocate
495                          * the register to the Z/W payload.
496                          */
497                         break;
498 
499                 case QOP_TLB_COLOR_READ:
500                         queue(block, qpu_NOP());
501                         *last_inst(block) = qpu_set_sig(*last_inst(block),
502                                                         QPU_SIG_COLOR_LOAD);
503                         handle_r4_qpu_write(block, qinst, dst);
504                         handled_qinst_cond = true;
505                         break;
506 
507                 case QOP_VARY_ADD_C:
508                         queue(block, qpu_a_FADD(dst, src[0], qpu_r5()) | unpack);
509                         break;
510 
511 
512                 case QOP_TEX_RESULT:
513                         queue(block, qpu_NOP());
514                         *last_inst(block) = qpu_set_sig(*last_inst(block),
515                                                         QPU_SIG_LOAD_TMU0);
516                         handle_r4_qpu_write(block, qinst, dst);
517                         handled_qinst_cond = true;
518                         break;
519 
520                 case QOP_THRSW:
521                         queue(block, qpu_NOP());
522                         *last_inst(block) = qpu_set_sig(*last_inst(block),
523                                                         QPU_SIG_THREAD_SWITCH);
524                         c->last_thrsw = last_inst(block);
525                         break;
526 
527                 case QOP_BRANCH:
528                         /* The branch target will be updated at QPU scheduling
529                          * time.
530                          */
531                         queue(block, (qpu_branch(qinst->cond, 0) |
532                                       QPU_BRANCH_REL));
533                         handled_qinst_cond = true;
534                         break;
535 
536                 case QOP_UNIFORMS_RESET:
537                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
538                                              qinst, &unpack);
539 
540                         queue(block, qpu_a_ADD(qpu_ra(QPU_W_UNIFORMS_ADDRESS),
541                                                src[0], src[1]));
542                         break;
543 
544                 default:
545                         assert(qinst->op < ARRAY_SIZE(translate));
546                         assert(translate[qinst->op].op != 0); /* NOPs */
547 
548                         /* Skip emitting the MOV if it's a no-op. */
549                         if (qir_is_raw_mov(qinst) &&
550                             dst.mux == src[0].mux && dst.addr == src[0].addr) {
551                                 break;
552                         }
553 
554                         /* If we have only one source, put it in the second
555                          * argument slot as well so that we don't take up
556                          * another raddr just to get unused data.
557                          */
558                         if (qir_get_non_sideband_nsrc(qinst) == 1)
559                                 src[1] = src[0];
560 
561                         fixup_raddr_conflict(block, dst, &src[0], &src[1],
562                                              qinst, &unpack);
563 
564                         if (qir_is_mul(qinst)) {
565                                 queue(block, qpu_m_alu2(translate[qinst->op].op,
566                                                         dst,
567                                                         src[0], src[1]) | unpack);
568                                 set_last_cond_mul(block, qinst->cond);
569                         } else {
570                                 queue(block, qpu_a_alu2(translate[qinst->op].op,
571                                                         dst,
572                                                         src[0], src[1]) | unpack);
573                                 set_last_cond_add(block, qinst->cond);
574                         }
575                         handled_qinst_cond = true;
576                         set_last_dst_pack(block, qinst);
577 
578                         break;
579                 }
580 
581                 assert(qinst->cond == QPU_COND_ALWAYS ||
582                        handled_qinst_cond);
583 
584                 if (qinst->sf)
585                         *last_inst(block) |= QPU_SF;
586         }
587 }
588 
589 void
vc4_generate_code(struct vc4_context * vc4,struct vc4_compile * c)590 vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
591 {
592         struct qblock *start_block = list_first_entry(&c->blocks,
593                                                       struct qblock, link);
594 
595         struct qpu_reg *temp_registers = vc4_register_allocate(vc4, c);
596         if (!temp_registers)
597                 return;
598 
599         switch (c->stage) {
600         case QSTAGE_VERT:
601         case QSTAGE_COORD:
602                 c->num_inputs_remaining = c->num_inputs;
603                 queue(start_block, qpu_load_imm_ui(qpu_vwsetup(), 0x00001a00));
604                 break;
605         case QSTAGE_FRAG:
606                 break;
607         }
608 
609         qir_for_each_block(block, c)
610                 vc4_generate_code_block(c, block, temp_registers);
611 
612         /* Switch the last SIG_THRSW instruction to SIG_LAST_THRSW.
613          *
614          * LAST_THRSW is a new signal in BCM2708B0 (including Raspberry Pi)
615          * that ensures that a later thread doesn't try to lock the scoreboard
616          * and terminate before an earlier-spawned thread on the same QPU, by
617          * delaying switching back to the later shader until earlier has
618          * finished.  Otherwise, if the earlier thread was hitting the same
619          * quad, the scoreboard would deadlock.
620          */
621         if (c->last_thrsw) {
622                 assert(QPU_GET_FIELD(*c->last_thrsw, QPU_SIG) ==
623                        QPU_SIG_THREAD_SWITCH);
624                 *c->last_thrsw = ((*c->last_thrsw & ~QPU_SIG_MASK) |
625                                   QPU_SET_FIELD(QPU_SIG_LAST_THREAD_SWITCH,
626                                                 QPU_SIG));
627         }
628 
629         uint32_t cycles = qpu_schedule_instructions(c);
630         uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
631 
632         /* thread end can't have VPM write or read */
633         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
634                           QPU_WADDR_ADD) == QPU_W_VPM ||
635             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
636                           QPU_WADDR_MUL) == QPU_W_VPM ||
637             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
638                           QPU_RADDR_A) == QPU_R_VPM ||
639             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
640                           QPU_RADDR_B) == QPU_R_VPM) {
641                 qpu_serialize_one_inst(c, qpu_NOP());
642         }
643 
644         /* thread end can't have uniform read */
645         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
646                           QPU_RADDR_A) == QPU_R_UNIF ||
647             QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
648                           QPU_RADDR_B) == QPU_R_UNIF) {
649                 qpu_serialize_one_inst(c, qpu_NOP());
650         }
651 
652         /* thread end can't have TLB operations */
653         if (qpu_inst_is_tlb(c->qpu_insts[c->qpu_inst_count - 1]))
654                 qpu_serialize_one_inst(c, qpu_NOP());
655 
656         /* Make sure there's no existing signal set (like for a small
657          * immediate)
658          */
659         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
660                           QPU_SIG) != QPU_SIG_NONE) {
661                 qpu_serialize_one_inst(c, qpu_NOP());
662         }
663 
664         c->qpu_insts[c->qpu_inst_count - 1] =
665                 qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
666                             QPU_SIG_PROG_END);
667         qpu_serialize_one_inst(c, qpu_NOP());
668         qpu_serialize_one_inst(c, qpu_NOP());
669 
670         switch (c->stage) {
671         case QSTAGE_VERT:
672         case QSTAGE_COORD:
673                 break;
674         case QSTAGE_FRAG:
675                 c->qpu_insts[c->qpu_inst_count - 1] =
676                         qpu_set_sig(c->qpu_insts[c->qpu_inst_count - 1],
677                                     QPU_SIG_SCOREBOARD_UNLOCK);
678                 break;
679         }
680 
681         cycles += c->qpu_inst_count - inst_count_at_schedule_time;
682 
683         if (VC4_DBG(SHADERDB)) {
684                 util_debug_message(&vc4->base.debug, SHADER_INFO,
685                                    "%s shader: %d inst, %d threads, %d uniforms, %d max-temps, %d estimated-cycles",
686                                    qir_get_stage_name(c->stage),
687                                    c->qpu_inst_count,
688                                    1 + c->fs_threaded,
689                                    c->num_uniforms,
690                                    c->max_reg_pressure,
691                                    cycles);
692         }
693 
694         if (VC4_DBG(QPU))
695                 vc4_dump_program(c);
696 
697         vc4_qpu_validate(c->qpu_insts, c->qpu_inst_count);
698 
699         free(temp_registers);
700 }
701