xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_legalize.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #include "util/ralloc.h"
10 #include "util/u_math.h"
11 
12 #include "ir3.h"
13 #include "ir3_shader.h"
14 
15 /*
16  * Legalize:
17  *
18  * The legalize pass handles ensuring sufficient nop's and sync flags for
19  * correct execution.
20  *
21  * 1) Iteratively determine where sync ((sy)/(ss)) flags are needed,
22  *    based on state flowing out of predecessor blocks until there is
23  *    no further change.  In some cases this requires inserting nops.
24  * 2) Mark (ei) on last varying input
25  * 3) Final nop scheduling for instruction latency
26  * 4) Resolve jumps and schedule blocks, marking potential convergence
27  *    points with (jp)
28  */
29 
30 struct ir3_legalize_ctx {
31    struct ir3_compiler *compiler;
32    struct ir3_shader_variant *so;
33    gl_shader_stage type;
34    int max_bary;
35    bool early_input_release;
36    bool has_inputs;
37    bool has_tex_prefetch;
38 };
39 
40 struct ir3_nop_state {
41    unsigned full_ready[GPR_REG_SIZE];
42    unsigned half_ready[GPR_REG_SIZE];
43 };
44 
45 struct ir3_legalize_state {
46    regmask_t needs_ss;
47    regmask_t needs_ss_scalar_full; /* half scalar ALU producer -> full scalar ALU consumer */
48    regmask_t needs_ss_scalar_half; /* full scalar ALU producer -> half scalar ALU consumer */
49    regmask_t needs_ss_war; /* write after read */
50    regmask_t needs_ss_or_sy_war;  /* WAR for sy-producer sources */
51    regmask_t needs_ss_scalar_war; /* scalar ALU write -> ALU write */
52    regmask_t needs_ss_or_sy_scalar_war;
53    regmask_t needs_sy;
54    bool needs_ss_for_const;
55 
56    /* Each of these arrays contains the cycle when the corresponding register
57     * becomes "ready" i.e. does not require any more nops. There is a special
58     * mechanism to let ALU instructions read compatible (i.e. same halfness)
59     * destinations of another ALU instruction with less delay, so this can
60     * depend on what type the consuming instruction is, which is why there are
61     * multiple arrays. The cycle is counted relative to the start of the block.
62     */
63 
64    /* When ALU instructions reading the given full/half register will be ready.
65     */
66    struct ir3_nop_state alu_nop;
67 
68    /* When non-ALU (e.g. cat5) instructions reading the given full/half register
69     * will be ready.
70     */
71    struct ir3_nop_state non_alu_nop;
72 
73    /* When p0.x-w, a0.x, and a1.x are ready. */
74    unsigned pred_ready[4];
75    unsigned addr_ready[2];
76 };
77 
78 struct ir3_legalize_block_data {
79    bool valid;
80    struct ir3_legalize_state begin_state;
81    struct ir3_legalize_state state;
82 };
83 
84 static inline bool
needs_ss_war(struct ir3_legalize_state * state,struct ir3_register * dst,bool is_scalar_alu)85 needs_ss_war(struct ir3_legalize_state *state, struct ir3_register *dst,
86              bool is_scalar_alu)
87 {
88    if (regmask_get(&state->needs_ss_war, dst))
89       return true;
90    if (regmask_get(&state->needs_ss_or_sy_war, dst))
91       return true;
92 
93    if (!is_scalar_alu) {
94       if (regmask_get(&state->needs_ss_scalar_war, dst))
95          return true;
96       if (regmask_get(&state->needs_ss_or_sy_scalar_war, dst))
97          return true;
98    }
99 
100    return false;
101 }
102 
103 static inline void
apply_ss(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)104 apply_ss(struct ir3_instruction *instr,
105          struct ir3_legalize_state *state,
106          bool mergedregs)
107 {
108    instr->flags |= IR3_INSTR_SS;
109    regmask_init(&state->needs_ss_war, mergedregs);
110    regmask_init(&state->needs_ss_or_sy_war, mergedregs);
111    regmask_init(&state->needs_ss, mergedregs);
112    regmask_init(&state->needs_ss_scalar_war, mergedregs);
113    regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
114    regmask_init(&state->needs_ss_scalar_full, mergedregs);
115    regmask_init(&state->needs_ss_scalar_half, mergedregs);
116    state->needs_ss_for_const = false;
117 }
118 
119 static inline void
apply_sy(struct ir3_instruction * instr,struct ir3_legalize_state * state,bool mergedregs)120 apply_sy(struct ir3_instruction *instr,
121          struct ir3_legalize_state *state,
122          bool mergedregs)
123 {
124    instr->flags |= IR3_INSTR_SY;
125    regmask_init(&state->needs_sy, mergedregs);
126    regmask_init(&state->needs_ss_or_sy_war, mergedregs);
127    regmask_init(&state->needs_ss_or_sy_scalar_war, mergedregs);
128 }
129 
130 static bool
count_instruction(struct ir3_instruction * n,struct ir3_compiler * compiler)131 count_instruction(struct ir3_instruction *n, struct ir3_compiler *compiler)
132 {
133    /* NOTE: don't count branch/jump since we don't know yet if they will
134     * be eliminated later in resolve_jumps().. really should do that
135     * earlier so we don't have this constraint.
136     */
137    return (is_alu(n) && !is_scalar_alu(n, compiler)) ||
138       (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
139            (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
140 }
141 
142 static unsigned *
get_ready_slot(struct ir3_legalize_state * state,struct ir3_register * reg,unsigned num,bool consumer_alu,bool matching_size)143 get_ready_slot(struct ir3_legalize_state *state,
144                struct ir3_register *reg, unsigned num,
145                bool consumer_alu, bool matching_size)
146 {
147    if (reg->flags & IR3_REG_PREDICATE) {
148       assert(num == reg->num);
149       assert(reg_num(reg) == REG_P0);
150       return &state->pred_ready[reg_comp(reg)];
151    }
152    if (reg->num == regid(REG_A0, 0))
153       return &state->addr_ready[0];
154    if (reg->num == regid(REG_A0, 1))
155       return &state->addr_ready[1];
156    struct ir3_nop_state *nop =
157       consumer_alu ? &state->alu_nop : &state->non_alu_nop;
158    assert(!(reg->flags & IR3_REG_SHARED));
159    if (reg->flags & IR3_REG_HALF) {
160       if (matching_size)
161          return &nop->half_ready[num];
162       else
163          return &nop->full_ready[num / 2];
164    } else {
165       if (matching_size)
166          return &nop->full_ready[num];
167       /* If "num" is large enough, then it can't alias a half-reg because only
168        * the first half of the full reg speace aliases half regs. Return NULL in
169        * this case.
170        */
171       else if (num * 2 < ARRAY_SIZE(nop->half_ready))
172          return &nop->half_ready[num * 2];
173       else
174          return NULL;
175    }
176 }
177 
178 static unsigned
delay_calc(struct ir3_legalize_state * state,struct ir3_instruction * instr,unsigned cycle)179 delay_calc(struct ir3_legalize_state *state,
180            struct ir3_instruction *instr,
181            unsigned cycle)
182 {
183    /* As far as we know, shader outputs don't need any delay. */
184    if (instr->opc == OPC_END || instr->opc == OPC_CHMASK)
185       return 0;
186 
187    unsigned delay = 0;
188    foreach_src_n (src, n, instr) {
189       if (src->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED))
190          continue;
191 
192       unsigned elems = post_ra_reg_elems(src);
193       unsigned num = post_ra_reg_num(src);
194       unsigned src_cycle = cycle;
195 
196       /* gat and swz have scalar sources and each source is read in a
197        * subsequent cycle.
198        */
199       if (instr->opc == OPC_GAT || instr->opc == OPC_SWZ)
200          src_cycle += n;
201 
202       /* cat3 instructions consume their last source two cycles later, so they
203        * only need a delay of 1.
204        */
205       if ((is_mad(instr->opc) || is_madsh(instr->opc)) && n == 2)
206          src_cycle += 2;
207 
208       for (unsigned elem = 0; elem < elems; elem++, num++) {
209          unsigned ready_cycle =
210             *get_ready_slot(state, src, num, is_alu(instr), true);
211          delay = MAX2(delay, MAX2(ready_cycle, src_cycle) - src_cycle);
212 
213          /* Increment cycle for ALU instructions with (rptN) where sources are
214           * read each subsequent cycle.
215           */
216          if (instr->repeat && !(src->flags & IR3_REG_RELATIV))
217             src_cycle++;
218       }
219    }
220 
221    return delay;
222 }
223 
224 static void
delay_update(struct ir3_legalize_state * state,struct ir3_instruction * instr,unsigned cycle,bool mergedregs)225 delay_update(struct ir3_legalize_state *state,
226              struct ir3_instruction *instr,
227              unsigned cycle,
228              bool mergedregs)
229 {
230    if (writes_addr1(instr) && instr->block->in_early_preamble)
231       return;
232 
233    foreach_dst_n (dst, n, instr) {
234       unsigned elems = post_ra_reg_elems(dst);
235       unsigned num = post_ra_reg_num(dst);
236       unsigned dst_cycle = cycle;
237 
238       /* sct and swz have scalar destinations and each destination is written in
239        * a subsequent cycle.
240        */
241       if (instr->opc == OPC_SCT || instr->opc == OPC_SWZ)
242          dst_cycle += n;
243 
244       /* For relative accesses with (rptN), we have no way of knowing which
245        * component is accessed when, so we have to assume the worst and mark
246        * every array member as being written at the end.
247        */
248       if (dst->flags & IR3_REG_RELATIV)
249          dst_cycle += instr->repeat;
250 
251       if (dst->flags & IR3_REG_SHARED)
252          continue;
253 
254       for (unsigned elem = 0; elem < elems; elem++, num++) {
255          for (unsigned consumer_alu = 0; consumer_alu < 2; consumer_alu++) {
256             for (unsigned matching_size = 0; matching_size < 2; matching_size++) {
257                unsigned *ready_slot =
258                   get_ready_slot(state, dst, num, consumer_alu, matching_size);
259 
260                if (!ready_slot)
261                   continue;
262 
263                bool reset_ready_slot = false;
264                unsigned delay = 0;
265                if (!is_alu(instr)) {
266                   /* Apparently writes that require (ss) or (sy) are
267                    * synchronized against previous writes, so consumers don't
268                    * have to wait for any previous overlapping ALU instructions
269                    * to complete.
270                    */
271                   reset_ready_slot = true;
272                } else if ((dst->flags & IR3_REG_PREDICATE) ||
273                           reg_num(dst) == REG_A0) {
274                   delay = 6;
275                   if (!matching_size)
276                      continue;
277                } else {
278                   delay = (consumer_alu && matching_size) ? 3 : 6;
279                }
280 
281                if (!matching_size) {
282                   for (unsigned i = 0; i < reg_elem_size(dst); i++) {
283                      ready_slot[i] =
284                         reset_ready_slot ? 0 :
285                         MAX2(ready_slot[i], dst_cycle + delay);
286                   }
287                } else {
288                   *ready_slot =
289                      reset_ready_slot ? 0 :
290                      MAX2(*ready_slot, dst_cycle + delay);
291                }
292             }
293          }
294 
295          /* Increment cycle for ALU instructions with (rptN) where destinations
296           * are written each subsequent cycle.
297           */
298          if (instr->repeat && !(dst->flags & IR3_REG_RELATIV))
299             dst_cycle++;
300       }
301    }
302 }
303 
304 /* We want to evaluate each block from the position of any other
305  * predecessor block, in order that the flags set are the union of
306  * all possible program paths.
307  *
308  * To do this, we need to know the output state (needs_ss/ss_war/sy)
309  * of all predecessor blocks.  The tricky thing is loops, which mean
310  * that we can't simply recursively process each predecessor block
311  * before legalizing the current block.
312  *
313  * How we handle that is by looping over all the blocks until the
314  * results converge.  If the output state of a given block changes
315  * in a given pass, this means that all successor blocks are not
316  * yet fully legalized.
317  */
318 
319 static bool
legalize_block(struct ir3_legalize_ctx * ctx,struct ir3_block * block)320 legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
321 {
322    struct ir3_legalize_block_data *bd = block->data;
323 
324    if (bd->valid)
325       return false;
326 
327    struct ir3_instruction *last_n = NULL;
328    struct list_head instr_list;
329    struct ir3_legalize_state prev_state = bd->state;
330    struct ir3_legalize_state *state = &bd->begin_state;
331    bool last_input_needs_ss = false;
332    bool mergedregs = ctx->so->mergedregs;
333 
334    /* Our input state is the OR of all predecessor blocks' state.
335     *
336     * Why don't we just zero the state at the beginning before merging in the
337     * predecessors? Because otherwise updates may not be a "lattice refinement",
338     * i.e. needs_ss may go from true to false for some register due to a (ss) we
339     * inserted the second time around (and the same for (sy)). This means that
340     * there's no solid guarantee the algorithm will converge, and in theory
341     * there may be infinite loops where we fight over the placment of an (ss).
342     */
343    for (unsigned i = 0; i < block->predecessors_count; i++) {
344       struct ir3_block *predecessor = block->predecessors[i];
345       struct ir3_legalize_block_data *pbd = predecessor->data;
346       struct ir3_legalize_state *pstate = &pbd->state;
347 
348       /* Our input (ss)/(sy) state is based on OR'ing the output
349        * state of all our predecessor blocks
350        */
351       regmask_or(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
352       regmask_or(&state->needs_ss_war, &state->needs_ss_war,
353                  &pstate->needs_ss_war);
354       regmask_or(&state->needs_ss_or_sy_war, &state->needs_ss_or_sy_war,
355                  &pstate->needs_ss_or_sy_war);
356       regmask_or(&state->needs_sy, &state->needs_sy, &pstate->needs_sy);
357       state->needs_ss_for_const |= pstate->needs_ss_for_const;
358 
359       /* Our nop state is the max of the predecessor blocks */
360       for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
361          state->pred_ready[i] = MAX2(state->pred_ready[i],
362                                      pstate->pred_ready[i]);
363       for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
364          state->alu_nop.full_ready[i] = MAX2(state->alu_nop.full_ready[i],
365                                              pstate->alu_nop.full_ready[i]);
366          state->alu_nop.half_ready[i] = MAX2(state->alu_nop.half_ready[i],
367                                              pstate->alu_nop.half_ready[i]);
368          state->non_alu_nop.full_ready[i] = MAX2(state->non_alu_nop.full_ready[i],
369                                                  pstate->non_alu_nop.full_ready[i]);
370          state->non_alu_nop.half_ready[i] = MAX2(state->non_alu_nop.half_ready[i],
371                                                  pstate->non_alu_nop.half_ready[i]);
372       }
373    }
374 
375    /* We need to take phsyical-only edges into account when tracking shared
376     * registers.
377     */
378    for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
379       struct ir3_block *predecessor = block->physical_predecessors[i];
380       struct ir3_legalize_block_data *pbd = predecessor->data;
381       struct ir3_legalize_state *pstate = &pbd->state;
382 
383       regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
384       regmask_or_shared(&state->needs_ss_scalar_full,
385                         &state->needs_ss_scalar_full,
386                         &pstate->needs_ss_scalar_full);
387       regmask_or_shared(&state->needs_ss_scalar_half,
388                         &state->needs_ss_scalar_half,
389                         &pstate->needs_ss_scalar_half);
390       regmask_or_shared(&state->needs_ss_scalar_war, &state->needs_ss_scalar_war,
391                         &pstate->needs_ss_scalar_war);
392       regmask_or_shared(&state->needs_ss_or_sy_scalar_war,
393                         &state->needs_ss_or_sy_scalar_war,
394                         &pstate->needs_ss_or_sy_scalar_war);
395    }
396 
397    memcpy(&bd->state, state, sizeof(*state));
398    state = &bd->state;
399 
400    unsigned input_count = 0;
401 
402    foreach_instr (n, &block->instr_list) {
403       if (is_input(n)) {
404          input_count++;
405       }
406    }
407 
408    unsigned inputs_remaining = input_count;
409 
410    /* Either inputs are in the first block or we expect inputs to be released
411     * with the end of the program.
412     */
413    assert(input_count == 0 || !ctx->early_input_release ||
414           block == ir3_after_preamble(block->shader));
415 
416    /* remove all the instructions from the list, we'll be adding
417     * them back in as we go
418     */
419    list_replace(&block->instr_list, &instr_list);
420    list_inithead(&block->instr_list);
421 
422    unsigned cycle = 0;
423 
424    foreach_instr_safe (n, &instr_list) {
425       unsigned i;
426 
427       n->flags &= ~(IR3_INSTR_SS | IR3_INSTR_SY);
428 
429       /* _meta::tex_prefetch instructions removed later in
430        * collect_tex_prefetches()
431        */
432       if (is_meta(n) && (n->opc != OPC_META_TEX_PREFETCH))
433          continue;
434 
435       if (is_input(n)) {
436          struct ir3_register *inloc = n->srcs[0];
437          assert(inloc->flags & IR3_REG_IMMED);
438 
439          int last_inloc =
440             inloc->iim_val + ((inloc->flags & IR3_REG_R) ? n->repeat : 0);
441          ctx->max_bary = MAX2(ctx->max_bary, last_inloc);
442       }
443 
444       if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
445          apply_ss(n, state, mergedregs);
446          apply_sy(n, state, mergedregs);
447          last_input_needs_ss = false;
448       }
449 
450       if (last_n && (last_n->opc == OPC_PREDT)) {
451          apply_ss(n, state, mergedregs);
452       }
453 
454       bool n_is_scalar_alu = is_scalar_alu(n, ctx->compiler);
455 
456       /* NOTE: consider dst register too.. it could happen that
457        * texture sample instruction (for example) writes some
458        * components which are unused.  A subsequent instruction
459        * that writes the same register can race w/ the sam instr
460        * resulting in undefined results:
461        */
462       for (i = 0; i < n->dsts_count + n->srcs_count; i++) {
463          struct ir3_register *reg;
464          if (i < n->dsts_count)
465             reg = n->dsts[i];
466          else
467             reg = n->srcs[i - n->dsts_count];
468 
469          if (reg_gpr(reg)) {
470 
471             /* TODO: we probably only need (ss) for alu
472              * instr consuming sfu result.. need to make
473              * some tests for both this and (sy)..
474              */
475             if (regmask_get(&state->needs_ss, reg)) {
476                apply_ss(n, state, mergedregs);
477                last_input_needs_ss = false;
478             }
479 
480             /* There is a fast feedback path for scalar ALU instructions which
481              * only takes 1 cycle of latency, similar to the normal 3 cycle
482              * latency path for ALU instructions. For this fast path the
483              * producer and consumer must use the same register size (i.e. no
484              * writing a full register and then reading half of it or vice
485              * versa). If we don't hit this path, either because of a mismatched
486              * size or a read via the regular ALU, then the write latency is
487              * variable and we must use (ss) to wait for the scalar ALU. This is
488              * different from the fixed 6 cycle latency for mismatched vector
489              * ALU accesses.
490              */
491             if (n_is_scalar_alu) {
492                /* Check if we have a mismatched size RaW dependency */
493                if (regmask_get((reg->flags & IR3_REG_HALF) ?
494                                &state->needs_ss_scalar_half :
495                                &state->needs_ss_scalar_full, reg)) {
496                   apply_ss(n, state, mergedregs);
497                   last_input_needs_ss = false;
498                }
499             } else {
500                /* check if we have a scalar -> vector RaW dependency */
501                if (regmask_get(&state->needs_ss_scalar_half, reg) ||
502                    regmask_get(&state->needs_ss_scalar_full, reg)) {
503                   apply_ss(n, state, mergedregs);
504                   last_input_needs_ss = false;
505                }
506             }
507 
508             if (regmask_get(&state->needs_sy, reg)) {
509                apply_sy(n, state, mergedregs);
510             }
511          } else if ((reg->flags & IR3_REG_CONST)) {
512             if (state->needs_ss_for_const) {
513                apply_ss(n, state, mergedregs);
514                last_input_needs_ss = false;
515             }
516          } else if (reg_is_addr1(reg) && block->in_early_preamble) {
517             if (regmask_get(&state->needs_ss, reg)) {
518                apply_ss(n, state, mergedregs);
519                last_input_needs_ss = false;
520             }
521          }
522       }
523 
524       foreach_dst (reg, n) {
525          if (needs_ss_war(state, reg, n_is_scalar_alu)) {
526             apply_ss(n, state, mergedregs);
527             last_input_needs_ss = false;
528          }
529       }
530 
531       /* I'm not exactly what this is for, but it seems we need this on every
532        * mova1 in early preambles.
533        */
534       if (writes_addr1(n) && block->in_early_preamble)
535          n->srcs[0]->flags |= IR3_REG_R;
536 
537       /* cat5+ does not have an (ss) bit, if needed we need to
538        * insert a nop to carry the sync flag.  Would be kinda
539        * clever if we were aware of this during scheduling, but
540        * this should be a pretty rare case:
541        */
542       if ((n->flags & IR3_INSTR_SS) && (opc_cat(n->opc) >= 5)) {
543          struct ir3_instruction *nop;
544          nop = ir3_NOP(block);
545          nop->flags |= IR3_INSTR_SS;
546          n->flags &= ~IR3_INSTR_SS;
547          last_n = nop;
548          cycle++;
549       }
550 
551       unsigned delay = delay_calc(state, n, cycle);
552 
553       /* NOTE: I think the nopN encoding works for a5xx and
554        * probably a4xx, but not a3xx.  So far only tested on
555        * a6xx.
556        */
557 
558       if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n &&
559           !n_is_scalar_alu &&
560           ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) &&
561           (last_n->repeat == 0)) {
562          /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
563          unsigned transfer = MIN2(delay, 3 - last_n->nop);
564          last_n->nop += transfer;
565          delay -= transfer;
566          cycle += transfer;
567       }
568 
569       if ((delay > 0) && last_n && (last_n->opc == OPC_NOP)) {
570          /* the previous nop can encode at most 5 repeats: */
571          unsigned transfer = MIN2(delay, 5 - last_n->repeat);
572          last_n->repeat += transfer;
573          delay -= transfer;
574          cycle += transfer;
575       }
576 
577       if (delay > 0) {
578          assert(delay <= 6);
579          ir3_NOP(block)->repeat = delay - 1;
580          cycle += delay;
581       }
582 
583       if (ctx->compiler->samgq_workaround &&
584           ctx->type != MESA_SHADER_FRAGMENT &&
585           ctx->type != MESA_SHADER_COMPUTE && n->opc == OPC_SAMGQ) {
586          struct ir3_instruction *samgp;
587 
588          list_delinit(&n->node);
589 
590          for (i = 0; i < 4; i++) {
591             samgp = ir3_instr_clone(n);
592             samgp->opc = OPC_SAMGP0 + i;
593             if (i > 1)
594                samgp->flags |= IR3_INSTR_SY;
595          }
596       } else {
597          list_delinit(&n->node);
598          list_addtail(&n->node, &block->instr_list);
599       }
600 
601       if (is_sfu(n))
602          regmask_set(&state->needs_ss, n->dsts[0]);
603 
604       foreach_dst (dst, n) {
605          if (dst->flags & IR3_REG_SHARED) {
606             if (n_is_scalar_alu) {
607                if (dst->flags & IR3_REG_HALF)
608                   regmask_set(&state->needs_ss_scalar_full, dst);
609                else
610                   regmask_set(&state->needs_ss_scalar_half, dst);
611             } else {
612                regmask_set(&state->needs_ss, dst);
613             }
614          } else if (reg_is_addr1(dst) && block->in_early_preamble) {
615             regmask_set(&state->needs_ss, dst);
616          }
617       }
618 
619       if (is_tex_or_prefetch(n) && n->dsts_count > 0) {
620          regmask_set(&state->needs_sy, n->dsts[0]);
621          if (n->opc == OPC_META_TEX_PREFETCH)
622             ctx->has_tex_prefetch = true;
623       } else if (n->opc == OPC_RESINFO && n->dsts_count > 0) {
624          regmask_set(&state->needs_ss, n->dsts[0]);
625          ir3_NOP(block)->flags |= IR3_INSTR_SS;
626          last_input_needs_ss = false;
627       } else if (is_load(n)) {
628          if (is_local_mem_load(n))
629             regmask_set(&state->needs_ss, n->dsts[0]);
630          else
631             regmask_set(&state->needs_sy, n->dsts[0]);
632       } else if (is_atomic(n->opc)) {
633          if (is_bindless_atomic(n->opc)) {
634             regmask_set(&state->needs_sy, n->srcs[2]);
635          } else if (is_global_a3xx_atomic(n->opc) ||
636                     is_global_a6xx_atomic(n->opc)) {
637             regmask_set(&state->needs_sy, n->dsts[0]);
638          } else {
639             regmask_set(&state->needs_ss, n->dsts[0]);
640          }
641       } else if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
642          state->needs_ss_for_const = true;
643       }
644 
645       if (is_ssbo(n->opc) || is_global_a3xx_atomic(n->opc) ||
646           is_bindless_atomic(n->opc))
647          ctx->so->has_ssbo = true;
648 
649       /* both tex/sfu appear to not always immediately consume
650        * their src register(s):
651        */
652       if (is_war_hazard_producer(n)) {
653          /* These WAR hazards can always be resolved with (ss). However, when
654           * the reader is a sy-producer, they can also be resolved using (sy)
655           * because once we have synced the reader's results using (sy), its
656           * sources have definitely been consumed. We track the two cases
657           * separately so that we don't add an unnecessary (ss) if a (sy) sync
658           * already happened.
659           * For example, this prevents adding the unnecessary (ss) in the
660           * following sequence:
661           * sam rd, rs, ...
662           * (sy)... ; sam synced so consumed its sources
663           * (ss)write rs ; (ss) unnecessary since rs has been consumed already
664           */
665          bool needs_ss = is_ss_producer(n) || is_store(n) || n->opc == OPC_STC;
666 
667          if (n_is_scalar_alu) {
668             /* Scalar ALU also does not immediately read its source because it
669              * is not executed right away, but scalar ALU instructions are
670              * executed in-order so subsequent scalar ALU instructions don't
671              * need to wait for previous ones.
672              */
673             regmask_t *mask = needs_ss ? &state->needs_ss_scalar_war
674                                        : &state->needs_ss_or_sy_scalar_war;
675 
676             foreach_src (reg, n) {
677                if ((reg->flags & IR3_REG_SHARED) || is_reg_a0(reg)) {
678                   regmask_set(mask, reg);
679                }
680             }
681          } else {
682             regmask_t *mask =
683                needs_ss ? &state->needs_ss_war : &state->needs_ss_or_sy_war;
684 
685             foreach_src (reg, n) {
686                if (!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
687                   regmask_set(mask, reg);
688                }
689             }
690          }
691       }
692 
693       bool count = count_instruction(n, ctx->compiler);
694       if (count)
695          cycle += 1;
696 
697       delay_update(state, n, cycle, mergedregs);
698 
699       if (count)
700          cycle += n->repeat;
701 
702       if (ctx->early_input_release && is_input(n)) {
703          last_input_needs_ss |= (n->opc == OPC_LDLV);
704 
705          assert(inputs_remaining > 0);
706          inputs_remaining--;
707          if (inputs_remaining == 0) {
708             /* This is the last input. We add the (ei) flag to release
709              * varying memory after this executes. If it's an ldlv,
710              * however, we need to insert a dummy bary.f on which we can
711              * set the (ei) flag. We may also need to insert an (ss) to
712              * guarantee that all ldlv's have finished fetching their
713              * results before releasing the varying memory.
714              */
715             struct ir3_instruction *last_input = n;
716             if (n->opc == OPC_LDLV) {
717                struct ir3_instruction *baryf;
718 
719                /* (ss)bary.f (ei)r63.x, 0, r0.x */
720                baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
721                ir3_dst_create(baryf, regid(63, 0), 0);
722                ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
723                ir3_src_create(baryf, regid(0, 0), 0);
724 
725                last_input = baryf;
726             }
727 
728             last_input->dsts[0]->flags |= IR3_REG_EI;
729             if (last_input_needs_ss) {
730                apply_ss(last_input, state, mergedregs);
731             }
732          }
733       }
734 
735       last_n = n;
736    }
737 
738    assert(inputs_remaining == 0 || !ctx->early_input_release);
739 
740    if (block == ir3_after_preamble(ctx->so->ir) &&
741        ctx->has_tex_prefetch && !ctx->has_inputs) {
742       /* texture prefetch, but *no* inputs.. we need to insert a
743        * dummy bary.f at the top of the shader to unblock varying
744        * storage:
745        */
746       struct ir3_instruction *baryf;
747 
748       /* (ss)bary.f (ei)r63.x, 0, r0.x */
749       baryf = ir3_instr_create(block, OPC_BARY_F, 1, 2);
750       ir3_dst_create(baryf, regid(63, 0), 0)->flags |= IR3_REG_EI;
751       ir3_src_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
752       ir3_src_create(baryf, regid(0, 0), 0);
753 
754       /* insert the dummy bary.f at head: */
755       list_delinit(&baryf->node);
756       list_add(&baryf->node, &block->instr_list);
757    }
758 
759    /* Currently our nop state contains the cycle offset from the start of this
760     * block when each register becomes ready. But successor blocks need the
761     * cycle offset from their start, which is this block's end. Translate the
762     * cycle offset.
763     */
764    for (unsigned i = 0; i < ARRAY_SIZE(state->pred_ready); i++)
765       state->pred_ready[i] = MAX2(state->pred_ready[i], cycle) - cycle;
766    for (unsigned i = 0; i < ARRAY_SIZE(state->alu_nop.full_ready); i++) {
767       state->alu_nop.full_ready[i] =
768          MAX2(state->alu_nop.full_ready[i], cycle) - cycle;
769       state->alu_nop.half_ready[i] =
770          MAX2(state->alu_nop.half_ready[i], cycle) - cycle;
771       state->non_alu_nop.full_ready[i] =
772          MAX2(state->non_alu_nop.full_ready[i], cycle) - cycle;
773       state->non_alu_nop.half_ready[i] =
774          MAX2(state->non_alu_nop.half_ready[i], cycle) - cycle;
775    }
776 
777    bd->valid = true;
778 
779    if (memcmp(&prev_state, state, sizeof(*state))) {
780       /* our output state changed, this invalidates all of our
781        * successors:
782        */
783       for (unsigned i = 0; i < ARRAY_SIZE(block->successors); i++) {
784          if (!block->successors[i])
785             break;
786          struct ir3_legalize_block_data *pbd = block->successors[i]->data;
787          pbd->valid = false;
788       }
789    }
790 
791    return true;
792 }
793 
794 /* Expands dsxpp and dsypp macros to:
795  *
796  * dsxpp.1 dst, src
797  * dsxpp.1.p dst, src
798  *
799  * We apply this after flags syncing, as we don't want to sync in between the
800  * two (which might happen if dst == src).
801  */
802 static bool
apply_fine_deriv_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)803 apply_fine_deriv_macro(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
804 {
805    struct list_head instr_list;
806 
807    /* remove all the instructions from the list, we'll be adding
808     * them back in as we go
809     */
810    list_replace(&block->instr_list, &instr_list);
811    list_inithead(&block->instr_list);
812 
813    foreach_instr_safe (n, &instr_list) {
814       list_addtail(&n->node, &block->instr_list);
815 
816       if (n->opc == OPC_DSXPP_MACRO || n->opc == OPC_DSYPP_MACRO) {
817          n->opc = (n->opc == OPC_DSXPP_MACRO) ? OPC_DSXPP_1 : OPC_DSYPP_1;
818 
819          struct ir3_instruction *op_p = ir3_instr_clone(n);
820          op_p->flags = IR3_INSTR_P;
821 
822          ctx->so->need_full_quad = true;
823       }
824    }
825 
826    return true;
827 }
828 
829 /* Some instructions can take a dummy destination of r63.x, which we model as it
830  * not having a destination in the IR to avoid having special code to handle
831  * this. Insert the dummy destination after everything else is done.
832  */
833 static bool
expand_dummy_dests(struct ir3_block * block)834 expand_dummy_dests(struct ir3_block *block)
835 {
836    foreach_instr (n, &block->instr_list) {
837       if ((n->opc == OPC_SAM || n->opc == OPC_LDC || n->opc == OPC_RESINFO) &&
838           n->dsts_count == 0) {
839          struct ir3_register *dst = ir3_dst_create(n, INVALID_REG, 0);
840          /* Copy the blob's writemask */
841          if (n->opc == OPC_SAM)
842             dst->wrmask = 0b1111;
843       }
844    }
845    return true;
846 }
847 
848 static void
apply_push_consts_load_macro(struct ir3_legalize_ctx * ctx,struct ir3_block * block)849 apply_push_consts_load_macro(struct ir3_legalize_ctx *ctx,
850                              struct ir3_block *block)
851 {
852    foreach_instr (n, &block->instr_list) {
853       if (n->opc == OPC_PUSH_CONSTS_LOAD_MACRO) {
854          struct ir3_instruction *stsc = ir3_instr_create(block, OPC_STSC, 0, 2);
855          ir3_instr_move_after(stsc, n);
856          ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
857             n->push_consts.dst_base;
858          ir3_src_create(stsc, 0, IR3_REG_IMMED)->iim_val =
859             n->push_consts.src_base;
860          stsc->cat6.iim_val = n->push_consts.src_size;
861          stsc->cat6.type = TYPE_U32;
862 
863          if (ctx->compiler->stsc_duplication_quirk) {
864             struct ir3_instruction *nop = ir3_NOP(block);
865             ir3_instr_move_after(nop, stsc);
866             nop->flags |= IR3_INSTR_SS;
867             ir3_instr_move_after(ir3_instr_clone(stsc), nop);
868          }
869 
870          list_delinit(&n->node);
871          break;
872       } else if (!is_meta(n)) {
873          break;
874       }
875    }
876 }
877 
878 /* NOTE: branch instructions are always the last instruction(s)
879  * in the block.  We take advantage of this as we resolve the
880  * branches, since "if (foo) break;" constructs turn into
881  * something like:
882  *
883  *   block3 {
884  *   	...
885  *   	0029:021: mov.s32s32 r62.x, r1.y
886  *   	0082:022: br !p0.x, target=block5
887  *   	0083:023: br p0.x, target=block4
888  *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
889  *   }
890  *   block4 {
891  *   	0084:024: jump, target=block6
892  *   	// succs: block6;
893  *   }
894  *   block5 {
895  *   	0085:025: jump, target=block7
896  *   	// succs: block7;
897  *   }
898  *
899  * ie. only instruction in block4/block5 is a jump, so when
900  * resolving branches we can easily detect this by checking
901  * that the first instruction in the target block is itself
902  * a jump, and setup the br directly to the jump's target
903  * (and strip back out the now unreached jump)
904  *
905  * TODO sometimes we end up with things like:
906  *
907  *    br !p0.x, #2
908  *    br p0.x, #12
909  *    add.u r0.y, r0.y, 1
910  *
911  * If we swapped the order of the branches, we could drop one.
912  */
913 static struct ir3_block *
resolve_dest_block(struct ir3_block * block)914 resolve_dest_block(struct ir3_block *block)
915 {
916    /* special case for last block: */
917    if (!block->successors[0])
918       return block;
919 
920    /* NOTE that we may or may not have inserted the jump
921     * in the target block yet, so conditions to resolve
922     * the dest to the dest block's successor are:
923     *
924     *   (1) successor[1] == NULL &&
925     *   (2) (block-is-empty || only-instr-is-jump)
926     */
927    if (block->successors[1] == NULL) {
928       if (list_is_empty(&block->instr_list)) {
929          return block->successors[0];
930       } else if (list_length(&block->instr_list) == 1) {
931          struct ir3_instruction *instr =
932             list_first_entry(&block->instr_list, struct ir3_instruction, node);
933          if (instr->opc == OPC_JUMP) {
934             /* If this jump is backwards, then we will probably convert
935              * the jump being resolved to a backwards jump, which will
936              * change a loop-with-continue or loop-with-if into a
937              * doubly-nested loop and change the convergence behavior.
938              * Disallow this here.
939              */
940             if (block->successors[0]->index <= block->index)
941                return block;
942             return block->successors[0];
943          }
944       }
945    }
946    return block;
947 }
948 
949 static void
remove_unused_block(struct ir3_block * old_target)950 remove_unused_block(struct ir3_block *old_target)
951 {
952    list_delinit(&old_target->node);
953 
954    /* cleanup dangling predecessors: */
955    for (unsigned i = 0; i < ARRAY_SIZE(old_target->successors); i++) {
956       if (old_target->successors[i]) {
957          struct ir3_block *succ = old_target->successors[i];
958          ir3_block_remove_predecessor(succ, old_target);
959       }
960    }
961 }
962 
963 static bool
retarget_jump(struct ir3_instruction * instr,struct ir3_block * new_target)964 retarget_jump(struct ir3_instruction *instr, struct ir3_block *new_target)
965 {
966    struct ir3_block *old_target = instr->cat0.target;
967    struct ir3_block *cur_block = instr->block;
968 
969    /* update current blocks successors to reflect the retargetting: */
970    if (cur_block->successors[0] == old_target) {
971       cur_block->successors[0] = new_target;
972    } else {
973       assert(cur_block->successors[1] == old_target);
974       cur_block->successors[1] = new_target;
975    }
976 
977    /* update new target's predecessors: */
978    ir3_block_add_predecessor(new_target, cur_block);
979 
980    /* and remove old_target's predecessor: */
981    ir3_block_remove_predecessor(old_target, cur_block);
982 
983    instr->cat0.target = new_target;
984 
985    if (old_target->predecessors_count == 0) {
986       remove_unused_block(old_target);
987       return true;
988    }
989 
990    return false;
991 }
992 
993 static bool
is_invertible_branch(struct ir3_instruction * instr)994 is_invertible_branch(struct ir3_instruction *instr)
995 {
996    switch (instr->opc) {
997    case OPC_BR:
998    case OPC_BRAA:
999    case OPC_BRAO:
1000    case OPC_BANY:
1001    case OPC_BALL:
1002       return true;
1003    default:
1004       return false;
1005    }
1006 }
1007 
1008 static bool
opt_jump(struct ir3 * ir)1009 opt_jump(struct ir3 *ir)
1010 {
1011    bool progress = false;
1012 
1013    unsigned index = 0;
1014    foreach_block (block, &ir->block_list)
1015       block->index = index++;
1016 
1017    foreach_block (block, &ir->block_list) {
1018       /* This pass destroys the physical CFG so don't keep it around to avoid
1019        * validation errors.
1020        */
1021       block->physical_successors_count = 0;
1022       block->physical_predecessors_count = 0;
1023 
1024       foreach_instr (instr, &block->instr_list) {
1025          if (!is_flow(instr) || !instr->cat0.target)
1026             continue;
1027 
1028          struct ir3_block *tblock = resolve_dest_block(instr->cat0.target);
1029          if (tblock != instr->cat0.target) {
1030             progress = true;
1031 
1032             /* Exit early if we deleted a block to avoid iterator
1033              * weirdness/assert fails
1034              */
1035             if (retarget_jump(instr, tblock))
1036                return true;
1037          }
1038       }
1039 
1040       /* Detect the case where the block ends either with:
1041        * - A single unconditional jump to the next block.
1042        * - Two jump instructions with opposite conditions, and one of the
1043        *   them jumps to the next block.
1044        * We can remove the one that jumps to the next block in either case.
1045        */
1046       if (list_is_empty(&block->instr_list))
1047          continue;
1048 
1049       struct ir3_instruction *jumps[2] = {NULL, NULL};
1050       jumps[0] =
1051          list_last_entry(&block->instr_list, struct ir3_instruction, node);
1052       if (!list_is_singular(&block->instr_list))
1053          jumps[1] =
1054             list_last_entry(&jumps[0]->node, struct ir3_instruction, node);
1055 
1056       if (jumps[0]->opc == OPC_JUMP)
1057          jumps[1] = NULL;
1058       else if (!is_invertible_branch(jumps[0]) || !jumps[1] ||
1059                !is_invertible_branch(jumps[1])) {
1060          continue;
1061       }
1062 
1063       for (unsigned i = 0; i < 2; i++) {
1064          if (!jumps[i])
1065             continue;
1066          struct ir3_block *tblock = jumps[i]->cat0.target;
1067          if (&tblock->node == block->node.next) {
1068             list_delinit(&jumps[i]->node);
1069             progress = true;
1070             break;
1071          }
1072       }
1073    }
1074 
1075    return progress;
1076 }
1077 
1078 static void
resolve_jumps(struct ir3 * ir)1079 resolve_jumps(struct ir3 *ir)
1080 {
1081    foreach_block (block, &ir->block_list)
1082       foreach_instr (instr, &block->instr_list)
1083          if (is_flow(instr) && instr->cat0.target) {
1084             struct ir3_instruction *target = list_first_entry(
1085                &instr->cat0.target->instr_list, struct ir3_instruction, node);
1086 
1087             instr->cat0.immed = (int)target->ip - (int)instr->ip;
1088          }
1089 }
1090 
1091 static void
mark_jp(struct ir3_block * block)1092 mark_jp(struct ir3_block *block)
1093 {
1094    /* We only call this on the end block (in kill_sched) or after retargeting
1095     * all jumps to empty blocks (in mark_xvergence_points) so there's no need to
1096     * worry about empty blocks.
1097     */
1098    assert(!list_is_empty(&block->instr_list));
1099 
1100    struct ir3_instruction *target =
1101       list_first_entry(&block->instr_list, struct ir3_instruction, node);
1102    target->flags |= IR3_INSTR_JP;
1103 }
1104 
1105 /* Mark points where control flow reconverges.
1106  *
1107  * Re-convergence points are where "parked" threads are reconverged with threads
1108  * that took the opposite path last time around. We already calculated them, we
1109  * just need to mark them with (jp).
1110  */
1111 static void
mark_xvergence_points(struct ir3 * ir)1112 mark_xvergence_points(struct ir3 *ir)
1113 {
1114    foreach_block (block, &ir->block_list) {
1115       if (block->reconvergence_point)
1116          mark_jp(block);
1117    }
1118 }
1119 
1120 static void
invert_branch(struct ir3_instruction * branch)1121 invert_branch(struct ir3_instruction *branch)
1122 {
1123    switch (branch->opc) {
1124    case OPC_BR:
1125       break;
1126    case OPC_BALL:
1127       branch->opc = OPC_BANY;
1128       break;
1129    case OPC_BANY:
1130       branch->opc = OPC_BALL;
1131       break;
1132    case OPC_BRAA:
1133       branch->opc = OPC_BRAO;
1134       break;
1135    case OPC_BRAO:
1136       branch->opc = OPC_BRAA;
1137       break;
1138    default:
1139       unreachable("can't get here");
1140    }
1141 
1142    branch->cat0.inv1 = !branch->cat0.inv1;
1143    branch->cat0.inv2 = !branch->cat0.inv2;
1144    branch->cat0.target = branch->block->successors[1];
1145 }
1146 
1147 /* Insert the branch/jump instructions for flow control between blocks.
1148  * Initially this is done naively, without considering if the successor
1149  * block immediately follows the current block (ie. so no jump required),
1150  * but that is cleaned up in opt_jump().
1151  */
1152 static void
block_sched(struct ir3 * ir)1153 block_sched(struct ir3 *ir)
1154 {
1155    foreach_block (block, &ir->block_list) {
1156       struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1157 
1158       if (block->successors[1]) {
1159          /* if/else, conditional branches to "then" or "else": */
1160          struct ir3_instruction *br1, *br2;
1161 
1162          assert(terminator);
1163          unsigned opc = terminator->opc;
1164 
1165          if (opc == OPC_GETONE || opc == OPC_SHPS || opc == OPC_GETLAST) {
1166             /* getone/shps can't be inverted, and it wouldn't even make sense
1167              * to follow it with an inverted branch, so follow it by an
1168              * unconditional branch.
1169              */
1170             assert(terminator->srcs_count == 0);
1171             br1 = terminator;
1172             br1->cat0.target = block->successors[1];
1173 
1174             br2 = ir3_JUMP(block);
1175             br2->cat0.target = block->successors[0];
1176          } else if (opc == OPC_BR || opc == OPC_BRAA || opc == OPC_BRAO ||
1177                     opc == OPC_BALL || opc == OPC_BANY) {
1178             /* create "else" branch first (since "then" block should
1179              * frequently/always end up being a fall-thru):
1180              */
1181             br1 = terminator;
1182             br2 = ir3_instr_clone(br1);
1183             invert_branch(br1);
1184             br2->cat0.target = block->successors[0];
1185          } else {
1186             assert(opc == OPC_PREDT || opc == OPC_PREDF);
1187 
1188             /* Handled by prede_sched. */
1189             terminator->cat0.target = block->successors[0];
1190             continue;
1191          }
1192 
1193          /* Creating br2 caused it to be moved before the terminator b1, move it
1194           * back.
1195           */
1196          ir3_instr_move_after(br2, br1);
1197       } else if (block->successors[0]) {
1198          /* otherwise unconditional jump or predt/predf to next block which
1199           * should already have been inserted.
1200           */
1201          assert(terminator);
1202          assert(terminator->opc == OPC_JUMP || terminator->opc == OPC_PREDT ||
1203                 terminator->opc == OPC_PREDF);
1204          terminator->cat0.target = block->successors[0];
1205       }
1206    }
1207 }
1208 
1209 static void
prede_sched(struct ir3 * ir)1210 prede_sched(struct ir3 *ir)
1211 {
1212    unsigned index = 0;
1213    foreach_block (block, &ir->block_list)
1214       block->index = index++;
1215 
1216    foreach_block (block, &ir->block_list) {
1217       /* Look for the following pattern generated by NIR lowering. The numbers
1218        * at the top of blocks are their index.
1219        *        |--- i ----|
1220        *        |   ...    |
1221        *        | pred[tf] |
1222        *        |----------|
1223        *      succ0 /   \ succ1
1224        * |-- i+1 ---| |-- i+2 ---|
1225        * |    ...   | |   ...    |
1226        * | pred[ft] | |   ...    |
1227        * |----------| |----------|
1228        *     succ0 \   / succ0
1229        *        |--- j ----|
1230        *        |   ...    |
1231        *        |----------|
1232        */
1233       struct ir3_block *succ0 = block->successors[0];
1234       struct ir3_block *succ1 = block->successors[1];
1235 
1236       if (!succ1)
1237          continue;
1238 
1239       struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1240       if (!terminator)
1241          continue;
1242       if (terminator->opc != OPC_PREDT && terminator->opc != OPC_PREDF)
1243          continue;
1244 
1245       assert(!succ0->successors[1] && !succ1->successors[1]);
1246       assert(succ0->successors[0] == succ1->successors[0]);
1247       assert(succ0->predecessors_count == 1 && succ1->predecessors_count == 1);
1248       assert(succ0->index == (block->index + 1));
1249       assert(succ1->index == (block->index + 2));
1250 
1251       struct ir3_instruction *succ0_terminator =
1252          ir3_block_get_terminator(succ0);
1253       assert(succ0_terminator);
1254       assert(succ0_terminator->opc ==
1255              (terminator->opc == OPC_PREDT ? OPC_PREDF : OPC_PREDT));
1256 
1257       ASSERTED struct ir3_instruction *succ1_terminator =
1258          ir3_block_get_terminator(succ1);
1259       assert(!succ1_terminator || (succ1_terminator->opc == OPC_JUMP));
1260 
1261       /* Simple case: both successors contain instructions. Keep both blocks and
1262        * insert prede before the second successor's terminator:
1263        *        |--- i ----|
1264        *        |   ...    |
1265        *        | pred[tf] |
1266        *        |----------|
1267        *      succ0 /   \ succ1
1268        * |-- i+1 ---| |-- i+2 ---|
1269        * |    ...   | |   ...    |
1270        * | pred[ft] | | prede    |
1271        * |----------| |----------|
1272        *     succ0 \   / succ0
1273        *        |--- j ----|
1274        *        |   ...    |
1275        *        |----------|
1276        */
1277       if (!list_is_empty(&succ1->instr_list)) {
1278          ir3_PREDE(succ1);
1279          continue;
1280       }
1281 
1282       /* Second successor is empty so we can remove it:
1283        *        |--- i ----|
1284        *        |   ...    |
1285        *        | pred[tf] |
1286        *        |----------|
1287        *      succ0 /   \ succ1
1288        * |-- i+1 ---|   |
1289        * |    ...   |   |
1290        * |   prede  |   |
1291        * |----------|   |
1292        *     succ0 \    /
1293        *        |--- j ----|
1294        *        |   ...    |
1295        *        |----------|
1296        */
1297       list_delinit(&succ0_terminator->node);
1298       ir3_PREDE(succ0);
1299       remove_unused_block(succ1);
1300       block->successors[1] = succ0->successors[0];
1301       ir3_block_add_predecessor(succ0->successors[0], block);
1302    }
1303 }
1304 
1305 /* Here we workaround the fact that kill doesn't actually kill the thread as
1306  * GL expects. The last instruction always needs to be an end instruction,
1307  * which means that if we're stuck in a loop where kill is the only way out,
1308  * then we may have to jump out to the end. kill may also have the d3d
1309  * semantics of converting the thread to a helper thread, rather than setting
1310  * the exec mask to 0, in which case the helper thread could get stuck in an
1311  * infinite loop.
1312  *
1313  * We do this late, both to give the scheduler the opportunity to reschedule
1314  * kill instructions earlier and to avoid having to create a separate basic
1315  * block.
1316  *
1317  * TODO: Assuming that the wavefront doesn't stop as soon as all threads are
1318  * killed, we might benefit by doing this more aggressively when the remaining
1319  * part of the program after the kill is large, since that would let us
1320  * skip over the instructions when there are no non-killed threads left.
1321  */
1322 static void
kill_sched(struct ir3 * ir,struct ir3_shader_variant * so)1323 kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1324 {
1325    ir3_count_instructions(ir);
1326 
1327    /* True if we know that this block will always eventually lead to the end
1328     * block:
1329     */
1330    bool always_ends = true;
1331    bool added = false;
1332    struct ir3_block *last_block =
1333       list_last_entry(&ir->block_list, struct ir3_block, node);
1334 
1335    foreach_block_rev (block, &ir->block_list) {
1336       for (unsigned i = 0; i < 2 && block->successors[i]; i++) {
1337          if (block->successors[i]->start_ip <= block->end_ip)
1338             always_ends = false;
1339       }
1340 
1341       if (always_ends)
1342          continue;
1343 
1344       foreach_instr_safe (instr, &block->instr_list) {
1345          if (instr->opc != OPC_KILL)
1346             continue;
1347 
1348          struct ir3_instruction *br = ir3_instr_create(block, OPC_BR, 0, 1);
1349          ir3_src_create(br, instr->srcs[0]->num, instr->srcs[0]->flags)->wrmask =
1350             1;
1351          br->cat0.target =
1352             list_last_entry(&ir->block_list, struct ir3_block, node);
1353 
1354          list_del(&br->node);
1355          list_add(&br->node, &instr->node);
1356 
1357          added = true;
1358       }
1359    }
1360 
1361    if (added) {
1362       /* I'm not entirely sure how the branchstack works, but we probably
1363        * need to add at least one entry for the divergence which is resolved
1364        * at the end:
1365        */
1366       so->branchstack++;
1367 
1368       /* We don't update predecessors/successors, so we have to do this
1369        * manually:
1370        */
1371       mark_jp(last_block);
1372    }
1373 }
1374 
1375 static void
dbg_sync_sched(struct ir3 * ir,struct ir3_shader_variant * so)1376 dbg_sync_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1377 {
1378    foreach_block (block, &ir->block_list) {
1379       foreach_instr_safe (instr, &block->instr_list) {
1380          if (is_ss_producer(instr) || is_sy_producer(instr)) {
1381             struct ir3_instruction *nop = ir3_NOP(block);
1382             nop->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
1383             ir3_instr_move_after(nop, instr);
1384          }
1385       }
1386    }
1387 }
1388 
1389 static void
dbg_nop_sched(struct ir3 * ir,struct ir3_shader_variant * so)1390 dbg_nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
1391 {
1392    foreach_block (block, &ir->block_list) {
1393       foreach_instr_safe (instr, &block->instr_list) {
1394          struct ir3_instruction *nop = ir3_NOP(block);
1395          nop->repeat = 5;
1396          ir3_instr_move_before(nop, instr);
1397       }
1398    }
1399 }
1400 
1401 static void
dbg_expand_rpt(struct ir3 * ir)1402 dbg_expand_rpt(struct ir3 *ir)
1403 {
1404    foreach_block (block, &ir->block_list) {
1405       foreach_instr_safe (instr, &block->instr_list) {
1406          if (instr->repeat == 0 || instr->opc == OPC_NOP ||
1407              instr->opc == OPC_SWZ || instr->opc == OPC_GAT ||
1408              instr->opc == OPC_SCT) {
1409             continue;
1410          }
1411 
1412          for (unsigned i = 0; i <= instr->repeat; ++i) {
1413             struct ir3_instruction *rpt = ir3_instr_clone(instr);
1414             ir3_instr_move_before(rpt, instr);
1415             rpt->repeat = 0;
1416 
1417             foreach_dst (dst, rpt) {
1418                dst->num += i;
1419                dst->wrmask = 1;
1420             }
1421 
1422             foreach_src (src, rpt) {
1423                if (!(src->flags & IR3_REG_R))
1424                   continue;
1425 
1426                src->num += i;
1427                src->uim_val += i;
1428                src->wrmask = 1;
1429                src->flags &= ~IR3_REG_R;
1430             }
1431          }
1432 
1433          list_delinit(&instr->node);
1434       }
1435    }
1436 }
1437 
1438 struct ir3_helper_block_data {
1439    /* Whether helper invocations may be used on any path starting at the
1440     * beginning of the block.
1441     */
1442    bool uses_helpers_beginning;
1443 
1444    /* Whether helper invocations may be used by the end of the block. Branch
1445     * instructions are considered to be "between" blocks, because (eq) has to be
1446     * inserted after them in the successor blocks, so branch instructions using
1447     * helpers will result in uses_helpers_end = true for their block.
1448     */
1449    bool uses_helpers_end;
1450 };
1451 
1452 /* Insert (eq) after the last instruction using the results of helper
1453  * invocations. Use a backwards dataflow analysis to determine at which points
1454  * in the program helper invocations are definitely never used, and then insert
1455  * (eq) at the point where we cross from a point where they may be used to a
1456  * point where they are never used.
1457  */
1458 static void
helper_sched(struct ir3_legalize_ctx * ctx,struct ir3 * ir,struct ir3_shader_variant * so)1459 helper_sched(struct ir3_legalize_ctx *ctx, struct ir3 *ir,
1460              struct ir3_shader_variant *so)
1461 {
1462    bool non_prefetch_helpers = false;
1463 
1464    foreach_block (block, &ir->block_list) {
1465       struct ir3_helper_block_data *bd =
1466          rzalloc(ctx, struct ir3_helper_block_data);
1467       foreach_instr (instr, &block->instr_list) {
1468          if (uses_helpers(instr)) {
1469             bd->uses_helpers_beginning = true;
1470             if (instr->opc != OPC_META_TEX_PREFETCH) {
1471                non_prefetch_helpers = true;
1472             }
1473          }
1474 
1475          if (instr->opc == OPC_SHPE) {
1476             /* (eq) is not allowed in preambles, mark the whole preamble as
1477              * requiring helpers to avoid putting it there.
1478              */
1479             bd->uses_helpers_beginning = true;
1480             bd->uses_helpers_end = true;
1481          }
1482       }
1483 
1484       struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1485       if (terminator) {
1486          if (terminator->opc == OPC_BALL || terminator->opc == OPC_BANY ||
1487              (terminator->opc == OPC_GETONE &&
1488               (terminator->flags & IR3_INSTR_NEEDS_HELPERS))) {
1489             bd->uses_helpers_beginning = true;
1490             bd->uses_helpers_end = true;
1491             non_prefetch_helpers = true;
1492          }
1493       }
1494 
1495       block->data = bd;
1496    }
1497 
1498    /* If only prefetches use helpers then we can disable them in the shader via
1499     * a register setting.
1500     */
1501    if (!non_prefetch_helpers) {
1502       so->prefetch_end_of_quad = true;
1503       return;
1504    }
1505 
1506    bool progress;
1507    do {
1508       progress = false;
1509       foreach_block_rev (block, &ir->block_list) {
1510          struct ir3_helper_block_data *bd = block->data;
1511 
1512          if (!bd->uses_helpers_beginning)
1513             continue;
1514 
1515          for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
1516             struct ir3_block *pred = block->physical_predecessors[i];
1517             struct ir3_helper_block_data *pred_bd = pred->data;
1518             if (!pred_bd->uses_helpers_end) {
1519                pred_bd->uses_helpers_end = true;
1520             }
1521             if (!pred_bd->uses_helpers_beginning) {
1522                pred_bd->uses_helpers_beginning = true;
1523                progress = true;
1524             }
1525          }
1526       }
1527    } while (progress);
1528 
1529    /* Now, we need to determine the points where helper invocations become
1530     * unused.
1531     */
1532    foreach_block (block, &ir->block_list) {
1533       struct ir3_helper_block_data *bd = block->data;
1534       if (bd->uses_helpers_end)
1535          continue;
1536 
1537       /* We need to check the predecessors because of situations with critical
1538        * edges like this that can occur after optimizing jumps:
1539        *
1540        *    br p0.x, #endif
1541        *    ...
1542        *    sam ...
1543        *    ...
1544        *    endif:
1545        *    ...
1546        *    end
1547        *
1548        * The endif block will have uses_helpers_beginning = false and
1549        * uses_helpers_end = false, but because we jump to there from the
1550        * beginning of the if where uses_helpers_end = true, we still want to
1551        * add an (eq) at the beginning of the block:
1552        *
1553        *    br p0.x, #endif
1554        *    ...
1555        *    sam ...
1556        *    (eq)nop
1557        *    ...
1558        *    endif:
1559        *    (eq)nop
1560        *    ...
1561        *    end
1562        *
1563        * This an extra nop in the case where the branch isn't taken, but that's
1564        * probably preferable to adding an extra jump instruction which is what
1565        * would happen if we ran this pass before optimizing jumps:
1566        *
1567        *    br p0.x, #else
1568        *    ...
1569        *    sam ...
1570        *    (eq)nop
1571        *    ...
1572        *    jump #endif
1573        *    else:
1574        *    (eq)nop
1575        *    endif:
1576        *    ...
1577        *    end
1578        *
1579        * We also need this to make sure we insert (eq) after branches which use
1580        * helper invocations.
1581        */
1582       bool pred_uses_helpers = bd->uses_helpers_beginning;
1583       for (unsigned i = 0; i < block->physical_predecessors_count; i++) {
1584          struct ir3_block *pred = block->physical_predecessors[i];
1585          struct ir3_helper_block_data *pred_bd = pred->data;
1586          if (pred_bd->uses_helpers_end) {
1587             pred_uses_helpers = true;
1588             break;
1589          }
1590       }
1591 
1592       if (!pred_uses_helpers)
1593          continue;
1594 
1595       /* The last use of helpers is somewhere between the beginning and the
1596        * end. first_instr will be the first instruction where helpers are no
1597        * longer required, or NULL if helpers are not required just at the end.
1598        */
1599       struct ir3_instruction *first_instr = NULL;
1600       foreach_instr_rev (instr, &block->instr_list) {
1601          /* Skip prefetches because they actually execute before the block
1602           * starts and at this stage they aren't guaranteed to be at the start
1603           * of the block.
1604           */
1605          if (uses_helpers(instr) && instr->opc != OPC_META_TEX_PREFETCH)
1606             break;
1607          first_instr = instr;
1608       }
1609 
1610       bool killed = false;
1611       bool expensive_instruction_in_block = false;
1612       if (first_instr) {
1613          foreach_instr_from (instr, first_instr, &block->instr_list) {
1614             /* If there's already a nop, we don't have to worry about whether to
1615              * insert one.
1616              */
1617             if (instr->opc == OPC_NOP) {
1618                instr->flags |= IR3_INSTR_EQ;
1619                killed = true;
1620                break;
1621             }
1622 
1623             /* ALU and SFU instructions probably aren't going to benefit much
1624              * from killing helper invocations, because they complete at least
1625              * an entire quad in a cycle and don't access any quad-divergent
1626              * memory, so delay emitting (eq) in the hopes that we find a nop
1627              * afterwards.
1628              */
1629             if (is_alu(instr) || is_sfu(instr))
1630                continue;
1631             if (instr->opc == OPC_PREDE)
1632                continue;
1633 
1634             expensive_instruction_in_block = true;
1635             break;
1636          }
1637       }
1638 
1639       /* If this block isn't the last block before the end instruction, assume
1640        * that there may be expensive instructions in later blocks so it's worth
1641        * it to insert a nop.
1642        */
1643       if (!killed && (expensive_instruction_in_block ||
1644                       block->successors[0] != ir3_end_block(ir))) {
1645          struct ir3_instruction *nop = ir3_NOP(block);
1646          nop->flags |= IR3_INSTR_EQ;
1647          if (first_instr)
1648             ir3_instr_move_before(nop, first_instr);
1649       }
1650    }
1651 }
1652 
1653 bool
ir3_legalize(struct ir3 * ir,struct ir3_shader_variant * so,int * max_bary)1654 ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
1655 {
1656    struct ir3_legalize_ctx *ctx = rzalloc(ir, struct ir3_legalize_ctx);
1657    bool mergedregs = so->mergedregs;
1658    bool progress;
1659 
1660    ctx->so = so;
1661    ctx->max_bary = -1;
1662    ctx->compiler = ir->compiler;
1663    ctx->type = ir->type;
1664 
1665    /* allocate per-block data: */
1666    foreach_block (block, &ir->block_list) {
1667       struct ir3_legalize_block_data *bd =
1668          rzalloc(ctx, struct ir3_legalize_block_data);
1669 
1670       regmask_init(&bd->state.needs_ss_war, mergedregs);
1671       regmask_init(&bd->state.needs_ss_or_sy_war, mergedregs);
1672       regmask_init(&bd->state.needs_ss_scalar_war, mergedregs);
1673       regmask_init(&bd->state.needs_ss_or_sy_scalar_war, mergedregs);
1674       regmask_init(&bd->state.needs_ss_scalar_full, mergedregs);
1675       regmask_init(&bd->state.needs_ss_scalar_half, mergedregs);
1676       regmask_init(&bd->state.needs_ss, mergedregs);
1677       regmask_init(&bd->state.needs_sy, mergedregs);
1678       regmask_init(&bd->begin_state.needs_ss_war, mergedregs);
1679       regmask_init(&bd->begin_state.needs_ss_or_sy_war, mergedregs);
1680       regmask_init(&bd->begin_state.needs_ss_scalar_war, mergedregs);
1681       regmask_init(&bd->begin_state.needs_ss_or_sy_scalar_war, mergedregs);
1682       regmask_init(&bd->begin_state.needs_ss_scalar_full, mergedregs);
1683       regmask_init(&bd->begin_state.needs_ss_scalar_half, mergedregs);
1684       regmask_init(&bd->begin_state.needs_ss, mergedregs);
1685       regmask_init(&bd->begin_state.needs_sy, mergedregs);
1686 
1687       block->data = bd;
1688    }
1689 
1690    /* We may have failed to pull all input loads into the first block.
1691     * In such case at the moment we aren't able to find a better place
1692     * to for (ei) than the end of the program.
1693     * a5xx and a6xx do automatically release varying storage at the end.
1694     */
1695    ctx->early_input_release = true;
1696 
1697    struct ir3_block *start_block = ir3_after_preamble(ir);
1698 
1699    /* Gather information to determine whether we can enable early preamble.
1700     */
1701    bool gpr_in_preamble = false;
1702    bool pred_in_preamble = false;
1703    bool relative_in_preamble = false;
1704    bool in_preamble = start_block != ir3_start_block(ir);
1705    bool has_preamble = start_block != ir3_start_block(ir);
1706 
1707    foreach_block (block, &ir->block_list) {
1708       if (block == start_block)
1709          in_preamble = false;
1710 
1711       foreach_instr (instr, &block->instr_list) {
1712          if (is_input(instr)) {
1713             ctx->has_inputs = true;
1714             if (block != start_block) {
1715                ctx->early_input_release = false;
1716             }
1717          }
1718 
1719          if (is_meta(instr))
1720             continue;
1721 
1722          foreach_src (reg, instr) {
1723             if (in_preamble) {
1724                if (!(reg->flags & (IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED)) &&
1725                    is_reg_gpr(reg))
1726                   gpr_in_preamble = true;
1727                if (reg->flags & IR3_REG_RELATIV)
1728                   relative_in_preamble = true;
1729             }
1730          }
1731 
1732          foreach_dst (reg, instr) {
1733             if (is_dest_gpr(reg)) {
1734                if (in_preamble) {
1735                   if (!(reg->flags & IR3_REG_SHARED))
1736                      gpr_in_preamble = true;
1737                   if (reg->flags & IR3_REG_RELATIV)
1738                      relative_in_preamble = true;
1739                }
1740             }
1741          }
1742 
1743          if (in_preamble && writes_pred(instr)) {
1744             pred_in_preamble = true;
1745          }
1746       }
1747    }
1748 
1749    so->early_preamble = has_preamble && !gpr_in_preamble &&
1750       !pred_in_preamble && !relative_in_preamble &&
1751       ir->compiler->has_early_preamble &&
1752       !(ir3_shader_debug & IR3_DBG_NOEARLYPREAMBLE);
1753 
1754    /* On a7xx, sync behavior for a1.x is different in the early preamble. RaW
1755     * dependencies must be synchronized with (ss) there must be an extra
1756     * (r) on the source of the mova1 instruction.
1757     */
1758    if (so->early_preamble && ir->compiler->gen >= 7) {
1759       foreach_block (block, &ir->block_list) {
1760          if (block == start_block)
1761             break;
1762          block->in_early_preamble = true;
1763       }
1764    }
1765 
1766    assert(ctx->early_input_release || ctx->compiler->gen >= 5);
1767 
1768    if (ir3_shader_debug & IR3_DBG_EXPANDRPT) {
1769       dbg_expand_rpt(ir);
1770    }
1771 
1772    /* process each block: */
1773    do {
1774       progress = false;
1775       foreach_block (block, &ir->block_list) {
1776          progress |= legalize_block(ctx, block);
1777       }
1778    } while (progress);
1779 
1780    *max_bary = ctx->max_bary;
1781 
1782    foreach_block (block, &ir->block_list) {
1783       struct ir3_instruction *terminator = ir3_block_get_terminator(block);
1784       if (terminator && terminator->opc == OPC_GETONE) {
1785          apply_push_consts_load_macro(ctx, block->successors[0]);
1786          break;
1787       }
1788    }
1789 
1790    block_sched(ir);
1791 
1792    foreach_block (block, &ir->block_list) {
1793       progress |= apply_fine_deriv_macro(ctx, block);
1794    }
1795 
1796    if (ir3_shader_debug & IR3_DBG_FULLSYNC) {
1797       dbg_sync_sched(ir, so);
1798    }
1799 
1800    if (ir3_shader_debug & IR3_DBG_FULLNOP) {
1801       dbg_nop_sched(ir, so);
1802    }
1803 
1804    bool cfg_changed = false;
1805    while (opt_jump(ir))
1806       cfg_changed = true;
1807 
1808    prede_sched(ir);
1809 
1810    if (cfg_changed)
1811       ir3_calc_reconvergence(so);
1812 
1813    if (so->type == MESA_SHADER_FRAGMENT)
1814       kill_sched(ir, so);
1815 
1816    /* TODO: does (eq) exist before a6xx? */
1817    if (so->type == MESA_SHADER_FRAGMENT && so->need_pixlod &&
1818        so->compiler->gen >= 6)
1819       helper_sched(ctx, ir, so);
1820 
1821    foreach_block (block, &ir->block_list) {
1822       progress |= expand_dummy_dests(block);
1823    }
1824 
1825    ir3_count_instructions(ir);
1826    resolve_jumps(ir);
1827 
1828    mark_xvergence_points(ir);
1829 
1830    ralloc_free(ctx);
1831 
1832    return true;
1833 }
1834