xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_sched.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014 Rob Clark <[email protected]>
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Rob Clark <[email protected]>
7  */
8 
9 #include "util/dag.h"
10 #include "util/u_math.h"
11 
12 #include "ir3.h"
13 #include "ir3_compiler.h"
14 
15 #if MESA_DEBUG
16 #define SCHED_DEBUG (ir3_shader_debug & IR3_DBG_SCHEDMSGS)
17 #else
18 #define SCHED_DEBUG 0
19 #endif
20 #define d(fmt, ...)                                                            \
21    do {                                                                        \
22       if (SCHED_DEBUG) {                                                       \
23          mesa_logi("SCHED: " fmt, ##__VA_ARGS__);                              \
24       }                                                                        \
25    } while (0)
26 
27 #define di(instr, fmt, ...)                                                    \
28    do {                                                                        \
29       if (SCHED_DEBUG) {                                                       \
30          struct log_stream *stream = mesa_log_streami();                       \
31          mesa_log_stream_printf(stream, "SCHED: " fmt ": ", ##__VA_ARGS__);    \
32          ir3_print_instr_stream(stream, instr);                                \
33          mesa_log_stream_destroy(stream);                                      \
34       }                                                                        \
35    } while (0)
36 
37 /*
38  * Instruction Scheduling:
39  *
40  * A block-level pre-RA scheduler, which works by creating a DAG of
41  * instruction dependencies, and heuristically picking a DAG head
42  * (instruction with no unscheduled dependencies).
43  *
44  * Where possible, it tries to pick instructions that avoid nop delay
45  * slots, but it will prefer to pick instructions that reduce (or do
46  * not increase) the number of live values.
47  *
48  * If the only possible choices are instructions that increase the
49  * number of live values, it will try to pick the one with the earliest
50  * consumer (based on pre-sched program order).
51  *
52  * There are a few special cases that need to be handled, since sched
53  * is currently independent of register allocation.  Usages of address
54  * register (a0.x) or predicate register (p0.x) must be serialized.  Ie.
55  * if you have two pairs of instructions that write the same special
56  * register and then read it, then those pairs cannot be interleaved.
57  * To solve this, when we are in such a scheduling "critical section",
58  * and we encounter a conflicting write to a special register, we try
59  * to schedule any remaining instructions that use that value first.
60  *
61  * TODO we can detect too-large live_values here.. would be a good place
62  * to "spill" cheap things, like move from uniform/immed.  (Constructing
63  * list of ssa def consumers before sched pass would make this easier.
64  * Also, in general it is general it might be best not to re-use load_immed
65  * across blocks.
66  *
67  * TODO we can use (abs)/(neg) src modifiers in a lot of cases to reduce
68  * the # of immediates in play (or at least that would help with
69  * dEQP-GLES31.functional.ubo.random.all_per_block_buffers.*).. probably
70  * do this in a nir pass that inserts fneg/etc?  The cp pass should fold
71  * these into src modifiers..
72  */
73 
74 struct ir3_sched_ctx {
75    struct ir3_compiler *compiler;
76    struct ir3_block *block; /* the current block */
77    struct dag *dag;
78 
79    struct list_head unscheduled_list; /* unscheduled instructions */
80    struct ir3_instruction *scheduled; /* last scheduled instr */
81    struct ir3_instruction *addr0;     /* current a0.x user, if any */
82    struct ir3_instruction *addr1;     /* current a1.x user, if any */
83 
84    struct ir3_instruction *split; /* most-recently-split a0/a1 producer */
85 
86    int remaining_kills;
87    int remaining_tex;
88 
89    bool error;
90 
91    unsigned ip;
92 
93    int sy_delay;
94    int ss_delay;
95 
96    /* We order the scheduled (sy)/(ss) producers, and keep track of the
97     * index of the last waited on instruction, so we can know which
98     * instructions are still outstanding (and therefore would require us to
99     * wait for all outstanding instructions before scheduling a use).
100     */
101    int sy_index, first_outstanding_sy_index;
102    int ss_index, first_outstanding_ss_index;
103 };
104 
105 struct ir3_sched_node {
106    struct dag_node dag; /* must be first for util_dynarray_foreach */
107    struct ir3_instruction *instr;
108 
109    unsigned delay;
110    unsigned max_delay;
111 
112    unsigned sy_index;
113    unsigned ss_index;
114 
115    /* For ready instructions, the earliest possible ip that it could be
116     * scheduled.
117     */
118    unsigned earliest_ip;
119 
120    /* For instructions that are a meta:collect src, once we schedule
121     * the first src of the collect, the entire vecN is live (at least
122     * from the PoV of the first RA pass.. the 2nd scalar pass can fill
123     * in some of the gaps, but often not all).  So we want to help out
124     * RA, and realize that as soon as we schedule the first collect
125     * src, there is no penalty to schedule the remainder (ie. they
126     * don't make additional values live).  In fact we'd prefer to
127     * schedule the rest ASAP to minimize the live range of the vecN.
128     *
129     * For instructions that are the src of a collect, we track the
130     * corresponding collect, and mark them as partially live as soon
131     * as any one of the src's is scheduled.
132     */
133    struct ir3_instruction *collect;
134    bool partially_live;
135 
136    /* Is this instruction a direct or indirect dependency for a kill?
137     * If so, we should prioritize it when possible
138     */
139    bool kill_path;
140 
141    /* This node represents a shader output.  A semi-common pattern in
142     * shaders is something along the lines of:
143     *
144     *    fragcolor.w = 1.0
145     *
146     * Which we'd prefer to schedule as late as possible, since it
147     * produces a live value that is never killed/consumed.  So detect
148     * outputs up-front, and avoid scheduling them unless the reduce
149     * register pressure (or at least are neutral)
150     */
151    bool output;
152 };
153 
154 #define foreach_sched_node(__n, __list)                                        \
155    list_for_each_entry (struct ir3_sched_node, __n, __list, dag.link)
156 
157 static void sched_node_init(struct ir3_sched_ctx *ctx,
158                             struct ir3_instruction *instr);
159 static void sched_node_add_dep(struct ir3_sched_ctx *ctx,
160                                struct ir3_instruction *instr,
161                                struct ir3_instruction *src, int i);
162 
163 static bool
is_scheduled(struct ir3_instruction * instr)164 is_scheduled(struct ir3_instruction *instr)
165 {
166    return !!(instr->flags & IR3_INSTR_MARK);
167 }
168 
169 /* check_src_cond() passing the user and ir3_sched_ctx. */
170 static bool
sched_check_src_cond(struct ir3_instruction * instr,bool (* cond)(struct ir3_instruction *,struct ir3_instruction *,struct ir3_sched_ctx *),struct ir3_sched_ctx * ctx)171 sched_check_src_cond(struct ir3_instruction *instr,
172                      bool (*cond)(struct ir3_instruction *,
173                                   struct ir3_instruction *,
174                                   struct ir3_sched_ctx *),
175                      struct ir3_sched_ctx *ctx)
176 {
177    foreach_ssa_src (src, instr) {
178       /* meta:split/collect aren't real instructions, the thing that
179        * we actually care about is *their* srcs
180        */
181       if ((src->opc == OPC_META_SPLIT) || (src->opc == OPC_META_COLLECT)) {
182          if (sched_check_src_cond(src, cond, ctx))
183             return true;
184       } else {
185          if (cond(src, instr, ctx))
186             return true;
187       }
188    }
189 
190    return false;
191 }
192 
193 /* Is this a sy producer that hasn't been waited on yet? */
194 
195 static bool
is_outstanding_sy(struct ir3_instruction * instr,struct ir3_instruction * use,struct ir3_sched_ctx * ctx)196 is_outstanding_sy(struct ir3_instruction *instr, struct ir3_instruction *use,
197                   struct ir3_sched_ctx *ctx)
198 {
199    if (!is_sy_producer(instr))
200       return false;
201 
202    /* The sched node is only valid within the same block, we cannot
203     * really say anything about src's from other blocks
204     */
205    if (instr->block != ctx->block)
206       return true;
207 
208    struct ir3_sched_node *n = instr->data;
209    return n->sy_index >= ctx->first_outstanding_sy_index;
210 }
211 
212 static bool
is_outstanding_ss(struct ir3_instruction * instr,struct ir3_instruction * use,struct ir3_sched_ctx * ctx)213 is_outstanding_ss(struct ir3_instruction *instr, struct ir3_instruction *use,
214                   struct ir3_sched_ctx *ctx)
215 {
216    if (!needs_ss(ctx->compiler, instr, use))
217       return false;
218 
219    /* The sched node is only valid within the same block, we cannot
220     * really say anything about src's from other blocks
221     */
222    if (instr->block != ctx->block)
223       return true;
224 
225    struct ir3_sched_node *n = instr->data;
226    return n->ss_index >= ctx->first_outstanding_ss_index;
227 }
228 
229 static unsigned
cycle_count(struct ir3_instruction * instr)230 cycle_count(struct ir3_instruction *instr)
231 {
232    if (instr->opc == OPC_META_COLLECT) {
233       /* Assume that only immed/const sources produce moves */
234       unsigned n = 0;
235       foreach_src (src, instr) {
236          if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
237             n++;
238       }
239       return n;
240    } else if (is_meta(instr)) {
241       return 0;
242    } else {
243       return 1;
244    }
245 }
246 
247 static void
schedule(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr)248 schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
249 {
250    assert(ctx->block == instr->block);
251 
252    /* remove from depth list:
253     */
254    list_delinit(&instr->node);
255 
256    if (writes_addr0(instr)) {
257       assert(ctx->addr0 == NULL);
258       ctx->addr0 = instr;
259    }
260 
261    if (writes_addr1(instr)) {
262       assert(ctx->addr1 == NULL);
263       ctx->addr1 = instr;
264    }
265 
266    instr->flags |= IR3_INSTR_MARK;
267 
268    di(instr, "schedule");
269 
270    list_addtail(&instr->node, &instr->block->instr_list);
271    ctx->scheduled = instr;
272 
273    if (is_kill_or_demote(instr)) {
274       assert(ctx->remaining_kills > 0);
275       ctx->remaining_kills--;
276    }
277 
278    struct ir3_sched_node *n = instr->data;
279 
280    /* If this instruction is a meta:collect src, mark the remaining
281     * collect srcs as partially live.
282     */
283    if (n->collect) {
284       foreach_ssa_src (src, n->collect) {
285          if (src->block != instr->block)
286             continue;
287          struct ir3_sched_node *sn = src->data;
288          sn->partially_live = true;
289       }
290    }
291 
292    bool counts_for_delay = is_alu(instr) || is_flow(instr);
293 
294    /* TODO: switch to "cycles". For now try to match ir3_delay. */
295    unsigned delay_cycles = counts_for_delay ? 1 + instr->repeat : 0;
296 
297    /* We insert any nop's needed to get to earliest_ip, then advance
298     * delay_cycles by scheduling the instruction.
299     */
300    ctx->ip = MAX2(ctx->ip, n->earliest_ip) + delay_cycles;
301 
302    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
303       unsigned delay = (unsigned)(uintptr_t)edge->data;
304       struct ir3_sched_node *child =
305          container_of(edge->child, struct ir3_sched_node, dag);
306       child->earliest_ip = MAX2(child->earliest_ip, ctx->ip + delay);
307    }
308 
309    dag_prune_head(ctx->dag, &n->dag);
310 
311    unsigned cycles = cycle_count(instr);
312 
313    if (is_ss_producer(instr)) {
314       ctx->ss_delay = soft_ss_delay(instr);
315       n->ss_index = ctx->ss_index++;
316    } else if (!is_meta(instr) &&
317               sched_check_src_cond(instr, is_outstanding_ss, ctx)) {
318       ctx->ss_delay = 0;
319       ctx->first_outstanding_ss_index = ctx->ss_index;
320    } else if (ctx->ss_delay > 0) {
321       ctx->ss_delay -= MIN2(cycles, ctx->ss_delay);
322    }
323 
324    if (is_sy_producer(instr)) {
325       /* NOTE that this isn't an attempt to hide texture fetch latency,
326        * but an attempt to hide the cost of switching to another warp.
327        * If we can, we'd like to try to schedule another texture fetch
328        * before scheduling something that would sync.
329        */
330       ctx->sy_delay = soft_sy_delay(instr, ctx->block->shader);
331       assert(ctx->remaining_tex > 0);
332       ctx->remaining_tex--;
333       n->sy_index = ctx->sy_index++;
334    } else if (!is_meta(instr) &&
335               sched_check_src_cond(instr, is_outstanding_sy, ctx)) {
336       ctx->sy_delay = 0;
337       ctx->first_outstanding_sy_index = ctx->sy_index;
338    } else if (ctx->sy_delay > 0) {
339       ctx->sy_delay -= MIN2(cycles, ctx->sy_delay);
340    }
341 
342 }
343 
344 struct ir3_sched_notes {
345    /* there is at least one kill which could be scheduled, except
346     * for unscheduled bary.f's:
347     */
348    bool blocked_kill;
349    /* there is at least one instruction that could be scheduled,
350     * except for conflicting address register usage:
351     */
352    bool addr0_conflict, addr1_conflict;
353 };
354 
355 static bool
should_skip(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr)356 should_skip(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
357 {
358    if (ctx->remaining_kills && (is_tex(instr) || is_mem(instr))) {
359       /* avoid texture/memory access if we have unscheduled kills
360        * that could make the expensive operation unnecessary.  By
361        * definition, if there are remaining kills, and this instr
362        * is not a dependency of a kill, there are other instructions
363        * that we can choose from.
364        */
365       struct ir3_sched_node *n = instr->data;
366       if (!n->kill_path)
367          return true;
368    }
369 
370    return false;
371 }
372 
373 /* could an instruction be scheduled if specified ssa src was scheduled? */
374 static bool
could_sched(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr,struct ir3_instruction * src)375 could_sched(struct ir3_sched_ctx *ctx,
376             struct ir3_instruction *instr, struct ir3_instruction *src)
377 {
378    foreach_ssa_src (other_src, instr) {
379       /* if dependency not scheduled, we aren't ready yet: */
380       if ((src != other_src) && !is_scheduled(other_src)) {
381          return false;
382       }
383    }
384 
385    /* Instructions not in the current block can never be scheduled.
386     */
387    if (instr->block != src->block)
388       return false;
389 
390    return !should_skip(ctx, instr);
391 }
392 
393 /* Check if instruction is ok to schedule.  Make sure it is not blocked
394  * by use of addr/predicate register, etc.
395  */
396 static bool
check_instr(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes,struct ir3_instruction * instr)397 check_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
398             struct ir3_instruction *instr)
399 {
400    assert(!is_scheduled(instr));
401 
402    if (instr == ctx->split) {
403       /* Don't schedule instructions created by splitting a a0.x/a1.x/p0.x
404        * write until another "normal" instruction has been scheduled.
405        */
406       return false;
407    }
408 
409    if (should_skip(ctx, instr))
410        return false;
411 
412    /* For instructions that write address register we need to
413     * make sure there is at least one instruction that uses the
414     * addr value which is otherwise ready.
415     *
416     * NOTE if any instructions use pred register and have other
417     * src args, we would need to do the same for writes_pred()..
418     */
419    if (writes_addr0(instr)) {
420       struct ir3 *ir = instr->block->shader;
421       bool ready = false;
422       for (unsigned i = 0; (i < ir->a0_users_count) && !ready; i++) {
423          struct ir3_instruction *indirect = ir->a0_users[i];
424          if (!indirect)
425             continue;
426          if (indirect->address->def != instr->dsts[0])
427             continue;
428          ready = could_sched(ctx, indirect, instr);
429       }
430 
431       /* nothing could be scheduled, so keep looking: */
432       if (!ready)
433          return false;
434    }
435 
436    if (writes_addr1(instr)) {
437       struct ir3 *ir = instr->block->shader;
438       bool ready = false;
439       for (unsigned i = 0; (i < ir->a1_users_count) && !ready; i++) {
440          struct ir3_instruction *indirect = ir->a1_users[i];
441          if (!indirect)
442             continue;
443          if (indirect->address->def != instr->dsts[0])
444             continue;
445          ready = could_sched(ctx, indirect, instr);
446       }
447 
448       /* nothing could be scheduled, so keep looking: */
449       if (!ready)
450          return false;
451    }
452 
453    /* if this is a write to address/predicate register, and that
454     * register is currently in use, we need to defer until it is
455     * free:
456     */
457    if (writes_addr0(instr) && ctx->addr0) {
458       assert(ctx->addr0 != instr);
459       notes->addr0_conflict = true;
460       return false;
461    }
462 
463    if (writes_addr1(instr) && ctx->addr1) {
464       assert(ctx->addr1 != instr);
465       notes->addr1_conflict = true;
466       return false;
467    }
468 
469    /* if the instruction is a kill, we need to ensure *every*
470     * bary.f is scheduled.  The hw seems unhappy if the thread
471     * gets killed before the end-input (ei) flag is hit.
472     *
473     * We could do this by adding each bary.f instruction as
474     * virtual ssa src for the kill instruction.  But we have
475     * fixed length instr->srcs[].
476     *
477     * TODO we could handle this by false-deps now, probably.
478     */
479    if (is_kill_or_demote(instr)) {
480       struct ir3 *ir = instr->block->shader;
481 
482       for (unsigned i = 0; i < ir->baryfs_count; i++) {
483          struct ir3_instruction *baryf = ir->baryfs[i];
484          if (baryf->flags & IR3_INSTR_UNUSED)
485             continue;
486          if (!is_scheduled(baryf)) {
487             notes->blocked_kill = true;
488             return false;
489          }
490       }
491    }
492 
493    return true;
494 }
495 
496 /* Find the instr->ip of the closest use of an instruction, in
497  * pre-sched order.  This isn't going to be the same as post-sched
498  * order, but it is a reasonable approximation to limit scheduling
499  * instructions *too* early.  This is mostly to prevent bad behavior
500  * in cases where we have a large number of possible instructions
501  * to choose, to avoid creating too much parallelism (ie. blowing
502  * up register pressure)
503  *
504  * See
505  * dEQP-GLES31.functional.atomic_counter.layout.reverse_offset.inc_dec.8_counters_5_calls_1_thread
506  */
507 static int
nearest_use(struct ir3_instruction * instr)508 nearest_use(struct ir3_instruction *instr)
509 {
510    unsigned nearest = ~0;
511    foreach_ssa_use (use, instr)
512       if (!is_scheduled(use))
513          nearest = MIN2(nearest, use->ip);
514 
515    /* slight hack.. this heuristic tends to push bary.f's to later
516     * in the shader, closer to their uses.  But we actually would
517     * prefer to get these scheduled earlier, to unlock varying
518     * storage for more VS jobs:
519     */
520    if (is_input(instr))
521       nearest /= 2;
522 
523    return nearest;
524 }
525 
526 static bool
is_only_nonscheduled_use(struct ir3_instruction * instr,struct ir3_instruction * use)527 is_only_nonscheduled_use(struct ir3_instruction *instr,
528                          struct ir3_instruction *use)
529 {
530    foreach_ssa_use (other_use, instr) {
531       if (other_use != use && !is_scheduled(other_use))
532          return false;
533    }
534 
535    return true;
536 }
537 
538 static unsigned
new_regs(struct ir3_instruction * instr)539 new_regs(struct ir3_instruction *instr)
540 {
541    unsigned regs = 0;
542 
543    foreach_dst (dst, instr) {
544       if (!is_dest_gpr(dst))
545          continue;
546       regs += reg_elems(dst);
547    }
548 
549    return regs;
550 }
551 
552 /* find net change to live values if instruction were scheduled: */
553 static int
live_effect(struct ir3_instruction * instr)554 live_effect(struct ir3_instruction *instr)
555 {
556    struct ir3_sched_node *n = instr->data;
557    int new_live =
558       (n->partially_live || !instr->uses || instr->uses->entries == 0)
559          ? 0
560          : new_regs(instr);
561    int freed_live = 0;
562 
563    /* if we schedule something that causes a vecN to be live,
564     * then count all it's other components too:
565     */
566    if (n->collect)
567       new_live *= n->collect->srcs_count;
568 
569    foreach_ssa_src_n (src, n, instr) {
570       if (__is_false_dep(instr, n))
571          continue;
572 
573       if (instr->block != src->block)
574          continue;
575 
576       if (is_only_nonscheduled_use(src, instr))
577          freed_live += new_regs(src);
578    }
579 
580    return new_live - freed_live;
581 }
582 
583 /* Determine if this is an instruction that we'd prefer not to schedule
584  * yet, in order to avoid an (ss)/(sy) sync.  This is limited by the
585  * ss_delay/sy_delay counters, ie. the more cycles it has been since
586  * the last SFU/tex, the less costly a sync would be, and the number of
587  * outstanding SFU/tex instructions to prevent a blowup in register pressure.
588  */
589 static bool
should_defer(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr)590 should_defer(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
591 {
592    if (ctx->ss_delay) {
593       if (sched_check_src_cond(instr, is_outstanding_ss, ctx))
594          return true;
595    }
596 
597    /* We mostly just want to try to schedule another texture fetch
598     * before scheduling something that would (sy) sync, so we can
599     * limit this rule to cases where there are remaining texture
600     * fetches
601     */
602    if (ctx->sy_delay && ctx->remaining_tex) {
603       if (sched_check_src_cond(instr, is_outstanding_sy, ctx))
604          return true;
605    }
606 
607    /* Avoid scheduling too many outstanding texture or sfu instructions at
608     * once by deferring further tex/SFU instructions. This both prevents
609     * stalls when the queue of texture/sfu instructions becomes too large,
610     * and prevents unacceptably large increases in register pressure from too
611     * many outstanding texture instructions.
612     */
613    if (ctx->sy_index - ctx->first_outstanding_sy_index >= 8 && is_sy_producer(instr))
614       return true;
615 
616    if (ctx->ss_index - ctx->first_outstanding_ss_index >= 8 && is_ss_producer(instr))
617       return true;
618 
619    return false;
620 }
621 
622 static struct ir3_sched_node *choose_instr_inc(struct ir3_sched_ctx *ctx,
623                                                struct ir3_sched_notes *notes,
624                                                bool defer, bool avoid_output);
625 
626 enum choose_instr_dec_rank {
627    DEC_NEUTRAL,
628    DEC_NEUTRAL_READY,
629    DEC_FREED,
630    DEC_FREED_READY,
631 };
632 
633 static const char *
dec_rank_name(enum choose_instr_dec_rank rank)634 dec_rank_name(enum choose_instr_dec_rank rank)
635 {
636    switch (rank) {
637    case DEC_NEUTRAL:
638       return "neutral";
639    case DEC_NEUTRAL_READY:
640       return "neutral+ready";
641    case DEC_FREED:
642       return "freed";
643    case DEC_FREED_READY:
644       return "freed+ready";
645    default:
646       return NULL;
647    }
648 }
649 
650 static unsigned
node_delay(struct ir3_sched_ctx * ctx,struct ir3_sched_node * n)651 node_delay(struct ir3_sched_ctx *ctx, struct ir3_sched_node *n)
652 {
653    return MAX2(n->earliest_ip, ctx->ip) - ctx->ip;
654 }
655 
656 /**
657  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
658  * Scheduling for Register pressure) heuristic.
659  *
660  * Only handles the case of choosing instructions that reduce register pressure
661  * or are even.
662  */
663 static struct ir3_sched_node *
choose_instr_dec(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes,bool defer)664 choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
665                  bool defer)
666 {
667    const char *mode = defer ? "-d" : "";
668    struct ir3_sched_node *chosen = NULL;
669    enum choose_instr_dec_rank chosen_rank = DEC_NEUTRAL;
670 
671    foreach_sched_node (n, &ctx->dag->heads) {
672       if (defer && should_defer(ctx, n->instr))
673          continue;
674 
675       unsigned d = node_delay(ctx, n);
676 
677       int live = live_effect(n->instr);
678       if (live > 0)
679          continue;
680 
681       if (!check_instr(ctx, notes, n->instr))
682          continue;
683 
684       enum choose_instr_dec_rank rank;
685       if (live < 0) {
686          /* Prioritize instrs which free up regs and can be scheduled with no
687           * delay.
688           */
689          if (d == 0)
690             rank = DEC_FREED_READY;
691          else
692             rank = DEC_FREED;
693       } else {
694          /* Contra the paper, pick a leader with no effect on used regs.  This
695           * may open up new opportunities, as otherwise a single-operand instr
696           * consuming a value will tend to block finding freeing that value.
697           * This had a massive effect on reducing spilling on V3D.
698           *
699           * XXX: Should this prioritize ready?
700           */
701          if (d == 0)
702             rank = DEC_NEUTRAL_READY;
703          else
704             rank = DEC_NEUTRAL;
705       }
706 
707       /* Prefer higher-ranked instructions, or in the case of a rank tie, the
708        * highest latency-to-end-of-program instruction.
709        */
710       if (!chosen || rank > chosen_rank ||
711           (rank == chosen_rank && chosen->max_delay < n->max_delay)) {
712          chosen = n;
713          chosen_rank = rank;
714       }
715    }
716 
717    if (chosen) {
718       di(chosen->instr, "dec%s: chose (%s)", mode, dec_rank_name(chosen_rank));
719       return chosen;
720    }
721 
722    return choose_instr_inc(ctx, notes, defer, true);
723 }
724 
725 enum choose_instr_inc_rank {
726    INC_DISTANCE,
727    INC_DISTANCE_READY,
728 };
729 
730 static const char *
inc_rank_name(enum choose_instr_inc_rank rank)731 inc_rank_name(enum choose_instr_inc_rank rank)
732 {
733    switch (rank) {
734    case INC_DISTANCE:
735       return "distance";
736    case INC_DISTANCE_READY:
737       return "distance+ready";
738    default:
739       return NULL;
740    }
741 }
742 
743 /**
744  * When we can't choose an instruction that reduces register pressure or
745  * is neutral, we end up here to try and pick the least bad option.
746  */
747 static struct ir3_sched_node *
choose_instr_inc(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes,bool defer,bool avoid_output)748 choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
749                  bool defer, bool avoid_output)
750 {
751    const char *mode = defer ? "-d" : "";
752    struct ir3_sched_node *chosen = NULL;
753    enum choose_instr_inc_rank chosen_rank = INC_DISTANCE;
754 
755    /*
756     * From hear on out, we are picking something that increases
757     * register pressure.  So try to pick something which will
758     * be consumed soon:
759     */
760    unsigned chosen_distance = 0;
761 
762    /* Pick the max delay of the remaining ready set. */
763    foreach_sched_node (n, &ctx->dag->heads) {
764       if (avoid_output && n->output)
765          continue;
766 
767       if (defer && should_defer(ctx, n->instr))
768          continue;
769 
770       if (!check_instr(ctx, notes, n->instr))
771          continue;
772 
773       unsigned d = node_delay(ctx, n);
774 
775       enum choose_instr_inc_rank rank;
776       if (d == 0)
777          rank = INC_DISTANCE_READY;
778       else
779          rank = INC_DISTANCE;
780 
781       unsigned distance = nearest_use(n->instr);
782 
783       if (!chosen || rank > chosen_rank ||
784           (rank == chosen_rank && distance < chosen_distance)) {
785          chosen = n;
786          chosen_distance = distance;
787          chosen_rank = rank;
788       }
789    }
790 
791    if (chosen) {
792       di(chosen->instr, "inc%s: chose (%s)", mode, inc_rank_name(chosen_rank));
793       return chosen;
794    }
795 
796    return NULL;
797 }
798 
799 /* Handles instruction selections for instructions we want to prioritize
800  * even if csp/csr would not pick them.
801  */
802 static struct ir3_sched_node *
choose_instr_prio(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes)803 choose_instr_prio(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
804 {
805    struct ir3_sched_node *chosen = NULL;
806 
807    foreach_sched_node (n, &ctx->dag->heads) {
808       /*
809        * - phi nodes and inputs must be scheduled first
810        * - split should be scheduled first, so that the vector value is
811        *   killed as soon as possible. RA cannot split up the vector and
812        *   reuse components that have been killed until it's been killed.
813        * - collect, on the other hand, should be treated as a "normal"
814        *   instruction, and may add to register pressure if its sources are
815        *   part of another vector or immediates.
816        */
817       if (!is_meta(n->instr) || n->instr->opc == OPC_META_COLLECT)
818          continue;
819 
820       if (!chosen || (chosen->max_delay < n->max_delay))
821          chosen = n;
822    }
823 
824    if (chosen) {
825       di(chosen->instr, "prio: chose (meta)");
826       return chosen;
827    }
828 
829    return NULL;
830 }
831 
832 static void
dump_state(struct ir3_sched_ctx * ctx)833 dump_state(struct ir3_sched_ctx *ctx)
834 {
835    if (!SCHED_DEBUG)
836       return;
837 
838    foreach_sched_node (n, &ctx->dag->heads) {
839       di(n->instr, "maxdel=%3d le=%d del=%u ", n->max_delay,
840          live_effect(n->instr), node_delay(ctx, n));
841 
842       util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
843          struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
844 
845          di(child->instr, " -> (%d parents) ", child->dag.parent_count);
846       }
847    }
848 }
849 
850 /* find instruction to schedule: */
851 static struct ir3_instruction *
choose_instr(struct ir3_sched_ctx * ctx,struct ir3_sched_notes * notes)852 choose_instr(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes)
853 {
854    struct ir3_sched_node *chosen;
855 
856    dump_state(ctx);
857 
858    chosen = choose_instr_prio(ctx, notes);
859    if (chosen)
860       return chosen->instr;
861 
862    chosen = choose_instr_dec(ctx, notes, true);
863    if (chosen)
864       return chosen->instr;
865 
866    chosen = choose_instr_dec(ctx, notes, false);
867    if (chosen)
868       return chosen->instr;
869 
870    chosen = choose_instr_inc(ctx, notes, false, false);
871    if (chosen)
872       return chosen->instr;
873 
874    return NULL;
875 }
876 
877 static struct ir3_instruction *
split_instr(struct ir3_sched_ctx * ctx,struct ir3_instruction * orig_instr)878 split_instr(struct ir3_sched_ctx *ctx, struct ir3_instruction *orig_instr)
879 {
880    struct ir3_instruction *new_instr = ir3_instr_clone(orig_instr);
881    di(new_instr, "split instruction");
882    sched_node_init(ctx, new_instr);
883    return new_instr;
884 }
885 
886 /* "spill" the address registers by remapping any unscheduled
887  * instructions which depend on the current address register
888  * to a clone of the instruction which wrote the address reg.
889  */
890 static struct ir3_instruction *
split_addr(struct ir3_sched_ctx * ctx,struct ir3_instruction ** addr,struct ir3_instruction ** users,unsigned users_count)891 split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
892            struct ir3_instruction **users, unsigned users_count)
893 {
894    struct ir3_instruction *new_addr = NULL;
895    unsigned i;
896 
897    assert(*addr);
898 
899    for (i = 0; i < users_count; i++) {
900       struct ir3_instruction *indirect = users[i];
901 
902       if (!indirect)
903          continue;
904 
905       /* skip instructions already scheduled: */
906       if (is_scheduled(indirect))
907          continue;
908 
909       /* remap remaining instructions using current addr
910        * to new addr:
911        */
912       if (indirect->address->def == (*addr)->dsts[0]) {
913          if (!new_addr) {
914             new_addr = split_instr(ctx, *addr);
915             /* original addr is scheduled, but new one isn't: */
916             new_addr->flags &= ~IR3_INSTR_MARK;
917          }
918          indirect->address->def = new_addr->dsts[0];
919          /* don't need to remove old dag edge since old addr is
920           * already scheduled:
921           */
922          sched_node_add_dep(ctx, indirect, new_addr, 0);
923          di(indirect, "new address");
924       }
925    }
926 
927    /* all remaining indirects remapped to new addr: */
928    *addr = NULL;
929 
930    return new_addr;
931 }
932 
933 static void
sched_node_init(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr)934 sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
935 {
936    struct ir3_sched_node *n = rzalloc(ctx->dag, struct ir3_sched_node);
937 
938    dag_init_node(ctx->dag, &n->dag);
939 
940    n->instr = instr;
941    instr->data = n;
942 }
943 
944 static void
sched_node_add_dep(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr,struct ir3_instruction * src,int i)945 sched_node_add_dep(struct ir3_sched_ctx *ctx,
946                    struct ir3_instruction *instr, struct ir3_instruction *src,
947                    int i)
948 {
949    /* don't consider dependencies in other blocks: */
950    if (src->block != instr->block)
951       return;
952 
953    /* we could have false-dep's that end up unused: */
954    if (src->flags & IR3_INSTR_UNUSED) {
955       assert(__is_false_dep(instr, i));
956       return;
957    }
958 
959    struct ir3_sched_node *n = instr->data;
960    struct ir3_sched_node *sn = src->data;
961 
962    /* If src is consumed by a collect, track that to realize that once
963     * any of the collect srcs are live, we should hurry up and schedule
964     * the rest.
965     */
966    if (instr->opc == OPC_META_COLLECT)
967       sn->collect = instr;
968 
969    unsigned d_soft = ir3_delayslots(ctx->compiler, src, instr, i, true);
970    unsigned d = ir3_delayslots(ctx->compiler, src, instr, i, false);
971 
972    /* delays from (ss) and (sy) are considered separately and more accurately in
973     * the scheduling heuristic, so ignore it when calculating the ip of
974     * instructions, but do consider it when prioritizing which instructions to
975     * schedule.
976     */
977    dag_add_edge_max_data(&sn->dag, &n->dag, (uintptr_t)d);
978 
979    n->delay = MAX2(n->delay, d_soft);
980 }
981 
982 static void
mark_kill_path(struct ir3_instruction * instr)983 mark_kill_path(struct ir3_instruction *instr)
984 {
985    struct ir3_sched_node *n = instr->data;
986 
987    if (n->kill_path) {
988       return;
989    }
990 
991    n->kill_path = true;
992 
993    foreach_ssa_src (src, instr) {
994       if (src->block != instr->block)
995          continue;
996       mark_kill_path(src);
997    }
998 }
999 
1000 /* Is it an output? */
1001 static bool
is_output_collect(struct ir3_instruction * instr)1002 is_output_collect(struct ir3_instruction *instr)
1003 {
1004    if (instr->opc != OPC_META_COLLECT)
1005       return false;
1006 
1007    foreach_ssa_use (use, instr) {
1008       if (use->opc != OPC_END && use->opc != OPC_CHMASK)
1009          return false;
1010    }
1011 
1012    return true;
1013 }
1014 
1015 /* Is it's only use as output? */
1016 static bool
is_output_only(struct ir3_instruction * instr)1017 is_output_only(struct ir3_instruction *instr)
1018 {
1019    foreach_ssa_use (use, instr)
1020       if (!is_output_collect(use))
1021          return false;
1022 
1023    return true;
1024 }
1025 
1026 static void
sched_node_add_deps(struct ir3_sched_ctx * ctx,struct ir3_instruction * instr)1027 sched_node_add_deps(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
1028 {
1029    /* There's nothing to do for phi nodes, since they always go first. And
1030     * phi nodes can reference sources later in the same block, so handling
1031     * sources is not only unnecessary but could cause problems.
1032     */
1033    if (instr->opc == OPC_META_PHI)
1034       return;
1035 
1036    /* Since foreach_ssa_src() already handles false-dep's we can construct
1037     * the DAG easily in a single pass.
1038     */
1039    foreach_ssa_src_n (src, i, instr) {
1040       sched_node_add_dep(ctx, instr, src, i);
1041    }
1042 
1043    /* NOTE that all inputs must be scheduled before a kill, so
1044     * mark these to be prioritized as well:
1045     */
1046    if (is_kill_or_demote(instr) || is_input(instr)) {
1047       mark_kill_path(instr);
1048    }
1049 
1050    if (is_output_only(instr)) {
1051       struct ir3_sched_node *n = instr->data;
1052       n->output = true;
1053    }
1054 }
1055 
1056 static void
sched_dag_max_delay_cb(struct dag_node * node,void * state)1057 sched_dag_max_delay_cb(struct dag_node *node, void *state)
1058 {
1059    struct ir3_sched_node *n = (struct ir3_sched_node *)node;
1060    uint32_t max_delay = 0;
1061 
1062    util_dynarray_foreach (&n->dag.edges, struct dag_edge, edge) {
1063       struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
1064       max_delay = MAX2(child->max_delay, max_delay);
1065    }
1066 
1067    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1068 }
1069 
1070 #ifndef NDEBUG
1071 static void
sched_dag_validate_cb(const struct dag_node * node,void * data)1072 sched_dag_validate_cb(const struct dag_node *node, void *data)
1073 {
1074    struct ir3_sched_node *n = (struct ir3_sched_node *)node;
1075 
1076    ir3_print_instr(n->instr);
1077 }
1078 #endif
1079 
1080 static void
sched_dag_init(struct ir3_sched_ctx * ctx)1081 sched_dag_init(struct ir3_sched_ctx *ctx)
1082 {
1083    ctx->dag = dag_create(ctx);
1084 
1085    foreach_instr (instr, &ctx->unscheduled_list)
1086       sched_node_init(ctx, instr);
1087 
1088 #ifndef NDEBUG
1089    dag_validate(ctx->dag, sched_dag_validate_cb, NULL);
1090 #endif
1091 
1092    foreach_instr (instr, &ctx->unscheduled_list)
1093       sched_node_add_deps(ctx, instr);
1094 
1095    dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
1096 }
1097 
1098 static void
sched_dag_destroy(struct ir3_sched_ctx * ctx)1099 sched_dag_destroy(struct ir3_sched_ctx *ctx)
1100 {
1101    ralloc_free(ctx->dag);
1102    ctx->dag = NULL;
1103 }
1104 
1105 static void
sched_block(struct ir3_sched_ctx * ctx,struct ir3_block * block)1106 sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
1107 {
1108    ctx->block = block;
1109 
1110    /* addr/pred writes are per-block: */
1111    ctx->addr0 = NULL;
1112    ctx->addr1 = NULL;
1113    ctx->sy_delay = 0;
1114    ctx->ss_delay = 0;
1115    ctx->sy_index = ctx->first_outstanding_sy_index = 0;
1116    ctx->ss_index = ctx->first_outstanding_ss_index = 0;
1117 
1118    /* The terminator has to stay at the end. Instead of trying to set up
1119     * dependencies to achieve this, it's easier to just remove it now and add it
1120     * back after scheduling.
1121     */
1122    struct ir3_instruction *terminator = ir3_block_take_terminator(block);
1123 
1124    /* move all instructions to the unscheduled list, and
1125     * empty the block's instruction list (to which we will
1126     * be inserting).
1127     */
1128    list_replace(&block->instr_list, &ctx->unscheduled_list);
1129    list_inithead(&block->instr_list);
1130 
1131    sched_dag_init(ctx);
1132 
1133    ctx->remaining_kills = 0;
1134    ctx->remaining_tex = 0;
1135    foreach_instr_safe (instr, &ctx->unscheduled_list) {
1136       if (is_kill_or_demote(instr))
1137          ctx->remaining_kills++;
1138       if (is_sy_producer(instr))
1139          ctx->remaining_tex++;
1140    }
1141 
1142    /* First schedule all meta:input and meta:phi instructions, followed by
1143     * tex-prefetch.  We want all of the instructions that load values into
1144     * registers before the shader starts to go before any other instructions.
1145     * But in particular we want inputs to come before prefetches.  This is
1146     * because a FS's bary_ij input may not actually be live in the shader,
1147     * but it should not be scheduled on top of any other input (but can be
1148     * overwritten by a tex prefetch)
1149     *
1150     * Note: Because the first block cannot have predecessors, meta:input and
1151     * meta:phi cannot exist in the same block.
1152     */
1153    foreach_instr_safe (instr, &ctx->unscheduled_list)
1154       if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_PHI)
1155          schedule(ctx, instr);
1156 
1157    foreach_instr_safe (instr, &ctx->unscheduled_list)
1158       if (instr->opc == OPC_META_TEX_PREFETCH)
1159          schedule(ctx, instr);
1160 
1161    foreach_instr_safe (instr, &ctx->unscheduled_list)
1162       if (instr->opc == OPC_PUSH_CONSTS_LOAD_MACRO)
1163          schedule(ctx, instr);
1164 
1165    while (!list_is_empty(&ctx->unscheduled_list)) {
1166       struct ir3_sched_notes notes = {0};
1167       struct ir3_instruction *instr;
1168 
1169       instr = choose_instr(ctx, &notes);
1170       if (instr) {
1171          unsigned delay = node_delay(ctx, instr->data);
1172          d("delay=%u", delay);
1173 
1174          assert(delay <= 6);
1175 
1176          schedule(ctx, instr);
1177 
1178          /* Since we've scheduled a "real" instruction, we can now
1179           * schedule any split instruction created by the scheduler again.
1180           */
1181          ctx->split = NULL;
1182       } else {
1183          struct ir3_instruction *new_instr = NULL;
1184          struct ir3 *ir = block->shader;
1185 
1186          /* nothing available to schedule.. if we are blocked on
1187           * address/predicate register conflict, then break the
1188           * deadlock by cloning the instruction that wrote that
1189           * reg:
1190           */
1191          if (notes.addr0_conflict) {
1192             new_instr =
1193                split_addr(ctx, &ctx->addr0, ir->a0_users, ir->a0_users_count);
1194          } else if (notes.addr1_conflict) {
1195             new_instr =
1196                split_addr(ctx, &ctx->addr1, ir->a1_users, ir->a1_users_count);
1197          } else {
1198             d("unscheduled_list:");
1199             foreach_instr (instr, &ctx->unscheduled_list)
1200                di(instr, "unscheduled: ");
1201             assert(0);
1202             ctx->error = true;
1203             return;
1204          }
1205 
1206          if (new_instr) {
1207             list_delinit(&new_instr->node);
1208             list_addtail(&new_instr->node, &ctx->unscheduled_list);
1209          }
1210 
1211          /* If we produced a new instruction, do not schedule it next to
1212           * guarantee progress.
1213           */
1214          ctx->split = new_instr;
1215       }
1216    }
1217 
1218    sched_dag_destroy(ctx);
1219 
1220    if (terminator)
1221       list_addtail(&terminator->node, &block->instr_list);
1222 }
1223 
1224 int
ir3_sched(struct ir3 * ir)1225 ir3_sched(struct ir3 *ir)
1226 {
1227    struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
1228 
1229    ctx->compiler = ir->compiler;
1230 
1231    foreach_block (block, &ir->block_list) {
1232       foreach_instr (instr, &block->instr_list) {
1233          instr->data = NULL;
1234       }
1235    }
1236 
1237    ir3_count_instructions_sched(ir);
1238    ir3_clear_mark(ir);
1239    ir3_find_ssa_uses(ir, ctx, false);
1240 
1241    foreach_block (block, &ir->block_list) {
1242       sched_block(ctx, block);
1243    }
1244 
1245    int ret = ctx->error ? -1 : 0;
1246 
1247    ralloc_free(ctx);
1248 
1249    return ret;
1250 }
1251 
1252 static unsigned
get_array_id(struct ir3_instruction * instr)1253 get_array_id(struct ir3_instruction *instr)
1254 {
1255    /* The expectation is that there is only a single array
1256     * src or dst, ir3_cp should enforce this.
1257     */
1258 
1259    foreach_dst (dst, instr)
1260       if (dst->flags & IR3_REG_ARRAY)
1261          return dst->array.id;
1262    foreach_src (src, instr)
1263       if (src->flags & IR3_REG_ARRAY)
1264          return src->array.id;
1265 
1266    unreachable("this was unexpected");
1267 }
1268 
1269 /* does instruction 'prior' need to be scheduled before 'instr'? */
1270 static bool
depends_on(struct ir3_instruction * instr,struct ir3_instruction * prior)1271 depends_on(struct ir3_instruction *instr, struct ir3_instruction *prior)
1272 {
1273    /* TODO for dependencies that are related to a specific object, ie
1274     * a specific SSBO/image/array, we could relax this constraint to
1275     * make accesses to unrelated objects not depend on each other (at
1276     * least as long as not declared coherent)
1277     */
1278    if (((instr->barrier_class & IR3_BARRIER_EVERYTHING) &&
1279         prior->barrier_class) ||
1280        ((prior->barrier_class & IR3_BARRIER_EVERYTHING) &&
1281         instr->barrier_class))
1282       return true;
1283 
1284    if (instr->barrier_class & prior->barrier_conflict) {
1285       if (!(instr->barrier_class &
1286             ~(IR3_BARRIER_ARRAY_R | IR3_BARRIER_ARRAY_W))) {
1287          /* if only array barrier, then we can further limit false-deps
1288           * by considering the array-id, ie reads/writes to different
1289           * arrays do not depend on each other (no aliasing)
1290           */
1291          if (get_array_id(instr) != get_array_id(prior)) {
1292             return false;
1293          }
1294       }
1295 
1296       return true;
1297    }
1298 
1299    return false;
1300 }
1301 
1302 static void
add_barrier_deps(struct ir3_block * block,struct ir3_instruction * instr)1303 add_barrier_deps(struct ir3_block *block, struct ir3_instruction *instr)
1304 {
1305    struct list_head *prev = instr->node.prev;
1306    struct list_head *next = instr->node.next;
1307 
1308    /* add dependencies on previous instructions that must be scheduled
1309     * prior to the current instruction
1310     */
1311    while (prev != &block->instr_list) {
1312       struct ir3_instruction *pi =
1313          list_entry(prev, struct ir3_instruction, node);
1314 
1315       prev = prev->prev;
1316 
1317       if (is_meta(pi))
1318          continue;
1319 
1320       if (instr->barrier_class == pi->barrier_class) {
1321          ir3_instr_add_dep(instr, pi);
1322          break;
1323       }
1324 
1325       if (depends_on(instr, pi))
1326          ir3_instr_add_dep(instr, pi);
1327    }
1328 
1329    /* add dependencies on this instruction to following instructions
1330     * that must be scheduled after the current instruction:
1331     */
1332    while (next != &block->instr_list) {
1333       struct ir3_instruction *ni =
1334          list_entry(next, struct ir3_instruction, node);
1335 
1336       next = next->next;
1337 
1338       if (is_meta(ni))
1339          continue;
1340 
1341       if (instr->barrier_class == ni->barrier_class) {
1342          ir3_instr_add_dep(ni, instr);
1343          break;
1344       }
1345 
1346       if (depends_on(ni, instr))
1347          ir3_instr_add_dep(ni, instr);
1348    }
1349 }
1350 
1351 /* before scheduling a block, we need to add any necessary false-dependencies
1352  * to ensure that:
1353  *
1354  *  (1) barriers are scheduled in the right order wrt instructions related
1355  *      to the barrier
1356  *
1357  *  (2) reads that come before a write actually get scheduled before the
1358  *      write
1359  */
1360 bool
ir3_sched_add_deps(struct ir3 * ir)1361 ir3_sched_add_deps(struct ir3 *ir)
1362 {
1363    bool progress = false;
1364 
1365    foreach_block (block, &ir->block_list) {
1366       foreach_instr (instr, &block->instr_list) {
1367          if (instr->barrier_class) {
1368             add_barrier_deps(block, instr);
1369             progress = true;
1370          }
1371       }
1372    }
1373 
1374    return progress;
1375 }
1376