xref: /aosp_15_r20/external/mesa3d/src/compiler/nir/nir_schedule.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2019 Broadcom
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "nir_schedule.h"
25 #include "util/dag.h"
26 #include "util/u_dynarray.h"
27 
28 /** @file
29  *
30  * Implements basic-block-level prepass instruction scheduling in NIR to
31  * manage register pressure.
32  *
33  * This is based on the Goodman/Hsu paper (1988, cached copy at
34  * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf).  We
35  * make up the DDG for NIR (which can be mostly done using the NIR def/use
36  * chains for SSA instructions, plus some edges for ordering register writes
37  * vs reads, and some more for ordering intrinsics).  Then we pick heads off
38  * of the DDG using their heuristic to emit the NIR instructions back into the
39  * block in their new order.
40  *
41  * The hard case for prepass scheduling on GPUs seems to always be consuming
42  * texture/ubo results.  The register pressure heuristic doesn't want to pick
43  * an instr that starts consuming texture results because it usually won't be
44  * the only usage, so that instruction will increase pressure.
45  *
46  * If you try to force consumption of tex results always, then in a case where
47  * single sample is used for many outputs, you'll end up picking every other
48  * user and expanding register pressure.  The partially_evaluated_path flag
49  * helps tremendously, in that if you happen for whatever reason to pick a
50  * texture sample's output, then you'll try to finish off that sample.  Future
51  * work may include doing some local search before locking in a choice, to try
52  * to more reliably find the case where just a few choices going against the
53  * heuristic can manage to free the whole vector.
54  */
55 
56 static bool debug;
57 
58 /**
59  * Represents a node in the DDG for a NIR instruction.
60  */
61 typedef struct {
62    struct dag_node dag; /* must be first for our u_dynarray_foreach */
63    nir_instr *instr;
64    bool partially_evaluated_path;
65 
66    /* Approximate estimate of the delay between starting this instruction and
67     * its results being available.
68     *
69     * Accuracy is not too important, given that we're prepass scheduling here
70     * and just trying to reduce excess dependencies introduced by a register
71     * allocator by stretching out the live intervals of expensive
72     * instructions.
73     */
74    uint32_t delay;
75 
76    /* Cost of the maximum-delay path from this node to the leaves. */
77    uint32_t max_delay;
78 
79    /* scoreboard->time value when this instruction can be scheduled without
80     * any stalls expected.
81     */
82    uint32_t ready_time;
83 } nir_schedule_node;
84 
85 typedef struct {
86    struct dag *dag;
87 
88    nir_shader *shader;
89 
90    /* Mapping from nir_def * to a struct set of
91     * instructions remaining to be scheduled using the register.
92     */
93    struct hash_table *remaining_uses;
94 
95    /* Map from nir_instr to nir_schedule_node * */
96    struct hash_table *instr_map;
97 
98    /* Set of nir_def * that have had any instruction scheduled on them. */
99    struct set *live_values;
100 
101    /* An abstract approximation of the number of nir_scheduler_node->delay
102     * units since the start of the shader.
103     */
104    uint32_t time;
105 
106    /* Number of channels currently used by the NIR instructions that have been
107     * scheduled.
108     */
109    int pressure;
110 
111    /* Options specified by the backend */
112    const nir_schedule_options *options;
113 } nir_schedule_scoreboard;
114 
115 /* When walking the instructions in reverse, we use this flag to swap
116  * before/after in add_dep().
117  */
118 enum direction { F,
119                  R };
120 
121 struct nir_schedule_class_dep {
122    int klass;
123    nir_schedule_node *node;
124    struct nir_schedule_class_dep *next;
125 };
126 
127 typedef struct {
128    nir_schedule_scoreboard *scoreboard;
129 
130    /* Map from registers to nir_schedule_node * */
131    struct hash_table *reg_map;
132 
133    /* Scheduler nodes for last instruction involved in some class of dependency.
134     */
135    nir_schedule_node *load_input;
136    nir_schedule_node *store_shared;
137    nir_schedule_node *unknown_intrinsic;
138    nir_schedule_node *discard;
139    nir_schedule_node *jump;
140 
141    struct nir_schedule_class_dep *class_deps;
142 
143    enum direction dir;
144 } nir_deps_state;
145 
146 static void *
_mesa_hash_table_search_data(struct hash_table * ht,void * key)147 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
148 {
149    struct hash_entry *entry = _mesa_hash_table_search(ht, key);
150    if (!entry)
151       return NULL;
152    return entry->data;
153 }
154 
155 static nir_schedule_node *
nir_schedule_get_node(struct hash_table * instr_map,nir_instr * instr)156 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
157 {
158    return _mesa_hash_table_search_data(instr_map, instr);
159 }
160 
161 static struct set *
nir_schedule_scoreboard_get_reg(nir_schedule_scoreboard * scoreboard,nir_def * reg)162 nir_schedule_scoreboard_get_reg(nir_schedule_scoreboard *scoreboard,
163                                 nir_def *reg)
164 {
165    return _mesa_hash_table_search_data(scoreboard->remaining_uses, reg);
166 }
167 
168 static struct set *
nir_schedule_scoreboard_get_src(nir_schedule_scoreboard * scoreboard,nir_src * src)169 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
170 {
171    return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
172 }
173 
174 static int
nir_schedule_reg_pressure(nir_def * reg)175 nir_schedule_reg_pressure(nir_def *reg)
176 {
177    nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
178    return nir_intrinsic_num_components(decl);
179 }
180 
181 static int
nir_schedule_def_pressure(nir_def * def)182 nir_schedule_def_pressure(nir_def *def)
183 {
184    return def->num_components;
185 }
186 
187 static int
nir_schedule_src_pressure(nir_src * src)188 nir_schedule_src_pressure(nir_src *src)
189 {
190    return nir_schedule_def_pressure(src->ssa);
191 }
192 
193 /**
194  * Adds a dependency such that @after must appear in the final program after
195  * @before.
196  *
197  * We add @before as a child of @after, so that DAG heads are the outputs of
198  * the program and we make our scheduling decisions bottom to top.
199  */
200 static void
add_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)201 add_dep(nir_deps_state *state,
202         nir_schedule_node *before,
203         nir_schedule_node *after)
204 {
205    if (!before || !after)
206       return;
207 
208    assert(before != after);
209 
210    if (state->dir == F)
211       dag_add_edge(&before->dag, &after->dag, 0);
212    else
213       dag_add_edge(&after->dag, &before->dag, 0);
214 }
215 
216 static void
add_read_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)217 add_read_dep(nir_deps_state *state,
218              nir_schedule_node *before,
219              nir_schedule_node *after)
220 {
221    add_dep(state, before, after);
222 }
223 
224 static void
add_write_dep(nir_deps_state * state,nir_schedule_node ** before,nir_schedule_node * after)225 add_write_dep(nir_deps_state *state,
226               nir_schedule_node **before,
227               nir_schedule_node *after)
228 {
229    add_dep(state, *before, after);
230    *before = after;
231 }
232 
233 static void
nir_schedule_load_reg_deps(nir_intrinsic_instr * load,nir_deps_state * state)234 nir_schedule_load_reg_deps(nir_intrinsic_instr *load,
235                            nir_deps_state *state)
236 {
237    nir_def *reg = load->src[0].ssa;
238    (void)nir_reg_get_decl(reg);
239 
240    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, reg);
241    if (!entry)
242       return;
243    nir_schedule_node *dst_n = entry->data;
244 
245    nir_schedule_node *src_n =
246       nir_schedule_get_node(state->scoreboard->instr_map, &load->instr);
247 
248    add_dep(state, dst_n, src_n);
249 }
250 
251 static void
nir_schedule_store_reg_deps(nir_intrinsic_instr * store,nir_deps_state * state)252 nir_schedule_store_reg_deps(nir_intrinsic_instr *store,
253                             nir_deps_state *state)
254 {
255    nir_def *reg = store->src[1].ssa;
256    (void)nir_reg_get_decl(reg);
257 
258    nir_schedule_node *dest_n =
259       nir_schedule_get_node(state->scoreboard->instr_map, &store->instr);
260 
261    struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, reg);
262    if (!entry) {
263       _mesa_hash_table_insert(state->reg_map, reg, dest_n);
264       return;
265    }
266    nir_schedule_node **before = (nir_schedule_node **)&entry->data;
267 
268    add_write_dep(state, before, dest_n);
269 }
270 
271 static bool
nir_schedule_ssa_deps(nir_def * def,void * in_state)272 nir_schedule_ssa_deps(nir_def *def, void *in_state)
273 {
274    nir_deps_state *state = in_state;
275    struct hash_table *instr_map = state->scoreboard->instr_map;
276    nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
277 
278    nir_foreach_use(src, def) {
279       nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
280                                                        nir_src_parent_instr(src));
281 
282       add_read_dep(state, def_n, use_n);
283    }
284 
285    return true;
286 }
287 
288 static struct nir_schedule_class_dep *
nir_schedule_get_class_dep(nir_deps_state * state,int klass)289 nir_schedule_get_class_dep(nir_deps_state *state,
290                            int klass)
291 {
292    for (struct nir_schedule_class_dep *class_dep = state->class_deps;
293         class_dep != NULL;
294         class_dep = class_dep->next) {
295       if (class_dep->klass == klass)
296          return class_dep;
297    }
298 
299    struct nir_schedule_class_dep *class_dep =
300       ralloc(state->reg_map, struct nir_schedule_class_dep);
301 
302    class_dep->klass = klass;
303    class_dep->node = NULL;
304    class_dep->next = state->class_deps;
305 
306    state->class_deps = class_dep;
307 
308    return class_dep;
309 }
310 
311 static void
nir_schedule_intrinsic_deps(nir_deps_state * state,nir_intrinsic_instr * instr)312 nir_schedule_intrinsic_deps(nir_deps_state *state,
313                             nir_intrinsic_instr *instr)
314 {
315    nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
316                                                 &instr->instr);
317    const nir_schedule_options *options = state->scoreboard->options;
318    nir_schedule_dependency dep;
319 
320    if (options->intrinsic_cb &&
321        options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
322       struct nir_schedule_class_dep *class_dep =
323          nir_schedule_get_class_dep(state, dep.klass);
324 
325       switch (dep.type) {
326       case NIR_SCHEDULE_READ_DEPENDENCY:
327          add_read_dep(state, class_dep->node, n);
328          break;
329       case NIR_SCHEDULE_WRITE_DEPENDENCY:
330          add_write_dep(state, &class_dep->node, n);
331          break;
332       }
333    }
334 
335    switch (instr->intrinsic) {
336    case nir_intrinsic_decl_reg:
337       break; /* Nothing to do */
338 
339    case nir_intrinsic_load_reg:
340       nir_schedule_load_reg_deps(instr, state);
341       break;
342 
343    case nir_intrinsic_store_reg:
344       nir_schedule_store_reg_deps(instr, state);
345       break;
346 
347    case nir_intrinsic_load_uniform:
348    case nir_intrinsic_load_ubo:
349    case nir_intrinsic_load_front_face:
350       break;
351 
352    case nir_intrinsic_demote:
353    case nir_intrinsic_demote_if:
354    case nir_intrinsic_terminate:
355    case nir_intrinsic_terminate_if:
356       /* We are adding two dependencies:
357        *
358        * * A individual one that we could use to add a read_dep while handling
359        *   nir_instr_type_tex
360        *
361        * * Include it on the unknown intrinsic set, as we want discard to be
362        *   serialized in in the same order relative to intervening stores or
363        *   atomic accesses to SSBOs and images
364        */
365       add_write_dep(state, &state->discard, n);
366       add_write_dep(state, &state->unknown_intrinsic, n);
367       break;
368 
369    case nir_intrinsic_store_output:
370       /* For some hardware and stages, output stores affect the same shared
371        * memory as input loads.
372        */
373       if ((state->scoreboard->options->stages_with_shared_io_memory &
374            (1 << state->scoreboard->shader->info.stage)))
375          add_write_dep(state, &state->load_input, n);
376 
377       /* Make sure that preceding discards stay before the store_output */
378       add_read_dep(state, state->discard, n);
379 
380       break;
381 
382    case nir_intrinsic_load_input:
383    case nir_intrinsic_load_per_primitive_input:
384    case nir_intrinsic_load_per_vertex_input:
385       add_read_dep(state, state->load_input, n);
386       break;
387 
388    case nir_intrinsic_load_shared:
389    case nir_intrinsic_load_shared2_amd:
390       /* Don't move load_shared beyond a following store_shared, as it could
391        * change their value
392        */
393       add_read_dep(state, state->store_shared, n);
394       break;
395 
396    case nir_intrinsic_shared_atomic:
397    case nir_intrinsic_shared_atomic_swap:
398    case nir_intrinsic_shared_append_amd:
399    case nir_intrinsic_shared_consume_amd:
400    case nir_intrinsic_store_shared:
401    case nir_intrinsic_store_shared2_amd:
402       add_write_dep(state, &state->store_shared, n);
403       break;
404 
405    case nir_intrinsic_barrier: {
406       const nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
407 
408       if (modes & nir_var_mem_shared)
409          add_write_dep(state, &state->store_shared, n);
410 
411       /* Serialize against other categories. */
412       add_write_dep(state, &state->unknown_intrinsic, n);
413 
414       break;
415    }
416 
417    case nir_intrinsic_ddx:
418    case nir_intrinsic_ddx_fine:
419    case nir_intrinsic_ddx_coarse:
420    case nir_intrinsic_ddy:
421    case nir_intrinsic_ddy_fine:
422    case nir_intrinsic_ddy_coarse:
423       /* Match the old behaviour. TODO: Is this correct with discards? */
424       break;
425 
426    default:
427       /* Attempt to handle other intrinsics that we haven't individually
428        * categorized by serializing them in the same order relative to each
429        * other.
430        */
431       add_write_dep(state, &state->unknown_intrinsic, n);
432       break;
433    }
434 }
435 
436 /**
437  * Common code for dependencies that need to be tracked both forward and
438  * backward.
439  *
440  * This is for things like "all reads of r4 have to happen between the r4
441  * writes that surround them".
442  */
443 static void
nir_schedule_calculate_deps(nir_deps_state * state,nir_schedule_node * n)444 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
445 {
446    nir_instr *instr = n->instr;
447 
448    /* For NIR SSA defs, we only need to do a single pass of making the uses
449     * depend on the def.
450     */
451    if (state->dir == F)
452       nir_foreach_def(instr, nir_schedule_ssa_deps, state);
453 
454    /* Make sure any other instructions keep their positions relative to
455     * jumps.
456     */
457    if (instr->type != nir_instr_type_jump)
458       add_read_dep(state, state->jump, n);
459 
460    switch (instr->type) {
461    case nir_instr_type_undef:
462    case nir_instr_type_load_const:
463    case nir_instr_type_alu:
464    case nir_instr_type_deref:
465    case nir_instr_type_debug_info:
466       break;
467 
468    case nir_instr_type_tex:
469       /* Don't move texture ops before a discard, as that could increase
470        * memory bandwidth for reading the discarded samples.
471        */
472       add_read_dep(state, state->discard, n);
473       break;
474 
475    case nir_instr_type_jump:
476       add_write_dep(state, &state->jump, n);
477       break;
478 
479    case nir_instr_type_call:
480       unreachable("Calls should have been lowered");
481       break;
482 
483    case nir_instr_type_parallel_copy:
484       unreachable("Parallel copies should have been lowered");
485       break;
486 
487    case nir_instr_type_phi:
488       unreachable("nir_schedule() should be called after lowering from SSA");
489       break;
490 
491    case nir_instr_type_intrinsic:
492       nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
493       break;
494    }
495 }
496 
497 static void
calculate_forward_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)498 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
499 {
500    nir_deps_state state = {
501       .scoreboard = scoreboard,
502       .dir = F,
503       .reg_map = _mesa_pointer_hash_table_create(NULL),
504    };
505 
506    nir_foreach_instr(instr, block) {
507       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
508                                                       instr);
509       nir_schedule_calculate_deps(&state, node);
510    }
511 
512    ralloc_free(state.reg_map);
513 }
514 
515 static void
calculate_reverse_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)516 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
517 {
518    nir_deps_state state = {
519       .scoreboard = scoreboard,
520       .dir = R,
521       .reg_map = _mesa_pointer_hash_table_create(NULL),
522    };
523 
524    nir_foreach_instr_reverse(instr, block) {
525       nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
526                                                       instr);
527       nir_schedule_calculate_deps(&state, node);
528    }
529 
530    ralloc_free(state.reg_map);
531 }
532 
533 typedef struct {
534    nir_schedule_scoreboard *scoreboard;
535    int regs_freed;
536 } nir_schedule_regs_freed_state;
537 
538 static bool
nir_schedule_regs_freed_src_cb(nir_src * src,void * in_state)539 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
540 {
541    nir_schedule_regs_freed_state *state = in_state;
542    nir_schedule_scoreboard *scoreboard = state->scoreboard;
543    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
544 
545    if (remaining_uses->entries == 1 &&
546        _mesa_set_search(remaining_uses, nir_src_parent_instr(src))) {
547       state->regs_freed += nir_schedule_src_pressure(src);
548    }
549 
550    return true;
551 }
552 
553 static bool
nir_schedule_regs_freed_def_cb(nir_def * def,void * in_state)554 nir_schedule_regs_freed_def_cb(nir_def *def, void *in_state)
555 {
556    nir_schedule_regs_freed_state *state = in_state;
557 
558    state->regs_freed -= nir_schedule_def_pressure(def);
559 
560    return true;
561 }
562 
563 static void
nir_schedule_regs_freed_load_reg(nir_intrinsic_instr * load,nir_schedule_regs_freed_state * state)564 nir_schedule_regs_freed_load_reg(nir_intrinsic_instr *load,
565                                  nir_schedule_regs_freed_state *state)
566 {
567    assert(nir_is_load_reg(load));
568 
569    if (load->intrinsic == nir_intrinsic_load_reg_indirect)
570       nir_schedule_regs_freed_src_cb(&load->src[1], state);
571 
572    nir_schedule_scoreboard *scoreboard = state->scoreboard;
573    nir_def *reg = load->src[0].ssa;
574    struct set *remaining_uses = nir_schedule_scoreboard_get_reg(scoreboard, reg);
575 
576    if (remaining_uses->entries == 1 &&
577        _mesa_set_search(remaining_uses, &load->instr)) {
578       state->regs_freed += nir_schedule_reg_pressure(reg);
579    }
580 
581    nir_schedule_regs_freed_def_cb(&load->def, state);
582 }
583 
584 static void
nir_schedule_regs_freed_store_reg(nir_intrinsic_instr * store,nir_schedule_regs_freed_state * state)585 nir_schedule_regs_freed_store_reg(nir_intrinsic_instr *store,
586                                   nir_schedule_regs_freed_state *state)
587 {
588    assert(nir_is_store_reg(store));
589 
590    nir_schedule_regs_freed_src_cb(&store->src[0], state);
591    if (store->intrinsic == nir_intrinsic_store_reg_indirect)
592       nir_schedule_regs_freed_src_cb(&store->src[2], state);
593 
594    nir_schedule_scoreboard *scoreboard = state->scoreboard;
595    nir_def *reg = store->src[1].ssa;
596 
597    /* Only the first def of a reg counts against register pressure. */
598    if (!_mesa_set_search(scoreboard->live_values, reg))
599       state->regs_freed -= nir_schedule_reg_pressure(reg);
600 }
601 
602 static bool
nir_schedule_regs_freed_reg_intrin(nir_instr * instr,nir_schedule_regs_freed_state * state)603 nir_schedule_regs_freed_reg_intrin(nir_instr *instr,
604                                    nir_schedule_regs_freed_state *state)
605 {
606    if (instr->type != nir_instr_type_intrinsic)
607       return false;
608 
609    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
610    switch (intrin->intrinsic) {
611    case nir_intrinsic_decl_reg:
612       return true; /* Handled but nothing to do */
613 
614    case nir_intrinsic_load_reg:
615    case nir_intrinsic_load_reg_indirect:
616       nir_schedule_regs_freed_load_reg(intrin, state);
617       return true;
618 
619    case nir_intrinsic_store_reg:
620    case nir_intrinsic_store_reg_indirect:
621       nir_schedule_regs_freed_store_reg(intrin, state);
622       return true;
623 
624    default:
625       return false;
626    }
627 }
628 
629 static int
nir_schedule_regs_freed(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)630 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
631 {
632    nir_schedule_regs_freed_state state = {
633       .scoreboard = scoreboard,
634    };
635 
636    if (!nir_schedule_regs_freed_reg_intrin(n->instr, &state)) {
637       nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
638       nir_foreach_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
639    }
640 
641    return state.regs_freed;
642 }
643 
644 /**
645  * Chooses an instruction that will minimise the register pressure as much as
646  * possible. This should only be used as a fallback when the regular scheduling
647  * generates a shader whose register allocation fails.
648  */
649 static nir_schedule_node *
nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard * scoreboard)650 nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
651 {
652    nir_schedule_node *chosen = NULL;
653 
654    /* Find the leader in the ready (shouldn't-stall) set with the mininum
655     * cost.
656     */
657    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
658       if (scoreboard->time < n->ready_time)
659          continue;
660 
661       if (!chosen || chosen->max_delay > n->max_delay)
662          chosen = n;
663    }
664    if (chosen) {
665       if (debug) {
666          fprintf(stderr, "chose (ready fallback):          ");
667          nir_print_instr(chosen->instr, stderr);
668          fprintf(stderr, "\n");
669       }
670 
671       return chosen;
672    }
673 
674    /* Otherwise, choose the leader with the minimum cost. */
675    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
676       if (!chosen || chosen->max_delay > n->max_delay)
677          chosen = n;
678    }
679    if (debug) {
680       fprintf(stderr, "chose (leader fallback):         ");
681       nir_print_instr(chosen->instr, stderr);
682       fprintf(stderr, "\n");
683    }
684 
685    return chosen;
686 }
687 
688 /**
689  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
690  * Scheduling for Parallelism) heuristic.
691  *
692  * Picks an instruction on the critical that's ready to execute without
693  * stalls, if possible, otherwise picks the instruction on the critical path.
694  */
695 static nir_schedule_node *
nir_schedule_choose_instruction_csp(nir_schedule_scoreboard * scoreboard)696 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
697 {
698    nir_schedule_node *chosen = NULL;
699 
700    /* Find the leader in the ready (shouldn't-stall) set with the maximum
701     * cost.
702     */
703    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
704       if (scoreboard->time < n->ready_time)
705          continue;
706 
707       if (!chosen || chosen->max_delay < n->max_delay)
708          chosen = n;
709    }
710    if (chosen) {
711       if (debug) {
712          fprintf(stderr, "chose (ready):          ");
713          nir_print_instr(chosen->instr, stderr);
714          fprintf(stderr, "\n");
715       }
716 
717       return chosen;
718    }
719 
720    /* Otherwise, choose the leader with the maximum cost. */
721    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
722       if (!chosen || chosen->max_delay < n->max_delay)
723          chosen = n;
724    }
725    if (debug) {
726       fprintf(stderr, "chose (leader):         ");
727       nir_print_instr(chosen->instr, stderr);
728       fprintf(stderr, "\n");
729    }
730 
731    return chosen;
732 }
733 
734 /**
735  * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
736  * Scheduling for Register pressure) heuristic.
737  */
738 static nir_schedule_node *
nir_schedule_choose_instruction_csr(nir_schedule_scoreboard * scoreboard)739 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
740 {
741    nir_schedule_node *chosen = NULL;
742 
743    /* Find a ready inst with regs freed and pick the one with max cost. */
744    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
745       if (n->ready_time > scoreboard->time)
746          continue;
747 
748       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
749 
750       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
751          chosen = n;
752       }
753    }
754    if (chosen) {
755       if (debug) {
756          fprintf(stderr, "chose (freed+ready):    ");
757          nir_print_instr(chosen->instr, stderr);
758          fprintf(stderr, "\n");
759       }
760 
761       return chosen;
762    }
763 
764    /* Find a leader with regs freed and pick the one with max cost. */
765    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
766       int regs_freed = nir_schedule_regs_freed(scoreboard, n);
767 
768       if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
769          chosen = n;
770       }
771    }
772    if (chosen) {
773       if (debug) {
774          fprintf(stderr, "chose (regs freed):     ");
775          nir_print_instr(chosen->instr, stderr);
776          fprintf(stderr, "\n");
777       }
778 
779       return chosen;
780    }
781 
782    /* Find a partially evaluated path and try to finish it off */
783    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
784       if (n->partially_evaluated_path &&
785           (!chosen || chosen->max_delay < n->max_delay)) {
786          chosen = n;
787       }
788    }
789    if (chosen) {
790       if (debug) {
791          fprintf(stderr, "chose (partial path):   ");
792          nir_print_instr(chosen->instr, stderr);
793          fprintf(stderr, "\n");
794       }
795 
796       return chosen;
797    }
798 
799    /* Contra the paper, pick a leader with no effect on used regs.  This may
800     * open up new opportunities, as otherwise a single-operand instr consuming
801     * a value will tend to block finding freeing that value.  This had a
802     * massive effect on reducing spilling on V3D.
803     *
804     * XXX: Should this prioritize ready?
805     */
806    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
807       if (nir_schedule_regs_freed(scoreboard, n) != 0)
808          continue;
809 
810       if (!chosen || chosen->max_delay < n->max_delay)
811          chosen = n;
812    }
813    if (chosen) {
814       if (debug) {
815          fprintf(stderr, "chose (regs no-op):     ");
816          nir_print_instr(chosen->instr, stderr);
817          fprintf(stderr, "\n");
818       }
819 
820       return chosen;
821    }
822 
823    /* Pick the max delay of the remaining ready set. */
824    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
825       if (n->ready_time > scoreboard->time)
826          continue;
827       if (!chosen || chosen->max_delay < n->max_delay)
828          chosen = n;
829    }
830    if (chosen) {
831       if (debug) {
832          fprintf(stderr, "chose (ready max delay):   ");
833          nir_print_instr(chosen->instr, stderr);
834          fprintf(stderr, "\n");
835       }
836       return chosen;
837    }
838 
839    /* Pick the max delay of the remaining leaders. */
840    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
841       if (!chosen || chosen->max_delay < n->max_delay)
842          chosen = n;
843    }
844 
845    if (debug) {
846       fprintf(stderr, "chose (max delay):         ");
847       nir_print_instr(chosen->instr, stderr);
848       fprintf(stderr, "\n");
849    }
850 
851    return chosen;
852 }
853 
854 static void
dump_state(nir_schedule_scoreboard * scoreboard)855 dump_state(nir_schedule_scoreboard *scoreboard)
856 {
857    list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
858       fprintf(stderr, "maxdel %5d ", n->max_delay);
859       nir_print_instr(n->instr, stderr);
860       fprintf(stderr, "\n");
861 
862       util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
863          nir_schedule_node *child = (nir_schedule_node *)edge->child;
864 
865          fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
866          nir_print_instr(child->instr, stderr);
867          fprintf(stderr, "\n");
868       }
869    }
870 }
871 
872 static void
nir_schedule_mark_use(nir_schedule_scoreboard * scoreboard,void * reg_or_def,nir_instr * reg_or_def_parent,int pressure)873 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
874                       void *reg_or_def,
875                       nir_instr *reg_or_def_parent,
876                       int pressure)
877 {
878    /* Make the value live if it's the first time it's been used. */
879    if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
880       _mesa_set_add(scoreboard->live_values, reg_or_def);
881       scoreboard->pressure += pressure;
882    }
883 
884    /* Make the value dead if it's the last remaining use.  Be careful when one
885     * instruction uses a value twice to not decrement pressure twice.
886     */
887    struct set *remaining_uses =
888       _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
889    struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
890    if (entry) {
891       _mesa_set_remove(remaining_uses, entry);
892 
893       if (remaining_uses->entries == 0)
894          scoreboard->pressure -= pressure;
895    }
896 }
897 
898 static bool
nir_schedule_mark_src_scheduled(nir_src * src,void * state)899 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
900 {
901    nir_schedule_scoreboard *scoreboard = state;
902    struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
903 
904    struct set_entry *entry = _mesa_set_search(remaining_uses,
905                                               nir_src_parent_instr(src));
906    if (entry) {
907       /* Once we've used an SSA value in one instruction, bump the priority of
908        * the other uses so the SSA value can get fully consumed.
909        *
910        * We don't do this for registers, and it's would be a hassle and it's
911        * unclear if that would help or not.  Also, skip it for constants, as
912        * they're often folded as immediates into backend instructions and have
913        * many unrelated instructions all referencing the same value (0).
914        */
915       if (src->ssa->parent_instr->type != nir_instr_type_load_const) {
916          nir_foreach_use(other_src, src->ssa) {
917             if (nir_src_parent_instr(other_src) == nir_src_parent_instr(src))
918                continue;
919 
920             nir_schedule_node *n =
921                nir_schedule_get_node(scoreboard->instr_map,
922                                      nir_src_parent_instr(other_src));
923 
924             if (n && !n->partially_evaluated_path) {
925                if (debug) {
926                   fprintf(stderr, "  New partially evaluated path: ");
927                   nir_print_instr(n->instr, stderr);
928                   fprintf(stderr, "\n");
929                }
930 
931                n->partially_evaluated_path = true;
932             }
933          }
934       }
935    }
936 
937    nir_schedule_mark_use(scoreboard,
938                          (void *)src->ssa,
939                          nir_src_parent_instr(src),
940                          nir_schedule_src_pressure(src));
941 
942    return true;
943 }
944 
945 static bool
nir_schedule_mark_def_scheduled(nir_def * def,void * state)946 nir_schedule_mark_def_scheduled(nir_def *def, void *state)
947 {
948    nir_schedule_scoreboard *scoreboard = state;
949 
950    nir_schedule_mark_use(scoreboard, def, def->parent_instr,
951                          nir_schedule_def_pressure(def));
952 
953    return true;
954 }
955 
956 static void
nir_schedule_mark_load_reg_scheduled(nir_intrinsic_instr * load,nir_schedule_scoreboard * scoreboard)957 nir_schedule_mark_load_reg_scheduled(nir_intrinsic_instr *load,
958                                      nir_schedule_scoreboard *scoreboard)
959 {
960    assert(nir_is_load_reg(load));
961    nir_def *reg = load->src[0].ssa;
962 
963    if (load->intrinsic == nir_intrinsic_load_reg_indirect)
964       nir_schedule_mark_src_scheduled(&load->src[1], scoreboard);
965 
966    nir_schedule_mark_use(scoreboard, reg, &load->instr,
967                          nir_schedule_reg_pressure(reg));
968 
969    nir_schedule_mark_def_scheduled(&load->def, scoreboard);
970 }
971 
972 static void
nir_schedule_mark_store_reg_scheduled(nir_intrinsic_instr * store,nir_schedule_scoreboard * scoreboard)973 nir_schedule_mark_store_reg_scheduled(nir_intrinsic_instr *store,
974                                       nir_schedule_scoreboard *scoreboard)
975 {
976    assert(nir_is_store_reg(store));
977    nir_def *reg = store->src[1].ssa;
978 
979    nir_schedule_mark_src_scheduled(&store->src[0], scoreboard);
980    if (store->intrinsic == nir_intrinsic_store_reg_indirect)
981       nir_schedule_mark_src_scheduled(&store->src[2], scoreboard);
982 
983    /* XXX: This is not actually accurate for regs -- the last use of a reg may
984     * have a live interval that extends across control flow.  We should
985     * calculate the live ranges of regs, and have scheduler nodes for the CF
986     * nodes that also "use" the reg.
987     */
988    nir_schedule_mark_use(scoreboard, reg, &store->instr,
989                          nir_schedule_reg_pressure(reg));
990 }
991 
992 static bool
nir_schedule_mark_reg_intrin_scheduled(nir_instr * instr,nir_schedule_scoreboard * scoreboard)993 nir_schedule_mark_reg_intrin_scheduled(nir_instr *instr,
994                                        nir_schedule_scoreboard *scoreboard)
995 {
996    if (instr->type != nir_instr_type_intrinsic)
997       return false;
998 
999    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1000    switch (intrin->intrinsic) {
1001    case nir_intrinsic_decl_reg:
1002       return true; /* Handled but nothing to do */
1003 
1004    case nir_intrinsic_load_reg:
1005    case nir_intrinsic_load_reg_indirect:
1006       nir_schedule_mark_load_reg_scheduled(intrin, scoreboard);
1007       return true;
1008 
1009    case nir_intrinsic_store_reg:
1010    case nir_intrinsic_store_reg_indirect:
1011       nir_schedule_mark_store_reg_scheduled(intrin, scoreboard);
1012       return true;
1013 
1014    default:
1015       return false;
1016    }
1017 }
1018 
1019 static void
nir_schedule_mark_node_scheduled(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)1020 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
1021                                  nir_schedule_node *n)
1022 {
1023    if (!nir_schedule_mark_reg_intrin_scheduled(n->instr, scoreboard)) {
1024       nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
1025       nir_foreach_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
1026    }
1027 
1028    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1029       nir_schedule_node *child = (nir_schedule_node *)edge->child;
1030 
1031       child->ready_time = MAX2(child->ready_time,
1032                                scoreboard->time + n->delay);
1033 
1034       if (child->dag.parent_count == 1) {
1035          if (debug) {
1036             fprintf(stderr, "  New DAG head: ");
1037             nir_print_instr(child->instr, stderr);
1038             fprintf(stderr, "\n");
1039          }
1040       }
1041    }
1042 
1043    dag_prune_head(scoreboard->dag, &n->dag);
1044 
1045    scoreboard->time = MAX2(n->ready_time, scoreboard->time);
1046    scoreboard->time++;
1047 }
1048 
1049 static void
nir_schedule_instructions(nir_schedule_scoreboard * scoreboard,nir_block * block)1050 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
1051 {
1052    while (!list_is_empty(&scoreboard->dag->heads)) {
1053       if (debug) {
1054          fprintf(stderr, "current list:\n");
1055          dump_state(scoreboard);
1056       }
1057 
1058       nir_schedule_node *chosen;
1059       if (scoreboard->options->fallback)
1060          chosen = nir_schedule_choose_instruction_fallback(scoreboard);
1061       else if (scoreboard->pressure < scoreboard->options->threshold)
1062          chosen = nir_schedule_choose_instruction_csp(scoreboard);
1063       else
1064          chosen = nir_schedule_choose_instruction_csr(scoreboard);
1065 
1066       /* Now that we've scheduled a new instruction, some of its children may
1067        * be promoted to the list of instructions ready to be scheduled.
1068        */
1069       nir_schedule_mark_node_scheduled(scoreboard, chosen);
1070 
1071       /* Move the instruction to the end (so our first chosen instructions are
1072        * the start of the program).
1073        */
1074       exec_node_remove(&chosen->instr->node);
1075       exec_list_push_tail(&block->instr_list, &chosen->instr->node);
1076 
1077       if (debug)
1078          fprintf(stderr, "\n");
1079    }
1080 }
1081 
1082 static uint32_t
nir_schedule_get_delay(nir_schedule_scoreboard * scoreboard,nir_instr * instr)1083 nir_schedule_get_delay(nir_schedule_scoreboard *scoreboard, nir_instr *instr)
1084 {
1085    if (scoreboard->options->instr_delay_cb) {
1086       void *cb_data = scoreboard->options->instr_delay_cb_data;
1087       return scoreboard->options->instr_delay_cb(instr, cb_data);
1088    }
1089 
1090    switch (instr->type) {
1091    case nir_instr_type_undef:
1092    case nir_instr_type_load_const:
1093    case nir_instr_type_alu:
1094    case nir_instr_type_deref:
1095    case nir_instr_type_jump:
1096    case nir_instr_type_parallel_copy:
1097    case nir_instr_type_call:
1098    case nir_instr_type_phi:
1099    case nir_instr_type_debug_info:
1100       return 1;
1101 
1102    case nir_instr_type_intrinsic:
1103       switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1104       case nir_intrinsic_decl_reg:
1105       case nir_intrinsic_load_reg:
1106       case nir_intrinsic_store_reg:
1107          return 0;
1108       case nir_intrinsic_load_ubo:
1109       case nir_intrinsic_load_ssbo:
1110       case nir_intrinsic_load_scratch:
1111       case nir_intrinsic_load_shared:
1112       case nir_intrinsic_image_load:
1113          return 50;
1114       default:
1115          return 1;
1116       }
1117       break;
1118 
1119    case nir_instr_type_tex:
1120       /* Pick some large number to try to fetch textures early and sample them
1121        * late.
1122        */
1123       return 100;
1124    }
1125 
1126    return 0;
1127 }
1128 
1129 static void
nir_schedule_dag_max_delay_cb(struct dag_node * node,void * state)1130 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
1131 {
1132    nir_schedule_node *n = (nir_schedule_node *)node;
1133    uint32_t max_delay = 0;
1134 
1135    util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1136       nir_schedule_node *child = (nir_schedule_node *)edge->child;
1137       max_delay = MAX2(child->max_delay, max_delay);
1138    }
1139 
1140    n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1141 }
1142 
1143 static void
nir_schedule_block(nir_schedule_scoreboard * scoreboard,nir_block * block)1144 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
1145 {
1146    void *mem_ctx = ralloc_context(NULL);
1147    scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
1148 
1149    scoreboard->dag = dag_create(mem_ctx);
1150 
1151    nir_foreach_instr(instr, block) {
1152       nir_schedule_node *n =
1153          rzalloc(mem_ctx, nir_schedule_node);
1154 
1155       n->instr = instr;
1156       n->delay = nir_schedule_get_delay(scoreboard, instr);
1157       dag_init_node(scoreboard->dag, &n->dag);
1158 
1159       _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
1160    }
1161 
1162    calculate_forward_deps(scoreboard, block);
1163    calculate_reverse_deps(scoreboard, block);
1164 
1165    dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
1166 
1167    nir_schedule_instructions(scoreboard, block);
1168 
1169    ralloc_free(mem_ctx);
1170    scoreboard->instr_map = NULL;
1171 }
1172 
1173 static bool
is_decl_reg(nir_instr * instr)1174 is_decl_reg(nir_instr *instr)
1175 {
1176    if (instr->type != nir_instr_type_intrinsic)
1177       return false;
1178 
1179    nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1180    return intrin->intrinsic == nir_intrinsic_decl_reg;
1181 }
1182 
1183 static bool
nir_schedule_ssa_def_init_scoreboard(nir_def * def,void * state)1184 nir_schedule_ssa_def_init_scoreboard(nir_def *def, void *state)
1185 {
1186    nir_schedule_scoreboard *scoreboard = state;
1187    struct set *def_uses = _mesa_pointer_set_create(scoreboard);
1188 
1189    _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
1190 
1191    /* We don't consider decl_reg to be a use to avoid extending register live
1192     * ranges any further than needed.
1193     */
1194    if (!is_decl_reg(def->parent_instr))
1195       _mesa_set_add(def_uses, def->parent_instr);
1196 
1197    nir_foreach_use(src, def) {
1198       _mesa_set_add(def_uses, nir_src_parent_instr(src));
1199    }
1200 
1201    /* XXX: Handle if uses */
1202 
1203    return true;
1204 }
1205 
1206 static nir_schedule_scoreboard *
nir_schedule_get_scoreboard(nir_shader * shader,const nir_schedule_options * options)1207 nir_schedule_get_scoreboard(nir_shader *shader,
1208                             const nir_schedule_options *options)
1209 {
1210    nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
1211 
1212    scoreboard->shader = shader;
1213    scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
1214    scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
1215    scoreboard->options = options;
1216    scoreboard->pressure = 0;
1217 
1218    nir_foreach_function_impl(impl, shader) {
1219       nir_foreach_block(block, impl) {
1220          nir_foreach_instr(instr, block) {
1221             nir_foreach_def(instr, nir_schedule_ssa_def_init_scoreboard,
1222                             scoreboard);
1223          }
1224 
1225          /* XXX: We're ignoring if uses, which may prioritize scheduling other
1226           * uses of the if src even when it doesn't help.  That's not many
1227           * values, though, so meh.
1228           */
1229       }
1230    }
1231 
1232    return scoreboard;
1233 }
1234 
1235 static void
nir_schedule_validate_uses(nir_schedule_scoreboard * scoreboard)1236 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1237 {
1238 #ifdef NDEBUG
1239    return;
1240 #endif
1241 
1242    bool any_uses = false;
1243 
1244    hash_table_foreach(scoreboard->remaining_uses, entry) {
1245       struct set *remaining_uses = entry->data;
1246 
1247       set_foreach(remaining_uses, instr_entry) {
1248          if (!any_uses) {
1249             fprintf(stderr, "Tracked uses remain after scheduling.  "
1250                             "Affected instructions: \n");
1251             any_uses = true;
1252          }
1253          nir_print_instr(instr_entry->key, stderr);
1254          fprintf(stderr, "\n");
1255       }
1256    }
1257 
1258    assert(!any_uses);
1259 }
1260 
1261 /**
1262  * Schedules the NIR instructions to try to decrease stalls (for example,
1263  * delaying texture reads) while managing register pressure.
1264  *
1265  * The threshold represents "number of NIR register/SSA def channels live
1266  * before switching the scheduling heuristic to reduce register pressure",
1267  * since most of our GPU architectures are scalar (extending to vector with a
1268  * flag wouldn't be hard).  This number should be a bit below the number of
1269  * registers available (counting any that may be occupied by system value
1270  * payload values, for example), since the heuristic may not always be able to
1271  * free a register immediately.  The amount below the limit is up to you to
1272  * tune.
1273  */
1274 void
nir_schedule(nir_shader * shader,const nir_schedule_options * options)1275 nir_schedule(nir_shader *shader,
1276              const nir_schedule_options *options)
1277 {
1278    nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1279                                                                      options);
1280 
1281    if (debug) {
1282       fprintf(stderr, "NIR shader before scheduling:\n");
1283       nir_print_shader(shader, stderr);
1284    }
1285 
1286    nir_foreach_function_impl(impl, shader) {
1287       nir_foreach_block(block, impl) {
1288          nir_schedule_block(scoreboard, block);
1289       }
1290    }
1291 
1292    nir_schedule_validate_uses(scoreboard);
1293 
1294    ralloc_free(scoreboard);
1295 }
1296