1 /*
2 * Copyright © 2019 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "nir_schedule.h"
25 #include "util/dag.h"
26 #include "util/u_dynarray.h"
27
28 /** @file
29 *
30 * Implements basic-block-level prepass instruction scheduling in NIR to
31 * manage register pressure.
32 *
33 * This is based on the Goodman/Hsu paper (1988, cached copy at
34 * https://people.freedesktop.org/~anholt/scheduling-goodman-hsu.pdf). We
35 * make up the DDG for NIR (which can be mostly done using the NIR def/use
36 * chains for SSA instructions, plus some edges for ordering register writes
37 * vs reads, and some more for ordering intrinsics). Then we pick heads off
38 * of the DDG using their heuristic to emit the NIR instructions back into the
39 * block in their new order.
40 *
41 * The hard case for prepass scheduling on GPUs seems to always be consuming
42 * texture/ubo results. The register pressure heuristic doesn't want to pick
43 * an instr that starts consuming texture results because it usually won't be
44 * the only usage, so that instruction will increase pressure.
45 *
46 * If you try to force consumption of tex results always, then in a case where
47 * single sample is used for many outputs, you'll end up picking every other
48 * user and expanding register pressure. The partially_evaluated_path flag
49 * helps tremendously, in that if you happen for whatever reason to pick a
50 * texture sample's output, then you'll try to finish off that sample. Future
51 * work may include doing some local search before locking in a choice, to try
52 * to more reliably find the case where just a few choices going against the
53 * heuristic can manage to free the whole vector.
54 */
55
56 static bool debug;
57
58 /**
59 * Represents a node in the DDG for a NIR instruction.
60 */
61 typedef struct {
62 struct dag_node dag; /* must be first for our u_dynarray_foreach */
63 nir_instr *instr;
64 bool partially_evaluated_path;
65
66 /* Approximate estimate of the delay between starting this instruction and
67 * its results being available.
68 *
69 * Accuracy is not too important, given that we're prepass scheduling here
70 * and just trying to reduce excess dependencies introduced by a register
71 * allocator by stretching out the live intervals of expensive
72 * instructions.
73 */
74 uint32_t delay;
75
76 /* Cost of the maximum-delay path from this node to the leaves. */
77 uint32_t max_delay;
78
79 /* scoreboard->time value when this instruction can be scheduled without
80 * any stalls expected.
81 */
82 uint32_t ready_time;
83 } nir_schedule_node;
84
85 typedef struct {
86 struct dag *dag;
87
88 nir_shader *shader;
89
90 /* Mapping from nir_def * to a struct set of
91 * instructions remaining to be scheduled using the register.
92 */
93 struct hash_table *remaining_uses;
94
95 /* Map from nir_instr to nir_schedule_node * */
96 struct hash_table *instr_map;
97
98 /* Set of nir_def * that have had any instruction scheduled on them. */
99 struct set *live_values;
100
101 /* An abstract approximation of the number of nir_scheduler_node->delay
102 * units since the start of the shader.
103 */
104 uint32_t time;
105
106 /* Number of channels currently used by the NIR instructions that have been
107 * scheduled.
108 */
109 int pressure;
110
111 /* Options specified by the backend */
112 const nir_schedule_options *options;
113 } nir_schedule_scoreboard;
114
115 /* When walking the instructions in reverse, we use this flag to swap
116 * before/after in add_dep().
117 */
118 enum direction { F,
119 R };
120
121 struct nir_schedule_class_dep {
122 int klass;
123 nir_schedule_node *node;
124 struct nir_schedule_class_dep *next;
125 };
126
127 typedef struct {
128 nir_schedule_scoreboard *scoreboard;
129
130 /* Map from registers to nir_schedule_node * */
131 struct hash_table *reg_map;
132
133 /* Scheduler nodes for last instruction involved in some class of dependency.
134 */
135 nir_schedule_node *load_input;
136 nir_schedule_node *store_shared;
137 nir_schedule_node *unknown_intrinsic;
138 nir_schedule_node *discard;
139 nir_schedule_node *jump;
140
141 struct nir_schedule_class_dep *class_deps;
142
143 enum direction dir;
144 } nir_deps_state;
145
146 static void *
_mesa_hash_table_search_data(struct hash_table * ht,void * key)147 _mesa_hash_table_search_data(struct hash_table *ht, void *key)
148 {
149 struct hash_entry *entry = _mesa_hash_table_search(ht, key);
150 if (!entry)
151 return NULL;
152 return entry->data;
153 }
154
155 static nir_schedule_node *
nir_schedule_get_node(struct hash_table * instr_map,nir_instr * instr)156 nir_schedule_get_node(struct hash_table *instr_map, nir_instr *instr)
157 {
158 return _mesa_hash_table_search_data(instr_map, instr);
159 }
160
161 static struct set *
nir_schedule_scoreboard_get_reg(nir_schedule_scoreboard * scoreboard,nir_def * reg)162 nir_schedule_scoreboard_get_reg(nir_schedule_scoreboard *scoreboard,
163 nir_def *reg)
164 {
165 return _mesa_hash_table_search_data(scoreboard->remaining_uses, reg);
166 }
167
168 static struct set *
nir_schedule_scoreboard_get_src(nir_schedule_scoreboard * scoreboard,nir_src * src)169 nir_schedule_scoreboard_get_src(nir_schedule_scoreboard *scoreboard, nir_src *src)
170 {
171 return _mesa_hash_table_search_data(scoreboard->remaining_uses, src->ssa);
172 }
173
174 static int
nir_schedule_reg_pressure(nir_def * reg)175 nir_schedule_reg_pressure(nir_def *reg)
176 {
177 nir_intrinsic_instr *decl = nir_reg_get_decl(reg);
178 return nir_intrinsic_num_components(decl);
179 }
180
181 static int
nir_schedule_def_pressure(nir_def * def)182 nir_schedule_def_pressure(nir_def *def)
183 {
184 return def->num_components;
185 }
186
187 static int
nir_schedule_src_pressure(nir_src * src)188 nir_schedule_src_pressure(nir_src *src)
189 {
190 return nir_schedule_def_pressure(src->ssa);
191 }
192
193 /**
194 * Adds a dependency such that @after must appear in the final program after
195 * @before.
196 *
197 * We add @before as a child of @after, so that DAG heads are the outputs of
198 * the program and we make our scheduling decisions bottom to top.
199 */
200 static void
add_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)201 add_dep(nir_deps_state *state,
202 nir_schedule_node *before,
203 nir_schedule_node *after)
204 {
205 if (!before || !after)
206 return;
207
208 assert(before != after);
209
210 if (state->dir == F)
211 dag_add_edge(&before->dag, &after->dag, 0);
212 else
213 dag_add_edge(&after->dag, &before->dag, 0);
214 }
215
216 static void
add_read_dep(nir_deps_state * state,nir_schedule_node * before,nir_schedule_node * after)217 add_read_dep(nir_deps_state *state,
218 nir_schedule_node *before,
219 nir_schedule_node *after)
220 {
221 add_dep(state, before, after);
222 }
223
224 static void
add_write_dep(nir_deps_state * state,nir_schedule_node ** before,nir_schedule_node * after)225 add_write_dep(nir_deps_state *state,
226 nir_schedule_node **before,
227 nir_schedule_node *after)
228 {
229 add_dep(state, *before, after);
230 *before = after;
231 }
232
233 static void
nir_schedule_load_reg_deps(nir_intrinsic_instr * load,nir_deps_state * state)234 nir_schedule_load_reg_deps(nir_intrinsic_instr *load,
235 nir_deps_state *state)
236 {
237 nir_def *reg = load->src[0].ssa;
238 (void)nir_reg_get_decl(reg);
239
240 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, reg);
241 if (!entry)
242 return;
243 nir_schedule_node *dst_n = entry->data;
244
245 nir_schedule_node *src_n =
246 nir_schedule_get_node(state->scoreboard->instr_map, &load->instr);
247
248 add_dep(state, dst_n, src_n);
249 }
250
251 static void
nir_schedule_store_reg_deps(nir_intrinsic_instr * store,nir_deps_state * state)252 nir_schedule_store_reg_deps(nir_intrinsic_instr *store,
253 nir_deps_state *state)
254 {
255 nir_def *reg = store->src[1].ssa;
256 (void)nir_reg_get_decl(reg);
257
258 nir_schedule_node *dest_n =
259 nir_schedule_get_node(state->scoreboard->instr_map, &store->instr);
260
261 struct hash_entry *entry = _mesa_hash_table_search(state->reg_map, reg);
262 if (!entry) {
263 _mesa_hash_table_insert(state->reg_map, reg, dest_n);
264 return;
265 }
266 nir_schedule_node **before = (nir_schedule_node **)&entry->data;
267
268 add_write_dep(state, before, dest_n);
269 }
270
271 static bool
nir_schedule_ssa_deps(nir_def * def,void * in_state)272 nir_schedule_ssa_deps(nir_def *def, void *in_state)
273 {
274 nir_deps_state *state = in_state;
275 struct hash_table *instr_map = state->scoreboard->instr_map;
276 nir_schedule_node *def_n = nir_schedule_get_node(instr_map, def->parent_instr);
277
278 nir_foreach_use(src, def) {
279 nir_schedule_node *use_n = nir_schedule_get_node(instr_map,
280 nir_src_parent_instr(src));
281
282 add_read_dep(state, def_n, use_n);
283 }
284
285 return true;
286 }
287
288 static struct nir_schedule_class_dep *
nir_schedule_get_class_dep(nir_deps_state * state,int klass)289 nir_schedule_get_class_dep(nir_deps_state *state,
290 int klass)
291 {
292 for (struct nir_schedule_class_dep *class_dep = state->class_deps;
293 class_dep != NULL;
294 class_dep = class_dep->next) {
295 if (class_dep->klass == klass)
296 return class_dep;
297 }
298
299 struct nir_schedule_class_dep *class_dep =
300 ralloc(state->reg_map, struct nir_schedule_class_dep);
301
302 class_dep->klass = klass;
303 class_dep->node = NULL;
304 class_dep->next = state->class_deps;
305
306 state->class_deps = class_dep;
307
308 return class_dep;
309 }
310
311 static void
nir_schedule_intrinsic_deps(nir_deps_state * state,nir_intrinsic_instr * instr)312 nir_schedule_intrinsic_deps(nir_deps_state *state,
313 nir_intrinsic_instr *instr)
314 {
315 nir_schedule_node *n = nir_schedule_get_node(state->scoreboard->instr_map,
316 &instr->instr);
317 const nir_schedule_options *options = state->scoreboard->options;
318 nir_schedule_dependency dep;
319
320 if (options->intrinsic_cb &&
321 options->intrinsic_cb(instr, &dep, options->intrinsic_cb_data)) {
322 struct nir_schedule_class_dep *class_dep =
323 nir_schedule_get_class_dep(state, dep.klass);
324
325 switch (dep.type) {
326 case NIR_SCHEDULE_READ_DEPENDENCY:
327 add_read_dep(state, class_dep->node, n);
328 break;
329 case NIR_SCHEDULE_WRITE_DEPENDENCY:
330 add_write_dep(state, &class_dep->node, n);
331 break;
332 }
333 }
334
335 switch (instr->intrinsic) {
336 case nir_intrinsic_decl_reg:
337 break; /* Nothing to do */
338
339 case nir_intrinsic_load_reg:
340 nir_schedule_load_reg_deps(instr, state);
341 break;
342
343 case nir_intrinsic_store_reg:
344 nir_schedule_store_reg_deps(instr, state);
345 break;
346
347 case nir_intrinsic_load_uniform:
348 case nir_intrinsic_load_ubo:
349 case nir_intrinsic_load_front_face:
350 break;
351
352 case nir_intrinsic_demote:
353 case nir_intrinsic_demote_if:
354 case nir_intrinsic_terminate:
355 case nir_intrinsic_terminate_if:
356 /* We are adding two dependencies:
357 *
358 * * A individual one that we could use to add a read_dep while handling
359 * nir_instr_type_tex
360 *
361 * * Include it on the unknown intrinsic set, as we want discard to be
362 * serialized in in the same order relative to intervening stores or
363 * atomic accesses to SSBOs and images
364 */
365 add_write_dep(state, &state->discard, n);
366 add_write_dep(state, &state->unknown_intrinsic, n);
367 break;
368
369 case nir_intrinsic_store_output:
370 /* For some hardware and stages, output stores affect the same shared
371 * memory as input loads.
372 */
373 if ((state->scoreboard->options->stages_with_shared_io_memory &
374 (1 << state->scoreboard->shader->info.stage)))
375 add_write_dep(state, &state->load_input, n);
376
377 /* Make sure that preceding discards stay before the store_output */
378 add_read_dep(state, state->discard, n);
379
380 break;
381
382 case nir_intrinsic_load_input:
383 case nir_intrinsic_load_per_primitive_input:
384 case nir_intrinsic_load_per_vertex_input:
385 add_read_dep(state, state->load_input, n);
386 break;
387
388 case nir_intrinsic_load_shared:
389 case nir_intrinsic_load_shared2_amd:
390 /* Don't move load_shared beyond a following store_shared, as it could
391 * change their value
392 */
393 add_read_dep(state, state->store_shared, n);
394 break;
395
396 case nir_intrinsic_shared_atomic:
397 case nir_intrinsic_shared_atomic_swap:
398 case nir_intrinsic_shared_append_amd:
399 case nir_intrinsic_shared_consume_amd:
400 case nir_intrinsic_store_shared:
401 case nir_intrinsic_store_shared2_amd:
402 add_write_dep(state, &state->store_shared, n);
403 break;
404
405 case nir_intrinsic_barrier: {
406 const nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
407
408 if (modes & nir_var_mem_shared)
409 add_write_dep(state, &state->store_shared, n);
410
411 /* Serialize against other categories. */
412 add_write_dep(state, &state->unknown_intrinsic, n);
413
414 break;
415 }
416
417 case nir_intrinsic_ddx:
418 case nir_intrinsic_ddx_fine:
419 case nir_intrinsic_ddx_coarse:
420 case nir_intrinsic_ddy:
421 case nir_intrinsic_ddy_fine:
422 case nir_intrinsic_ddy_coarse:
423 /* Match the old behaviour. TODO: Is this correct with discards? */
424 break;
425
426 default:
427 /* Attempt to handle other intrinsics that we haven't individually
428 * categorized by serializing them in the same order relative to each
429 * other.
430 */
431 add_write_dep(state, &state->unknown_intrinsic, n);
432 break;
433 }
434 }
435
436 /**
437 * Common code for dependencies that need to be tracked both forward and
438 * backward.
439 *
440 * This is for things like "all reads of r4 have to happen between the r4
441 * writes that surround them".
442 */
443 static void
nir_schedule_calculate_deps(nir_deps_state * state,nir_schedule_node * n)444 nir_schedule_calculate_deps(nir_deps_state *state, nir_schedule_node *n)
445 {
446 nir_instr *instr = n->instr;
447
448 /* For NIR SSA defs, we only need to do a single pass of making the uses
449 * depend on the def.
450 */
451 if (state->dir == F)
452 nir_foreach_def(instr, nir_schedule_ssa_deps, state);
453
454 /* Make sure any other instructions keep their positions relative to
455 * jumps.
456 */
457 if (instr->type != nir_instr_type_jump)
458 add_read_dep(state, state->jump, n);
459
460 switch (instr->type) {
461 case nir_instr_type_undef:
462 case nir_instr_type_load_const:
463 case nir_instr_type_alu:
464 case nir_instr_type_deref:
465 case nir_instr_type_debug_info:
466 break;
467
468 case nir_instr_type_tex:
469 /* Don't move texture ops before a discard, as that could increase
470 * memory bandwidth for reading the discarded samples.
471 */
472 add_read_dep(state, state->discard, n);
473 break;
474
475 case nir_instr_type_jump:
476 add_write_dep(state, &state->jump, n);
477 break;
478
479 case nir_instr_type_call:
480 unreachable("Calls should have been lowered");
481 break;
482
483 case nir_instr_type_parallel_copy:
484 unreachable("Parallel copies should have been lowered");
485 break;
486
487 case nir_instr_type_phi:
488 unreachable("nir_schedule() should be called after lowering from SSA");
489 break;
490
491 case nir_instr_type_intrinsic:
492 nir_schedule_intrinsic_deps(state, nir_instr_as_intrinsic(instr));
493 break;
494 }
495 }
496
497 static void
calculate_forward_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)498 calculate_forward_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
499 {
500 nir_deps_state state = {
501 .scoreboard = scoreboard,
502 .dir = F,
503 .reg_map = _mesa_pointer_hash_table_create(NULL),
504 };
505
506 nir_foreach_instr(instr, block) {
507 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
508 instr);
509 nir_schedule_calculate_deps(&state, node);
510 }
511
512 ralloc_free(state.reg_map);
513 }
514
515 static void
calculate_reverse_deps(nir_schedule_scoreboard * scoreboard,nir_block * block)516 calculate_reverse_deps(nir_schedule_scoreboard *scoreboard, nir_block *block)
517 {
518 nir_deps_state state = {
519 .scoreboard = scoreboard,
520 .dir = R,
521 .reg_map = _mesa_pointer_hash_table_create(NULL),
522 };
523
524 nir_foreach_instr_reverse(instr, block) {
525 nir_schedule_node *node = nir_schedule_get_node(scoreboard->instr_map,
526 instr);
527 nir_schedule_calculate_deps(&state, node);
528 }
529
530 ralloc_free(state.reg_map);
531 }
532
533 typedef struct {
534 nir_schedule_scoreboard *scoreboard;
535 int regs_freed;
536 } nir_schedule_regs_freed_state;
537
538 static bool
nir_schedule_regs_freed_src_cb(nir_src * src,void * in_state)539 nir_schedule_regs_freed_src_cb(nir_src *src, void *in_state)
540 {
541 nir_schedule_regs_freed_state *state = in_state;
542 nir_schedule_scoreboard *scoreboard = state->scoreboard;
543 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
544
545 if (remaining_uses->entries == 1 &&
546 _mesa_set_search(remaining_uses, nir_src_parent_instr(src))) {
547 state->regs_freed += nir_schedule_src_pressure(src);
548 }
549
550 return true;
551 }
552
553 static bool
nir_schedule_regs_freed_def_cb(nir_def * def,void * in_state)554 nir_schedule_regs_freed_def_cb(nir_def *def, void *in_state)
555 {
556 nir_schedule_regs_freed_state *state = in_state;
557
558 state->regs_freed -= nir_schedule_def_pressure(def);
559
560 return true;
561 }
562
563 static void
nir_schedule_regs_freed_load_reg(nir_intrinsic_instr * load,nir_schedule_regs_freed_state * state)564 nir_schedule_regs_freed_load_reg(nir_intrinsic_instr *load,
565 nir_schedule_regs_freed_state *state)
566 {
567 assert(nir_is_load_reg(load));
568
569 if (load->intrinsic == nir_intrinsic_load_reg_indirect)
570 nir_schedule_regs_freed_src_cb(&load->src[1], state);
571
572 nir_schedule_scoreboard *scoreboard = state->scoreboard;
573 nir_def *reg = load->src[0].ssa;
574 struct set *remaining_uses = nir_schedule_scoreboard_get_reg(scoreboard, reg);
575
576 if (remaining_uses->entries == 1 &&
577 _mesa_set_search(remaining_uses, &load->instr)) {
578 state->regs_freed += nir_schedule_reg_pressure(reg);
579 }
580
581 nir_schedule_regs_freed_def_cb(&load->def, state);
582 }
583
584 static void
nir_schedule_regs_freed_store_reg(nir_intrinsic_instr * store,nir_schedule_regs_freed_state * state)585 nir_schedule_regs_freed_store_reg(nir_intrinsic_instr *store,
586 nir_schedule_regs_freed_state *state)
587 {
588 assert(nir_is_store_reg(store));
589
590 nir_schedule_regs_freed_src_cb(&store->src[0], state);
591 if (store->intrinsic == nir_intrinsic_store_reg_indirect)
592 nir_schedule_regs_freed_src_cb(&store->src[2], state);
593
594 nir_schedule_scoreboard *scoreboard = state->scoreboard;
595 nir_def *reg = store->src[1].ssa;
596
597 /* Only the first def of a reg counts against register pressure. */
598 if (!_mesa_set_search(scoreboard->live_values, reg))
599 state->regs_freed -= nir_schedule_reg_pressure(reg);
600 }
601
602 static bool
nir_schedule_regs_freed_reg_intrin(nir_instr * instr,nir_schedule_regs_freed_state * state)603 nir_schedule_regs_freed_reg_intrin(nir_instr *instr,
604 nir_schedule_regs_freed_state *state)
605 {
606 if (instr->type != nir_instr_type_intrinsic)
607 return false;
608
609 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
610 switch (intrin->intrinsic) {
611 case nir_intrinsic_decl_reg:
612 return true; /* Handled but nothing to do */
613
614 case nir_intrinsic_load_reg:
615 case nir_intrinsic_load_reg_indirect:
616 nir_schedule_regs_freed_load_reg(intrin, state);
617 return true;
618
619 case nir_intrinsic_store_reg:
620 case nir_intrinsic_store_reg_indirect:
621 nir_schedule_regs_freed_store_reg(intrin, state);
622 return true;
623
624 default:
625 return false;
626 }
627 }
628
629 static int
nir_schedule_regs_freed(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)630 nir_schedule_regs_freed(nir_schedule_scoreboard *scoreboard, nir_schedule_node *n)
631 {
632 nir_schedule_regs_freed_state state = {
633 .scoreboard = scoreboard,
634 };
635
636 if (!nir_schedule_regs_freed_reg_intrin(n->instr, &state)) {
637 nir_foreach_src(n->instr, nir_schedule_regs_freed_src_cb, &state);
638 nir_foreach_def(n->instr, nir_schedule_regs_freed_def_cb, &state);
639 }
640
641 return state.regs_freed;
642 }
643
644 /**
645 * Chooses an instruction that will minimise the register pressure as much as
646 * possible. This should only be used as a fallback when the regular scheduling
647 * generates a shader whose register allocation fails.
648 */
649 static nir_schedule_node *
nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard * scoreboard)650 nir_schedule_choose_instruction_fallback(nir_schedule_scoreboard *scoreboard)
651 {
652 nir_schedule_node *chosen = NULL;
653
654 /* Find the leader in the ready (shouldn't-stall) set with the mininum
655 * cost.
656 */
657 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
658 if (scoreboard->time < n->ready_time)
659 continue;
660
661 if (!chosen || chosen->max_delay > n->max_delay)
662 chosen = n;
663 }
664 if (chosen) {
665 if (debug) {
666 fprintf(stderr, "chose (ready fallback): ");
667 nir_print_instr(chosen->instr, stderr);
668 fprintf(stderr, "\n");
669 }
670
671 return chosen;
672 }
673
674 /* Otherwise, choose the leader with the minimum cost. */
675 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
676 if (!chosen || chosen->max_delay > n->max_delay)
677 chosen = n;
678 }
679 if (debug) {
680 fprintf(stderr, "chose (leader fallback): ");
681 nir_print_instr(chosen->instr, stderr);
682 fprintf(stderr, "\n");
683 }
684
685 return chosen;
686 }
687
688 /**
689 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSP (Code
690 * Scheduling for Parallelism) heuristic.
691 *
692 * Picks an instruction on the critical that's ready to execute without
693 * stalls, if possible, otherwise picks the instruction on the critical path.
694 */
695 static nir_schedule_node *
nir_schedule_choose_instruction_csp(nir_schedule_scoreboard * scoreboard)696 nir_schedule_choose_instruction_csp(nir_schedule_scoreboard *scoreboard)
697 {
698 nir_schedule_node *chosen = NULL;
699
700 /* Find the leader in the ready (shouldn't-stall) set with the maximum
701 * cost.
702 */
703 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
704 if (scoreboard->time < n->ready_time)
705 continue;
706
707 if (!chosen || chosen->max_delay < n->max_delay)
708 chosen = n;
709 }
710 if (chosen) {
711 if (debug) {
712 fprintf(stderr, "chose (ready): ");
713 nir_print_instr(chosen->instr, stderr);
714 fprintf(stderr, "\n");
715 }
716
717 return chosen;
718 }
719
720 /* Otherwise, choose the leader with the maximum cost. */
721 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
722 if (!chosen || chosen->max_delay < n->max_delay)
723 chosen = n;
724 }
725 if (debug) {
726 fprintf(stderr, "chose (leader): ");
727 nir_print_instr(chosen->instr, stderr);
728 fprintf(stderr, "\n");
729 }
730
731 return chosen;
732 }
733
734 /**
735 * Chooses an instruction to schedule using the Goodman/Hsu (1988) CSR (Code
736 * Scheduling for Register pressure) heuristic.
737 */
738 static nir_schedule_node *
nir_schedule_choose_instruction_csr(nir_schedule_scoreboard * scoreboard)739 nir_schedule_choose_instruction_csr(nir_schedule_scoreboard *scoreboard)
740 {
741 nir_schedule_node *chosen = NULL;
742
743 /* Find a ready inst with regs freed and pick the one with max cost. */
744 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
745 if (n->ready_time > scoreboard->time)
746 continue;
747
748 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
749
750 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
751 chosen = n;
752 }
753 }
754 if (chosen) {
755 if (debug) {
756 fprintf(stderr, "chose (freed+ready): ");
757 nir_print_instr(chosen->instr, stderr);
758 fprintf(stderr, "\n");
759 }
760
761 return chosen;
762 }
763
764 /* Find a leader with regs freed and pick the one with max cost. */
765 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
766 int regs_freed = nir_schedule_regs_freed(scoreboard, n);
767
768 if (regs_freed > 0 && (!chosen || chosen->max_delay < n->max_delay)) {
769 chosen = n;
770 }
771 }
772 if (chosen) {
773 if (debug) {
774 fprintf(stderr, "chose (regs freed): ");
775 nir_print_instr(chosen->instr, stderr);
776 fprintf(stderr, "\n");
777 }
778
779 return chosen;
780 }
781
782 /* Find a partially evaluated path and try to finish it off */
783 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
784 if (n->partially_evaluated_path &&
785 (!chosen || chosen->max_delay < n->max_delay)) {
786 chosen = n;
787 }
788 }
789 if (chosen) {
790 if (debug) {
791 fprintf(stderr, "chose (partial path): ");
792 nir_print_instr(chosen->instr, stderr);
793 fprintf(stderr, "\n");
794 }
795
796 return chosen;
797 }
798
799 /* Contra the paper, pick a leader with no effect on used regs. This may
800 * open up new opportunities, as otherwise a single-operand instr consuming
801 * a value will tend to block finding freeing that value. This had a
802 * massive effect on reducing spilling on V3D.
803 *
804 * XXX: Should this prioritize ready?
805 */
806 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
807 if (nir_schedule_regs_freed(scoreboard, n) != 0)
808 continue;
809
810 if (!chosen || chosen->max_delay < n->max_delay)
811 chosen = n;
812 }
813 if (chosen) {
814 if (debug) {
815 fprintf(stderr, "chose (regs no-op): ");
816 nir_print_instr(chosen->instr, stderr);
817 fprintf(stderr, "\n");
818 }
819
820 return chosen;
821 }
822
823 /* Pick the max delay of the remaining ready set. */
824 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
825 if (n->ready_time > scoreboard->time)
826 continue;
827 if (!chosen || chosen->max_delay < n->max_delay)
828 chosen = n;
829 }
830 if (chosen) {
831 if (debug) {
832 fprintf(stderr, "chose (ready max delay): ");
833 nir_print_instr(chosen->instr, stderr);
834 fprintf(stderr, "\n");
835 }
836 return chosen;
837 }
838
839 /* Pick the max delay of the remaining leaders. */
840 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
841 if (!chosen || chosen->max_delay < n->max_delay)
842 chosen = n;
843 }
844
845 if (debug) {
846 fprintf(stderr, "chose (max delay): ");
847 nir_print_instr(chosen->instr, stderr);
848 fprintf(stderr, "\n");
849 }
850
851 return chosen;
852 }
853
854 static void
dump_state(nir_schedule_scoreboard * scoreboard)855 dump_state(nir_schedule_scoreboard *scoreboard)
856 {
857 list_for_each_entry(nir_schedule_node, n, &scoreboard->dag->heads, dag.link) {
858 fprintf(stderr, "maxdel %5d ", n->max_delay);
859 nir_print_instr(n->instr, stderr);
860 fprintf(stderr, "\n");
861
862 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
863 nir_schedule_node *child = (nir_schedule_node *)edge->child;
864
865 fprintf(stderr, " -> (%d parents) ", child->dag.parent_count);
866 nir_print_instr(child->instr, stderr);
867 fprintf(stderr, "\n");
868 }
869 }
870 }
871
872 static void
nir_schedule_mark_use(nir_schedule_scoreboard * scoreboard,void * reg_or_def,nir_instr * reg_or_def_parent,int pressure)873 nir_schedule_mark_use(nir_schedule_scoreboard *scoreboard,
874 void *reg_or_def,
875 nir_instr *reg_or_def_parent,
876 int pressure)
877 {
878 /* Make the value live if it's the first time it's been used. */
879 if (!_mesa_set_search(scoreboard->live_values, reg_or_def)) {
880 _mesa_set_add(scoreboard->live_values, reg_or_def);
881 scoreboard->pressure += pressure;
882 }
883
884 /* Make the value dead if it's the last remaining use. Be careful when one
885 * instruction uses a value twice to not decrement pressure twice.
886 */
887 struct set *remaining_uses =
888 _mesa_hash_table_search_data(scoreboard->remaining_uses, reg_or_def);
889 struct set_entry *entry = _mesa_set_search(remaining_uses, reg_or_def_parent);
890 if (entry) {
891 _mesa_set_remove(remaining_uses, entry);
892
893 if (remaining_uses->entries == 0)
894 scoreboard->pressure -= pressure;
895 }
896 }
897
898 static bool
nir_schedule_mark_src_scheduled(nir_src * src,void * state)899 nir_schedule_mark_src_scheduled(nir_src *src, void *state)
900 {
901 nir_schedule_scoreboard *scoreboard = state;
902 struct set *remaining_uses = nir_schedule_scoreboard_get_src(scoreboard, src);
903
904 struct set_entry *entry = _mesa_set_search(remaining_uses,
905 nir_src_parent_instr(src));
906 if (entry) {
907 /* Once we've used an SSA value in one instruction, bump the priority of
908 * the other uses so the SSA value can get fully consumed.
909 *
910 * We don't do this for registers, and it's would be a hassle and it's
911 * unclear if that would help or not. Also, skip it for constants, as
912 * they're often folded as immediates into backend instructions and have
913 * many unrelated instructions all referencing the same value (0).
914 */
915 if (src->ssa->parent_instr->type != nir_instr_type_load_const) {
916 nir_foreach_use(other_src, src->ssa) {
917 if (nir_src_parent_instr(other_src) == nir_src_parent_instr(src))
918 continue;
919
920 nir_schedule_node *n =
921 nir_schedule_get_node(scoreboard->instr_map,
922 nir_src_parent_instr(other_src));
923
924 if (n && !n->partially_evaluated_path) {
925 if (debug) {
926 fprintf(stderr, " New partially evaluated path: ");
927 nir_print_instr(n->instr, stderr);
928 fprintf(stderr, "\n");
929 }
930
931 n->partially_evaluated_path = true;
932 }
933 }
934 }
935 }
936
937 nir_schedule_mark_use(scoreboard,
938 (void *)src->ssa,
939 nir_src_parent_instr(src),
940 nir_schedule_src_pressure(src));
941
942 return true;
943 }
944
945 static bool
nir_schedule_mark_def_scheduled(nir_def * def,void * state)946 nir_schedule_mark_def_scheduled(nir_def *def, void *state)
947 {
948 nir_schedule_scoreboard *scoreboard = state;
949
950 nir_schedule_mark_use(scoreboard, def, def->parent_instr,
951 nir_schedule_def_pressure(def));
952
953 return true;
954 }
955
956 static void
nir_schedule_mark_load_reg_scheduled(nir_intrinsic_instr * load,nir_schedule_scoreboard * scoreboard)957 nir_schedule_mark_load_reg_scheduled(nir_intrinsic_instr *load,
958 nir_schedule_scoreboard *scoreboard)
959 {
960 assert(nir_is_load_reg(load));
961 nir_def *reg = load->src[0].ssa;
962
963 if (load->intrinsic == nir_intrinsic_load_reg_indirect)
964 nir_schedule_mark_src_scheduled(&load->src[1], scoreboard);
965
966 nir_schedule_mark_use(scoreboard, reg, &load->instr,
967 nir_schedule_reg_pressure(reg));
968
969 nir_schedule_mark_def_scheduled(&load->def, scoreboard);
970 }
971
972 static void
nir_schedule_mark_store_reg_scheduled(nir_intrinsic_instr * store,nir_schedule_scoreboard * scoreboard)973 nir_schedule_mark_store_reg_scheduled(nir_intrinsic_instr *store,
974 nir_schedule_scoreboard *scoreboard)
975 {
976 assert(nir_is_store_reg(store));
977 nir_def *reg = store->src[1].ssa;
978
979 nir_schedule_mark_src_scheduled(&store->src[0], scoreboard);
980 if (store->intrinsic == nir_intrinsic_store_reg_indirect)
981 nir_schedule_mark_src_scheduled(&store->src[2], scoreboard);
982
983 /* XXX: This is not actually accurate for regs -- the last use of a reg may
984 * have a live interval that extends across control flow. We should
985 * calculate the live ranges of regs, and have scheduler nodes for the CF
986 * nodes that also "use" the reg.
987 */
988 nir_schedule_mark_use(scoreboard, reg, &store->instr,
989 nir_schedule_reg_pressure(reg));
990 }
991
992 static bool
nir_schedule_mark_reg_intrin_scheduled(nir_instr * instr,nir_schedule_scoreboard * scoreboard)993 nir_schedule_mark_reg_intrin_scheduled(nir_instr *instr,
994 nir_schedule_scoreboard *scoreboard)
995 {
996 if (instr->type != nir_instr_type_intrinsic)
997 return false;
998
999 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1000 switch (intrin->intrinsic) {
1001 case nir_intrinsic_decl_reg:
1002 return true; /* Handled but nothing to do */
1003
1004 case nir_intrinsic_load_reg:
1005 case nir_intrinsic_load_reg_indirect:
1006 nir_schedule_mark_load_reg_scheduled(intrin, scoreboard);
1007 return true;
1008
1009 case nir_intrinsic_store_reg:
1010 case nir_intrinsic_store_reg_indirect:
1011 nir_schedule_mark_store_reg_scheduled(intrin, scoreboard);
1012 return true;
1013
1014 default:
1015 return false;
1016 }
1017 }
1018
1019 static void
nir_schedule_mark_node_scheduled(nir_schedule_scoreboard * scoreboard,nir_schedule_node * n)1020 nir_schedule_mark_node_scheduled(nir_schedule_scoreboard *scoreboard,
1021 nir_schedule_node *n)
1022 {
1023 if (!nir_schedule_mark_reg_intrin_scheduled(n->instr, scoreboard)) {
1024 nir_foreach_src(n->instr, nir_schedule_mark_src_scheduled, scoreboard);
1025 nir_foreach_def(n->instr, nir_schedule_mark_def_scheduled, scoreboard);
1026 }
1027
1028 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1029 nir_schedule_node *child = (nir_schedule_node *)edge->child;
1030
1031 child->ready_time = MAX2(child->ready_time,
1032 scoreboard->time + n->delay);
1033
1034 if (child->dag.parent_count == 1) {
1035 if (debug) {
1036 fprintf(stderr, " New DAG head: ");
1037 nir_print_instr(child->instr, stderr);
1038 fprintf(stderr, "\n");
1039 }
1040 }
1041 }
1042
1043 dag_prune_head(scoreboard->dag, &n->dag);
1044
1045 scoreboard->time = MAX2(n->ready_time, scoreboard->time);
1046 scoreboard->time++;
1047 }
1048
1049 static void
nir_schedule_instructions(nir_schedule_scoreboard * scoreboard,nir_block * block)1050 nir_schedule_instructions(nir_schedule_scoreboard *scoreboard, nir_block *block)
1051 {
1052 while (!list_is_empty(&scoreboard->dag->heads)) {
1053 if (debug) {
1054 fprintf(stderr, "current list:\n");
1055 dump_state(scoreboard);
1056 }
1057
1058 nir_schedule_node *chosen;
1059 if (scoreboard->options->fallback)
1060 chosen = nir_schedule_choose_instruction_fallback(scoreboard);
1061 else if (scoreboard->pressure < scoreboard->options->threshold)
1062 chosen = nir_schedule_choose_instruction_csp(scoreboard);
1063 else
1064 chosen = nir_schedule_choose_instruction_csr(scoreboard);
1065
1066 /* Now that we've scheduled a new instruction, some of its children may
1067 * be promoted to the list of instructions ready to be scheduled.
1068 */
1069 nir_schedule_mark_node_scheduled(scoreboard, chosen);
1070
1071 /* Move the instruction to the end (so our first chosen instructions are
1072 * the start of the program).
1073 */
1074 exec_node_remove(&chosen->instr->node);
1075 exec_list_push_tail(&block->instr_list, &chosen->instr->node);
1076
1077 if (debug)
1078 fprintf(stderr, "\n");
1079 }
1080 }
1081
1082 static uint32_t
nir_schedule_get_delay(nir_schedule_scoreboard * scoreboard,nir_instr * instr)1083 nir_schedule_get_delay(nir_schedule_scoreboard *scoreboard, nir_instr *instr)
1084 {
1085 if (scoreboard->options->instr_delay_cb) {
1086 void *cb_data = scoreboard->options->instr_delay_cb_data;
1087 return scoreboard->options->instr_delay_cb(instr, cb_data);
1088 }
1089
1090 switch (instr->type) {
1091 case nir_instr_type_undef:
1092 case nir_instr_type_load_const:
1093 case nir_instr_type_alu:
1094 case nir_instr_type_deref:
1095 case nir_instr_type_jump:
1096 case nir_instr_type_parallel_copy:
1097 case nir_instr_type_call:
1098 case nir_instr_type_phi:
1099 case nir_instr_type_debug_info:
1100 return 1;
1101
1102 case nir_instr_type_intrinsic:
1103 switch (nir_instr_as_intrinsic(instr)->intrinsic) {
1104 case nir_intrinsic_decl_reg:
1105 case nir_intrinsic_load_reg:
1106 case nir_intrinsic_store_reg:
1107 return 0;
1108 case nir_intrinsic_load_ubo:
1109 case nir_intrinsic_load_ssbo:
1110 case nir_intrinsic_load_scratch:
1111 case nir_intrinsic_load_shared:
1112 case nir_intrinsic_image_load:
1113 return 50;
1114 default:
1115 return 1;
1116 }
1117 break;
1118
1119 case nir_instr_type_tex:
1120 /* Pick some large number to try to fetch textures early and sample them
1121 * late.
1122 */
1123 return 100;
1124 }
1125
1126 return 0;
1127 }
1128
1129 static void
nir_schedule_dag_max_delay_cb(struct dag_node * node,void * state)1130 nir_schedule_dag_max_delay_cb(struct dag_node *node, void *state)
1131 {
1132 nir_schedule_node *n = (nir_schedule_node *)node;
1133 uint32_t max_delay = 0;
1134
1135 util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
1136 nir_schedule_node *child = (nir_schedule_node *)edge->child;
1137 max_delay = MAX2(child->max_delay, max_delay);
1138 }
1139
1140 n->max_delay = MAX2(n->max_delay, max_delay + n->delay);
1141 }
1142
1143 static void
nir_schedule_block(nir_schedule_scoreboard * scoreboard,nir_block * block)1144 nir_schedule_block(nir_schedule_scoreboard *scoreboard, nir_block *block)
1145 {
1146 void *mem_ctx = ralloc_context(NULL);
1147 scoreboard->instr_map = _mesa_pointer_hash_table_create(mem_ctx);
1148
1149 scoreboard->dag = dag_create(mem_ctx);
1150
1151 nir_foreach_instr(instr, block) {
1152 nir_schedule_node *n =
1153 rzalloc(mem_ctx, nir_schedule_node);
1154
1155 n->instr = instr;
1156 n->delay = nir_schedule_get_delay(scoreboard, instr);
1157 dag_init_node(scoreboard->dag, &n->dag);
1158
1159 _mesa_hash_table_insert(scoreboard->instr_map, instr, n);
1160 }
1161
1162 calculate_forward_deps(scoreboard, block);
1163 calculate_reverse_deps(scoreboard, block);
1164
1165 dag_traverse_bottom_up(scoreboard->dag, nir_schedule_dag_max_delay_cb, NULL);
1166
1167 nir_schedule_instructions(scoreboard, block);
1168
1169 ralloc_free(mem_ctx);
1170 scoreboard->instr_map = NULL;
1171 }
1172
1173 static bool
is_decl_reg(nir_instr * instr)1174 is_decl_reg(nir_instr *instr)
1175 {
1176 if (instr->type != nir_instr_type_intrinsic)
1177 return false;
1178
1179 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
1180 return intrin->intrinsic == nir_intrinsic_decl_reg;
1181 }
1182
1183 static bool
nir_schedule_ssa_def_init_scoreboard(nir_def * def,void * state)1184 nir_schedule_ssa_def_init_scoreboard(nir_def *def, void *state)
1185 {
1186 nir_schedule_scoreboard *scoreboard = state;
1187 struct set *def_uses = _mesa_pointer_set_create(scoreboard);
1188
1189 _mesa_hash_table_insert(scoreboard->remaining_uses, def, def_uses);
1190
1191 /* We don't consider decl_reg to be a use to avoid extending register live
1192 * ranges any further than needed.
1193 */
1194 if (!is_decl_reg(def->parent_instr))
1195 _mesa_set_add(def_uses, def->parent_instr);
1196
1197 nir_foreach_use(src, def) {
1198 _mesa_set_add(def_uses, nir_src_parent_instr(src));
1199 }
1200
1201 /* XXX: Handle if uses */
1202
1203 return true;
1204 }
1205
1206 static nir_schedule_scoreboard *
nir_schedule_get_scoreboard(nir_shader * shader,const nir_schedule_options * options)1207 nir_schedule_get_scoreboard(nir_shader *shader,
1208 const nir_schedule_options *options)
1209 {
1210 nir_schedule_scoreboard *scoreboard = rzalloc(NULL, nir_schedule_scoreboard);
1211
1212 scoreboard->shader = shader;
1213 scoreboard->live_values = _mesa_pointer_set_create(scoreboard);
1214 scoreboard->remaining_uses = _mesa_pointer_hash_table_create(scoreboard);
1215 scoreboard->options = options;
1216 scoreboard->pressure = 0;
1217
1218 nir_foreach_function_impl(impl, shader) {
1219 nir_foreach_block(block, impl) {
1220 nir_foreach_instr(instr, block) {
1221 nir_foreach_def(instr, nir_schedule_ssa_def_init_scoreboard,
1222 scoreboard);
1223 }
1224
1225 /* XXX: We're ignoring if uses, which may prioritize scheduling other
1226 * uses of the if src even when it doesn't help. That's not many
1227 * values, though, so meh.
1228 */
1229 }
1230 }
1231
1232 return scoreboard;
1233 }
1234
1235 static void
nir_schedule_validate_uses(nir_schedule_scoreboard * scoreboard)1236 nir_schedule_validate_uses(nir_schedule_scoreboard *scoreboard)
1237 {
1238 #ifdef NDEBUG
1239 return;
1240 #endif
1241
1242 bool any_uses = false;
1243
1244 hash_table_foreach(scoreboard->remaining_uses, entry) {
1245 struct set *remaining_uses = entry->data;
1246
1247 set_foreach(remaining_uses, instr_entry) {
1248 if (!any_uses) {
1249 fprintf(stderr, "Tracked uses remain after scheduling. "
1250 "Affected instructions: \n");
1251 any_uses = true;
1252 }
1253 nir_print_instr(instr_entry->key, stderr);
1254 fprintf(stderr, "\n");
1255 }
1256 }
1257
1258 assert(!any_uses);
1259 }
1260
1261 /**
1262 * Schedules the NIR instructions to try to decrease stalls (for example,
1263 * delaying texture reads) while managing register pressure.
1264 *
1265 * The threshold represents "number of NIR register/SSA def channels live
1266 * before switching the scheduling heuristic to reduce register pressure",
1267 * since most of our GPU architectures are scalar (extending to vector with a
1268 * flag wouldn't be hard). This number should be a bit below the number of
1269 * registers available (counting any that may be occupied by system value
1270 * payload values, for example), since the heuristic may not always be able to
1271 * free a register immediately. The amount below the limit is up to you to
1272 * tune.
1273 */
1274 void
nir_schedule(nir_shader * shader,const nir_schedule_options * options)1275 nir_schedule(nir_shader *shader,
1276 const nir_schedule_options *options)
1277 {
1278 nir_schedule_scoreboard *scoreboard = nir_schedule_get_scoreboard(shader,
1279 options);
1280
1281 if (debug) {
1282 fprintf(stderr, "NIR shader before scheduling:\n");
1283 nir_print_shader(shader, stderr);
1284 }
1285
1286 nir_foreach_function_impl(impl, shader) {
1287 nir_foreach_block(block, impl) {
1288 nir_schedule_block(scoreboard, block);
1289 }
1290 }
1291
1292 nir_schedule_validate_uses(scoreboard);
1293
1294 ralloc_free(scoreboard);
1295 }
1296