1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors (Collabora):
24 * Alyssa Rosenzweig <[email protected]>
25 */
26
27 #include "util/u_memory.h"
28 #include "bi_builder.h"
29 #include "compiler.h"
30 #include "nodearray.h"
31
32 struct lcra_state {
33 unsigned node_count;
34 uint64_t *affinity;
35
36 /* Linear constraints imposed. For each node there there is a
37 * 'nodearray' structure, which changes between a sparse and dense
38 * array depending on the number of elements.
39 *
40 * Each element is itself a bit field denoting whether (c_j - c_i) bias
41 * is present or not, including negative biases.
42 *
43 * We support up to 8 components so the bias is in range
44 * [-7, 7] encoded by a 16-bit field
45 */
46 nodearray *linear;
47
48 /* Before solving, forced registers; after solving, solutions. */
49 unsigned *solutions;
50
51 /** Node which caused register allocation to fail */
52 unsigned spill_node;
53 };
54
55 /* This module is an implementation of "Linearly Constrained
56 * Register Allocation". The paper is available in PDF form
57 * (https://people.collabora.com/~alyssa/LCRA.pdf) as well as Markdown+LaTeX
58 * (https://gitlab.freedesktop.org/alyssa/lcra/blob/master/LCRA.md)
59 */
60
61 static struct lcra_state *
lcra_alloc_equations(unsigned node_count)62 lcra_alloc_equations(unsigned node_count)
63 {
64 struct lcra_state *l = calloc(1, sizeof(*l));
65
66 l->node_count = node_count;
67
68 l->linear = calloc(sizeof(l->linear[0]), node_count);
69 l->solutions = calloc(sizeof(l->solutions[0]), node_count);
70 l->affinity = calloc(sizeof(l->affinity[0]), node_count);
71
72 memset(l->solutions, ~0, sizeof(l->solutions[0]) * node_count);
73
74 return l;
75 }
76
77 static void
lcra_free(struct lcra_state * l)78 lcra_free(struct lcra_state *l)
79 {
80 for (unsigned i = 0; i < l->node_count; ++i)
81 nodearray_reset(&l->linear[i]);
82
83 free(l->linear);
84 free(l->affinity);
85 free(l->solutions);
86 free(l);
87 }
88
89 static void
lcra_add_node_interference(struct lcra_state * l,unsigned i,unsigned cmask_i,unsigned j,unsigned cmask_j)90 lcra_add_node_interference(struct lcra_state *l, unsigned i, unsigned cmask_i,
91 unsigned j, unsigned cmask_j)
92 {
93 if (i == j)
94 return;
95
96 nodearray_value constraint_fw = 0;
97 nodearray_value constraint_bw = 0;
98
99 /* The constraint bits are reversed from lcra.c so that register
100 * allocation can be done in parallel for every possible solution,
101 * with lower-order bits representing smaller registers. */
102
103 for (unsigned D = 0; D < 8; ++D) {
104 if (cmask_i & (cmask_j << D)) {
105 constraint_fw |= (1 << (7 + D));
106 constraint_bw |= (1 << (7 - D));
107 }
108
109 if (cmask_i & (cmask_j >> D)) {
110 constraint_bw |= (1 << (7 + D));
111 constraint_fw |= (1 << (7 - D));
112 }
113 }
114
115 /* Use dense arrays after adding 256 elements */
116 nodearray_orr(&l->linear[j], i, constraint_fw, 256, l->node_count);
117 nodearray_orr(&l->linear[i], j, constraint_bw, 256, l->node_count);
118 }
119
120 static bool
lcra_test_linear(struct lcra_state * l,unsigned * solutions,unsigned i)121 lcra_test_linear(struct lcra_state *l, unsigned *solutions, unsigned i)
122 {
123 signed constant = solutions[i];
124
125 if (nodearray_is_sparse(&l->linear[i])) {
126 nodearray_sparse_foreach(&l->linear[i], elem) {
127 unsigned j = nodearray_sparse_key(elem);
128 nodearray_value constraint = nodearray_sparse_value(elem);
129
130 if (solutions[j] == ~0)
131 continue;
132
133 signed lhs = constant - solutions[j];
134
135 if (lhs < -7 || lhs > 7)
136 continue;
137
138 if (constraint & (1 << (lhs + 7)))
139 return false;
140 }
141
142 return true;
143 }
144
145 nodearray_value *row = l->linear[i].dense;
146
147 for (unsigned j = 0; j < l->node_count; ++j) {
148 if (solutions[j] == ~0)
149 continue;
150
151 signed lhs = constant - solutions[j];
152
153 if (lhs < -7 || lhs > 7)
154 continue;
155
156 if (row[j] & (1 << (lhs + 7)))
157 return false;
158 }
159
160 return true;
161 }
162
163 static bool
lcra_solve(struct lcra_state * l)164 lcra_solve(struct lcra_state *l)
165 {
166 for (unsigned step = 0; step < l->node_count; ++step) {
167 if (l->solutions[step] != ~0)
168 continue;
169 if (l->affinity[step] == 0)
170 continue;
171
172 bool succ = false;
173
174 u_foreach_bit64(r, l->affinity[step]) {
175 l->solutions[step] = r;
176
177 if (lcra_test_linear(l, l->solutions, step)) {
178 succ = true;
179 break;
180 }
181 }
182
183 /* Out of registers - prepare to spill */
184 if (!succ) {
185 l->spill_node = step;
186 return false;
187 }
188 }
189
190 return true;
191 }
192
193 /* Register spilling is implemented with a cost-benefit system. Costs are set
194 * by the user. Benefits are calculated from the constraints. */
195
196 static unsigned
lcra_count_constraints(struct lcra_state * l,unsigned i)197 lcra_count_constraints(struct lcra_state *l, unsigned i)
198 {
199 unsigned count = 0;
200 nodearray *constraints = &l->linear[i];
201
202 if (nodearray_is_sparse(constraints)) {
203 nodearray_sparse_foreach(constraints, elem)
204 count += util_bitcount(nodearray_sparse_value(elem));
205 } else {
206 nodearray_dense_foreach_64(constraints, elem)
207 count += util_bitcount64(*elem);
208 }
209
210 return count;
211 }
212
213 /* Liveness analysis is a backwards-may dataflow analysis pass. Within a block,
214 * we compute live_out from live_in. The intrablock pass is linear-time. It
215 * returns whether progress was made. */
216
217 static void
bi_liveness_ins_update_ra(uint8_t * live,bi_instr * ins)218 bi_liveness_ins_update_ra(uint8_t *live, bi_instr *ins)
219 {
220 /* live_in[s] = GEN[s] + (live_out[s] - KILL[s]) */
221
222 bi_foreach_dest(ins, d) {
223 live[ins->dest[d].value] &= ~bi_writemask(ins, d);
224 }
225
226 bi_foreach_ssa_src(ins, src) {
227 unsigned count = bi_count_read_registers(ins, src);
228 unsigned rmask = BITFIELD_MASK(count);
229
230 live[ins->src[src].value] |= (rmask << ins->src[src].offset);
231 }
232 }
233
234 static bool
liveness_block_update(bi_block * blk,unsigned temp_count)235 liveness_block_update(bi_block *blk, unsigned temp_count)
236 {
237 bool progress = false;
238
239 /* live_out[s] = sum { p in succ[s] } ( live_in[p] ) */
240 bi_foreach_successor(blk, succ) {
241 for (unsigned i = 0; i < temp_count; ++i)
242 blk->live_out[i] |= succ->live_in[i];
243 }
244
245 uint8_t *live = ralloc_array(blk, uint8_t, temp_count);
246 memcpy(live, blk->live_out, temp_count);
247
248 bi_foreach_instr_in_block_rev(blk, ins)
249 bi_liveness_ins_update_ra(live, ins);
250
251 /* To figure out progress, diff live_in */
252
253 for (unsigned i = 0; (i < temp_count) && !progress; ++i)
254 progress |= (blk->live_in[i] != live[i]);
255
256 ralloc_free(blk->live_in);
257 blk->live_in = live;
258
259 return progress;
260 }
261
262 /* Globally, liveness analysis uses a fixed-point algorithm based on a
263 * worklist. We initialize a work list with the exit block. We iterate the work
264 * list to compute live_in from live_out for each block on the work list,
265 * adding the predecessors of the block to the work list if we made progress.
266 */
267
268 static void
bi_compute_liveness_ra(bi_context * ctx)269 bi_compute_liveness_ra(bi_context *ctx)
270 {
271 u_worklist worklist;
272 bi_worklist_init(ctx, &worklist);
273
274 bi_foreach_block(ctx, block) {
275 if (block->live_in)
276 ralloc_free(block->live_in);
277
278 if (block->live_out)
279 ralloc_free(block->live_out);
280
281 block->live_in = rzalloc_array(block, uint8_t, ctx->ssa_alloc);
282 block->live_out = rzalloc_array(block, uint8_t, ctx->ssa_alloc);
283
284 bi_worklist_push_tail(&worklist, block);
285 }
286
287 while (!u_worklist_is_empty(&worklist)) {
288 /* Pop off in reverse order since liveness is backwards */
289 bi_block *blk = bi_worklist_pop_tail(&worklist);
290
291 /* Update liveness information. If we made progress, we need to
292 * reprocess the predecessors
293 */
294 if (liveness_block_update(blk, ctx->ssa_alloc)) {
295 bi_foreach_predecessor(blk, pred)
296 bi_worklist_push_head(&worklist, *pred);
297 }
298 }
299
300 u_worklist_fini(&worklist);
301 }
302
303 /* Construct an affinity mask such that the vector with `count` elements does
304 * not intersect any of the registers in the bitset `clobber`. In other words,
305 * an allocated register r needs to satisfy for each i < count: a + i != b.
306 * Equivalently that's a != b - i, so we need a \ne { b - i : i < n }. For the
307 * entire clobber set B, we need a \ne union b \in B { b - i : i < n }, where
308 * that union is the desired clobber set. That may be written equivalently as
309 * the union over i < n of (B - i), where subtraction is defined elementwise
310 * and corresponds to a shift of the entire bitset.
311 *
312 * EVEN_BITS_MASK is an affinity mask for aligned register pairs. Interpreted
313 * as a bit set, it is { x : 0 <= x < 64 if x is even }
314 */
315
316 #define EVEN_BITS_MASK (0x5555555555555555ull)
317
318 static uint64_t
bi_make_affinity(uint64_t clobber,unsigned count,bool split_file)319 bi_make_affinity(uint64_t clobber, unsigned count, bool split_file)
320 {
321 uint64_t clobbered = 0;
322
323 for (unsigned i = 0; i < count; ++i)
324 clobbered |= (clobber >> i);
325
326 /* Don't allocate past the end of the register file */
327 if (count > 1) {
328 unsigned excess = count - 1;
329 uint64_t mask = BITFIELD_MASK(excess);
330 clobbered |= mask << (64 - excess);
331
332 if (split_file)
333 clobbered |= mask << (16 - excess);
334 }
335
336 /* Don't allocate the middle if we split out the middle */
337 if (split_file)
338 clobbered |= BITFIELD64_MASK(32) << 16;
339
340 /* We can use a register iff it's not clobberred */
341 return ~clobbered;
342 }
343
344 static void
bi_mark_interference(bi_block * block,struct lcra_state * l,uint8_t * live,uint64_t preload_live,unsigned node_count,bool is_blend,bool split_file,bool aligned_sr)345 bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live,
346 uint64_t preload_live, unsigned node_count, bool is_blend,
347 bool split_file, bool aligned_sr)
348 {
349 bi_foreach_instr_in_block_rev(block, ins) {
350 /* Mark all registers live after the instruction as
351 * interfering with the destination */
352
353 bi_foreach_dest(ins, d) {
354 unsigned node = ins->dest[d].value;
355
356 /* Don't allocate to anything that's read later as a
357 * preloaded register. The affinity is the intersection
358 * of affinity masks for each write. Since writes have
359 * offsets, but the affinity is for the whole node, we
360 * need to offset the affinity opposite the write
361 * offset, so we shift right. */
362 unsigned count = bi_count_write_registers(ins, d);
363 unsigned offset = ins->dest[d].offset;
364 uint64_t affinity =
365 bi_make_affinity(preload_live, count, split_file) >> offset;
366 /* Valhall needs >= 64-bit staging writes to be pair-aligned */
367 if (aligned_sr && (count >= 2 || offset))
368 affinity &= EVEN_BITS_MASK;
369
370 l->affinity[node] &= affinity;
371
372 for (unsigned i = 0; i < node_count; ++i) {
373 uint8_t r = live[i];
374
375 /* Nodes only interfere if they occupy
376 * /different values/ at the same time
377 * (Boissinot). In particular, sources of
378 * moves do not interfere with their
379 * destinations. This enables a limited form of
380 * coalescing.
381 */
382 if (ins->op == BI_OPCODE_MOV_I32 && bi_is_ssa(ins->src[0]) &&
383 i == ins->src[0].value) {
384
385 r &= ~BITFIELD_BIT(ins->src[0].offset);
386 }
387
388 if (r) {
389 lcra_add_node_interference(l, node, bi_writemask(ins, d), i, r);
390 }
391 }
392
393 unsigned node_first = ins->dest[0].value;
394 if (d == 1) {
395 lcra_add_node_interference(l, node, bi_writemask(ins, 1),
396 node_first, bi_writemask(ins, 0));
397 }
398 }
399
400 /* Valhall needs >= 64-bit reads to be pair-aligned */
401 if (aligned_sr) {
402 bi_foreach_ssa_src(ins, s) {
403 if (bi_count_read_registers(ins, s) >= 2)
404 l->affinity[ins->src[s].value] &= EVEN_BITS_MASK;
405 }
406 }
407
408 if (!is_blend && ins->op == BI_OPCODE_BLEND) {
409 /* Blend shaders might clobber r0-r15, r48. */
410 uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48);
411
412 for (unsigned i = 0; i < node_count; ++i) {
413 if (live[i])
414 l->affinity[i] &= ~clobber;
415 }
416 }
417
418 /* Update live_in */
419 preload_live = bi_postra_liveness_ins(preload_live, ins);
420 bi_liveness_ins_update_ra(live, ins);
421 }
422
423 block->reg_live_in = preload_live;
424 }
425
426 static void
bi_compute_interference(bi_context * ctx,struct lcra_state * l,bool full_regs)427 bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs)
428 {
429 bi_compute_liveness_ra(ctx);
430 bi_postra_liveness(ctx);
431
432 bi_foreach_block_rev(ctx, blk) {
433 uint8_t *live = mem_dup(blk->live_out, ctx->ssa_alloc);
434
435 bi_mark_interference(blk, l, live, blk->reg_live_out, ctx->ssa_alloc,
436 ctx->inputs->is_blend, !full_regs, ctx->arch >= 9);
437
438 free(live);
439 }
440 }
441
442 static struct lcra_state *
bi_allocate_registers(bi_context * ctx,bool * success,bool full_regs)443 bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs)
444 {
445 struct lcra_state *l = lcra_alloc_equations(ctx->ssa_alloc);
446
447 /* Blend shaders are restricted to R0-R15. Other shaders at full
448 * occupancy also can access R48-R63. At half occupancy they can access
449 * the whole file. */
450
451 uint64_t default_affinity =
452 ctx->inputs->is_blend ? BITFIELD64_MASK(16)
453 : full_regs ? BITFIELD64_MASK(64)
454 : (BITFIELD64_MASK(16) | (BITFIELD64_MASK(16) << 48));
455
456 /* To test spilling, mimic a small register file */
457 if (bifrost_debug & BIFROST_DBG_SPILL && !ctx->inputs->is_blend)
458 default_affinity &= BITFIELD64_MASK(48) << 8;
459
460 bi_foreach_instr_global(ctx, ins) {
461 bi_foreach_dest(ins, d)
462 l->affinity[ins->dest[d].value] = default_affinity;
463
464 /* Blend shaders expect the src colour to be in r0-r3 */
465 if (ins->op == BI_OPCODE_BLEND && !ctx->inputs->is_blend) {
466 assert(bi_is_ssa(ins->src[0]));
467 l->solutions[ins->src[0].value] = 0;
468
469 /* Dual source blend input in r4-r7 */
470 if (bi_is_ssa(ins->src[4]))
471 l->solutions[ins->src[4].value] = 4;
472
473 /* Writes to R48 */
474 if (!bi_is_null(ins->dest[0]))
475 l->solutions[ins->dest[0].value] = 48;
476 }
477
478 /* Coverage mask writes stay in R60 */
479 if ((ins->op == BI_OPCODE_ATEST || ins->op == BI_OPCODE_ZS_EMIT) &&
480 !bi_is_null(ins->dest[0])) {
481 l->solutions[ins->dest[0].value] = 60;
482 }
483
484 /* Experimentally, it seems coverage masks inputs to ATEST must
485 * be in R60. Otherwise coverage mask writes do not work with
486 * early-ZS with pixel-frequency-shading (this combination of
487 * settings is legal if depth/stencil writes are disabled).
488 */
489 if (ins->op == BI_OPCODE_ATEST) {
490 assert(bi_is_ssa(ins->src[0]));
491 l->solutions[ins->src[0].value] = 60;
492 }
493 }
494
495 bi_compute_interference(ctx, l, full_regs);
496
497 /* Coalesce register moves if we're allowed. We need to be careful due
498 * to the restricted affinity induced by the blend shader ABI.
499 */
500 bi_foreach_instr_global(ctx, I) {
501 if (I->op != BI_OPCODE_MOV_I32)
502 continue;
503 if (I->src[0].type != BI_INDEX_REGISTER)
504 continue;
505
506 unsigned reg = I->src[0].value;
507 unsigned node = I->dest[0].value;
508
509 if (l->solutions[node] != ~0)
510 continue;
511
512 uint64_t affinity = l->affinity[node];
513
514 if (ctx->inputs->is_blend) {
515 /* We're allowed to coalesce the moves to these */
516 affinity |= BITFIELD64_BIT(48);
517 affinity |= BITFIELD64_BIT(60);
518 }
519
520 /* Try to coalesce */
521 if (affinity & BITFIELD64_BIT(reg)) {
522 l->solutions[node] = reg;
523
524 if (!lcra_test_linear(l, l->solutions, node))
525 l->solutions[node] = ~0;
526 }
527 }
528
529 *success = lcra_solve(l);
530
531 return l;
532 }
533
534 static bi_index
bi_reg_from_index(bi_context * ctx,struct lcra_state * l,bi_index index)535 bi_reg_from_index(bi_context *ctx, struct lcra_state *l, bi_index index)
536 {
537 /* Offsets can only be applied when we register allocated an index, or
538 * alternatively for FAU's encoding */
539
540 ASSERTED bool is_offset = (index.offset > 0) && (index.type != BI_INDEX_FAU);
541
542 /* Did we run RA for this index at all */
543 if (!bi_is_ssa(index)) {
544 assert(!is_offset);
545 return index;
546 }
547
548 /* LCRA didn't bother solving this index (how lazy!) */
549 signed solution = l->solutions[index.value];
550 if (solution < 0) {
551 assert(!is_offset);
552 return index;
553 }
554
555 /* todo: do we want to compose with the subword swizzle? */
556 bi_index new_index = bi_register(solution + index.offset);
557 new_index.swizzle = index.swizzle;
558 new_index.abs = index.abs;
559 new_index.neg = index.neg;
560 return new_index;
561 }
562
563 /* Dual texture instructions write to two sets of staging registers, modeled as
564 * two destinations in the IR. The first set is communicated with the usual
565 * staging register mechanism. The second set is encoded in the texture
566 * operation descriptor. This is quite unusual, and requires the following late
567 * fixup.
568 */
569 static void
bi_fixup_dual_tex_register(bi_instr * I)570 bi_fixup_dual_tex_register(bi_instr *I)
571 {
572 assert(I->dest[1].type == BI_INDEX_REGISTER);
573 assert(I->src[3].type == BI_INDEX_CONSTANT);
574
575 struct bifrost_dual_texture_operation desc = {
576 .secondary_register = I->dest[1].value,
577 };
578
579 I->src[3].value |= bi_dual_tex_as_u32(desc);
580 }
581
582 static void
bi_install_registers(bi_context * ctx,struct lcra_state * l)583 bi_install_registers(bi_context *ctx, struct lcra_state *l)
584 {
585 bi_foreach_instr_global(ctx, ins) {
586 bi_foreach_dest(ins, d)
587 ins->dest[d] = bi_reg_from_index(ctx, l, ins->dest[d]);
588
589 bi_foreach_src(ins, s)
590 ins->src[s] = bi_reg_from_index(ctx, l, ins->src[s]);
591
592 if (ins->op == BI_OPCODE_TEXC_DUAL)
593 bi_fixup_dual_tex_register(ins);
594 }
595 }
596
597 static void
bi_rewrite_index_src_single(bi_instr * ins,bi_index old,bi_index new)598 bi_rewrite_index_src_single(bi_instr *ins, bi_index old, bi_index new)
599 {
600 bi_foreach_src(ins, i) {
601 if (bi_is_equiv(ins->src[i], old)) {
602 ins->src[i].type = new.type;
603 ins->src[i].value = new.value;
604 }
605 }
606 }
607
608 /* If register allocation fails, find the best spill node */
609
610 static signed
bi_choose_spill_node(bi_context * ctx,struct lcra_state * l)611 bi_choose_spill_node(bi_context *ctx, struct lcra_state *l)
612 {
613 /* Pick a node satisfying bi_spill_register's preconditions */
614 BITSET_WORD *no_spill =
615 calloc(sizeof(BITSET_WORD), BITSET_WORDS(l->node_count));
616
617 bi_foreach_instr_global(ctx, ins) {
618 bi_foreach_dest(ins, d) {
619 /* Don't allow spilling coverage mask writes because the
620 * register preload logic assumes it will stay in R60.
621 * This could be optimized.
622 */
623 if (ins->no_spill || ins->op == BI_OPCODE_ATEST ||
624 ins->op == BI_OPCODE_ZS_EMIT ||
625 (ins->op == BI_OPCODE_MOV_I32 &&
626 ins->src[0].type == BI_INDEX_REGISTER &&
627 ins->src[0].value == 60)) {
628 BITSET_SET(no_spill, ins->dest[d].value);
629 }
630 }
631 }
632
633 unsigned best_benefit = 0.0;
634 signed best_node = -1;
635
636 if (nodearray_is_sparse(&l->linear[l->spill_node])) {
637 nodearray_sparse_foreach(&l->linear[l->spill_node], elem) {
638 unsigned i = nodearray_sparse_key(elem);
639 unsigned constraint = nodearray_sparse_value(elem);
640
641 /* Only spill nodes that interfere with the node failing
642 * register allocation. It's pointless to spill anything else */
643 if (!constraint)
644 continue;
645
646 if (BITSET_TEST(no_spill, i))
647 continue;
648
649 unsigned benefit = lcra_count_constraints(l, i);
650
651 if (benefit > best_benefit) {
652 best_benefit = benefit;
653 best_node = i;
654 }
655 }
656 } else {
657 nodearray_value *row = l->linear[l->spill_node].dense;
658
659 for (unsigned i = 0; i < l->node_count; ++i) {
660 /* Only spill nodes that interfere with the node failing
661 * register allocation. It's pointless to spill anything else */
662 if (!row[i])
663 continue;
664
665 if (BITSET_TEST(no_spill, i))
666 continue;
667
668 unsigned benefit = lcra_count_constraints(l, i);
669
670 if (benefit > best_benefit) {
671 best_benefit = benefit;
672 best_node = i;
673 }
674 }
675 }
676
677 free(no_spill);
678 return best_node;
679 }
680
681 static unsigned
bi_count_read_index(bi_instr * I,bi_index index)682 bi_count_read_index(bi_instr *I, bi_index index)
683 {
684 unsigned max = 0;
685
686 bi_foreach_src(I, s) {
687 if (bi_is_equiv(I->src[s], index)) {
688 unsigned count = bi_count_read_registers(I, s);
689 max = MAX2(max, count + I->src[s].offset);
690 }
691 }
692
693 return max;
694 }
695
696 /*
697 * Wrappers to emit loads/stores to thread-local storage in an appropriate way
698 * for the target, so the spill/fill code becomes architecture-independent.
699 */
700
701 static bi_index
bi_tls_ptr(bool hi)702 bi_tls_ptr(bool hi)
703 {
704 return bi_fau(BIR_FAU_TLS_PTR, hi);
705 }
706
707 static bi_instr *
bi_load_tl(bi_builder * b,unsigned bits,bi_index src,unsigned offset)708 bi_load_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
709 {
710 if (b->shader->arch >= 9) {
711 return bi_load_to(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true),
712 BI_SEG_TL, offset);
713 } else {
714 return bi_load_to(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL,
715 0);
716 }
717 }
718
719 static void
bi_store_tl(bi_builder * b,unsigned bits,bi_index src,unsigned offset)720 bi_store_tl(bi_builder *b, unsigned bits, bi_index src, unsigned offset)
721 {
722 if (b->shader->arch >= 9) {
723 bi_store(b, bits, src, bi_tls_ptr(false), bi_tls_ptr(true), BI_SEG_TL,
724 offset);
725 } else {
726 bi_store(b, bits, src, bi_imm_u32(offset), bi_zero(), BI_SEG_TL, 0);
727 }
728 }
729
730 /* Once we've chosen a spill node, spill it and returns bytes spilled */
731
732 static unsigned
bi_spill_register(bi_context * ctx,bi_index index,uint32_t offset)733 bi_spill_register(bi_context *ctx, bi_index index, uint32_t offset)
734 {
735 bi_builder b = {.shader = ctx};
736 unsigned channels = 0;
737
738 /* Spill after every store, fill before every load */
739 bi_foreach_instr_global_safe(ctx, I) {
740 bi_foreach_dest(I, d) {
741 if (!bi_is_equiv(I->dest[d], index))
742 continue;
743
744 unsigned extra = I->dest[d].offset;
745 bi_index tmp = bi_temp(ctx);
746
747 I->dest[d] = bi_replace_index(I->dest[d], tmp);
748 I->no_spill = true;
749
750 unsigned count = bi_count_write_registers(I, d);
751 unsigned bits = count * 32;
752
753 b.cursor = bi_after_instr(I);
754 bi_store_tl(&b, bits, tmp, offset + 4 * extra);
755
756 ctx->spills++;
757 channels = MAX2(channels, extra + count);
758 }
759
760 if (bi_has_arg(I, index)) {
761 b.cursor = bi_before_instr(I);
762 bi_index tmp = bi_temp(ctx);
763
764 unsigned bits = bi_count_read_index(I, index) * 32;
765 bi_rewrite_index_src_single(I, index, tmp);
766
767 bi_instr *ld = bi_load_tl(&b, bits, tmp, offset);
768 ld->no_spill = true;
769 ctx->fills++;
770 }
771 }
772
773 return (channels * 4);
774 }
775
776 /*
777 * For transition, lower collects and splits before RA, rather than after RA.
778 * LCRA knows how to deal with offsets (broken SSA), but not how to coalesce
779 * these vector moves.
780 */
781 static void
bi_lower_vector(bi_context * ctx,unsigned first_reg)782 bi_lower_vector(bi_context *ctx, unsigned first_reg)
783 {
784 bi_index *remap = calloc(ctx->ssa_alloc, sizeof(bi_index));
785
786 bi_foreach_instr_global_safe(ctx, I) {
787 bi_builder b = bi_init_builder(ctx, bi_after_instr(I));
788
789 if (I->op == BI_OPCODE_SPLIT_I32) {
790 bi_index src = I->src[0];
791 assert(src.offset == 0);
792
793 bi_foreach_dest(I, i) {
794 src.offset = i;
795 bi_mov_i32_to(&b, I->dest[i], src);
796
797 if (I->dest[i].value < first_reg)
798 remap[I->dest[i].value] = src;
799 }
800
801 bi_remove_instruction(I);
802 } else if (I->op == BI_OPCODE_COLLECT_I32) {
803 bi_index dest = I->dest[0];
804 assert(dest.offset == 0);
805 assert(((dest.value < first_reg) || I->nr_srcs == 1) &&
806 "nir_lower_phis_to_scalar");
807
808 bi_foreach_src(I, i) {
809 if (bi_is_null(I->src[i]))
810 continue;
811
812 dest.offset = i;
813 bi_mov_i32_to(&b, dest, I->src[i]);
814 }
815
816 bi_remove_instruction(I);
817 }
818 }
819
820 bi_foreach_instr_global(ctx, I) {
821 bi_foreach_ssa_src(I, s) {
822 if (I->src[s].value < first_reg && !bi_is_null(remap[I->src[s].value]))
823 bi_replace_src(I, s, remap[I->src[s].value]);
824 }
825 }
826
827 free(remap);
828
829 /* After generating a pile of moves, clean up */
830 bi_compute_liveness_ra(ctx);
831
832 bi_foreach_block_rev(ctx, block) {
833 uint8_t *live = rzalloc_array(block, uint8_t, ctx->ssa_alloc);
834
835 bi_foreach_successor(block, succ) {
836 for (unsigned i = 0; i < ctx->ssa_alloc; ++i)
837 live[i] |= succ->live_in[i];
838 }
839
840 bi_foreach_instr_in_block_safe_rev(block, ins) {
841 bool all_null = true;
842
843 bi_foreach_dest(ins, d) {
844 if (live[ins->dest[d].value] & bi_writemask(ins, d))
845 all_null = false;
846 }
847
848 if (all_null && !bi_side_effects(ins))
849 bi_remove_instruction(ins);
850 else
851 bi_liveness_ins_update_ra(live, ins);
852 }
853
854 ralloc_free(block->live_in);
855 block->live_in = live;
856 }
857 }
858
859 /*
860 * Check if the instruction requires a "tied" operand. Such instructions MUST
861 * allocate their source and destination to the same register. This is a
862 * constraint on RA, and may require extra moves.
863 *
864 * In particular, this is the case for Bifrost instructions that both read and
865 * write with the staging register mechanism.
866 */
867 static bool
bi_is_tied(const bi_instr * I)868 bi_is_tied(const bi_instr *I)
869 {
870 return (I->op == BI_OPCODE_TEXC || I->op == BI_OPCODE_TEXC_DUAL ||
871 I->op == BI_OPCODE_ATOM_RETURN_I32 || I->op == BI_OPCODE_AXCHG_I32 ||
872 I->op == BI_OPCODE_ACMPXCHG_I32) &&
873 !bi_is_null(I->src[0]);
874 }
875
876 /*
877 * For transition, coalesce tied operands together, as LCRA knows how to handle
878 * non-SSA operands but doesn't know about tied operands.
879 *
880 * This breaks the SSA form of the program, but that doesn't matter for LCRA.
881 */
882 static void
bi_coalesce_tied(bi_context * ctx)883 bi_coalesce_tied(bi_context *ctx)
884 {
885 bi_foreach_instr_global(ctx, I) {
886 if (!bi_is_tied(I))
887 continue;
888
889 bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
890 unsigned n = bi_count_read_registers(I, 0);
891
892 for (unsigned i = 0; i < n; ++i) {
893 bi_index dst = I->dest[0], src = I->src[0];
894
895 assert(dst.offset == 0 && src.offset == 0);
896 dst.offset = src.offset = i;
897
898 bi_mov_i32_to(&b, dst, src);
899 }
900
901 bi_replace_src(I, 0, I->dest[0]);
902 }
903 }
904
905 static unsigned
find_or_allocate_temp(unsigned * map,unsigned value,unsigned * alloc)906 find_or_allocate_temp(unsigned *map, unsigned value, unsigned *alloc)
907 {
908 if (!map[value])
909 map[value] = ++(*alloc);
910
911 assert(map[value]);
912 return map[value] - 1;
913 }
914
915 /* Reassigns numbering to get rid of gaps in the indices and to prioritize
916 * smaller register classes */
917
918 static void
squeeze_index(bi_context * ctx)919 squeeze_index(bi_context *ctx)
920 {
921 unsigned *map = rzalloc_array(ctx, unsigned, ctx->ssa_alloc);
922 ctx->ssa_alloc = 0;
923
924 bi_foreach_instr_global(ctx, I) {
925 bi_foreach_dest(I, d)
926 I->dest[d].value =
927 find_or_allocate_temp(map, I->dest[d].value, &ctx->ssa_alloc);
928
929 bi_foreach_ssa_src(I, s)
930 I->src[s].value =
931 find_or_allocate_temp(map, I->src[s].value, &ctx->ssa_alloc);
932 }
933
934 ralloc_free(map);
935 }
936
937 /*
938 * Brainless out-of-SSA pass. The eventual goal is to go out-of-SSA after RA and
939 * coalesce implicitly with biased colouring in a tree scan allocator. For now,
940 * this should be good enough for LCRA.
941 */
942 static unsigned
bi_out_of_ssa(bi_context * ctx)943 bi_out_of_ssa(bi_context *ctx)
944 {
945 bi_index zero = bi_fau(BIR_FAU_IMMEDIATE | 0, false);
946 unsigned first_reg = ctx->ssa_alloc;
947
948 /* Trivially lower phis */
949 bi_foreach_block(ctx, block) {
950 bi_foreach_instr_in_block_safe(block, I) {
951 if (I->op != BI_OPCODE_PHI)
952 break;
953
954 /* Assign a register for the phi */
955 bi_index reg = bi_temp(ctx);
956 assert(reg.value >= first_reg);
957
958 /* Lower to a move in each predecessor. The destinations
959 * cannot interfere so these can be sequentialized
960 * in arbitrary order.
961 */
962 bi_foreach_predecessor(block, pred) {
963 bi_builder b = bi_init_builder(ctx, bi_after_block_logical(*pred));
964 unsigned i = bi_predecessor_index(block, *pred);
965
966 assert(!I->src[i].abs);
967 assert(!I->src[i].neg);
968 assert(I->src[i].swizzle == BI_SWIZZLE_H01);
969
970 /* MOV of immediate needs lowering on Valhall */
971 if (ctx->arch >= 9 && I->src[i].type == BI_INDEX_CONSTANT)
972 bi_iadd_imm_i32_to(&b, reg, zero, I->src[i].value);
973 else
974 bi_mov_i32_to(&b, reg, I->src[i]);
975 }
976
977 /* Replace the phi with a move */
978 bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
979 bi_mov_i32_to(&b, I->dest[0], reg);
980 bi_remove_instruction(I);
981
982 /* Propagate that move within the block. The destination
983 * is SSA and the source is not written in this block,
984 * so this is legal. The move itself will be DCE'd if
985 * possible in the next pass.
986 */
987 bi_foreach_instr_in_block_rev(block, prop) {
988 if (prop->op == BI_OPCODE_PHI)
989 break;
990
991 bi_foreach_src(prop, s) {
992 if (bi_is_equiv(prop->src[s], I->dest[0])) {
993 bi_replace_src(prop, s, reg);
994 }
995 }
996 }
997 }
998 }
999
1000 /* Try to locally propagate the moves we created. We need to be extra
1001 * careful because we're not in SSA at this point, as such this
1002 * algorithm is quadratic. This will go away when we go out of SSA after
1003 * RA.
1004 */
1005 BITSET_WORD *used =
1006 calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc));
1007 BITSET_WORD *multiple_uses =
1008 calloc(sizeof(BITSET_WORD), BITSET_WORDS(ctx->ssa_alloc));
1009
1010 bi_foreach_instr_global(ctx, I) {
1011 bi_foreach_ssa_src(I, s) {
1012 if (BITSET_TEST(used, I->src[s].value))
1013 BITSET_SET(multiple_uses, I->src[s].value);
1014 else
1015 BITSET_SET(used, I->src[s].value);
1016 }
1017 }
1018
1019 bi_foreach_block(ctx, block) {
1020 bi_foreach_instr_in_block_safe_rev(block, mov) {
1021 /* Match "reg = ssa" */
1022 if (mov->op != BI_OPCODE_MOV_I32)
1023 continue;
1024 if (mov->dest[0].type != BI_INDEX_NORMAL)
1025 continue;
1026 if (mov->dest[0].value < first_reg)
1027 continue;
1028 if (!bi_is_ssa(mov->src[0]))
1029 continue;
1030 if (mov->src[0].value >= first_reg)
1031 continue;
1032 if (BITSET_TEST(multiple_uses, mov->src[0].value))
1033 continue;
1034
1035 bool found = false;
1036
1037 /* Look locally for the write of the SSA */
1038 bi_foreach_instr_in_block_rev(block, I) {
1039 bool bail = false;
1040
1041 bi_foreach_src(I, s) {
1042 /* Bail: write-after-read */
1043 if (bi_is_equiv(I->src[s], mov->dest[0]))
1044 bail = true;
1045 }
1046
1047 if (bail)
1048 break;
1049
1050 bi_foreach_dest(I, d) {
1051 /* Bail: write-after-write */
1052 if (bi_is_equiv(I->dest[d], mov->dest[0]))
1053 break;
1054
1055 if (!bi_is_equiv(I->dest[d], mov->src[0]))
1056 continue;
1057
1058 /* We found it, replace */
1059 I->dest[d] = bi_replace_index(I->dest[d], mov->dest[0]);
1060 found = true;
1061 break;
1062 }
1063
1064 if (found)
1065 break;
1066 }
1067
1068 if (found)
1069 bi_remove_instruction(mov);
1070 }
1071 }
1072
1073 free(used);
1074 free(multiple_uses);
1075 return first_reg;
1076 }
1077
1078 void
bi_register_allocate(bi_context * ctx)1079 bi_register_allocate(bi_context *ctx)
1080 {
1081 struct lcra_state *l = NULL;
1082 bool success = false;
1083
1084 unsigned iter_count = 1000; /* max iterations */
1085
1086 /* Number of bytes of memory we've spilled into */
1087 unsigned spill_count = ctx->info.tls_size;
1088
1089 if (ctx->arch >= 9)
1090 va_lower_split_64bit(ctx);
1091
1092 /* Lower tied operands. SSA is broken from here on. */
1093 unsigned first_reg = bi_out_of_ssa(ctx);
1094 bi_lower_vector(ctx, first_reg);
1095 bi_coalesce_tied(ctx);
1096 squeeze_index(ctx);
1097
1098 /* Try with reduced register pressure to improve thread count */
1099 if (ctx->arch >= 7) {
1100 l = bi_allocate_registers(ctx, &success, false);
1101
1102 if (success) {
1103 ctx->info.work_reg_count = 32;
1104 } else {
1105 lcra_free(l);
1106 l = NULL;
1107 }
1108 }
1109
1110 /* Otherwise, use the register file and spill until we succeed */
1111 while (!success && ((iter_count--) > 0)) {
1112 l = bi_allocate_registers(ctx, &success, true);
1113
1114 if (success) {
1115 ctx->info.work_reg_count = 64;
1116 } else {
1117 signed spill_node = bi_choose_spill_node(ctx, l);
1118 lcra_free(l);
1119 l = NULL;
1120
1121 if (spill_node == -1)
1122 unreachable("Failed to choose spill node\n");
1123
1124 if (ctx->inputs->is_blend)
1125 unreachable("Blend shaders may not spill");
1126
1127 /* By default, we use packed TLS addressing on Valhall.
1128 * We cannot cross 16 byte boundaries with packed TLS
1129 * addressing. Align to ensure this doesn't happen. This
1130 * could be optimized a bit.
1131 */
1132 if (ctx->arch >= 9)
1133 spill_count = ALIGN_POT(spill_count, 16);
1134
1135 spill_count +=
1136 bi_spill_register(ctx, bi_get_index(spill_node), spill_count);
1137
1138 /* In case the spill affected an instruction with tied
1139 * operands, we need to fix up.
1140 */
1141 bi_coalesce_tied(ctx);
1142 }
1143 }
1144
1145 assert(success);
1146 assert(l != NULL);
1147
1148 ctx->info.tls_size = spill_count;
1149 bi_install_registers(ctx, l);
1150
1151 lcra_free(l);
1152 }
1153