1 /*
2 * Copyright © 2021 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "ir3_compiler.h"
7 #include "ir3_ra.h"
8 #include "util/ralloc.h"
9
10 /* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
11 * of parallelcopy's to trivially turn the program into CSSA form. Then we
12 * try to "merge" SSA def's into "merge sets" which could be allocated to a
13 * single register in order to eliminate copies. First we merge phi nodes,
14 * which should always succeed because of the parallelcopy's we inserted, and
15 * then we try to coalesce the copies we introduced.
16 *
17 * The merged registers are used for three purposes:
18 *
19 * 1. We always use the same pvtmem slot for spilling all SSA defs in each
20 * merge set. This prevents us from having to insert memory-to-memory copies
21 * in the spiller and makes sure we don't insert unecessary copies.
22 * 2. When two values are live at the same time, part of the same merge
23 * set, and they overlap each other in the merge set, they always occupy
24 * overlapping physical registers in RA. This reduces register pressure and
25 * copies in several important scenarios:
26 * - When sources of a collect are used later by something else, we don't
27 * have to introduce copies.
28 * - We can handle sequences of extracts that "explode" a vector into its
29 * components without any additional copying.
30 * 3. We use the merge sets for affinities in register allocation: That is, we
31 * try to allocate all the definitions in the same merge set to the
32 * same/compatible registers. This helps us e.g. allocate sources of a collect
33 * to contiguous registers without too much special code in RA.
34 *
35 * In a "normal" register allocator, or when spilling, we'd just merge
36 * registers in the same merge set to the same register, but with SSA-based
37 * register allocation we may have to split the live interval.
38 *
39 * The implementation is based on "Revisiting Out-of-SSA Translation for
40 * Correctness, CodeQuality, and Efficiency," and is broadly similar to the
41 * implementation in nir_from_ssa, with the twist that we also try to coalesce
42 * META_SPLIT and META_COLLECT. This makes this pass more complicated but
43 * prevents us from needing to handle these specially in RA and the spiller,
44 * which are already complicated enough. This also forces us to implement that
45 * value-comparison optimization they explain, as without it we wouldn't be
46 * able to coalesce META_SPLIT even in the simplest of cases.
47 */
48
49 /* In order to dynamically reconstruct the dominance forest, we need the
50 * instructions ordered by a preorder traversal of the dominance tree:
51 */
52
53 static unsigned
index_instrs(struct ir3_block * block,unsigned index)54 index_instrs(struct ir3_block *block, unsigned index)
55 {
56 foreach_instr (instr, &block->instr_list)
57 instr->ip = index++;
58
59 for (unsigned i = 0; i < block->dom_children_count; i++)
60 index = index_instrs(block->dom_children[i], index);
61
62 return index;
63 }
64
65 void
ir3_index_instrs_for_merge_sets(struct ir3 * ir)66 ir3_index_instrs_for_merge_sets(struct ir3 *ir)
67 {
68 index_instrs(ir3_start_block(ir), 0);
69 }
70
71 /* Definitions within a merge set are ordered by instr->ip as set above: */
72
73 static bool
def_after(struct ir3_register * a,struct ir3_register * b)74 def_after(struct ir3_register *a, struct ir3_register *b)
75 {
76 return a->instr->ip > b->instr->ip;
77 }
78
79 static bool
def_dominates(struct ir3_register * a,struct ir3_register * b)80 def_dominates(struct ir3_register *a, struct ir3_register *b)
81 {
82 if (def_after(a, b)) {
83 return false;
84 } else if (a->instr->block == b->instr->block) {
85 return def_after(b, a);
86 } else {
87 return ir3_block_dominates(a->instr->block, b->instr->block);
88 }
89 }
90
91 /* This represents a region inside a register. The offset is relative to the
92 * start of the register, and offset + size <= size(reg).
93 */
94 struct def_value {
95 struct ir3_register *reg;
96 unsigned offset, size;
97 };
98
99 /* Chase any copies to get the source of a region inside a register. This is
100 * Value(a) in the paper.
101 */
102 static struct def_value
chase_copies(struct def_value value)103 chase_copies(struct def_value value)
104 {
105 while (true) {
106 struct ir3_instruction *instr = value.reg->instr;
107 if (instr->opc == OPC_META_SPLIT) {
108 value.offset += instr->split.off * reg_elem_size(value.reg);
109 value.reg = instr->srcs[0]->def;
110 } else if (instr->opc == OPC_META_COLLECT) {
111 if (value.offset % reg_elem_size(value.reg) != 0 ||
112 value.size > reg_elem_size(value.reg) ||
113 value.offset + value.size > reg_size(value.reg))
114 break;
115 struct ir3_register *src =
116 instr->srcs[value.offset / reg_elem_size(value.reg)];
117 if (!src->def)
118 break;
119 value.offset = 0;
120 value.reg = src->def;
121 } else {
122 /* TODO: parallelcopy */
123 break;
124 }
125 }
126
127 return value;
128 }
129
130 /* This represents an entry in the merge set, and consists of a register +
131 * offset from the merge set base.
132 */
133 struct merge_def {
134 struct ir3_register *reg;
135 unsigned offset;
136 };
137
138 static bool
can_skip_interference(const struct merge_def * a,const struct merge_def * b)139 can_skip_interference(const struct merge_def *a, const struct merge_def *b)
140 {
141 unsigned a_start = a->offset;
142 unsigned b_start = b->offset;
143 unsigned a_end = a_start + reg_size(a->reg);
144 unsigned b_end = b_start + reg_size(b->reg);
145
146 /* Registers that don't overlap never interfere */
147 if (a_end <= b_start || b_end <= a_start)
148 return true;
149
150 /* Disallow skipping interference unless one definition contains the
151 * other. This restriction is important for register allocation, because
152 * it means that at any given point in the program, the live values in a
153 * given merge set will form a tree. If they didn't, then one live value
154 * would partially overlap another, and they would have overlapping live
155 * ranges because they're live at the same point. This simplifies register
156 * allocation and spilling.
157 */
158 if (!((a_start <= b_start && a_end >= b_end) ||
159 (b_start <= a_start && b_end >= a_end)))
160 return false;
161
162 /* For each register, chase the intersection of a and b to find the
163 * ultimate source.
164 */
165 unsigned start = MAX2(a_start, b_start);
166 unsigned end = MIN2(a_end, b_end);
167 struct def_value a_value = chase_copies((struct def_value){
168 .reg = a->reg,
169 .offset = start - a_start,
170 .size = end - start,
171 });
172 struct def_value b_value = chase_copies((struct def_value){
173 .reg = b->reg,
174 .offset = start - b_start,
175 .size = end - start,
176 });
177 return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
178 }
179
180 static struct ir3_merge_set *
get_merge_set(struct ir3_register * def)181 get_merge_set(struct ir3_register *def)
182 {
183 if (def->merge_set)
184 return def->merge_set;
185
186 struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
187 set->preferred_reg = ~0;
188 set->interval_start = ~0;
189 set->spill_slot = ~0;
190 set->size = reg_size(def);
191 set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
192 set->regs_count = 1;
193 set->regs = ralloc(set, struct ir3_register *);
194 set->regs[0] = def;
195
196 return set;
197 }
198
199 /* Merges b into a */
200 static struct ir3_merge_set *
merge_merge_sets(struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)201 merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
202 {
203 if (b_offset < 0)
204 return merge_merge_sets(b, a, -b_offset);
205
206 struct ir3_register **new_regs =
207 rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
208
209 unsigned a_index = 0, b_index = 0, new_index = 0;
210 for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
211 if (b_index < b->regs_count &&
212 (a_index == a->regs_count ||
213 def_after(a->regs[a_index], b->regs[b_index]))) {
214 new_regs[new_index] = b->regs[b_index++];
215 new_regs[new_index]->merge_set_offset += b_offset;
216 } else {
217 new_regs[new_index] = a->regs[a_index++];
218 }
219 new_regs[new_index]->merge_set = a;
220 }
221
222 assert(new_index == a->regs_count + b->regs_count);
223
224 /* Technically this should be the lcm, but because alignment is only 1 or
225 * 2 so far this should be ok.
226 */
227 a->alignment = MAX2(a->alignment, b->alignment);
228 a->regs_count += b->regs_count;
229 ralloc_free(a->regs);
230 a->regs = new_regs;
231 a->size = MAX2(a->size, b->size + b_offset);
232
233 return a;
234 }
235
236 static bool
merge_sets_interfere(struct ir3_liveness * live,struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)237 merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
238 struct ir3_merge_set *b, int b_offset)
239 {
240 if (b_offset < 0)
241 return merge_sets_interfere(live, b, a, -b_offset);
242
243 struct merge_def dom[a->regs_count + b->regs_count];
244 unsigned a_index = 0, b_index = 0;
245 int dom_index = -1;
246
247 /* Reject trying to merge the sets if the alignment doesn't work out */
248 if (b_offset % a->alignment != 0)
249 return true;
250
251 while (a_index < a->regs_count || b_index < b->regs_count) {
252 struct merge_def current;
253 if (a_index == a->regs_count) {
254 current.reg = b->regs[b_index];
255 current.offset = current.reg->merge_set_offset + b_offset;
256 b_index++;
257 } else if (b_index == b->regs_count) {
258 current.reg = a->regs[a_index];
259 current.offset = current.reg->merge_set_offset;
260 a_index++;
261 } else {
262 if (def_after(b->regs[b_index], a->regs[a_index])) {
263 current.reg = a->regs[a_index];
264 current.offset = current.reg->merge_set_offset;
265 a_index++;
266 } else {
267 current.reg = b->regs[b_index];
268 current.offset = current.reg->merge_set_offset + b_offset;
269 b_index++;
270 }
271 }
272
273 while (dom_index >= 0 &&
274 !def_dominates(dom[dom_index].reg, current.reg)) {
275 dom_index--;
276 }
277
278 /* TODO: in the original paper, just dom[dom_index] needs to be
279 * checked for interference. We implement the value-chasing extension
280 * as well as support for sub-registers, which complicates this
281 * significantly because it's no longer the case that if a dominates b
282 * dominates c and a and b don't interfere then we only need to check
283 * interference between b and c to be sure a and c don't interfere --
284 * this means we may have to check for interference against values
285 * higher in the stack then dom[dom_index]. In the paper there's a
286 * description of a way to do less interference tests with the
287 * value-chasing extension, but we'd have to come up with something
288 * ourselves for handling the similar problems that come up with
289 * allowing values to contain subregisters. For now we just test
290 * everything in the stack.
291 */
292 for (int i = 0; i <= dom_index; i++) {
293 if (can_skip_interference(¤t, &dom[i]))
294 continue;
295
296 /* Ok, now we actually have to check interference. Since we know
297 * that dom[i] dominates current, this boils down to checking
298 * whether dom[i] is live after current.
299 */
300 if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
301 return true;
302 }
303
304 dom[++dom_index] = current;
305 }
306
307 return false;
308 }
309
310 static void
try_merge_defs(struct ir3_liveness * live,struct ir3_register * a,struct ir3_register * b,unsigned b_offset)311 try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
312 struct ir3_register *b, unsigned b_offset)
313 {
314 struct ir3_merge_set *a_set = get_merge_set(a);
315 struct ir3_merge_set *b_set = get_merge_set(b);
316
317 if (a_set == b_set) {
318 /* Note: Even in this case we may not always successfully be able to
319 * coalesce this copy, if the offsets don't line up. But in any
320 * case, we can't do anything.
321 */
322 return;
323 }
324
325 int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
326
327 if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
328 merge_merge_sets(a_set, b_set, b_set_offset);
329 }
330
331 void
ir3_force_merge(struct ir3_register * a,struct ir3_register * b,int b_offset)332 ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
333 {
334 struct ir3_merge_set *a_set = get_merge_set(a);
335 struct ir3_merge_set *b_set = get_merge_set(b);
336
337 if (a_set == b_set)
338 return;
339
340 int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
341 merge_merge_sets(a_set, b_set, b_set_offset);
342 }
343
344 static void
coalesce_phi(struct ir3_liveness * live,struct ir3_instruction * phi)345 coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
346 {
347 for (unsigned i = 0; i < phi->srcs_count; i++) {
348 if (phi->srcs[i]->def)
349 try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
350 }
351 }
352
353 static void
aggressive_coalesce_parallel_copy(struct ir3_liveness * live,struct ir3_instruction * pcopy)354 aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
355 struct ir3_instruction *pcopy)
356 {
357 for (unsigned i = 0; i < pcopy->dsts_count; i++) {
358 if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
359 continue;
360 try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
361 }
362 }
363
364 static void
aggressive_coalesce_split(struct ir3_liveness * live,struct ir3_instruction * split)365 aggressive_coalesce_split(struct ir3_liveness *live,
366 struct ir3_instruction *split)
367 {
368 if (!(split->dsts[0]->flags & IR3_REG_SSA))
369 return;
370 try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
371 split->split.off * reg_elem_size(split->dsts[0]));
372 }
373
374 static void
aggressive_coalesce_collect(struct ir3_liveness * live,struct ir3_instruction * collect)375 aggressive_coalesce_collect(struct ir3_liveness *live,
376 struct ir3_instruction *collect)
377 {
378 for (unsigned i = 0, offset = 0; i < collect->srcs_count;
379 offset += reg_elem_size(collect->srcs[i]), i++) {
380 if (!(collect->srcs[i]->flags & IR3_REG_SSA))
381 continue;
382 try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
383 }
384 }
385
386 static void
aggressive_coalesce_rpt(struct ir3_liveness * live,struct ir3_instruction * instr)387 aggressive_coalesce_rpt(struct ir3_liveness *live,
388 struct ir3_instruction *instr)
389 {
390 if (!ir3_instr_is_first_rpt(instr))
391 return;
392
393 struct ir3_register *def = instr->dsts[0];
394 unsigned def_offset = 0;
395 unsigned src_offsets[instr->srcs_count];
396 memset(src_offsets, 0, sizeof(unsigned) * instr->srcs_count);
397
398 foreach_instr_rpt_excl (rpt, instr) {
399 if (!(rpt->dsts[0]->flags & IR3_REG_SSA))
400 continue;
401
402 def_offset += reg_elem_size(def);
403 try_merge_defs(live, def, rpt->dsts[0], def_offset);
404
405 foreach_src_n (src, src_n, instr) {
406 struct ir3_register *rpt_src = rpt->srcs[src_n];
407
408 if (!(src->flags & IR3_REG_SSA) || !(rpt_src->flags & IR3_REG_SSA))
409 continue;
410 if (src->def == rpt_src->def)
411 continue;
412
413 src_offsets[src_n] += reg_elem_size(src->def);
414 try_merge_defs(live, src->def, rpt_src->def, src_offsets[src_n]);
415 }
416 }
417 }
418
419 static void
create_parallel_copy(struct ir3_block * block)420 create_parallel_copy(struct ir3_block *block)
421 {
422 for (unsigned i = 0; i < 2; i++) {
423 if (!block->successors[i])
424 continue;
425
426 struct ir3_block *succ = block->successors[i];
427
428 unsigned pred_idx = ir3_block_get_pred_index(succ, block);
429
430 unsigned phi_count = 0;
431 foreach_instr (phi, &succ->instr_list) {
432 if (phi->opc != OPC_META_PHI)
433 break;
434
435 /* Avoid phis we've already colored */
436 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
437 continue;
438
439 /* Avoid undef */
440 if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
441 !phi->srcs[pred_idx]->def)
442 continue;
443
444 /* We don't support critical edges. If we were to support them,
445 * we'd need to insert parallel copies after the phi node to solve
446 * the lost-copy problem.
447 */
448 assert(i == 0 && !block->successors[1]);
449 phi_count++;
450 }
451
452 if (phi_count == 0)
453 continue;
454
455 struct ir3_register *src[phi_count];
456 unsigned j = 0;
457 foreach_instr (phi, &succ->instr_list) {
458 if (phi->opc != OPC_META_PHI)
459 break;
460 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
461 continue;
462 if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
463 !phi->srcs[pred_idx]->def)
464 continue;
465 src[j++] = phi->srcs[pred_idx];
466 }
467 assert(j == phi_count);
468
469 struct ir3_instruction *pcopy =
470 ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
471
472 for (j = 0; j < phi_count; j++) {
473 struct ir3_register *reg = __ssa_dst(pcopy);
474 reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
475 reg->size = src[j]->size;
476 reg->wrmask = src[j]->wrmask;
477 }
478
479 for (j = 0; j < phi_count; j++) {
480 pcopy->srcs[pcopy->srcs_count++] =
481 ir3_reg_clone(block->shader, src[j]);
482 }
483
484 j = 0;
485 foreach_instr (phi, &succ->instr_list) {
486 if (phi->opc != OPC_META_PHI)
487 break;
488 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
489 continue;
490 if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
491 !phi->srcs[pred_idx]->def)
492 continue;
493 phi->srcs[pred_idx]->def = pcopy->dsts[j];
494 pcopy->dsts[j]->flags |= phi->dsts[0]->flags & IR3_REG_SHARED;
495 phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
496 phi->srcs[pred_idx]->num = INVALID_REG;
497 j++;
498 }
499 assert(j == phi_count);
500 }
501 }
502
503 void
ir3_create_parallel_copies(struct ir3 * ir)504 ir3_create_parallel_copies(struct ir3 *ir)
505 {
506 foreach_block (block, &ir->block_list) {
507 create_parallel_copy(block);
508 }
509 }
510
511 static void
index_merge_sets(struct ir3_liveness * live,struct ir3 * ir)512 index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
513 {
514 unsigned offset = 0;
515 foreach_block (block, &ir->block_list) {
516 foreach_instr (instr, &block->instr_list) {
517 for (unsigned i = 0; i < instr->dsts_count; i++) {
518 struct ir3_register *dst = instr->dsts[i];
519
520 unsigned dst_offset;
521 struct ir3_merge_set *merge_set = dst->merge_set;
522 unsigned size = reg_size(dst);
523 if (merge_set) {
524 if (merge_set->interval_start == ~0) {
525 merge_set->interval_start = offset;
526 offset += merge_set->size;
527 }
528 dst_offset = merge_set->interval_start + dst->merge_set_offset;
529 } else {
530 dst_offset = offset;
531 offset += size;
532 }
533
534 dst->interval_start = dst_offset;
535 dst->interval_end = dst_offset + size;
536 }
537 }
538 }
539
540 live->interval_offset = offset;
541 }
542
543 #define RESET "\x1b[0m"
544 #define BLUE "\x1b[0;34m"
545 #define SYN_SSA(x) BLUE x RESET
546
547 static void
dump_merge_sets(struct ir3 * ir)548 dump_merge_sets(struct ir3 *ir)
549 {
550 d("merge sets:");
551 struct set *merge_sets = _mesa_pointer_set_create(NULL);
552 foreach_block (block, &ir->block_list) {
553 foreach_instr (instr, &block->instr_list) {
554 for (unsigned i = 0; i < instr->dsts_count; i++) {
555 struct ir3_register *dst = instr->dsts[i];
556
557 struct ir3_merge_set *merge_set = dst->merge_set;
558 if (!merge_set || _mesa_set_search(merge_sets, merge_set))
559 continue;
560
561 d("merge set, size %u, align %u, interval start %u:",
562 merge_set->size, merge_set->alignment, merge_set->interval_start);
563 for (unsigned j = 0; j < merge_set->regs_count; j++) {
564 struct ir3_register *reg = merge_set->regs[j];
565 const char *s = (reg->flags & IR3_REG_SHARED) ? "s" : "";
566 const char *h = (reg->flags & IR3_REG_HALF) ? "h" : "";
567 d("\t%s%s" SYN_SSA("ssa_%u") ":%u, offset %u, interval: %u-%u",
568 s, h, reg->instr->serialno, reg->name, reg->merge_set_offset,
569 reg->interval_start, reg->interval_end);
570 }
571
572 _mesa_set_add(merge_sets, merge_set);
573 }
574 }
575 }
576
577 ralloc_free(merge_sets);
578 }
579
580 void
ir3_merge_regs(struct ir3_liveness * live,struct ir3 * ir)581 ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
582 {
583 /* First pass: coalesce phis, which must be together. */
584 foreach_block (block, &ir->block_list) {
585 foreach_instr (instr, &block->instr_list) {
586 if (instr->opc != OPC_META_PHI)
587 break;
588
589 coalesce_phi(live, instr);
590 }
591 }
592
593 /* Second pass: aggressively coalesce parallelcopy, split, collect */
594 foreach_block (block, &ir->block_list) {
595 foreach_instr (instr, &block->instr_list) {
596 switch (instr->opc) {
597 case OPC_META_SPLIT:
598 aggressive_coalesce_split(live, instr);
599 break;
600 case OPC_META_COLLECT:
601 aggressive_coalesce_collect(live, instr);
602 break;
603 case OPC_META_PARALLEL_COPY:
604 aggressive_coalesce_parallel_copy(live, instr);
605 break;
606 default:
607 break;
608 }
609 }
610 }
611
612 foreach_block (block, &ir->block_list) {
613 foreach_instr (instr, &block->instr_list) {
614 aggressive_coalesce_rpt(live, instr);
615 }
616 }
617
618 index_merge_sets(live, ir);
619
620 if (ir3_shader_debug & IR3_DBG_RAMSGS)
621 dump_merge_sets(ir);
622 }
623