xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_merge_regs.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "ir3_compiler.h"
7 #include "ir3_ra.h"
8 #include "util/ralloc.h"
9 
10 /* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
11  * of parallelcopy's to trivially turn the program into CSSA form. Then we
12  * try to "merge" SSA def's into "merge sets" which could be allocated to a
13  * single register in order to eliminate copies. First we merge phi nodes,
14  * which should always succeed because of the parallelcopy's we inserted, and
15  * then we try to coalesce the copies we introduced.
16  *
17  * The merged registers are used for three purposes:
18  *
19  * 1. We always use the same pvtmem slot for spilling all SSA defs in each
20  * merge set. This prevents us from having to insert memory-to-memory copies
21  * in the spiller and makes sure we don't insert unecessary copies.
22  * 2. When two values are live at the same time, part of the same merge
23  * set, and they overlap each other in the merge set, they always occupy
24  * overlapping physical registers in RA. This reduces register pressure and
25  * copies in several important scenarios:
26  *	- When sources of a collect are used later by something else, we don't
27  *	have to introduce copies.
28  *	- We can handle sequences of extracts that "explode" a vector into its
29  *	components without any additional copying.
30  * 3. We use the merge sets for affinities in register allocation: That is, we
31  * try to allocate all the definitions in the same merge set to the
32  * same/compatible registers. This helps us e.g. allocate sources of a collect
33  * to contiguous registers without too much special code in RA.
34  *
35  * In a "normal" register allocator, or when spilling, we'd just merge
36  * registers in the same merge set to the same register, but with SSA-based
37  * register allocation we may have to split the live interval.
38  *
39  * The implementation is based on "Revisiting Out-of-SSA Translation for
40  * Correctness, CodeQuality, and Efficiency," and is broadly similar to the
41  * implementation in nir_from_ssa, with the twist that we also try to coalesce
42  * META_SPLIT and META_COLLECT. This makes this pass more complicated but
43  * prevents us from needing to handle these specially in RA and the spiller,
44  * which are already complicated enough. This also forces us to implement that
45  * value-comparison optimization they explain, as without it we wouldn't be
46  * able to coalesce META_SPLIT even in the simplest of cases.
47  */
48 
49 /* In order to dynamically reconstruct the dominance forest, we need the
50  * instructions ordered by a preorder traversal of the dominance tree:
51  */
52 
53 static unsigned
index_instrs(struct ir3_block * block,unsigned index)54 index_instrs(struct ir3_block *block, unsigned index)
55 {
56    foreach_instr (instr, &block->instr_list)
57       instr->ip = index++;
58 
59    for (unsigned i = 0; i < block->dom_children_count; i++)
60       index = index_instrs(block->dom_children[i], index);
61 
62    return index;
63 }
64 
65 void
ir3_index_instrs_for_merge_sets(struct ir3 * ir)66 ir3_index_instrs_for_merge_sets(struct ir3 *ir)
67 {
68    index_instrs(ir3_start_block(ir), 0);
69 }
70 
71 /* Definitions within a merge set are ordered by instr->ip as set above: */
72 
73 static bool
def_after(struct ir3_register * a,struct ir3_register * b)74 def_after(struct ir3_register *a, struct ir3_register *b)
75 {
76    return a->instr->ip > b->instr->ip;
77 }
78 
79 static bool
def_dominates(struct ir3_register * a,struct ir3_register * b)80 def_dominates(struct ir3_register *a, struct ir3_register *b)
81 {
82    if (def_after(a, b)) {
83       return false;
84    } else if (a->instr->block == b->instr->block) {
85       return def_after(b, a);
86    } else {
87       return ir3_block_dominates(a->instr->block, b->instr->block);
88    }
89 }
90 
91 /* This represents a region inside a register. The offset is relative to the
92  * start of the register, and offset + size <= size(reg).
93  */
94 struct def_value {
95    struct ir3_register *reg;
96    unsigned offset, size;
97 };
98 
99 /* Chase any copies to get the source of a region inside a register. This is
100  * Value(a) in the paper.
101  */
102 static struct def_value
chase_copies(struct def_value value)103 chase_copies(struct def_value value)
104 {
105    while (true) {
106       struct ir3_instruction *instr = value.reg->instr;
107       if (instr->opc == OPC_META_SPLIT) {
108          value.offset += instr->split.off * reg_elem_size(value.reg);
109          value.reg = instr->srcs[0]->def;
110       } else if (instr->opc == OPC_META_COLLECT) {
111          if (value.offset % reg_elem_size(value.reg) != 0 ||
112              value.size > reg_elem_size(value.reg) ||
113              value.offset + value.size > reg_size(value.reg))
114             break;
115          struct ir3_register *src =
116             instr->srcs[value.offset / reg_elem_size(value.reg)];
117          if (!src->def)
118             break;
119          value.offset = 0;
120          value.reg = src->def;
121       } else {
122          /* TODO: parallelcopy */
123          break;
124       }
125    }
126 
127    return value;
128 }
129 
130 /* This represents an entry in the merge set, and consists of a register +
131  * offset from the merge set base.
132  */
133 struct merge_def {
134    struct ir3_register *reg;
135    unsigned offset;
136 };
137 
138 static bool
can_skip_interference(const struct merge_def * a,const struct merge_def * b)139 can_skip_interference(const struct merge_def *a, const struct merge_def *b)
140 {
141    unsigned a_start = a->offset;
142    unsigned b_start = b->offset;
143    unsigned a_end = a_start + reg_size(a->reg);
144    unsigned b_end = b_start + reg_size(b->reg);
145 
146    /* Registers that don't overlap never interfere */
147    if (a_end <= b_start || b_end <= a_start)
148       return true;
149 
150    /* Disallow skipping interference unless one definition contains the
151     * other. This restriction is important for register allocation, because
152     * it means that at any given point in the program, the live values in a
153     * given merge set will form a tree. If they didn't, then one live value
154     * would partially overlap another, and they would have overlapping live
155     * ranges because they're live at the same point. This simplifies register
156     * allocation and spilling.
157     */
158    if (!((a_start <= b_start && a_end >= b_end) ||
159          (b_start <= a_start && b_end >= a_end)))
160       return false;
161 
162    /* For each register, chase the intersection of a and b to find the
163     * ultimate source.
164     */
165    unsigned start = MAX2(a_start, b_start);
166    unsigned end = MIN2(a_end, b_end);
167    struct def_value a_value = chase_copies((struct def_value){
168       .reg = a->reg,
169       .offset = start - a_start,
170       .size = end - start,
171    });
172    struct def_value b_value = chase_copies((struct def_value){
173       .reg = b->reg,
174       .offset = start - b_start,
175       .size = end - start,
176    });
177    return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
178 }
179 
180 static struct ir3_merge_set *
get_merge_set(struct ir3_register * def)181 get_merge_set(struct ir3_register *def)
182 {
183    if (def->merge_set)
184       return def->merge_set;
185 
186    struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
187    set->preferred_reg = ~0;
188    set->interval_start = ~0;
189    set->spill_slot = ~0;
190    set->size = reg_size(def);
191    set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
192    set->regs_count = 1;
193    set->regs = ralloc(set, struct ir3_register *);
194    set->regs[0] = def;
195 
196    return set;
197 }
198 
199 /* Merges b into a */
200 static struct ir3_merge_set *
merge_merge_sets(struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)201 merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b, int b_offset)
202 {
203    if (b_offset < 0)
204       return merge_merge_sets(b, a, -b_offset);
205 
206    struct ir3_register **new_regs =
207       rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
208 
209    unsigned a_index = 0, b_index = 0, new_index = 0;
210    for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
211       if (b_index < b->regs_count &&
212           (a_index == a->regs_count ||
213            def_after(a->regs[a_index], b->regs[b_index]))) {
214          new_regs[new_index] = b->regs[b_index++];
215          new_regs[new_index]->merge_set_offset += b_offset;
216       } else {
217          new_regs[new_index] = a->regs[a_index++];
218       }
219       new_regs[new_index]->merge_set = a;
220    }
221 
222    assert(new_index == a->regs_count + b->regs_count);
223 
224    /* Technically this should be the lcm, but because alignment is only 1 or
225     * 2 so far this should be ok.
226     */
227    a->alignment = MAX2(a->alignment, b->alignment);
228    a->regs_count += b->regs_count;
229    ralloc_free(a->regs);
230    a->regs = new_regs;
231    a->size = MAX2(a->size, b->size + b_offset);
232 
233    return a;
234 }
235 
236 static bool
merge_sets_interfere(struct ir3_liveness * live,struct ir3_merge_set * a,struct ir3_merge_set * b,int b_offset)237 merge_sets_interfere(struct ir3_liveness *live, struct ir3_merge_set *a,
238                      struct ir3_merge_set *b, int b_offset)
239 {
240    if (b_offset < 0)
241       return merge_sets_interfere(live, b, a, -b_offset);
242 
243    struct merge_def dom[a->regs_count + b->regs_count];
244    unsigned a_index = 0, b_index = 0;
245    int dom_index = -1;
246 
247    /* Reject trying to merge the sets if the alignment doesn't work out */
248    if (b_offset % a->alignment != 0)
249       return true;
250 
251    while (a_index < a->regs_count || b_index < b->regs_count) {
252       struct merge_def current;
253       if (a_index == a->regs_count) {
254          current.reg = b->regs[b_index];
255          current.offset = current.reg->merge_set_offset + b_offset;
256          b_index++;
257       } else if (b_index == b->regs_count) {
258          current.reg = a->regs[a_index];
259          current.offset = current.reg->merge_set_offset;
260          a_index++;
261       } else {
262          if (def_after(b->regs[b_index], a->regs[a_index])) {
263             current.reg = a->regs[a_index];
264             current.offset = current.reg->merge_set_offset;
265             a_index++;
266          } else {
267             current.reg = b->regs[b_index];
268             current.offset = current.reg->merge_set_offset + b_offset;
269             b_index++;
270          }
271       }
272 
273       while (dom_index >= 0 &&
274              !def_dominates(dom[dom_index].reg, current.reg)) {
275          dom_index--;
276       }
277 
278       /* TODO: in the original paper, just dom[dom_index] needs to be
279        * checked for interference. We implement the value-chasing extension
280        * as well as support for sub-registers, which complicates this
281        * significantly because it's no longer the case that if a dominates b
282        * dominates c and a and b don't interfere then we only need to check
283        * interference between b and c to be sure a and c don't interfere --
284        * this means we may have to check for interference against values
285        * higher in the stack then dom[dom_index]. In the paper there's a
286        * description of a way to do less interference tests with the
287        * value-chasing extension, but we'd have to come up with something
288        * ourselves for handling the similar problems that come up with
289        * allowing values to contain subregisters. For now we just test
290        * everything in the stack.
291        */
292       for (int i = 0; i <= dom_index; i++) {
293          if (can_skip_interference(&current, &dom[i]))
294             continue;
295 
296          /* Ok, now we actually have to check interference. Since we know
297           * that dom[i] dominates current, this boils down to checking
298           * whether dom[i] is live after current.
299           */
300          if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
301             return true;
302       }
303 
304       dom[++dom_index] = current;
305    }
306 
307    return false;
308 }
309 
310 static void
try_merge_defs(struct ir3_liveness * live,struct ir3_register * a,struct ir3_register * b,unsigned b_offset)311 try_merge_defs(struct ir3_liveness *live, struct ir3_register *a,
312                struct ir3_register *b, unsigned b_offset)
313 {
314    struct ir3_merge_set *a_set = get_merge_set(a);
315    struct ir3_merge_set *b_set = get_merge_set(b);
316 
317    if (a_set == b_set) {
318       /* Note: Even in this case we may not always successfully be able to
319        * coalesce this copy, if the offsets don't line up. But in any
320        * case, we can't do anything.
321        */
322       return;
323    }
324 
325    int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
326 
327    if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
328       merge_merge_sets(a_set, b_set, b_set_offset);
329 }
330 
331 void
ir3_force_merge(struct ir3_register * a,struct ir3_register * b,int b_offset)332 ir3_force_merge(struct ir3_register *a, struct ir3_register *b, int b_offset)
333 {
334    struct ir3_merge_set *a_set = get_merge_set(a);
335    struct ir3_merge_set *b_set = get_merge_set(b);
336 
337    if (a_set == b_set)
338       return;
339 
340    int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
341    merge_merge_sets(a_set, b_set, b_set_offset);
342 }
343 
344 static void
coalesce_phi(struct ir3_liveness * live,struct ir3_instruction * phi)345 coalesce_phi(struct ir3_liveness *live, struct ir3_instruction *phi)
346 {
347    for (unsigned i = 0; i < phi->srcs_count; i++) {
348       if (phi->srcs[i]->def)
349          try_merge_defs(live, phi->dsts[0], phi->srcs[i]->def, 0);
350    }
351 }
352 
353 static void
aggressive_coalesce_parallel_copy(struct ir3_liveness * live,struct ir3_instruction * pcopy)354 aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
355                                   struct ir3_instruction *pcopy)
356 {
357    for (unsigned i = 0; i < pcopy->dsts_count; i++) {
358       if (!(pcopy->srcs[i]->flags & IR3_REG_SSA))
359          continue;
360       try_merge_defs(live, pcopy->dsts[i], pcopy->srcs[i]->def, 0);
361    }
362 }
363 
364 static void
aggressive_coalesce_split(struct ir3_liveness * live,struct ir3_instruction * split)365 aggressive_coalesce_split(struct ir3_liveness *live,
366                           struct ir3_instruction *split)
367 {
368    if (!(split->dsts[0]->flags & IR3_REG_SSA))
369       return;
370    try_merge_defs(live, split->srcs[0]->def, split->dsts[0],
371                   split->split.off * reg_elem_size(split->dsts[0]));
372 }
373 
374 static void
aggressive_coalesce_collect(struct ir3_liveness * live,struct ir3_instruction * collect)375 aggressive_coalesce_collect(struct ir3_liveness *live,
376                             struct ir3_instruction *collect)
377 {
378    for (unsigned i = 0, offset = 0; i < collect->srcs_count;
379         offset += reg_elem_size(collect->srcs[i]), i++) {
380       if (!(collect->srcs[i]->flags & IR3_REG_SSA))
381          continue;
382       try_merge_defs(live, collect->dsts[0], collect->srcs[i]->def, offset);
383    }
384 }
385 
386 static void
aggressive_coalesce_rpt(struct ir3_liveness * live,struct ir3_instruction * instr)387 aggressive_coalesce_rpt(struct ir3_liveness *live,
388                         struct ir3_instruction *instr)
389 {
390    if (!ir3_instr_is_first_rpt(instr))
391       return;
392 
393    struct ir3_register *def = instr->dsts[0];
394    unsigned def_offset = 0;
395    unsigned src_offsets[instr->srcs_count];
396    memset(src_offsets, 0, sizeof(unsigned) * instr->srcs_count);
397 
398    foreach_instr_rpt_excl (rpt, instr) {
399       if (!(rpt->dsts[0]->flags & IR3_REG_SSA))
400          continue;
401 
402       def_offset += reg_elem_size(def);
403       try_merge_defs(live, def, rpt->dsts[0], def_offset);
404 
405       foreach_src_n (src, src_n, instr) {
406          struct ir3_register *rpt_src = rpt->srcs[src_n];
407 
408          if (!(src->flags & IR3_REG_SSA) || !(rpt_src->flags & IR3_REG_SSA))
409             continue;
410          if (src->def == rpt_src->def)
411             continue;
412 
413          src_offsets[src_n] += reg_elem_size(src->def);
414          try_merge_defs(live, src->def, rpt_src->def, src_offsets[src_n]);
415       }
416    }
417 }
418 
419 static void
create_parallel_copy(struct ir3_block * block)420 create_parallel_copy(struct ir3_block *block)
421 {
422    for (unsigned i = 0; i < 2; i++) {
423       if (!block->successors[i])
424          continue;
425 
426       struct ir3_block *succ = block->successors[i];
427 
428       unsigned pred_idx = ir3_block_get_pred_index(succ, block);
429 
430       unsigned phi_count = 0;
431       foreach_instr (phi, &succ->instr_list) {
432          if (phi->opc != OPC_META_PHI)
433             break;
434 
435          /* Avoid phis we've already colored */
436          if (!(phi->dsts[0]->flags & IR3_REG_SSA))
437             continue;
438 
439          /* Avoid undef */
440          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
441              !phi->srcs[pred_idx]->def)
442             continue;
443 
444          /* We don't support critical edges. If we were to support them,
445           * we'd need to insert parallel copies after the phi node to solve
446           * the lost-copy problem.
447           */
448          assert(i == 0 && !block->successors[1]);
449          phi_count++;
450       }
451 
452       if (phi_count == 0)
453          continue;
454 
455       struct ir3_register *src[phi_count];
456       unsigned j = 0;
457       foreach_instr (phi, &succ->instr_list) {
458          if (phi->opc != OPC_META_PHI)
459             break;
460          if (!(phi->dsts[0]->flags & IR3_REG_SSA))
461             continue;
462          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
463              !phi->srcs[pred_idx]->def)
464             continue;
465          src[j++] = phi->srcs[pred_idx];
466       }
467       assert(j == phi_count);
468 
469       struct ir3_instruction *pcopy =
470          ir3_instr_create(block, OPC_META_PARALLEL_COPY, phi_count, phi_count);
471 
472       for (j = 0; j < phi_count; j++) {
473          struct ir3_register *reg = __ssa_dst(pcopy);
474          reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
475          reg->size = src[j]->size;
476          reg->wrmask = src[j]->wrmask;
477       }
478 
479       for (j = 0; j < phi_count; j++) {
480          pcopy->srcs[pcopy->srcs_count++] =
481             ir3_reg_clone(block->shader, src[j]);
482       }
483 
484       j = 0;
485       foreach_instr (phi, &succ->instr_list) {
486          if (phi->opc != OPC_META_PHI)
487             break;
488          if (!(phi->dsts[0]->flags & IR3_REG_SSA))
489             continue;
490          if ((phi->srcs[pred_idx]->flags & IR3_REG_SSA) &&
491              !phi->srcs[pred_idx]->def)
492             continue;
493          phi->srcs[pred_idx]->def = pcopy->dsts[j];
494          pcopy->dsts[j]->flags |= phi->dsts[0]->flags & IR3_REG_SHARED;
495          phi->srcs[pred_idx]->flags = pcopy->dsts[j]->flags;
496          phi->srcs[pred_idx]->num = INVALID_REG;
497          j++;
498       }
499       assert(j == phi_count);
500    }
501 }
502 
503 void
ir3_create_parallel_copies(struct ir3 * ir)504 ir3_create_parallel_copies(struct ir3 *ir)
505 {
506    foreach_block (block, &ir->block_list) {
507       create_parallel_copy(block);
508    }
509 }
510 
511 static void
index_merge_sets(struct ir3_liveness * live,struct ir3 * ir)512 index_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
513 {
514    unsigned offset = 0;
515    foreach_block (block, &ir->block_list) {
516       foreach_instr (instr, &block->instr_list) {
517          for (unsigned i = 0; i < instr->dsts_count; i++) {
518             struct ir3_register *dst = instr->dsts[i];
519 
520             unsigned dst_offset;
521             struct ir3_merge_set *merge_set = dst->merge_set;
522             unsigned size = reg_size(dst);
523             if (merge_set) {
524                if (merge_set->interval_start == ~0) {
525                   merge_set->interval_start = offset;
526                   offset += merge_set->size;
527                }
528                dst_offset = merge_set->interval_start + dst->merge_set_offset;
529             } else {
530                dst_offset = offset;
531                offset += size;
532             }
533 
534             dst->interval_start = dst_offset;
535             dst->interval_end = dst_offset + size;
536          }
537       }
538    }
539 
540    live->interval_offset = offset;
541 }
542 
543 #define RESET      "\x1b[0m"
544 #define BLUE       "\x1b[0;34m"
545 #define SYN_SSA(x) BLUE x RESET
546 
547 static void
dump_merge_sets(struct ir3 * ir)548 dump_merge_sets(struct ir3 *ir)
549 {
550    d("merge sets:");
551    struct set *merge_sets = _mesa_pointer_set_create(NULL);
552    foreach_block (block, &ir->block_list) {
553       foreach_instr (instr, &block->instr_list) {
554          for (unsigned i = 0; i < instr->dsts_count; i++) {
555             struct ir3_register *dst = instr->dsts[i];
556 
557             struct ir3_merge_set *merge_set = dst->merge_set;
558             if (!merge_set || _mesa_set_search(merge_sets, merge_set))
559                continue;
560 
561             d("merge set, size %u, align %u, interval start %u:",
562               merge_set->size, merge_set->alignment, merge_set->interval_start);
563             for (unsigned j = 0; j < merge_set->regs_count; j++) {
564                struct ir3_register *reg = merge_set->regs[j];
565                const char *s = (reg->flags & IR3_REG_SHARED) ? "s" : "";
566                const char *h = (reg->flags & IR3_REG_HALF) ? "h" : "";
567                d("\t%s%s" SYN_SSA("ssa_%u") ":%u, offset %u, interval: %u-%u",
568                  s, h, reg->instr->serialno, reg->name, reg->merge_set_offset,
569                  reg->interval_start, reg->interval_end);
570             }
571 
572             _mesa_set_add(merge_sets, merge_set);
573          }
574       }
575    }
576 
577    ralloc_free(merge_sets);
578 }
579 
580 void
ir3_merge_regs(struct ir3_liveness * live,struct ir3 * ir)581 ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
582 {
583    /* First pass: coalesce phis, which must be together. */
584    foreach_block (block, &ir->block_list) {
585       foreach_instr (instr, &block->instr_list) {
586          if (instr->opc != OPC_META_PHI)
587             break;
588 
589          coalesce_phi(live, instr);
590       }
591    }
592 
593    /* Second pass: aggressively coalesce parallelcopy, split, collect */
594    foreach_block (block, &ir->block_list) {
595       foreach_instr (instr, &block->instr_list) {
596          switch (instr->opc) {
597          case OPC_META_SPLIT:
598             aggressive_coalesce_split(live, instr);
599             break;
600          case OPC_META_COLLECT:
601             aggressive_coalesce_collect(live, instr);
602             break;
603          case OPC_META_PARALLEL_COPY:
604             aggressive_coalesce_parallel_copy(live, instr);
605             break;
606          default:
607             break;
608          }
609       }
610    }
611 
612    foreach_block (block, &ir->block_list) {
613       foreach_instr (instr, &block->instr_list) {
614          aggressive_coalesce_rpt(live, instr);
615       }
616    }
617 
618    index_merge_sets(live, ir);
619 
620    if (ir3_shader_debug & IR3_DBG_RAMSGS)
621       dump_merge_sets(ir);
622 }
623