xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_spill.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "util/rb_tree.h"
7 #include "ir3_ra.h"
8 #include "ir3_shader.h"
9 
10 /*
11  * This pass does two things:
12  *
13  * 1. Calculates the maximum register pressure. To do this, we need to use the
14  *    exact same technique that RA uses for combining meta_split instructions
15  *    with their sources, so that our calculation agrees with RA.
16  * 2. Spills when the register pressure is exceeded a limit calculated by RA.
17  *    The implementation is based on "Register Spilling and Live-Range Splitting
18  *    for SSA-Form Programs" by Braun and Hack, although again care has to be
19  *    taken to handle combining split/collect instructions.
20  */
21 
22 struct reg_or_immed {
23    unsigned flags;
24    union {
25       struct ir3_register *def;
26       uint32_t uimm;
27       unsigned const_num;
28    };
29 };
30 
31 struct ra_spill_interval {
32    struct ir3_reg_interval interval;
33 
34    struct rb_node node;
35    struct rb_node half_node;
36 
37    /* The current SSA value/const/immed this source is mapped to. */
38    struct reg_or_immed dst;
39 
40    /* When computing use distances we use the distance relative to the start
41     * of the block. So, for example, a value that's defined in cycle 5 of the
42     * block and used 6 cycles later will always have a next_use_distance of 11
43     * until we reach that use.
44     */
45    unsigned next_use_distance;
46 
47    /* Whether this value was reloaded and therefore doesn't need to be
48     * spilled again. Corresponds to the S set in the paper.
49     */
50    bool already_spilled;
51 
52    /* We need to add sources early for accounting purposes, but we have to
53     * insert the reload code for them last. Keep track of whether this interval
54     * needs to be reloaded later.
55     */
56    bool needs_reload;
57 
58    /* Keep track of whether this interval currently can't be spilled because:
59     * - It or one of its children is a source and we're making space for
60     *   sources.
61     * - It is a destination and we're making space for destinations.
62     */
63    bool cant_spill;
64 
65    /* Whether this interval can be rematerialized. */
66    bool can_rematerialize;
67 };
68 
69 struct ra_spill_block_state {
70    unsigned *next_use_end;
71    unsigned *next_use_start;
72 
73    unsigned cycles;
74 
75    /* Map from SSA def to reg_or_immed it is mapped to at the end of the block.
76     * This map only contains values which we didn't spill, so it also serves as
77     * a record of the new live-out set for this block.
78     */
79    struct hash_table *remap;
80 
81    /* For blocks whose successors are visited first (i.e. loop backedges), which
82     * values should be live at the end.
83     */
84    BITSET_WORD *live_out;
85 
86    bool visited;
87 };
88 
89 struct ra_spill_ctx {
90    struct ir3_reg_ctx reg_ctx;
91 
92    struct ra_spill_interval **intervals;
93    unsigned intervals_count;
94 
95    /* rb tree of live intervals that we can spill, ordered by next-use distance.
96     * full_live_intervals contains the full+shared intervals in the merged_regs
97     * case. We use this list to determine what to spill.
98     */
99    struct rb_tree full_live_intervals;
100    struct rb_tree half_live_intervals;
101 
102    struct ir3_pressure cur_pressure, max_pressure;
103 
104    struct ir3_pressure limit_pressure;
105 
106    /* When spilling, we need to reserve a register to serve as the zero'd
107     * "base". For simplicity we reserve a register at the beginning so that it's
108     * always available.
109     */
110    struct ir3_register *base_reg;
111 
112    /* Current pvtmem offset in bytes. */
113    unsigned spill_slot;
114 
115    struct ir3_liveness *live;
116 
117    const struct ir3_compiler *compiler;
118 
119    struct ra_spill_block_state *blocks;
120 
121    bool spilling;
122 
123    bool merged_regs;
124 };
125 
126 static void
add_base_reg(struct ra_spill_ctx * ctx,struct ir3 * ir)127 add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir)
128 {
129    struct ir3_block *start = ir3_start_block(ir);
130 
131    /* We need to stick it after any meta instructions which need to be first. */
132    struct ir3_instruction *after = NULL;
133    foreach_instr (instr, &start->instr_list) {
134       if (instr->opc != OPC_META_INPUT &&
135           instr->opc != OPC_META_TEX_PREFETCH) {
136          after = instr;
137          break;
138       }
139    }
140 
141    struct ir3_instruction *mov = create_immed(start, 0);
142 
143    if (after)
144       ir3_instr_move_before(mov, after);
145 
146    ctx->base_reg = mov->dsts[0];
147 
148    /* We don't create an interval, etc. for the base reg, so just lower the
149     * register pressure limit to account for it. We assume it's always
150     * available for simplicity.
151     */
152    ctx->limit_pressure.full -= reg_size(ctx->base_reg);
153 }
154 
155 
156 /* Compute the number of cycles per instruction used for next-use-distance
157  * analysis. This is just approximate, obviously.
158  */
159 static unsigned
instr_cycles(struct ir3_instruction * instr)160 instr_cycles(struct ir3_instruction *instr)
161 {
162    if (instr->opc == OPC_META_PARALLEL_COPY) {
163       unsigned cycles = 0;
164       for (unsigned i = 0; i < instr->dsts_count; i++) {
165          if (!instr->srcs[i]->def ||
166              instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) {
167             cycles += reg_elems(instr->srcs[i]);
168          }
169       }
170 
171       return cycles;
172    }
173 
174    if (instr->opc == OPC_META_COLLECT) {
175       unsigned cycles = 0;
176       for (unsigned i = 0; i < instr->srcs_count; i++) {
177          if (!instr->srcs[i]->def ||
178              instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) {
179             cycles++;
180          }
181       }
182 
183       return cycles;
184    }
185 
186    if (is_meta(instr))
187       return 0;
188 
189    return 1 + instr->repeat;
190 }
191 
192 static bool
compute_block_next_distance(struct ra_spill_ctx * ctx,struct ir3_block * block,unsigned * tmp_next_use)193 compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block,
194                             unsigned *tmp_next_use)
195 {
196    struct ra_spill_block_state *state = &ctx->blocks[block->index];
197    memcpy(tmp_next_use, state->next_use_end,
198           ctx->live->definitions_count * sizeof(*tmp_next_use));
199 
200    unsigned cycle = state->cycles;
201    foreach_instr_rev (instr, &block->instr_list) {
202       ra_foreach_dst (dst, instr) {
203          dst->next_use = tmp_next_use[dst->name];
204       }
205 
206       ra_foreach_src (src, instr) {
207          src->next_use = tmp_next_use[src->def->name];
208       }
209 
210       cycle -= instr_cycles(instr);
211 
212       if (instr->opc == OPC_META_PARALLEL_COPY) {
213          ra_foreach_src_n (src, i, instr) {
214             if (src->def->merge_set == instr->dsts[i]->merge_set &&
215                 src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) {
216                tmp_next_use[src->def->name] =
217                   tmp_next_use[instr->dsts[i]->name];
218             } else {
219                tmp_next_use[src->def->name] = cycle;
220             }
221          }
222       } else if (instr->opc != OPC_META_PHI) {
223          ra_foreach_src (src, instr) {
224             tmp_next_use[src->def->name] = cycle;
225          }
226       }
227 
228       ra_foreach_dst (dst, instr) {
229          tmp_next_use[dst->name] = UINT_MAX;
230       }
231    }
232 
233    memcpy(state->next_use_start, tmp_next_use,
234           ctx->live->definitions_count * sizeof(*tmp_next_use));
235 
236    bool progress = false;
237    for (unsigned i = 0; i < block->predecessors_count; i++) {
238       const struct ir3_block *pred = block->predecessors[i];
239       struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index];
240 
241       /* Add a large-enough distance in front of edges exiting the loop so that
242        * variables that are live-through the loop but not used inside it are
243        * prioritized for spilling, as per the paper. This just needs to be
244        * larger than the longest path through the loop.
245        */
246       bool loop_exit = pred->loop_depth < block->loop_depth;
247       unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0);
248 
249       for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
250          if (state->next_use_start[j] < UINT_MAX &&
251              state->next_use_start[j] + block_distance <
252              pred_state->next_use_end[j]) {
253             pred_state->next_use_end[j] = state->next_use_start[j] +
254                block_distance;
255             progress = true;
256          }
257       }
258 
259       foreach_instr (phi, &block->instr_list) {
260          if (phi->opc != OPC_META_PHI)
261             break;
262          if (!phi->srcs[i]->def)
263             continue;
264          unsigned src = phi->srcs[i]->def->name;
265          if (phi->dsts[0]->next_use < UINT_MAX &&
266              phi->dsts[0]->next_use + block_distance <
267              pred_state->next_use_end[src]) {
268             pred_state->next_use_end[src] = phi->dsts[0]->next_use +
269                block_distance;
270             progress = true;
271          }
272       }
273    }
274 
275    return progress;
276 }
277 
278 static void
compute_next_distance(struct ra_spill_ctx * ctx,struct ir3 * ir)279 compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir)
280 {
281    for (unsigned i = 0; i < ctx->live->block_count; i++) {
282       ctx->blocks[i].next_use_start =
283          ralloc_array(ctx, unsigned, ctx->live->definitions_count);
284       ctx->blocks[i].next_use_end =
285          ralloc_array(ctx, unsigned, ctx->live->definitions_count);
286 
287       for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
288          ctx->blocks[i].next_use_start[j] = UINT_MAX;
289          ctx->blocks[i].next_use_end[j] = UINT_MAX;
290       }
291    }
292 
293    foreach_block (block, &ir->block_list) {
294       struct ra_spill_block_state *state = &ctx->blocks[block->index];
295       state->cycles = 0;
296       foreach_instr (instr, &block->instr_list) {
297          state->cycles += instr_cycles(instr);
298          foreach_dst (dst, instr) {
299             dst->spill_slot = ~0;
300          }
301       }
302    }
303 
304    unsigned *tmp_next_use =
305       ralloc_array(ctx, unsigned, ctx->live->definitions_count);
306 
307    bool progress = true;
308    while (progress) {
309       progress = false;
310       foreach_block_rev (block, &ir->block_list) {
311          progress |= compute_block_next_distance(ctx, block, tmp_next_use);
312       }
313    }
314 }
315 
316 static bool
can_rematerialize(struct ir3_register * reg)317 can_rematerialize(struct ir3_register *reg)
318 {
319    if (reg->flags & IR3_REG_ARRAY)
320       return false;
321    if (reg->instr->opc != OPC_MOV)
322       return false;
323    if (!(reg->instr->srcs[0]->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
324       return false;
325    if (reg->instr->srcs[0]->flags & IR3_REG_RELATIV)
326       return false;
327    return true;
328 }
329 
330 static struct ir3_register *
rematerialize(struct ir3_register * reg,struct ir3_cursor cursor)331 rematerialize(struct ir3_register *reg, struct ir3_cursor cursor)
332 {
333    d("rematerializing ssa_%u:%u", reg->instr->serialno, reg->name);
334 
335    struct ir3_instruction *remat =
336       ir3_instr_create_at(cursor, reg->instr->opc, 1, reg->instr->srcs_count);
337    struct ir3_register *dst = __ssa_dst(remat);
338    dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
339    for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
340       struct ir3_register *src =
341          ir3_src_create(remat, INVALID_REG, reg->instr->srcs[i]->flags);
342       *src = *reg->instr->srcs[i];
343    }
344 
345    remat->cat1 = reg->instr->cat1;
346 
347    dst->merge_set = reg->merge_set;
348    dst->merge_set_offset = reg->merge_set_offset;
349    dst->interval_start = reg->interval_start;
350    dst->interval_end = reg->interval_end;
351    return dst;
352 }
353 
354 static void
ra_spill_interval_init(struct ra_spill_interval * interval,struct ir3_register * reg)355 ra_spill_interval_init(struct ra_spill_interval *interval,
356                        struct ir3_register *reg)
357 {
358    ir3_reg_interval_init(&interval->interval, reg);
359    interval->dst.flags = reg->flags;
360    interval->dst.def = reg;
361    interval->already_spilled = false;
362    interval->needs_reload = false;
363    interval->cant_spill = false;
364    interval->can_rematerialize = can_rematerialize(reg);
365 }
366 
367 static struct ra_spill_interval *
ir3_reg_interval_to_interval(struct ir3_reg_interval * interval)368 ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
369 {
370    return rb_node_data(struct ra_spill_interval, interval, interval);
371 }
372 
373 static struct ra_spill_interval *
ra_spill_interval_root(struct ra_spill_interval * interval)374 ra_spill_interval_root(struct ra_spill_interval *interval)
375 {
376    struct ir3_reg_interval *ir3_interval = &interval->interval;
377    while (ir3_interval->parent)
378       ir3_interval = ir3_interval->parent;
379    return ir3_reg_interval_to_interval(ir3_interval);
380 }
381 
382 static struct ra_spill_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)383 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
384 {
385    return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
386 }
387 
388 static int
spill_interval_cmp(const struct ra_spill_interval * a,const struct ra_spill_interval * b)389 spill_interval_cmp(const struct ra_spill_interval *a,
390                    const struct ra_spill_interval *b)
391 {
392    /* Prioritize intervals that we can rematerialize. */
393    if (a->can_rematerialize && !b->can_rematerialize)
394       return 1;
395    if (!a->can_rematerialize && b->can_rematerialize)
396       return -1;
397 
398    return a->next_use_distance - b->next_use_distance;
399 }
400 
401 static int
ra_spill_interval_cmp(const struct rb_node * _a,const struct rb_node * _b)402 ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b)
403 {
404    const struct ra_spill_interval *a =
405       rb_node_data(const struct ra_spill_interval, _a, node);
406    const struct ra_spill_interval *b =
407       rb_node_data(const struct ra_spill_interval, _b, node);
408    return spill_interval_cmp(a, b);
409 }
410 
411 static int
ra_spill_interval_half_cmp(const struct rb_node * _a,const struct rb_node * _b)412 ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b)
413 {
414    const struct ra_spill_interval *a =
415       rb_node_data(const struct ra_spill_interval, _a, half_node);
416    const struct ra_spill_interval *b =
417       rb_node_data(const struct ra_spill_interval, _b, half_node);
418    return spill_interval_cmp(a, b);
419 }
420 
421 static void
interval_add(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)422 interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
423 {
424    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
425    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
426 
427    unsigned size = reg_size(interval->interval.reg);
428    if (interval->interval.reg->flags & IR3_REG_SHARED) {
429       ctx->cur_pressure.shared += size;
430       if (interval->interval.reg->flags & IR3_REG_HALF)
431          ctx->cur_pressure.shared_half += size;
432    } else {
433       if (interval->interval.reg->flags & IR3_REG_HALF) {
434          ctx->cur_pressure.half += size;
435          if (ctx->spilling) {
436             rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
437                            ra_spill_interval_half_cmp);
438          }
439       }
440       if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
441          ctx->cur_pressure.full += size;
442          if (ctx->spilling) {
443             rb_tree_insert(&ctx->full_live_intervals, &interval->node,
444                            ra_spill_interval_cmp);
445          }
446       }
447    }
448 }
449 
450 static void
interval_delete(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)451 interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
452 {
453    struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
454    struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
455 
456    unsigned size = reg_size(interval->interval.reg);
457    if (interval->interval.reg->flags & IR3_REG_SHARED) {
458       ctx->cur_pressure.shared -= size;
459       if (interval->interval.reg->flags & IR3_REG_HALF)
460          ctx->cur_pressure.shared_half -= size;
461    } else {
462       if (interval->interval.reg->flags & IR3_REG_HALF) {
463          ctx->cur_pressure.half -= size;
464          if (ctx->spilling) {
465             rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
466          }
467       }
468       if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
469          ctx->cur_pressure.full -= size;
470          if (ctx->spilling) {
471             rb_tree_remove(&ctx->full_live_intervals, &interval->node);
472          }
473       }
474    }
475 }
476 
477 static void
interval_readd(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)478 interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
479                struct ir3_reg_interval *_child)
480 {
481    interval_add(_ctx, _child);
482 }
483 
484 static void
spill_ctx_init(struct ra_spill_ctx * ctx,struct ir3_shader_variant * v,struct ir3_liveness * live)485 spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v,
486                struct ir3_liveness *live)
487 {
488    ctx->live = live;
489    ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *,
490                                  ctx->live->definitions_count);
491    struct ra_spill_interval *intervals =
492       rzalloc_array(ctx, struct ra_spill_interval,
493                     ctx->live->definitions_count);
494    for (unsigned i = 0; i < ctx->live->definitions_count; i++)
495       ctx->intervals[i] = &intervals[i];
496 
497    ctx->intervals_count = ctx->live->definitions_count;
498    ctx->compiler = v->compiler;
499    ctx->merged_regs = v->mergedregs;
500 
501    rb_tree_init(&ctx->reg_ctx.intervals);
502    ctx->reg_ctx.interval_add = interval_add;
503    ctx->reg_ctx.interval_delete = interval_delete;
504    ctx->reg_ctx.interval_readd = interval_readd;
505 }
506 
507 static void
ra_spill_ctx_insert(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)508 ra_spill_ctx_insert(struct ra_spill_ctx *ctx,
509                     struct ra_spill_interval *interval)
510 {
511    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
512 }
513 
514 static void
ra_spill_ctx_remove(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)515 ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
516                     struct ra_spill_interval *interval)
517 {
518    ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
519 }
520 
521 static void
init_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)522 init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
523 {
524    struct ra_spill_interval *interval = ctx->intervals[dst->name];
525    ra_spill_interval_init(interval, dst);
526    if (ctx->spilling) {
527       interval->next_use_distance = dst->next_use;
528 
529       /* We only need to keep track of used-ness if this value may be
530        * rematerialized. This also keeps us from nuking things that may be
531        * in the keeps list (e.g. atomics, input splits).
532        */
533       if (interval->can_rematerialize)
534          dst->instr->flags |= IR3_INSTR_UNUSED;
535    }
536 }
537 
538 static void
insert_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)539 insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
540 {
541    struct ra_spill_interval *interval = ctx->intervals[dst->name];
542    if (interval->interval.inserted)
543       return;
544 
545    ra_spill_ctx_insert(ctx, interval);
546    interval->cant_spill = true;
547 
548    /* For precolored inputs, make sure we leave enough registers to allow for
549     * holes in the inputs. It can happen that the binning shader has a lower
550     * register pressure than the main shader, but the main shader decided to
551     * add holes between the inputs which means that the binning shader has a
552     * higher register demand.
553     */
554    if (dst->instr->opc == OPC_META_INPUT && dst->num != INVALID_REG) {
555       physreg_t physreg = ra_reg_get_physreg(dst);
556       physreg_t max = physreg + reg_size(dst);
557 
558       if (interval->interval.reg->flags & IR3_REG_SHARED) {
559          ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
560          if (interval->interval.reg->flags & IR3_REG_HALF) {
561             ctx->max_pressure.shared_half = MAX2(ctx->max_pressure.shared_half,
562                                                  max);
563          }
564       } else if (interval->interval.reg->flags & IR3_REG_HALF) {
565          ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
566       } else {
567          ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
568       }
569    }
570 }
571 
572 static void
insert_src(struct ra_spill_ctx * ctx,struct ir3_register * src)573 insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src)
574 {
575    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
576 
577    if (!interval->interval.inserted) {
578       ra_spill_ctx_insert(ctx, interval);
579       interval->needs_reload = true;
580       interval->already_spilled = true;
581    }
582 
583    ra_spill_interval_root(interval)->cant_spill = true;
584 
585 }
586 
587 static void
remove_src_early(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)588 remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
589                  struct ir3_register *src)
590 {
591    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
592 
593    if (ctx->spilling) {
594       /* It might happen that a collect that cannot be coalesced with one of its
595        * sources while spilling can be coalesced with it afterwards. In this
596        * case, we might be able to remove it here during spilling but not
597        * afterwards (because it may have a child interval). If this happens, we
598        * could end up with a register pressure that is higher after spilling
599        * than before. Prevent this by never removing collects early while
600        * spilling.
601        */
602       if (src->def->instr->opc == OPC_META_COLLECT)
603          return;
604    }
605 
606    if (!interval->interval.inserted || interval->interval.parent ||
607        !rb_tree_is_empty(&interval->interval.children))
608       return;
609 
610    ra_spill_ctx_remove(ctx, interval);
611 }
612 
613 static void
remove_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)614 remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
615            struct ir3_register *src)
616 {
617    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
618 
619    if (!interval->interval.inserted)
620       return;
621 
622    ra_spill_ctx_remove(ctx, interval);
623 }
624 
625 static void
finish_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)626 finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
627 {
628    struct ra_spill_interval *interval = ctx->intervals[dst->name];
629    interval->cant_spill = false;
630 }
631 
632 static void
remove_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)633 remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
634 {
635    struct ra_spill_interval *interval = ctx->intervals[dst->name];
636 
637    if (!interval->interval.inserted)
638       return;
639 
640    ra_spill_ctx_remove(ctx, interval);
641 }
642 
643 static void
update_src_next_use(struct ra_spill_ctx * ctx,struct ir3_register * src)644 update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src)
645 {
646    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
647 
648    assert(interval->interval.inserted);
649 
650    interval->next_use_distance = src->next_use;
651 
652    /* If this node is inserted in one of the trees, then it needs to be resorted
653     * as its key has changed.
654     */
655    if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) {
656       if (src->flags & IR3_REG_HALF) {
657          rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
658          rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
659                         ra_spill_interval_half_cmp);
660       }
661       if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) {
662          rb_tree_remove(&ctx->full_live_intervals, &interval->node);
663          rb_tree_insert(&ctx->full_live_intervals, &interval->node,
664                         ra_spill_interval_cmp);
665       }
666    }
667 }
668 
669 static unsigned
get_spill_slot(struct ra_spill_ctx * ctx,struct ir3_register * reg)670 get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg)
671 {
672    if (reg->merge_set) {
673       if (reg->merge_set->spill_slot == ~0) {
674          reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot,
675                                                 reg->merge_set->alignment * 2);
676          ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2;
677       }
678       return reg->merge_set->spill_slot + reg->merge_set_offset * 2;
679    } else {
680       if (reg->spill_slot == ~0) {
681          reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg) * 2);
682          ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2;
683       }
684       return reg->spill_slot;
685    }
686 }
687 
688 static void
set_src_val(struct ir3_register * src,const struct reg_or_immed * val)689 set_src_val(struct ir3_register *src, const struct reg_or_immed *val)
690 {
691    if (val->flags & IR3_REG_IMMED) {
692       src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF);
693       src->uim_val = val->uimm;
694       src->def = NULL;
695    } else if (val->flags & IR3_REG_CONST) {
696       src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF);
697       src->num = val->const_num;
698       src->def = NULL;
699    } else {
700       src->def = val->def;
701       val->def->instr->flags &= ~IR3_INSTR_UNUSED;
702    }
703 }
704 
705 static struct ir3_register *
materialize_pcopy_src(const struct reg_or_immed * src,struct ir3_builder * builder)706 materialize_pcopy_src(const struct reg_or_immed *src,
707                       struct ir3_builder *builder)
708 {
709    struct ir3_instruction *mov = ir3_build_instr(builder, OPC_MOV, 1, 1);
710    struct ir3_register *dst = __ssa_dst(mov);
711    dst->flags |= src->flags & IR3_REG_HALF;
712    struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags);
713    set_src_val(mov_src, src);
714    mov->cat1.src_type = mov->cat1.dst_type =
715       (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
716    return dst;
717 }
718 
719 static void
spill(struct ra_spill_ctx * ctx,const struct reg_or_immed * val,unsigned spill_slot,struct ir3_cursor cursor)720 spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val,
721       unsigned spill_slot, struct ir3_cursor cursor)
722 {
723    struct ir3_register *reg;
724    struct ir3_builder builder = ir3_builder_at(cursor);
725 
726    /* If spilling an immed/const pcopy src, we need to actually materialize it
727     * first with a mov.
728     */
729    if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
730       reg = materialize_pcopy_src(val, &builder);
731    } else {
732       reg = val->def;
733       reg->instr->flags &= ~IR3_INSTR_UNUSED;
734    }
735 
736    d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name,
737      spill_slot);
738 
739    unsigned elems = reg_elems(reg);
740    struct ir3_instruction *spill =
741       ir3_build_instr(&builder, OPC_SPILL_MACRO, 0, 3);
742    ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
743    unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED |
744                                       IR3_REG_CONST | IR3_REG_SSA |
745                                       IR3_REG_ARRAY);
746    struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags);
747    ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
748    spill->cat6.dst_offset = spill_slot;
749    spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
750 
751    src->def = reg;
752    if (reg->flags & IR3_REG_ARRAY) {
753       src->size = reg->size;
754       src->array.id = reg->array.id;
755       src->array.offset = 0;
756    } else {
757       src->wrmask = reg->wrmask;
758    }
759 }
760 
761 static void
spill_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_cursor cursor)762 spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
763                struct ir3_cursor cursor)
764 {
765    if (interval->can_rematerialize && !interval->interval.reg->merge_set)
766       return;
767 
768    spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg),
769          cursor);
770 }
771 
772 /* This is similar to "limit" in the paper. */
773 static void
limit(struct ra_spill_ctx * ctx,struct ir3_cursor cursor)774 limit(struct ra_spill_ctx *ctx, struct ir3_cursor cursor)
775 {
776    if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
777       d("cur half pressure %u exceeds %u", ctx->cur_pressure.half,
778         ctx->limit_pressure.half);
779       rb_tree_foreach_safe (struct ra_spill_interval, interval,
780                             &ctx->half_live_intervals, half_node) {
781          d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
782            interval->interval.reg->name);
783          if (!interval->cant_spill) {
784             if (!interval->already_spilled)
785                spill_interval(ctx, interval, cursor);
786             ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
787             if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
788                break;
789          }
790       }
791 
792       assert(ctx->cur_pressure.half <= ctx->limit_pressure.half);
793    }
794 
795    if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
796       d("cur full pressure %u exceeds %u", ctx->cur_pressure.full,
797         ctx->limit_pressure.full);
798       rb_tree_foreach_safe (struct ra_spill_interval, interval,
799                             &ctx->full_live_intervals, node) {
800          d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
801            interval->interval.reg->name);
802          if (!interval->cant_spill) {
803             if (!interval->already_spilled)
804                spill_interval(ctx, interval, cursor);
805             ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
806             if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
807                break;
808          } else {
809             d("can't spill");
810          }
811       }
812 
813       assert(ctx->cur_pressure.full <= ctx->limit_pressure.full);
814    }
815 }
816 
817 /* There's a corner case where we reload a value which has overlapping live
818  * values already reloaded, either because it's the child of some other interval
819  * that was already reloaded or some of its children have already been
820  * reloaded. Because RA only expects overlapping source/dest intervals for meta
821  * instructions (split/collect), and we don't want to add register pressure by
822  * creating an entirely separate value, we need to add splits and collects to
823  * deal with this case. These splits/collects have to also have correct merge
824  * set information, so that it doesn't result in any actual code or register
825  * pressure in practice.
826  */
827 
828 static void
add_to_merge_set(struct ir3_merge_set * set,struct ir3_register * def,unsigned offset)829 add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def,
830                  unsigned offset)
831 {
832    def->merge_set = set;
833    def->merge_set_offset = offset;
834    def->interval_start = set->interval_start + offset;
835    def->interval_end = set->interval_start + offset + reg_size(def);
836 }
837 
838 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_builder * builder)839 split(struct ir3_register *def, unsigned offset, struct ir3_builder *builder)
840 {
841    if (reg_elems(def) == 1) {
842       assert(offset == 0);
843       return def;
844    }
845 
846    assert(!(def->flags & IR3_REG_ARRAY));
847    assert(def->merge_set);
848    struct ir3_instruction *split =
849       ir3_build_instr(builder, OPC_META_SPLIT, 1, 1);
850    split->split.off = offset;
851    struct ir3_register *dst = __ssa_dst(split);
852    dst->flags |= def->flags & IR3_REG_HALF;
853    struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags);
854    src->wrmask = def->wrmask;
855    src->def = def;
856    add_to_merge_set(def->merge_set, dst,
857                     def->merge_set_offset + offset * reg_elem_size(def));
858    return dst;
859 }
860 
861 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_cursor cursor)862 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
863         struct ir3_cursor cursor)
864 {
865    if (offset == 0 && elems == reg_elems(parent_def))
866       return parent_def;
867 
868    struct ir3_builder builder = ir3_builder_at(cursor);
869    struct ir3_register *srcs[elems];
870    for (unsigned i = 0; i < elems; i++) {
871       srcs[i] = split(parent_def, offset + i, &builder);
872    }
873 
874    struct ir3_instruction *collect =
875       ir3_build_instr(&builder, OPC_META_COLLECT, 1, elems);
876    struct ir3_register *dst = __ssa_dst(collect);
877    dst->flags |= parent_def->flags & IR3_REG_HALF;
878    dst->wrmask = MASK(elems);
879    add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset);
880 
881    for (unsigned i = 0; i < elems; i++) {
882       ir3_src_create(collect, INVALID_REG, parent_def->flags)->def = srcs[i];
883    }
884 
885    return dst;
886 }
887 
888 static struct ir3_register *
reload(struct ra_spill_ctx * ctx,struct ir3_register * reg,struct ir3_cursor cursor)889 reload(struct ra_spill_ctx *ctx, struct ir3_register *reg,
890        struct ir3_cursor cursor)
891 {
892    unsigned spill_slot = get_spill_slot(ctx, reg);
893 
894    d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name,
895      spill_slot);
896 
897    unsigned elems = reg_elems(reg);
898    struct ir3_instruction *reload =
899       ir3_instr_create_at(cursor, OPC_RELOAD_MACRO, 1, 3);
900    struct ir3_register *dst = __ssa_dst(reload);
901    dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
902    /* The reload may be split into multiple pieces, and if the destination
903     * overlaps with the base register then it could get clobbered before the
904     * last ldp in the sequence. Note that we always reserve space for the base
905     * register throughout the whole program, so effectively extending its live
906     * range past the end of the instruction isn't a problem for our pressure
907     * accounting.
908     */
909    dst->flags |= IR3_REG_EARLY_CLOBBER;
910    ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
911    struct ir3_register *offset_reg =
912       ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED);
913    offset_reg->uim_val = spill_slot;
914    ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
915    reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
916 
917    if (reg->flags & IR3_REG_ARRAY) {
918       dst->array.offset = 0;
919       dst->array.id = reg->array.id;
920       dst->size = reg->size;
921    } else {
922       dst->wrmask = reg->wrmask;
923    }
924 
925    dst->merge_set = reg->merge_set;
926    dst->merge_set_offset = reg->merge_set_offset;
927    dst->interval_start = reg->interval_start;
928    dst->interval_end = reg->interval_end;
929    return dst;
930 }
931 
932 static void
rewrite_src_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_register * def,struct ir3_cursor cursor)933 rewrite_src_interval(struct ra_spill_ctx *ctx,
934                     struct ra_spill_interval *interval,
935                     struct ir3_register *def,
936                     struct ir3_cursor cursor)
937 {
938    interval->dst.flags = def->flags;
939    interval->dst.def = def;
940    interval->needs_reload = false;
941 
942    rb_tree_foreach (struct ra_spill_interval, child,
943                     &interval->interval.children, interval.node) {
944       struct ir3_register *child_reg = child->interval.reg;
945       struct ir3_register *child_def =
946          extract(def, (child_reg->interval_start -
947                        interval->interval.reg->interval_start) / reg_elem_size(def),
948                  reg_elems(child_reg), cursor);
949       rewrite_src_interval(ctx, child, child_def, cursor);
950    }
951 }
952 
953 static void
reload_def(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_cursor cursor)954 reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def,
955            struct ir3_cursor cursor)
956 {
957    unsigned elems = reg_elems(def);
958    struct ra_spill_interval *interval = ctx->intervals[def->name];
959 
960    struct ir3_reg_interval *ir3_parent = interval->interval.parent;
961 
962    if (ir3_parent) {
963       struct ra_spill_interval *parent =
964          ir3_reg_interval_to_interval(ir3_parent);
965       if (!parent->needs_reload) {
966          interval->dst.flags = def->flags;
967          interval->dst.def = extract(
968             parent->dst.def, (def->interval_start - parent->dst.def->interval_start) /
969             reg_elem_size(def), elems, cursor);
970          return;
971       }
972    }
973 
974    struct ir3_register *dst;
975    if (interval->can_rematerialize)
976       dst = rematerialize(def, cursor);
977    else
978       dst = reload(ctx, def, cursor);
979 
980    rewrite_src_interval(ctx, interval, dst, cursor);
981 }
982 
983 static void
reload_src(struct ra_spill_ctx * ctx,struct ir3_cursor cursor,struct ir3_register * src)984 reload_src(struct ra_spill_ctx *ctx, struct ir3_cursor cursor,
985             struct ir3_register *src)
986 {
987    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
988 
989    if (interval->needs_reload) {
990       reload_def(ctx, src->def, cursor);
991    }
992 
993    ra_spill_interval_root(interval)->cant_spill = false;
994 }
995 
996 static void
rewrite_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)997 rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
998             struct ir3_register *src)
999 {
1000    struct ra_spill_interval *interval = ctx->intervals[src->def->name];
1001 
1002    set_src_val(src, &interval->dst);
1003 }
1004 
1005 static void
update_max_pressure(struct ra_spill_ctx * ctx)1006 update_max_pressure(struct ra_spill_ctx *ctx)
1007 {
1008    d("pressure:");
1009    d("\tfull: %u", ctx->cur_pressure.full);
1010    d("\thalf: %u", ctx->cur_pressure.half);
1011    d("\tshared: %u", ctx->cur_pressure.shared);
1012    d("\tshared half: %u", ctx->cur_pressure.shared_half);
1013 
1014    ctx->max_pressure.full =
1015       MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
1016    ctx->max_pressure.half =
1017       MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
1018    ctx->max_pressure.shared =
1019       MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
1020    ctx->max_pressure.shared_half =
1021       MAX2(ctx->max_pressure.shared_half, ctx->cur_pressure.shared_half);
1022 }
1023 
1024 static void
handle_instr(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1025 handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1026 {
1027    ra_foreach_dst (dst, instr) {
1028       /* No need to handle instructions inserted while spilling. Most are
1029        * ignored automatically by virtue of being inserted before the current
1030        * instruction. However, for live-ins, we may insert extracts after the
1031        * phis. Trying to handle them ends badly as they don't have intervals
1032        * allocated.
1033        * Note: since liveness is calculated before spilling, original
1034        * instruction have a name while new ones don't.
1035        */
1036       if (!dst->name)
1037          return;
1038 
1039       init_dst(ctx, dst);
1040    }
1041 
1042    if (ctx->spilling) {
1043       ra_foreach_src (src, instr)
1044          insert_src(ctx, src);
1045    }
1046 
1047    /* Handle tied and early-kill destinations. If a destination is tied to a
1048     * source and that source is live-through, then we need to allocate a new
1049     * register for the destination which is live-through itself and cannot
1050     * overlap the sources. Similarly early-kill destinations cannot overlap
1051     * sources.
1052     */
1053 
1054    ra_foreach_dst (dst, instr) {
1055       struct ir3_register *tied_src = dst->tied;
1056       if ((tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL)) ||
1057           (dst->flags & IR3_REG_EARLY_CLOBBER))
1058          insert_dst(ctx, dst);
1059    }
1060 
1061    if (ctx->spilling)
1062       limit(ctx, ir3_before_instr(instr));
1063    else
1064       update_max_pressure(ctx);
1065 
1066    if (ctx->spilling) {
1067       ra_foreach_src (src, instr) {
1068          reload_src(ctx, ir3_before_instr(instr), src);
1069          update_src_next_use(ctx, src);
1070       }
1071    }
1072 
1073    ra_foreach_src (src, instr) {
1074       if (src->flags & IR3_REG_FIRST_KILL)
1075          remove_src_early(ctx, instr, src);
1076    }
1077 
1078    ra_foreach_dst (dst, instr) {
1079       insert_dst(ctx, dst);
1080    }
1081 
1082    if (ctx->spilling)
1083       limit(ctx, ir3_before_instr(instr));
1084    else
1085       update_max_pressure(ctx);
1086 
1087    /* We have to remove sources before rewriting them so that we can lookup the
1088     * interval to remove before the source itself is changed.
1089     */
1090    ra_foreach_src (src, instr) {
1091       if (src->flags & IR3_REG_FIRST_KILL)
1092          remove_src(ctx, instr, src);
1093    }
1094 
1095    if (ctx->spilling) {
1096       ra_foreach_src (src, instr) {
1097          rewrite_src(ctx, instr, src);
1098       }
1099    }
1100 
1101    ra_foreach_dst (dst, instr) {
1102       finish_dst(ctx, dst);
1103    }
1104 
1105    for (unsigned i = 0; i < instr->dsts_count; i++) {
1106       if (ra_reg_is_dst(instr->dsts[i]) &&
1107           (instr->dsts[i]->flags & IR3_REG_UNUSED))
1108          remove_dst(ctx, instr->dsts[i]);
1109    }
1110 }
1111 
1112 static struct ra_spill_interval *
create_temp_interval(struct ra_spill_ctx * ctx,struct ir3_register * def)1113 create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def)
1114 {
1115    unsigned name = ctx->intervals_count++;
1116    unsigned offset = ctx->live->interval_offset;
1117 
1118    /* This is kinda hacky, but we need to create a fake SSA def here that is
1119     * only used as part of the pcopy accounting. See below.
1120     */
1121    struct ir3_register *reg = rzalloc(ctx, struct ir3_register);
1122    *reg = *def;
1123    reg->name = name;
1124    reg->interval_start = offset;
1125    reg->interval_end = offset + reg_size(def);
1126    reg->merge_set = NULL;
1127 
1128    ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *,
1129                              ctx->intervals_count);
1130    struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval);
1131    ra_spill_interval_init(interval, reg);
1132    ctx->intervals[name] = interval;
1133    ctx->live->interval_offset += reg_size(def);
1134    return interval;
1135 }
1136 
1137 /* In the sequence of copies generated (see below), would this source be killed?
1138  */
1139 static bool
is_last_pcopy_src(struct ir3_instruction * pcopy,unsigned src_n)1140 is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
1141 {
1142    struct ir3_register *src = pcopy->srcs[src_n];
1143    if (!(src->flags & IR3_REG_KILL))
1144       return false;
1145    for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) {
1146       if (pcopy->srcs[j]->def == src->def)
1147          return false;
1148    }
1149    return true;
1150 }
1151 
1152 /* Parallel copies are different from normal instructions. The sources together
1153  * may be larger than the entire register file, so we cannot just reload every
1154  * source like normal, and indeed that probably wouldn't be a great idea.
1155  * Instead we essentially need to lower the parallel copy to "copies," just like
1156  * in the normal CSSA construction, although we implement the copies by
1157  * reloading and then possibly spilling values. We essentially just shuffle
1158  * around the sources until each source either (a) is live or (b) has the same
1159  * spill slot as its corresponding destination. We do this by decomposing the
1160  * copy into a series of copies, so:
1161  *
1162  * a, b, c = d, e, f
1163  *
1164  * becomes:
1165  *
1166  * d' = d
1167  * e' = e
1168  * f' = f
1169  * a = d'
1170  * b = e'
1171  * c = f'
1172  *
1173  * the temporary SSA values d', e', and f' never actually show up in the result.
1174  * They are only used for our internal accounting. They may, however, have their
1175  * own spill slot created for them. Similarly, we don't actually emit any copy
1176  * instructions, although we emit the spills/reloads that *would've* been
1177  * required if those copies were there.
1178  *
1179  * TODO: in order to reduce the number of temporaries and therefore spill slots,
1180  * we could instead do a more complicated analysis that considers the location
1181  * transfer graph.
1182  *
1183  * In addition, we actually remove the parallel copy and rewrite all its uses
1184  * (in the phi nodes) rather than rewrite its sources at the end. Recreating it
1185  * later turns out to be easier than keeping it up-to-date throughout this pass,
1186  * since we may have to remove entries for phi sources that are spilled and add
1187  * entries for live-outs that are spilled and reloaded, which can happen here
1188  * and then possibly be undone or done again when processing live-ins of the
1189  * successor block.
1190  */
1191 
1192 static void
handle_pcopy(struct ra_spill_ctx * ctx,struct ir3_instruction * pcopy)1193 handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
1194 {
1195    ra_foreach_dst (dst, pcopy) {
1196       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1197       ra_spill_interval_init(dst_interval, dst);
1198    }
1199 
1200    foreach_src_n (src, i, pcopy) {
1201       struct ir3_register *dst = pcopy->dsts[i];
1202       if (!(dst->flags & IR3_REG_SSA))
1203          continue;
1204 
1205       d("processing src %u", i);
1206 
1207       /* Skip the intermediate copy for cases where the source is merged with
1208        * the destination. Crucially this means that we also don't reload/spill
1209        * it if it's been spilled, because it shares the same spill slot.
1210        */
1211       if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1212           src->def->merge_set == dst->merge_set &&
1213           src->def->merge_set_offset == dst->merge_set_offset) {
1214          struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
1215          struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1216          if (src_interval->interval.inserted) {
1217             update_src_next_use(ctx, src);
1218             if (is_last_pcopy_src(pcopy, i))
1219                ra_spill_ctx_remove(ctx, src_interval);
1220             dst_interval->cant_spill = true;
1221             ra_spill_ctx_insert(ctx, dst_interval);
1222             limit(ctx, ir3_before_instr(pcopy));
1223             dst_interval->cant_spill = false;
1224             dst_interval->dst = src_interval->dst;
1225          }
1226       } else if (src->flags & IR3_REG_SSA) {
1227          struct ra_spill_interval *temp_interval =
1228             create_temp_interval(ctx, dst);
1229          struct ir3_register *temp = temp_interval->interval.reg;
1230          temp_interval->next_use_distance = src->next_use;
1231 
1232          insert_src(ctx, src);
1233          limit(ctx, ir3_before_instr(pcopy));
1234          reload_src(ctx, ir3_before_instr(pcopy), src);
1235          update_src_next_use(ctx, src);
1236          if (is_last_pcopy_src(pcopy, i))
1237             remove_src(ctx, pcopy, src);
1238          struct ra_spill_interval *src_interval =
1239             ctx->intervals[src->def->name];
1240          temp_interval->dst = src_interval->dst;
1241 
1242          temp_interval->cant_spill = true;
1243          ra_spill_ctx_insert(ctx, temp_interval);
1244          limit(ctx, ir3_before_instr(pcopy));
1245          temp_interval->cant_spill = false;
1246 
1247          src->flags = temp->flags;
1248          src->def = temp;
1249       }
1250    }
1251 
1252    d("done with pcopy srcs");
1253 
1254    foreach_src_n (src, i, pcopy) {
1255       struct ir3_register *dst = pcopy->dsts[i];
1256       if (!(dst->flags & IR3_REG_SSA))
1257          continue;
1258 
1259       if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1260           src->def->merge_set == dst->merge_set &&
1261           src->def->merge_set_offset == dst->merge_set_offset)
1262          continue;
1263 
1264       struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1265 
1266       if (!(src->flags & IR3_REG_SSA)) {
1267          dst_interval->cant_spill = true;
1268          ra_spill_ctx_insert(ctx, dst_interval);
1269          limit(ctx, ir3_before_instr(pcopy));
1270          dst_interval->cant_spill = false;
1271 
1272          assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED));
1273          if (src->flags & IR3_REG_CONST) {
1274             dst_interval->dst.flags = src->flags;
1275             dst_interval->dst.const_num = src->num;
1276          } else {
1277             dst_interval->dst.flags = src->flags;
1278             dst_interval->dst.uimm = src->uim_val;
1279          }
1280       } else {
1281          struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name];
1282 
1283          insert_src(ctx, src);
1284          limit(ctx, ir3_before_instr(pcopy));
1285          reload_src(ctx, ir3_before_instr(pcopy), src);
1286          remove_src(ctx, pcopy, src);
1287 
1288          dst_interval->dst = temp_interval->dst;
1289          ra_spill_ctx_insert(ctx, dst_interval);
1290       }
1291    }
1292 
1293    pcopy->flags |= IR3_INSTR_UNUSED;
1294 }
1295 
1296 static void
handle_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1297 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1298 {
1299    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1300       return;
1301 
1302    init_dst(ctx, instr->dsts[0]);
1303    insert_dst(ctx, instr->dsts[0]);
1304    finish_dst(ctx, instr->dsts[0]);
1305 }
1306 
1307 static void
remove_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1308 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1309 {
1310    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1311       return;
1312 
1313    if (instr->opc == OPC_META_TEX_PREFETCH) {
1314       ra_foreach_src (src, instr) {
1315          if (src->flags & IR3_REG_FIRST_KILL)
1316             remove_src(ctx, instr, src);
1317       }
1318    }
1319    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1320       remove_dst(ctx, instr->dsts[0]);
1321 }
1322 
1323 static void
handle_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1324 handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block,
1325                struct ir3_register *def)
1326 {
1327    struct ra_spill_interval *interval = ctx->intervals[def->name];
1328    ra_spill_interval_init(interval, def);
1329    if (ctx->spilling) {
1330       interval->next_use_distance =
1331          ctx->blocks[block->index].next_use_start[def->name];
1332    }
1333 
1334    ra_spill_ctx_insert(ctx, interval);
1335 }
1336 
1337 static bool
is_live_in_phi(struct ir3_register * def,struct ir3_block * block)1338 is_live_in_phi(struct ir3_register *def, struct ir3_block *block)
1339 {
1340    return def->instr->opc == OPC_META_PHI && def->instr->block == block;
1341 }
1342 
1343 static bool
is_live_in_pred(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1344 is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def,
1345                 struct ir3_block *block, unsigned pred_idx)
1346 {
1347    struct ir3_block *pred = block->predecessors[pred_idx];
1348    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1349    if (is_live_in_phi(def, block)) {
1350       def = def->instr->srcs[pred_idx]->def;
1351       if (!def)
1352          return false;
1353    }
1354 
1355    return _mesa_hash_table_search(state->remap, def);
1356 }
1357 
1358 static bool
is_live_in_undef(struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1359 is_live_in_undef(struct ir3_register *def,
1360                  struct ir3_block *block, unsigned pred_idx)
1361 {
1362    if (!is_live_in_phi(def, block))
1363       return false;
1364 
1365    return !def->instr->srcs[pred_idx]->def;
1366 }
1367 
1368 static struct reg_or_immed *
read_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1369 read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1370              struct ir3_block *block, unsigned pred_idx)
1371 {
1372    struct ir3_block *pred = block->predecessors[pred_idx];
1373    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1374 
1375    if (is_live_in_phi(def, block)) {
1376       def = def->instr->srcs[pred_idx]->def;
1377       if (!def)
1378          return NULL;
1379    }
1380 
1381    struct hash_entry *entry = _mesa_hash_table_search(state->remap, def);
1382    if (entry)
1383       return entry->data;
1384    else
1385       return NULL;
1386 }
1387 
1388 static bool
is_live_in_all_preds(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1389 is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def,
1390                      struct ir3_block *block)
1391 {
1392    for (unsigned i = 0; i < block->predecessors_count; i++) {
1393       if (!is_live_in_pred(ctx, def, block, i))
1394          return false;
1395    }
1396 
1397    return true;
1398 }
1399 
1400 static void
spill_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1401 spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1402               struct ir3_block *block)
1403 {
1404    for (unsigned i = 0; i < block->predecessors_count; i++) {
1405       struct ir3_block *pred = block->predecessors[i];
1406       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1407 
1408       if (!state->visited)
1409          continue;
1410 
1411       struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i);
1412       if (pred_def) {
1413          spill(ctx, pred_def, get_spill_slot(ctx, def),
1414                ir3_before_terminator(pred));
1415       }
1416    }
1417 }
1418 
1419 static void
spill_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1420 spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1421 {
1422    bool all_preds_visited = true;
1423    for (unsigned i = 0; i < block->predecessors_count; i++) {
1424       struct ir3_block *pred = block->predecessors[i];
1425       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1426       if (!state->visited) {
1427          all_preds_visited = false;
1428          break;
1429       }
1430    }
1431 
1432    /* Note: in the paper they explicitly spill live-through values first, but we
1433     * should be doing that automatically by virtue of picking the largest
1434     * distance due to the extra distance added to edges out of loops.
1435     *
1436     * TODO: Keep track of pressure in each block and preemptively spill
1437     * live-through values as described in the paper to avoid spilling them
1438     * inside the loop.
1439     */
1440 
1441    if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
1442       rb_tree_foreach_safe (struct ra_spill_interval, interval,
1443                             &ctx->half_live_intervals, half_node) {
1444          if (all_preds_visited &&
1445              is_live_in_all_preds(ctx, interval->interval.reg, block))
1446             continue;
1447          if (interval->interval.reg->merge_set ||
1448              !interval->can_rematerialize)
1449             spill_live_in(ctx, interval->interval.reg, block);
1450          ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1451          if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
1452             break;
1453       }
1454    }
1455 
1456    if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
1457       rb_tree_foreach_safe (struct ra_spill_interval, interval,
1458                             &ctx->full_live_intervals, node) {
1459          if (all_preds_visited &&
1460              is_live_in_all_preds(ctx, interval->interval.reg, block))
1461             continue;
1462          spill_live_in(ctx, interval->interval.reg, block);
1463          ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1464          if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
1465             break;
1466       }
1467    }
1468 }
1469 
1470 static void
live_in_rewrite(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct reg_or_immed * new_val,struct ir3_block * block,unsigned pred_idx)1471 live_in_rewrite(struct ra_spill_ctx *ctx,
1472                 struct ra_spill_interval *interval,
1473                 struct reg_or_immed *new_val,
1474                 struct ir3_block *block, unsigned pred_idx)
1475 {
1476    struct ir3_block *pred = block->predecessors[pred_idx];
1477    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1478    struct ir3_register *def = interval->interval.reg;
1479    if (is_live_in_phi(def, block)) {
1480       def = def->instr->srcs[pred_idx]->def;
1481    }
1482 
1483    if (def)
1484       _mesa_hash_table_insert(state->remap, def, new_val);
1485 }
1486 
1487 static void
reload_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1488 reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1489                struct ir3_block *block)
1490 {
1491    struct ra_spill_interval *interval = ctx->intervals[def->name];
1492    for (unsigned i = 0; i < block->predecessors_count; i++) {
1493       struct ir3_block *pred = block->predecessors[i];
1494       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1495       if (!state->visited)
1496          continue;
1497 
1498       if (is_live_in_undef(def, block, i))
1499          continue;
1500 
1501       struct reg_or_immed *new_val = read_live_in(ctx, def, block, i);
1502 
1503       if (!new_val) {
1504          new_val = ralloc(ctx, struct reg_or_immed);
1505          if (interval->can_rematerialize)
1506             new_val->def = rematerialize(def, ir3_before_terminator(pred));
1507          else
1508             new_val->def = reload(ctx, def, ir3_before_terminator(pred));
1509          new_val->flags = new_val->def->flags;
1510       }
1511       live_in_rewrite(ctx, interval, new_val, block, i);
1512    }
1513 }
1514 
1515 static void
reload_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1516 reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1517 {
1518    rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1519                     interval.node) {
1520       reload_live_in(ctx, interval->interval.reg, block);
1521    }
1522 }
1523 
1524 static void
add_live_in_phi(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_register * parent_def,struct ir3_block * block)1525 add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def,
1526                 struct ir3_register *parent_def, struct ir3_block *block)
1527 {
1528    struct ra_spill_interval *interval = ctx->intervals[def->name];
1529    if (!interval->interval.inserted)
1530       return;
1531 
1532    bool needs_phi = false;
1533    struct ir3_register *cur_def = NULL;
1534    for (unsigned i = 0; i < block->predecessors_count; i++) {
1535       struct ir3_block *pred = block->predecessors[i];
1536       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1537 
1538       if (!state->visited) {
1539          needs_phi = true;
1540          break;
1541       }
1542 
1543       struct hash_entry *entry =
1544          _mesa_hash_table_search(state->remap, def);
1545       assert(entry);
1546       struct reg_or_immed *pred_val = entry->data;
1547       if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) ||
1548           !pred_val->def ||
1549           (cur_def && cur_def != pred_val->def)) {
1550          needs_phi = true;
1551          break;
1552       }
1553       cur_def = pred_val->def;
1554    }
1555 
1556    if (!needs_phi) {
1557       interval->dst.def = cur_def;
1558       interval->dst.flags = cur_def->flags;
1559 
1560       rb_tree_foreach (struct ra_spill_interval, child,
1561                        &interval->interval.children, interval.node) {
1562          add_live_in_phi(ctx, child->interval.reg, cur_def, block);
1563       }
1564 
1565       return;
1566    }
1567 
1568    if (parent_def) {
1569       /* We have a child interval that needs a phi but whose parent does not
1570        * need one (otherwise parent_def would be NULL). Just extract the child
1571        * from the parent without creating a phi for the child.
1572        */
1573       unsigned offset = (def->interval_start - parent_def->interval_start) /
1574                         reg_elem_size(def);
1575       struct ir3_register *extracted =
1576          extract(parent_def, offset, reg_elems(def), ir3_after_phis(block));
1577       rewrite_src_interval(ctx, interval, extracted,
1578                            ir3_after_instr(extracted->instr));
1579       return;
1580    }
1581 
1582    struct ir3_instruction *phi = ir3_instr_create_at(
1583       ir3_before_block(block), OPC_META_PHI, 1, block->predecessors_count);
1584    struct ir3_register *dst = __ssa_dst(phi);
1585    dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
1586    dst->size = def->size;
1587    dst->wrmask = def->wrmask;
1588 
1589    dst->interval_start = def->interval_start;
1590    dst->interval_end = def->interval_end;
1591    dst->merge_set = def->merge_set;
1592    dst->merge_set_offset = def->merge_set_offset;
1593 
1594    for (unsigned i = 0; i < block->predecessors_count; i++) {
1595       struct ir3_block *pred = block->predecessors[i];
1596       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1597       struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags);
1598       src->size = def->size;
1599       src->wrmask = def->wrmask;
1600 
1601       if (state->visited) {
1602          struct hash_entry *entry =
1603             _mesa_hash_table_search(state->remap, def);
1604          assert(entry);
1605          struct reg_or_immed *new_val = entry->data;
1606          set_src_val(src, new_val);
1607       } else {
1608          src->def = def;
1609       }
1610    }
1611 
1612    interval->dst.def = dst;
1613    interval->dst.flags = dst->flags;
1614    rewrite_src_interval(ctx, interval, dst, ir3_after_phis(block));
1615 }
1616 
1617 static void
add_live_in_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1618 add_live_in_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1619 {
1620    rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1621                     interval.node) {
1622       if (BITSET_TEST(ctx->live->live_in[block->index],
1623                       interval->interval.reg->name)) {
1624          add_live_in_phi(ctx, interval->interval.reg, NULL, block);
1625       }
1626    }
1627 }
1628 
1629 /* When spilling a block with a single predecessors, the pred may have other
1630  * successors so we can't choose what's live in and we can't spill/restore
1631  * anything. Just make the inserted intervals exactly match the predecessor. If
1632  * it wasn't live in the predecessor then it must've already been spilled. Also,
1633  * there are no phi nodes and no live-ins.
1634  */
1635 static void
spill_single_pred_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block)1636 spill_single_pred_live_in(struct ra_spill_ctx *ctx,
1637                           struct ir3_block *block)
1638 {
1639    unsigned name;
1640    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1641                        ctx->live->definitions_count) {
1642       struct ir3_register *reg = ctx->live->definitions[name];
1643       struct ra_spill_interval *interval = ctx->intervals[reg->name];
1644       struct reg_or_immed *val = read_live_in(ctx, reg, block, 0);
1645       if (val)
1646          interval->dst = *val;
1647       else
1648          ra_spill_ctx_remove(ctx, interval);
1649    }
1650 }
1651 
1652 static void
rewrite_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * phi,struct ir3_block * block)1653 rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
1654             struct ir3_block *block)
1655 {
1656    if (!(phi->dsts[0]->flags & IR3_REG_SSA))
1657       return;
1658 
1659    if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
1660       phi->flags |= IR3_INSTR_UNUSED;
1661       return;
1662    }
1663 
1664    for (unsigned i = 0; i < block->predecessors_count; i++) {
1665       struct ir3_block *pred = block->predecessors[i];
1666       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1667 
1668       if (!state->visited)
1669          continue;
1670 
1671       struct ir3_register *src = phi->srcs[i];
1672       if (!src->def)
1673          continue;
1674 
1675       struct hash_entry *entry =
1676          _mesa_hash_table_search(state->remap, src->def);
1677       assert(entry);
1678       struct reg_or_immed *new_val = entry->data;
1679       set_src_val(src, new_val);
1680    }
1681 }
1682 
1683 static void
spill_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block)1684 spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
1685                struct ir3_block *block)
1686 {
1687    struct ir3_register *def = interval->interval.reg;
1688 
1689    if (interval->interval.reg->merge_set ||
1690        !interval->can_rematerialize) {
1691       spill(ctx, &interval->dst, get_spill_slot(ctx, def),
1692             ir3_before_terminator(block));
1693    }
1694    ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1695 }
1696 
1697 static void
spill_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1698 spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1699 {
1700    struct ra_spill_block_state *state = &ctx->blocks[block->index];
1701    rb_tree_foreach_safe (struct ra_spill_interval, interval,
1702                          &ctx->reg_ctx.intervals, interval.node) {
1703       if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) {
1704          spill_live_out(ctx, interval, block);
1705       }
1706    }
1707 }
1708 
1709 static void
reload_live_out(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1710 reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def,
1711                 struct ir3_block *block)
1712 {
1713    struct ra_spill_interval *interval = ctx->intervals[def->name];
1714    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1715 
1716    reload_def(ctx, def, ir3_before_terminator(block));
1717 }
1718 
1719 static void
reload_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1720 reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1721 {
1722    struct ra_spill_block_state *state = &ctx->blocks[block->index];
1723    unsigned name;
1724    BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1725       struct ir3_register *reg = ctx->live->definitions[name];
1726       struct ra_spill_interval *interval = ctx->intervals[name];
1727       if (!interval->interval.inserted)
1728          reload_live_out(ctx, reg, block);
1729    }
1730 }
1731 
1732 static void
update_live_out_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1733 update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1734 {
1735    assert(!block->successors[1]);
1736    struct ir3_block *succ = block->successors[0];
1737    unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1738 
1739    foreach_instr (instr, &succ->instr_list) {
1740       if (instr->opc != OPC_META_PHI)
1741          break;
1742 
1743       struct ir3_register *def = instr->srcs[pred_idx]->def;
1744       if (!def)
1745          continue;
1746 
1747       struct ra_spill_interval *interval = ctx->intervals[def->name];
1748       if (!interval->interval.inserted)
1749          continue;
1750       set_src_val(instr->srcs[pred_idx], &interval->dst);
1751    }
1752 }
1753 
1754 static void
record_pred_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block,unsigned pred_idx)1755 record_pred_live_out(struct ra_spill_ctx *ctx,
1756                      struct ra_spill_interval *interval,
1757                      struct ir3_block *block, unsigned pred_idx)
1758 {
1759    struct ir3_block *pred = block->predecessors[pred_idx];
1760    struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1761 
1762    struct ir3_register *def = interval->interval.reg;
1763    if (is_live_in_phi(def, block)) {
1764       def = def->instr->srcs[pred_idx]->def;
1765    }
1766    BITSET_SET(state->live_out, def->name);
1767 
1768    rb_tree_foreach (struct ra_spill_interval, child,
1769                     &interval->interval.children, interval.node) {
1770       record_pred_live_out(ctx, child, block, pred_idx);
1771    }
1772 }
1773 
1774 static void
record_pred_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1775 record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1776 {
1777    for (unsigned i = 0; i < block->predecessors_count; i++) {
1778       struct ir3_block *pred = block->predecessors[i];
1779       struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1780       if (state->visited)
1781          continue;
1782 
1783       state->live_out = rzalloc_array(ctx, BITSET_WORD,
1784                                       BITSET_WORDS(ctx->live->definitions_count));
1785 
1786 
1787       rb_tree_foreach (struct ra_spill_interval, interval,
1788                        &ctx->reg_ctx.intervals, interval.node) {
1789          record_pred_live_out(ctx, interval, block, i);
1790       }
1791    }
1792 }
1793 
1794 static void
record_live_out(struct ra_spill_ctx * ctx,struct ra_spill_block_state * state,struct ra_spill_interval * interval)1795 record_live_out(struct ra_spill_ctx *ctx,
1796                 struct ra_spill_block_state *state,
1797                 struct ra_spill_interval *interval)
1798 {
1799    if (!(interval->dst.flags & IR3_REG_SSA) ||
1800        interval->dst.def) {
1801       struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed);
1802       *val = interval->dst;
1803       _mesa_hash_table_insert(state->remap, interval->interval.reg, val);
1804    }
1805    rb_tree_foreach (struct ra_spill_interval, child,
1806                     &interval->interval.children, interval.node) {
1807       record_live_out(ctx, state, child);
1808    }
1809 }
1810 
1811 static void
record_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1812 record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1813 {
1814    struct ra_spill_block_state *state = &ctx->blocks[block->index];
1815    state->remap = _mesa_pointer_hash_table_create(ctx);
1816 
1817    rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1818                     interval.node) {
1819       record_live_out(ctx, state, interval);
1820    }
1821 }
1822 
1823 static void
handle_block(struct ra_spill_ctx * ctx,struct ir3_block * block)1824 handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
1825 {
1826    memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
1827    rb_tree_init(&ctx->reg_ctx.intervals);
1828    rb_tree_init(&ctx->full_live_intervals);
1829    rb_tree_init(&ctx->half_live_intervals);
1830 
1831    unsigned name;
1832    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1833                        ctx->live->definitions_count) {
1834       struct ir3_register *reg = ctx->live->definitions[name];
1835       handle_live_in(ctx, block, reg);
1836    }
1837 
1838    foreach_instr (instr, &block->instr_list) {
1839       if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
1840           instr->opc != OPC_META_TEX_PREFETCH)
1841          break;
1842       handle_input_phi(ctx, instr);
1843    }
1844 
1845    if (ctx->spilling) {
1846       if (block->predecessors_count == 1 &&
1847           block->predecessors[0]->successors[1]) {
1848          spill_single_pred_live_in(ctx, block);
1849       } else {
1850          spill_live_ins(ctx, block);
1851          reload_live_ins(ctx, block);
1852          record_pred_live_outs(ctx, block);
1853          foreach_instr (instr, &block->instr_list) {
1854             if (instr->opc != OPC_META_PHI)
1855                break;
1856             rewrite_phi(ctx, instr, block);
1857          }
1858          add_live_in_phis(ctx, block);
1859       }
1860    } else {
1861       update_max_pressure(ctx);
1862    }
1863 
1864    foreach_instr (instr, &block->instr_list) {
1865       di(instr, "processing");
1866 
1867       if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
1868           instr->opc == OPC_META_TEX_PREFETCH)
1869          remove_input_phi(ctx, instr);
1870       else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY)
1871          handle_pcopy(ctx, instr);
1872       else if (ctx->spilling && instr->opc == OPC_MOV &&
1873                instr->dsts[0] == ctx->base_reg)
1874          /* skip */;
1875       else
1876          handle_instr(ctx, instr);
1877    }
1878 
1879    if (ctx->spilling && block->successors[0]) {
1880       struct ra_spill_block_state *state =
1881          &ctx->blocks[block->successors[0]->index];
1882       if (state->visited) {
1883          assert(!block->successors[1]);
1884 
1885          spill_live_outs(ctx, block);
1886          reload_live_outs(ctx, block);
1887          update_live_out_phis(ctx, block);
1888       }
1889    }
1890 
1891    if (ctx->spilling) {
1892       record_live_outs(ctx, block);
1893       ctx->blocks[block->index].visited = true;
1894    }
1895 }
1896 
1897 static bool
simplify_phi_node(struct ir3_instruction * phi)1898 simplify_phi_node(struct ir3_instruction *phi)
1899 {
1900    struct ir3_register *def = NULL;
1901    foreach_src (src, phi) {
1902       /* Ignore phi sources which point to the phi itself. */
1903       if (src->def == phi->dsts[0])
1904          continue;
1905       /* If it's undef or it doesn't match the previous sources, bail */
1906       if (!src->def || (def && def != src->def))
1907          return false;
1908       def = src->def;
1909    }
1910 
1911    phi->data = def;
1912    phi->flags |= IR3_INSTR_UNUSED;
1913    return true;
1914 }
1915 
1916 static struct ir3_register *
simplify_phi_def(struct ir3_register * def)1917 simplify_phi_def(struct ir3_register *def)
1918 {
1919    if (def->instr->opc == OPC_META_PHI) {
1920       struct ir3_instruction *phi = def->instr;
1921 
1922       /* Note: this function is always called at least once after visiting the
1923        * phi, so either there has been a simplified phi in the meantime, in
1924        * which case we will set progress=true and visit the definition again, or
1925        * phi->data already has the most up-to-date value. Therefore we don't
1926        * have to recursively check phi->data.
1927        */
1928       if (phi->data)
1929          return phi->data;
1930    }
1931 
1932    return def;
1933 }
1934 
1935 static void
simplify_phi_srcs(struct ir3_instruction * instr)1936 simplify_phi_srcs(struct ir3_instruction *instr)
1937 {
1938    foreach_src (src, instr) {
1939       if (src->def)
1940          src->def = simplify_phi_def(src->def);
1941    }
1942 }
1943 
1944 /* We insert phi nodes for all live-ins of loops in case we need to split the
1945  * live range. This pass cleans that up for the case where the live range didn't
1946  * actually need to be split.
1947  */
1948 static void
simplify_phi_nodes(struct ir3 * ir)1949 simplify_phi_nodes(struct ir3 *ir)
1950 {
1951    foreach_block (block, &ir->block_list) {
1952       foreach_instr (instr, &block->instr_list) {
1953          if (instr->opc != OPC_META_PHI)
1954             break;
1955          instr->data = NULL;
1956       }
1957    }
1958 
1959    bool progress;
1960    do {
1961       progress = false;
1962       foreach_block (block, &ir->block_list) {
1963          foreach_instr (instr, &block->instr_list) {
1964             if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED))
1965                continue;
1966 
1967             simplify_phi_srcs(instr);
1968          }
1969 
1970          /* Visit phi nodes in the successors to make sure that phi sources are
1971           * always visited at least once after visiting the definition they
1972           * point to. See note in simplify_phi_def() for why this is necessary.
1973           */
1974          for (unsigned i = 0; i < 2; i++) {
1975             struct ir3_block *succ = block->successors[i];
1976             if (!succ)
1977                continue;
1978             foreach_instr (instr, &succ->instr_list) {
1979                if (instr->opc != OPC_META_PHI)
1980                   break;
1981                if (instr->flags & IR3_INSTR_UNUSED) {
1982                   if (instr->data)
1983                      instr->data = simplify_phi_def(instr->data);
1984                } else {
1985                   simplify_phi_srcs(instr);
1986                   progress |= simplify_phi_node(instr);
1987                }
1988             }
1989          }
1990       }
1991    } while (progress);
1992 }
1993 
1994 static void
unmark_dead(struct ir3 * ir)1995 unmark_dead(struct ir3 *ir)
1996 {
1997    foreach_block (block, &ir->block_list) {
1998       foreach_instr (instr, &block->instr_list) {
1999          instr->flags &= ~IR3_INSTR_UNUSED;
2000       }
2001    }
2002 }
2003 
2004 /* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark
2005  * which ones are dead along the way, so there's nothing to compute here.
2006  */
2007 static void
cleanup_dead(struct ir3 * ir)2008 cleanup_dead(struct ir3 *ir)
2009 {
2010    foreach_block (block, &ir->block_list) {
2011       foreach_instr_safe (instr, &block->instr_list) {
2012          if (instr->flags & IR3_INSTR_UNUSED) {
2013             if (instr->opc == OPC_META_PARALLEL_COPY) {
2014                /* There may be non-SSA shared copies, we need to preserve these.
2015                 */
2016                for (unsigned i = 0; i < instr->dsts_count;) {
2017                   if (instr->dsts[i]->flags & IR3_REG_SSA) {
2018                      instr->dsts[i] = instr->dsts[--instr->dsts_count];
2019                      instr->srcs[i] = instr->srcs[--instr->srcs_count];
2020                   } else {
2021                      i++;
2022                   }
2023                }
2024 
2025                if (instr->dsts_count == 0)
2026                   list_delinit(&instr->node);
2027             } else {
2028                list_delinit(&instr->node);
2029             }
2030          }
2031       }
2032    }
2033 }
2034 
2035 /* Deal with merge sets after spilling. Spilling generally leaves the merge sets
2036  * in a mess, and even if we properly cleaned up after ourselves, we would want
2037  * to recompute the merge sets afterward anway. That's because
2038  * spilling/reloading can "break up" phi webs and split/collect webs so that
2039  * allocating them to the same register no longer gives any benefit. For
2040  * example, imagine we have this:
2041  *
2042  * if (...) {
2043  *    foo = ...
2044  * } else {
2045  *    bar = ...
2046  * }
2047  * baz = phi(foo, bar)
2048  *
2049  * and we spill "baz":
2050  *
2051  * if (...) {
2052  *    foo = ...
2053  *    spill(foo)
2054  * } else {
2055  *    bar = ...
2056  *    spill(bar)
2057  * }
2058  * baz = reload()
2059  *
2060  * now foo, bar, and baz don't have to be allocated to the same register. How
2061  * exactly the merge sets change can be complicated, so it's easier just to
2062  * recompute them.
2063  *
2064  * However, there's a wrinkle in this: those same merge sets determine the
2065  * register pressure, due to multiple values inhabiting the same register! And
2066  * we assume that this sharing happens when spilling. Therefore we need a
2067  * three-step procedure:
2068  *
2069  * 1. Drop the original merge sets.
2070  * 2. Calculate which values *must* be merged, being careful to only use the
2071  *    interval information which isn't trashed by spilling, and forcibly merge
2072  *    them.
2073  * 3. Let ir3_merge_regs() finish the job, including recalculating the
2074  *    intervals.
2075  */
2076 
2077 static void
fixup_merge_sets(struct ir3_liveness * live,struct ir3 * ir)2078 fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
2079 {
2080    foreach_block (block, &ir->block_list) {
2081       foreach_instr (instr, &block->instr_list) {
2082          foreach_dst (dst, instr) {
2083             dst->merge_set = NULL;
2084             dst->merge_set_offset = 0;
2085          }
2086       }
2087    }
2088 
2089    ir3_index_instrs_for_merge_sets(ir);
2090 
2091    foreach_block (block, &ir->block_list) {
2092       foreach_instr (instr, &block->instr_list) {
2093          if (instr->opc != OPC_META_SPLIT &&
2094              instr->opc != OPC_META_COLLECT)
2095             continue;
2096 
2097          struct ir3_register *dst = instr->dsts[0];
2098          ra_foreach_src (src, instr) {
2099             if (!(src->flags & IR3_REG_KILL) &&
2100                 src->def->interval_start < dst->interval_end &&
2101                 dst->interval_start < src->def->interval_end) {
2102                ir3_force_merge(dst, src->def,
2103                                src->def->interval_start - dst->interval_start);
2104             }
2105          }
2106       }
2107    }
2108 
2109    ir3_merge_regs(live, ir);
2110 }
2111 
2112 void
ir3_calc_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * max_pressure)2113 ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
2114                   struct ir3_pressure *max_pressure)
2115 {
2116    struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
2117    spill_ctx_init(ctx, v, live);
2118 
2119    foreach_block (block, &v->ir->block_list) {
2120       handle_block(ctx, block);
2121    }
2122 
2123    assert(ctx->cur_pressure.full == 0);
2124    assert(ctx->cur_pressure.half == 0);
2125    assert(ctx->cur_pressure.shared == 0);
2126    assert(ctx->cur_pressure.shared_half == 0);
2127 
2128    *max_pressure = ctx->max_pressure;
2129    ralloc_free(ctx);
2130 }
2131 
2132 bool
ir3_spill(struct ir3 * ir,struct ir3_shader_variant * v,struct ir3_liveness ** live,const struct ir3_pressure * limit_pressure)2133 ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
2134           struct ir3_liveness **live,
2135           const struct ir3_pressure *limit_pressure)
2136 {
2137    void *mem_ctx = ralloc_parent(*live);
2138    struct ra_spill_ctx *ctx = rzalloc(mem_ctx, struct ra_spill_ctx);
2139    spill_ctx_init(ctx, v, *live);
2140 
2141    ctx->spilling = true;
2142 
2143    ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state,
2144                                ctx->live->block_count);
2145    rb_tree_init(&ctx->full_live_intervals);
2146    rb_tree_init(&ctx->half_live_intervals);
2147 
2148    ctx->limit_pressure = *limit_pressure;
2149    ctx->spill_slot = v->pvtmem_size;
2150 
2151    add_base_reg(ctx, ir);
2152    compute_next_distance(ctx, ir);
2153 
2154    unmark_dead(ir);
2155 
2156    foreach_block (block, &ir->block_list) {
2157       handle_block(ctx, block);
2158    }
2159 
2160    simplify_phi_nodes(ir);
2161 
2162    cleanup_dead(ir);
2163 
2164    ir3_create_parallel_copies(ir);
2165 
2166    /* After this point, we're done mutating the IR. Liveness has been trashed,
2167     * so recalculate it. We'll need it for recalculating the merge sets.
2168     */
2169    ralloc_free(ctx->live);
2170    *live = ir3_calc_liveness(mem_ctx, ir);
2171 
2172    fixup_merge_sets(*live, ir);
2173 
2174    v->pvtmem_size = ctx->spill_slot;
2175    ralloc_free(ctx);
2176 
2177    return true;
2178 }
2179