1 /*
2 * Copyright © 2021 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "util/rb_tree.h"
7 #include "ir3_ra.h"
8 #include "ir3_shader.h"
9
10 /*
11 * This pass does two things:
12 *
13 * 1. Calculates the maximum register pressure. To do this, we need to use the
14 * exact same technique that RA uses for combining meta_split instructions
15 * with their sources, so that our calculation agrees with RA.
16 * 2. Spills when the register pressure is exceeded a limit calculated by RA.
17 * The implementation is based on "Register Spilling and Live-Range Splitting
18 * for SSA-Form Programs" by Braun and Hack, although again care has to be
19 * taken to handle combining split/collect instructions.
20 */
21
22 struct reg_or_immed {
23 unsigned flags;
24 union {
25 struct ir3_register *def;
26 uint32_t uimm;
27 unsigned const_num;
28 };
29 };
30
31 struct ra_spill_interval {
32 struct ir3_reg_interval interval;
33
34 struct rb_node node;
35 struct rb_node half_node;
36
37 /* The current SSA value/const/immed this source is mapped to. */
38 struct reg_or_immed dst;
39
40 /* When computing use distances we use the distance relative to the start
41 * of the block. So, for example, a value that's defined in cycle 5 of the
42 * block and used 6 cycles later will always have a next_use_distance of 11
43 * until we reach that use.
44 */
45 unsigned next_use_distance;
46
47 /* Whether this value was reloaded and therefore doesn't need to be
48 * spilled again. Corresponds to the S set in the paper.
49 */
50 bool already_spilled;
51
52 /* We need to add sources early for accounting purposes, but we have to
53 * insert the reload code for them last. Keep track of whether this interval
54 * needs to be reloaded later.
55 */
56 bool needs_reload;
57
58 /* Keep track of whether this interval currently can't be spilled because:
59 * - It or one of its children is a source and we're making space for
60 * sources.
61 * - It is a destination and we're making space for destinations.
62 */
63 bool cant_spill;
64
65 /* Whether this interval can be rematerialized. */
66 bool can_rematerialize;
67 };
68
69 struct ra_spill_block_state {
70 unsigned *next_use_end;
71 unsigned *next_use_start;
72
73 unsigned cycles;
74
75 /* Map from SSA def to reg_or_immed it is mapped to at the end of the block.
76 * This map only contains values which we didn't spill, so it also serves as
77 * a record of the new live-out set for this block.
78 */
79 struct hash_table *remap;
80
81 /* For blocks whose successors are visited first (i.e. loop backedges), which
82 * values should be live at the end.
83 */
84 BITSET_WORD *live_out;
85
86 bool visited;
87 };
88
89 struct ra_spill_ctx {
90 struct ir3_reg_ctx reg_ctx;
91
92 struct ra_spill_interval **intervals;
93 unsigned intervals_count;
94
95 /* rb tree of live intervals that we can spill, ordered by next-use distance.
96 * full_live_intervals contains the full+shared intervals in the merged_regs
97 * case. We use this list to determine what to spill.
98 */
99 struct rb_tree full_live_intervals;
100 struct rb_tree half_live_intervals;
101
102 struct ir3_pressure cur_pressure, max_pressure;
103
104 struct ir3_pressure limit_pressure;
105
106 /* When spilling, we need to reserve a register to serve as the zero'd
107 * "base". For simplicity we reserve a register at the beginning so that it's
108 * always available.
109 */
110 struct ir3_register *base_reg;
111
112 /* Current pvtmem offset in bytes. */
113 unsigned spill_slot;
114
115 struct ir3_liveness *live;
116
117 const struct ir3_compiler *compiler;
118
119 struct ra_spill_block_state *blocks;
120
121 bool spilling;
122
123 bool merged_regs;
124 };
125
126 static void
add_base_reg(struct ra_spill_ctx * ctx,struct ir3 * ir)127 add_base_reg(struct ra_spill_ctx *ctx, struct ir3 *ir)
128 {
129 struct ir3_block *start = ir3_start_block(ir);
130
131 /* We need to stick it after any meta instructions which need to be first. */
132 struct ir3_instruction *after = NULL;
133 foreach_instr (instr, &start->instr_list) {
134 if (instr->opc != OPC_META_INPUT &&
135 instr->opc != OPC_META_TEX_PREFETCH) {
136 after = instr;
137 break;
138 }
139 }
140
141 struct ir3_instruction *mov = create_immed(start, 0);
142
143 if (after)
144 ir3_instr_move_before(mov, after);
145
146 ctx->base_reg = mov->dsts[0];
147
148 /* We don't create an interval, etc. for the base reg, so just lower the
149 * register pressure limit to account for it. We assume it's always
150 * available for simplicity.
151 */
152 ctx->limit_pressure.full -= reg_size(ctx->base_reg);
153 }
154
155
156 /* Compute the number of cycles per instruction used for next-use-distance
157 * analysis. This is just approximate, obviously.
158 */
159 static unsigned
instr_cycles(struct ir3_instruction * instr)160 instr_cycles(struct ir3_instruction *instr)
161 {
162 if (instr->opc == OPC_META_PARALLEL_COPY) {
163 unsigned cycles = 0;
164 for (unsigned i = 0; i < instr->dsts_count; i++) {
165 if (!instr->srcs[i]->def ||
166 instr->srcs[i]->def->merge_set != instr->dsts[i]->merge_set) {
167 cycles += reg_elems(instr->srcs[i]);
168 }
169 }
170
171 return cycles;
172 }
173
174 if (instr->opc == OPC_META_COLLECT) {
175 unsigned cycles = 0;
176 for (unsigned i = 0; i < instr->srcs_count; i++) {
177 if (!instr->srcs[i]->def ||
178 instr->srcs[i]->def->merge_set != instr->dsts[0]->merge_set) {
179 cycles++;
180 }
181 }
182
183 return cycles;
184 }
185
186 if (is_meta(instr))
187 return 0;
188
189 return 1 + instr->repeat;
190 }
191
192 static bool
compute_block_next_distance(struct ra_spill_ctx * ctx,struct ir3_block * block,unsigned * tmp_next_use)193 compute_block_next_distance(struct ra_spill_ctx *ctx, struct ir3_block *block,
194 unsigned *tmp_next_use)
195 {
196 struct ra_spill_block_state *state = &ctx->blocks[block->index];
197 memcpy(tmp_next_use, state->next_use_end,
198 ctx->live->definitions_count * sizeof(*tmp_next_use));
199
200 unsigned cycle = state->cycles;
201 foreach_instr_rev (instr, &block->instr_list) {
202 ra_foreach_dst (dst, instr) {
203 dst->next_use = tmp_next_use[dst->name];
204 }
205
206 ra_foreach_src (src, instr) {
207 src->next_use = tmp_next_use[src->def->name];
208 }
209
210 cycle -= instr_cycles(instr);
211
212 if (instr->opc == OPC_META_PARALLEL_COPY) {
213 ra_foreach_src_n (src, i, instr) {
214 if (src->def->merge_set == instr->dsts[i]->merge_set &&
215 src->def->merge_set_offset == instr->dsts[i]->merge_set_offset) {
216 tmp_next_use[src->def->name] =
217 tmp_next_use[instr->dsts[i]->name];
218 } else {
219 tmp_next_use[src->def->name] = cycle;
220 }
221 }
222 } else if (instr->opc != OPC_META_PHI) {
223 ra_foreach_src (src, instr) {
224 tmp_next_use[src->def->name] = cycle;
225 }
226 }
227
228 ra_foreach_dst (dst, instr) {
229 tmp_next_use[dst->name] = UINT_MAX;
230 }
231 }
232
233 memcpy(state->next_use_start, tmp_next_use,
234 ctx->live->definitions_count * sizeof(*tmp_next_use));
235
236 bool progress = false;
237 for (unsigned i = 0; i < block->predecessors_count; i++) {
238 const struct ir3_block *pred = block->predecessors[i];
239 struct ra_spill_block_state *pred_state = &ctx->blocks[pred->index];
240
241 /* Add a large-enough distance in front of edges exiting the loop so that
242 * variables that are live-through the loop but not used inside it are
243 * prioritized for spilling, as per the paper. This just needs to be
244 * larger than the longest path through the loop.
245 */
246 bool loop_exit = pred->loop_depth < block->loop_depth;
247 unsigned block_distance = pred_state->cycles + (loop_exit ? 100000 : 0);
248
249 for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
250 if (state->next_use_start[j] < UINT_MAX &&
251 state->next_use_start[j] + block_distance <
252 pred_state->next_use_end[j]) {
253 pred_state->next_use_end[j] = state->next_use_start[j] +
254 block_distance;
255 progress = true;
256 }
257 }
258
259 foreach_instr (phi, &block->instr_list) {
260 if (phi->opc != OPC_META_PHI)
261 break;
262 if (!phi->srcs[i]->def)
263 continue;
264 unsigned src = phi->srcs[i]->def->name;
265 if (phi->dsts[0]->next_use < UINT_MAX &&
266 phi->dsts[0]->next_use + block_distance <
267 pred_state->next_use_end[src]) {
268 pred_state->next_use_end[src] = phi->dsts[0]->next_use +
269 block_distance;
270 progress = true;
271 }
272 }
273 }
274
275 return progress;
276 }
277
278 static void
compute_next_distance(struct ra_spill_ctx * ctx,struct ir3 * ir)279 compute_next_distance(struct ra_spill_ctx *ctx, struct ir3 *ir)
280 {
281 for (unsigned i = 0; i < ctx->live->block_count; i++) {
282 ctx->blocks[i].next_use_start =
283 ralloc_array(ctx, unsigned, ctx->live->definitions_count);
284 ctx->blocks[i].next_use_end =
285 ralloc_array(ctx, unsigned, ctx->live->definitions_count);
286
287 for (unsigned j = 0; j < ctx->live->definitions_count; j++) {
288 ctx->blocks[i].next_use_start[j] = UINT_MAX;
289 ctx->blocks[i].next_use_end[j] = UINT_MAX;
290 }
291 }
292
293 foreach_block (block, &ir->block_list) {
294 struct ra_spill_block_state *state = &ctx->blocks[block->index];
295 state->cycles = 0;
296 foreach_instr (instr, &block->instr_list) {
297 state->cycles += instr_cycles(instr);
298 foreach_dst (dst, instr) {
299 dst->spill_slot = ~0;
300 }
301 }
302 }
303
304 unsigned *tmp_next_use =
305 ralloc_array(ctx, unsigned, ctx->live->definitions_count);
306
307 bool progress = true;
308 while (progress) {
309 progress = false;
310 foreach_block_rev (block, &ir->block_list) {
311 progress |= compute_block_next_distance(ctx, block, tmp_next_use);
312 }
313 }
314 }
315
316 static bool
can_rematerialize(struct ir3_register * reg)317 can_rematerialize(struct ir3_register *reg)
318 {
319 if (reg->flags & IR3_REG_ARRAY)
320 return false;
321 if (reg->instr->opc != OPC_MOV)
322 return false;
323 if (!(reg->instr->srcs[0]->flags & (IR3_REG_IMMED | IR3_REG_CONST)))
324 return false;
325 if (reg->instr->srcs[0]->flags & IR3_REG_RELATIV)
326 return false;
327 return true;
328 }
329
330 static struct ir3_register *
rematerialize(struct ir3_register * reg,struct ir3_cursor cursor)331 rematerialize(struct ir3_register *reg, struct ir3_cursor cursor)
332 {
333 d("rematerializing ssa_%u:%u", reg->instr->serialno, reg->name);
334
335 struct ir3_instruction *remat =
336 ir3_instr_create_at(cursor, reg->instr->opc, 1, reg->instr->srcs_count);
337 struct ir3_register *dst = __ssa_dst(remat);
338 dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
339 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
340 struct ir3_register *src =
341 ir3_src_create(remat, INVALID_REG, reg->instr->srcs[i]->flags);
342 *src = *reg->instr->srcs[i];
343 }
344
345 remat->cat1 = reg->instr->cat1;
346
347 dst->merge_set = reg->merge_set;
348 dst->merge_set_offset = reg->merge_set_offset;
349 dst->interval_start = reg->interval_start;
350 dst->interval_end = reg->interval_end;
351 return dst;
352 }
353
354 static void
ra_spill_interval_init(struct ra_spill_interval * interval,struct ir3_register * reg)355 ra_spill_interval_init(struct ra_spill_interval *interval,
356 struct ir3_register *reg)
357 {
358 ir3_reg_interval_init(&interval->interval, reg);
359 interval->dst.flags = reg->flags;
360 interval->dst.def = reg;
361 interval->already_spilled = false;
362 interval->needs_reload = false;
363 interval->cant_spill = false;
364 interval->can_rematerialize = can_rematerialize(reg);
365 }
366
367 static struct ra_spill_interval *
ir3_reg_interval_to_interval(struct ir3_reg_interval * interval)368 ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
369 {
370 return rb_node_data(struct ra_spill_interval, interval, interval);
371 }
372
373 static struct ra_spill_interval *
ra_spill_interval_root(struct ra_spill_interval * interval)374 ra_spill_interval_root(struct ra_spill_interval *interval)
375 {
376 struct ir3_reg_interval *ir3_interval = &interval->interval;
377 while (ir3_interval->parent)
378 ir3_interval = ir3_interval->parent;
379 return ir3_reg_interval_to_interval(ir3_interval);
380 }
381
382 static struct ra_spill_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)383 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
384 {
385 return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
386 }
387
388 static int
spill_interval_cmp(const struct ra_spill_interval * a,const struct ra_spill_interval * b)389 spill_interval_cmp(const struct ra_spill_interval *a,
390 const struct ra_spill_interval *b)
391 {
392 /* Prioritize intervals that we can rematerialize. */
393 if (a->can_rematerialize && !b->can_rematerialize)
394 return 1;
395 if (!a->can_rematerialize && b->can_rematerialize)
396 return -1;
397
398 return a->next_use_distance - b->next_use_distance;
399 }
400
401 static int
ra_spill_interval_cmp(const struct rb_node * _a,const struct rb_node * _b)402 ra_spill_interval_cmp(const struct rb_node *_a, const struct rb_node *_b)
403 {
404 const struct ra_spill_interval *a =
405 rb_node_data(const struct ra_spill_interval, _a, node);
406 const struct ra_spill_interval *b =
407 rb_node_data(const struct ra_spill_interval, _b, node);
408 return spill_interval_cmp(a, b);
409 }
410
411 static int
ra_spill_interval_half_cmp(const struct rb_node * _a,const struct rb_node * _b)412 ra_spill_interval_half_cmp(const struct rb_node *_a, const struct rb_node *_b)
413 {
414 const struct ra_spill_interval *a =
415 rb_node_data(const struct ra_spill_interval, _a, half_node);
416 const struct ra_spill_interval *b =
417 rb_node_data(const struct ra_spill_interval, _b, half_node);
418 return spill_interval_cmp(a, b);
419 }
420
421 static void
interval_add(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)422 interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
423 {
424 struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
425 struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
426
427 unsigned size = reg_size(interval->interval.reg);
428 if (interval->interval.reg->flags & IR3_REG_SHARED) {
429 ctx->cur_pressure.shared += size;
430 if (interval->interval.reg->flags & IR3_REG_HALF)
431 ctx->cur_pressure.shared_half += size;
432 } else {
433 if (interval->interval.reg->flags & IR3_REG_HALF) {
434 ctx->cur_pressure.half += size;
435 if (ctx->spilling) {
436 rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
437 ra_spill_interval_half_cmp);
438 }
439 }
440 if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
441 ctx->cur_pressure.full += size;
442 if (ctx->spilling) {
443 rb_tree_insert(&ctx->full_live_intervals, &interval->node,
444 ra_spill_interval_cmp);
445 }
446 }
447 }
448 }
449
450 static void
interval_delete(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _interval)451 interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
452 {
453 struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
454 struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
455
456 unsigned size = reg_size(interval->interval.reg);
457 if (interval->interval.reg->flags & IR3_REG_SHARED) {
458 ctx->cur_pressure.shared -= size;
459 if (interval->interval.reg->flags & IR3_REG_HALF)
460 ctx->cur_pressure.shared_half -= size;
461 } else {
462 if (interval->interval.reg->flags & IR3_REG_HALF) {
463 ctx->cur_pressure.half -= size;
464 if (ctx->spilling) {
465 rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
466 }
467 }
468 if (ctx->merged_regs || !(interval->interval.reg->flags & IR3_REG_HALF)) {
469 ctx->cur_pressure.full -= size;
470 if (ctx->spilling) {
471 rb_tree_remove(&ctx->full_live_intervals, &interval->node);
472 }
473 }
474 }
475 }
476
477 static void
interval_readd(struct ir3_reg_ctx * _ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)478 interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
479 struct ir3_reg_interval *_child)
480 {
481 interval_add(_ctx, _child);
482 }
483
484 static void
spill_ctx_init(struct ra_spill_ctx * ctx,struct ir3_shader_variant * v,struct ir3_liveness * live)485 spill_ctx_init(struct ra_spill_ctx *ctx, struct ir3_shader_variant *v,
486 struct ir3_liveness *live)
487 {
488 ctx->live = live;
489 ctx->intervals = ralloc_array(ctx, struct ra_spill_interval *,
490 ctx->live->definitions_count);
491 struct ra_spill_interval *intervals =
492 rzalloc_array(ctx, struct ra_spill_interval,
493 ctx->live->definitions_count);
494 for (unsigned i = 0; i < ctx->live->definitions_count; i++)
495 ctx->intervals[i] = &intervals[i];
496
497 ctx->intervals_count = ctx->live->definitions_count;
498 ctx->compiler = v->compiler;
499 ctx->merged_regs = v->mergedregs;
500
501 rb_tree_init(&ctx->reg_ctx.intervals);
502 ctx->reg_ctx.interval_add = interval_add;
503 ctx->reg_ctx.interval_delete = interval_delete;
504 ctx->reg_ctx.interval_readd = interval_readd;
505 }
506
507 static void
ra_spill_ctx_insert(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)508 ra_spill_ctx_insert(struct ra_spill_ctx *ctx,
509 struct ra_spill_interval *interval)
510 {
511 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
512 }
513
514 static void
ra_spill_ctx_remove(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval)515 ra_spill_ctx_remove(struct ra_spill_ctx *ctx,
516 struct ra_spill_interval *interval)
517 {
518 ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
519 }
520
521 static void
init_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)522 init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
523 {
524 struct ra_spill_interval *interval = ctx->intervals[dst->name];
525 ra_spill_interval_init(interval, dst);
526 if (ctx->spilling) {
527 interval->next_use_distance = dst->next_use;
528
529 /* We only need to keep track of used-ness if this value may be
530 * rematerialized. This also keeps us from nuking things that may be
531 * in the keeps list (e.g. atomics, input splits).
532 */
533 if (interval->can_rematerialize)
534 dst->instr->flags |= IR3_INSTR_UNUSED;
535 }
536 }
537
538 static void
insert_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)539 insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
540 {
541 struct ra_spill_interval *interval = ctx->intervals[dst->name];
542 if (interval->interval.inserted)
543 return;
544
545 ra_spill_ctx_insert(ctx, interval);
546 interval->cant_spill = true;
547
548 /* For precolored inputs, make sure we leave enough registers to allow for
549 * holes in the inputs. It can happen that the binning shader has a lower
550 * register pressure than the main shader, but the main shader decided to
551 * add holes between the inputs which means that the binning shader has a
552 * higher register demand.
553 */
554 if (dst->instr->opc == OPC_META_INPUT && dst->num != INVALID_REG) {
555 physreg_t physreg = ra_reg_get_physreg(dst);
556 physreg_t max = physreg + reg_size(dst);
557
558 if (interval->interval.reg->flags & IR3_REG_SHARED) {
559 ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
560 if (interval->interval.reg->flags & IR3_REG_HALF) {
561 ctx->max_pressure.shared_half = MAX2(ctx->max_pressure.shared_half,
562 max);
563 }
564 } else if (interval->interval.reg->flags & IR3_REG_HALF) {
565 ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
566 } else {
567 ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
568 }
569 }
570 }
571
572 static void
insert_src(struct ra_spill_ctx * ctx,struct ir3_register * src)573 insert_src(struct ra_spill_ctx *ctx, struct ir3_register *src)
574 {
575 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
576
577 if (!interval->interval.inserted) {
578 ra_spill_ctx_insert(ctx, interval);
579 interval->needs_reload = true;
580 interval->already_spilled = true;
581 }
582
583 ra_spill_interval_root(interval)->cant_spill = true;
584
585 }
586
587 static void
remove_src_early(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)588 remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
589 struct ir3_register *src)
590 {
591 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
592
593 if (ctx->spilling) {
594 /* It might happen that a collect that cannot be coalesced with one of its
595 * sources while spilling can be coalesced with it afterwards. In this
596 * case, we might be able to remove it here during spilling but not
597 * afterwards (because it may have a child interval). If this happens, we
598 * could end up with a register pressure that is higher after spilling
599 * than before. Prevent this by never removing collects early while
600 * spilling.
601 */
602 if (src->def->instr->opc == OPC_META_COLLECT)
603 return;
604 }
605
606 if (!interval->interval.inserted || interval->interval.parent ||
607 !rb_tree_is_empty(&interval->interval.children))
608 return;
609
610 ra_spill_ctx_remove(ctx, interval);
611 }
612
613 static void
remove_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)614 remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
615 struct ir3_register *src)
616 {
617 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
618
619 if (!interval->interval.inserted)
620 return;
621
622 ra_spill_ctx_remove(ctx, interval);
623 }
624
625 static void
finish_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)626 finish_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
627 {
628 struct ra_spill_interval *interval = ctx->intervals[dst->name];
629 interval->cant_spill = false;
630 }
631
632 static void
remove_dst(struct ra_spill_ctx * ctx,struct ir3_register * dst)633 remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
634 {
635 struct ra_spill_interval *interval = ctx->intervals[dst->name];
636
637 if (!interval->interval.inserted)
638 return;
639
640 ra_spill_ctx_remove(ctx, interval);
641 }
642
643 static void
update_src_next_use(struct ra_spill_ctx * ctx,struct ir3_register * src)644 update_src_next_use(struct ra_spill_ctx *ctx, struct ir3_register *src)
645 {
646 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
647
648 assert(interval->interval.inserted);
649
650 interval->next_use_distance = src->next_use;
651
652 /* If this node is inserted in one of the trees, then it needs to be resorted
653 * as its key has changed.
654 */
655 if (!interval->interval.parent && !(src->flags & IR3_REG_SHARED)) {
656 if (src->flags & IR3_REG_HALF) {
657 rb_tree_remove(&ctx->half_live_intervals, &interval->half_node);
658 rb_tree_insert(&ctx->half_live_intervals, &interval->half_node,
659 ra_spill_interval_half_cmp);
660 }
661 if (ctx->merged_regs || !(src->flags & IR3_REG_HALF)) {
662 rb_tree_remove(&ctx->full_live_intervals, &interval->node);
663 rb_tree_insert(&ctx->full_live_intervals, &interval->node,
664 ra_spill_interval_cmp);
665 }
666 }
667 }
668
669 static unsigned
get_spill_slot(struct ra_spill_ctx * ctx,struct ir3_register * reg)670 get_spill_slot(struct ra_spill_ctx *ctx, struct ir3_register *reg)
671 {
672 if (reg->merge_set) {
673 if (reg->merge_set->spill_slot == ~0) {
674 reg->merge_set->spill_slot = ALIGN_POT(ctx->spill_slot,
675 reg->merge_set->alignment * 2);
676 ctx->spill_slot = reg->merge_set->spill_slot + reg->merge_set->size * 2;
677 }
678 return reg->merge_set->spill_slot + reg->merge_set_offset * 2;
679 } else {
680 if (reg->spill_slot == ~0) {
681 reg->spill_slot = ALIGN_POT(ctx->spill_slot, reg_elem_size(reg) * 2);
682 ctx->spill_slot = reg->spill_slot + reg_size(reg) * 2;
683 }
684 return reg->spill_slot;
685 }
686 }
687
688 static void
set_src_val(struct ir3_register * src,const struct reg_or_immed * val)689 set_src_val(struct ir3_register *src, const struct reg_or_immed *val)
690 {
691 if (val->flags & IR3_REG_IMMED) {
692 src->flags = IR3_REG_IMMED | (val->flags & IR3_REG_HALF);
693 src->uim_val = val->uimm;
694 src->def = NULL;
695 } else if (val->flags & IR3_REG_CONST) {
696 src->flags = IR3_REG_CONST | (val->flags & IR3_REG_HALF);
697 src->num = val->const_num;
698 src->def = NULL;
699 } else {
700 src->def = val->def;
701 val->def->instr->flags &= ~IR3_INSTR_UNUSED;
702 }
703 }
704
705 static struct ir3_register *
materialize_pcopy_src(const struct reg_or_immed * src,struct ir3_builder * builder)706 materialize_pcopy_src(const struct reg_or_immed *src,
707 struct ir3_builder *builder)
708 {
709 struct ir3_instruction *mov = ir3_build_instr(builder, OPC_MOV, 1, 1);
710 struct ir3_register *dst = __ssa_dst(mov);
711 dst->flags |= src->flags & IR3_REG_HALF;
712 struct ir3_register *mov_src = ir3_src_create(mov, INVALID_REG, src->flags);
713 set_src_val(mov_src, src);
714 mov->cat1.src_type = mov->cat1.dst_type =
715 (src->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
716 return dst;
717 }
718
719 static void
spill(struct ra_spill_ctx * ctx,const struct reg_or_immed * val,unsigned spill_slot,struct ir3_cursor cursor)720 spill(struct ra_spill_ctx *ctx, const struct reg_or_immed *val,
721 unsigned spill_slot, struct ir3_cursor cursor)
722 {
723 struct ir3_register *reg;
724 struct ir3_builder builder = ir3_builder_at(cursor);
725
726 /* If spilling an immed/const pcopy src, we need to actually materialize it
727 * first with a mov.
728 */
729 if (val->flags & (IR3_REG_CONST | IR3_REG_IMMED)) {
730 reg = materialize_pcopy_src(val, &builder);
731 } else {
732 reg = val->def;
733 reg->instr->flags &= ~IR3_INSTR_UNUSED;
734 }
735
736 d("spilling ssa_%u:%u to %u", reg->instr->serialno, reg->name,
737 spill_slot);
738
739 unsigned elems = reg_elems(reg);
740 struct ir3_instruction *spill =
741 ir3_build_instr(&builder, OPC_SPILL_MACRO, 0, 3);
742 ir3_src_create(spill, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
743 unsigned src_flags = reg->flags & (IR3_REG_HALF | IR3_REG_IMMED |
744 IR3_REG_CONST | IR3_REG_SSA |
745 IR3_REG_ARRAY);
746 struct ir3_register *src = ir3_src_create(spill, INVALID_REG, src_flags);
747 ir3_src_create(spill, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
748 spill->cat6.dst_offset = spill_slot;
749 spill->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
750
751 src->def = reg;
752 if (reg->flags & IR3_REG_ARRAY) {
753 src->size = reg->size;
754 src->array.id = reg->array.id;
755 src->array.offset = 0;
756 } else {
757 src->wrmask = reg->wrmask;
758 }
759 }
760
761 static void
spill_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_cursor cursor)762 spill_interval(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
763 struct ir3_cursor cursor)
764 {
765 if (interval->can_rematerialize && !interval->interval.reg->merge_set)
766 return;
767
768 spill(ctx, &interval->dst, get_spill_slot(ctx, interval->interval.reg),
769 cursor);
770 }
771
772 /* This is similar to "limit" in the paper. */
773 static void
limit(struct ra_spill_ctx * ctx,struct ir3_cursor cursor)774 limit(struct ra_spill_ctx *ctx, struct ir3_cursor cursor)
775 {
776 if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
777 d("cur half pressure %u exceeds %u", ctx->cur_pressure.half,
778 ctx->limit_pressure.half);
779 rb_tree_foreach_safe (struct ra_spill_interval, interval,
780 &ctx->half_live_intervals, half_node) {
781 d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
782 interval->interval.reg->name);
783 if (!interval->cant_spill) {
784 if (!interval->already_spilled)
785 spill_interval(ctx, interval, cursor);
786 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
787 if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
788 break;
789 }
790 }
791
792 assert(ctx->cur_pressure.half <= ctx->limit_pressure.half);
793 }
794
795 if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
796 d("cur full pressure %u exceeds %u", ctx->cur_pressure.full,
797 ctx->limit_pressure.full);
798 rb_tree_foreach_safe (struct ra_spill_interval, interval,
799 &ctx->full_live_intervals, node) {
800 d("trying ssa_%u:%u", interval->interval.reg->instr->serialno,
801 interval->interval.reg->name);
802 if (!interval->cant_spill) {
803 if (!interval->already_spilled)
804 spill_interval(ctx, interval, cursor);
805 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
806 if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
807 break;
808 } else {
809 d("can't spill");
810 }
811 }
812
813 assert(ctx->cur_pressure.full <= ctx->limit_pressure.full);
814 }
815 }
816
817 /* There's a corner case where we reload a value which has overlapping live
818 * values already reloaded, either because it's the child of some other interval
819 * that was already reloaded or some of its children have already been
820 * reloaded. Because RA only expects overlapping source/dest intervals for meta
821 * instructions (split/collect), and we don't want to add register pressure by
822 * creating an entirely separate value, we need to add splits and collects to
823 * deal with this case. These splits/collects have to also have correct merge
824 * set information, so that it doesn't result in any actual code or register
825 * pressure in practice.
826 */
827
828 static void
add_to_merge_set(struct ir3_merge_set * set,struct ir3_register * def,unsigned offset)829 add_to_merge_set(struct ir3_merge_set *set, struct ir3_register *def,
830 unsigned offset)
831 {
832 def->merge_set = set;
833 def->merge_set_offset = offset;
834 def->interval_start = set->interval_start + offset;
835 def->interval_end = set->interval_start + offset + reg_size(def);
836 }
837
838 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_builder * builder)839 split(struct ir3_register *def, unsigned offset, struct ir3_builder *builder)
840 {
841 if (reg_elems(def) == 1) {
842 assert(offset == 0);
843 return def;
844 }
845
846 assert(!(def->flags & IR3_REG_ARRAY));
847 assert(def->merge_set);
848 struct ir3_instruction *split =
849 ir3_build_instr(builder, OPC_META_SPLIT, 1, 1);
850 split->split.off = offset;
851 struct ir3_register *dst = __ssa_dst(split);
852 dst->flags |= def->flags & IR3_REG_HALF;
853 struct ir3_register *src = ir3_src_create(split, INVALID_REG, def->flags);
854 src->wrmask = def->wrmask;
855 src->def = def;
856 add_to_merge_set(def->merge_set, dst,
857 def->merge_set_offset + offset * reg_elem_size(def));
858 return dst;
859 }
860
861 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_cursor cursor)862 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
863 struct ir3_cursor cursor)
864 {
865 if (offset == 0 && elems == reg_elems(parent_def))
866 return parent_def;
867
868 struct ir3_builder builder = ir3_builder_at(cursor);
869 struct ir3_register *srcs[elems];
870 for (unsigned i = 0; i < elems; i++) {
871 srcs[i] = split(parent_def, offset + i, &builder);
872 }
873
874 struct ir3_instruction *collect =
875 ir3_build_instr(&builder, OPC_META_COLLECT, 1, elems);
876 struct ir3_register *dst = __ssa_dst(collect);
877 dst->flags |= parent_def->flags & IR3_REG_HALF;
878 dst->wrmask = MASK(elems);
879 add_to_merge_set(parent_def->merge_set, dst, parent_def->merge_set_offset);
880
881 for (unsigned i = 0; i < elems; i++) {
882 ir3_src_create(collect, INVALID_REG, parent_def->flags)->def = srcs[i];
883 }
884
885 return dst;
886 }
887
888 static struct ir3_register *
reload(struct ra_spill_ctx * ctx,struct ir3_register * reg,struct ir3_cursor cursor)889 reload(struct ra_spill_ctx *ctx, struct ir3_register *reg,
890 struct ir3_cursor cursor)
891 {
892 unsigned spill_slot = get_spill_slot(ctx, reg);
893
894 d("reloading ssa_%u:%u from %u", reg->instr->serialno, reg->name,
895 spill_slot);
896
897 unsigned elems = reg_elems(reg);
898 struct ir3_instruction *reload =
899 ir3_instr_create_at(cursor, OPC_RELOAD_MACRO, 1, 3);
900 struct ir3_register *dst = __ssa_dst(reload);
901 dst->flags |= reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
902 /* The reload may be split into multiple pieces, and if the destination
903 * overlaps with the base register then it could get clobbered before the
904 * last ldp in the sequence. Note that we always reserve space for the base
905 * register throughout the whole program, so effectively extending its live
906 * range past the end of the instruction isn't a problem for our pressure
907 * accounting.
908 */
909 dst->flags |= IR3_REG_EARLY_CLOBBER;
910 ir3_src_create(reload, INVALID_REG, ctx->base_reg->flags)->def = ctx->base_reg;
911 struct ir3_register *offset_reg =
912 ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED);
913 offset_reg->uim_val = spill_slot;
914 ir3_src_create(reload, INVALID_REG, IR3_REG_IMMED)->uim_val = elems;
915 reload->cat6.type = (reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
916
917 if (reg->flags & IR3_REG_ARRAY) {
918 dst->array.offset = 0;
919 dst->array.id = reg->array.id;
920 dst->size = reg->size;
921 } else {
922 dst->wrmask = reg->wrmask;
923 }
924
925 dst->merge_set = reg->merge_set;
926 dst->merge_set_offset = reg->merge_set_offset;
927 dst->interval_start = reg->interval_start;
928 dst->interval_end = reg->interval_end;
929 return dst;
930 }
931
932 static void
rewrite_src_interval(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_register * def,struct ir3_cursor cursor)933 rewrite_src_interval(struct ra_spill_ctx *ctx,
934 struct ra_spill_interval *interval,
935 struct ir3_register *def,
936 struct ir3_cursor cursor)
937 {
938 interval->dst.flags = def->flags;
939 interval->dst.def = def;
940 interval->needs_reload = false;
941
942 rb_tree_foreach (struct ra_spill_interval, child,
943 &interval->interval.children, interval.node) {
944 struct ir3_register *child_reg = child->interval.reg;
945 struct ir3_register *child_def =
946 extract(def, (child_reg->interval_start -
947 interval->interval.reg->interval_start) / reg_elem_size(def),
948 reg_elems(child_reg), cursor);
949 rewrite_src_interval(ctx, child, child_def, cursor);
950 }
951 }
952
953 static void
reload_def(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_cursor cursor)954 reload_def(struct ra_spill_ctx *ctx, struct ir3_register *def,
955 struct ir3_cursor cursor)
956 {
957 unsigned elems = reg_elems(def);
958 struct ra_spill_interval *interval = ctx->intervals[def->name];
959
960 struct ir3_reg_interval *ir3_parent = interval->interval.parent;
961
962 if (ir3_parent) {
963 struct ra_spill_interval *parent =
964 ir3_reg_interval_to_interval(ir3_parent);
965 if (!parent->needs_reload) {
966 interval->dst.flags = def->flags;
967 interval->dst.def = extract(
968 parent->dst.def, (def->interval_start - parent->dst.def->interval_start) /
969 reg_elem_size(def), elems, cursor);
970 return;
971 }
972 }
973
974 struct ir3_register *dst;
975 if (interval->can_rematerialize)
976 dst = rematerialize(def, cursor);
977 else
978 dst = reload(ctx, def, cursor);
979
980 rewrite_src_interval(ctx, interval, dst, cursor);
981 }
982
983 static void
reload_src(struct ra_spill_ctx * ctx,struct ir3_cursor cursor,struct ir3_register * src)984 reload_src(struct ra_spill_ctx *ctx, struct ir3_cursor cursor,
985 struct ir3_register *src)
986 {
987 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
988
989 if (interval->needs_reload) {
990 reload_def(ctx, src->def, cursor);
991 }
992
993 ra_spill_interval_root(interval)->cant_spill = false;
994 }
995
996 static void
rewrite_src(struct ra_spill_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)997 rewrite_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr,
998 struct ir3_register *src)
999 {
1000 struct ra_spill_interval *interval = ctx->intervals[src->def->name];
1001
1002 set_src_val(src, &interval->dst);
1003 }
1004
1005 static void
update_max_pressure(struct ra_spill_ctx * ctx)1006 update_max_pressure(struct ra_spill_ctx *ctx)
1007 {
1008 d("pressure:");
1009 d("\tfull: %u", ctx->cur_pressure.full);
1010 d("\thalf: %u", ctx->cur_pressure.half);
1011 d("\tshared: %u", ctx->cur_pressure.shared);
1012 d("\tshared half: %u", ctx->cur_pressure.shared_half);
1013
1014 ctx->max_pressure.full =
1015 MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
1016 ctx->max_pressure.half =
1017 MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
1018 ctx->max_pressure.shared =
1019 MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
1020 ctx->max_pressure.shared_half =
1021 MAX2(ctx->max_pressure.shared_half, ctx->cur_pressure.shared_half);
1022 }
1023
1024 static void
handle_instr(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1025 handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1026 {
1027 ra_foreach_dst (dst, instr) {
1028 /* No need to handle instructions inserted while spilling. Most are
1029 * ignored automatically by virtue of being inserted before the current
1030 * instruction. However, for live-ins, we may insert extracts after the
1031 * phis. Trying to handle them ends badly as they don't have intervals
1032 * allocated.
1033 * Note: since liveness is calculated before spilling, original
1034 * instruction have a name while new ones don't.
1035 */
1036 if (!dst->name)
1037 return;
1038
1039 init_dst(ctx, dst);
1040 }
1041
1042 if (ctx->spilling) {
1043 ra_foreach_src (src, instr)
1044 insert_src(ctx, src);
1045 }
1046
1047 /* Handle tied and early-kill destinations. If a destination is tied to a
1048 * source and that source is live-through, then we need to allocate a new
1049 * register for the destination which is live-through itself and cannot
1050 * overlap the sources. Similarly early-kill destinations cannot overlap
1051 * sources.
1052 */
1053
1054 ra_foreach_dst (dst, instr) {
1055 struct ir3_register *tied_src = dst->tied;
1056 if ((tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL)) ||
1057 (dst->flags & IR3_REG_EARLY_CLOBBER))
1058 insert_dst(ctx, dst);
1059 }
1060
1061 if (ctx->spilling)
1062 limit(ctx, ir3_before_instr(instr));
1063 else
1064 update_max_pressure(ctx);
1065
1066 if (ctx->spilling) {
1067 ra_foreach_src (src, instr) {
1068 reload_src(ctx, ir3_before_instr(instr), src);
1069 update_src_next_use(ctx, src);
1070 }
1071 }
1072
1073 ra_foreach_src (src, instr) {
1074 if (src->flags & IR3_REG_FIRST_KILL)
1075 remove_src_early(ctx, instr, src);
1076 }
1077
1078 ra_foreach_dst (dst, instr) {
1079 insert_dst(ctx, dst);
1080 }
1081
1082 if (ctx->spilling)
1083 limit(ctx, ir3_before_instr(instr));
1084 else
1085 update_max_pressure(ctx);
1086
1087 /* We have to remove sources before rewriting them so that we can lookup the
1088 * interval to remove before the source itself is changed.
1089 */
1090 ra_foreach_src (src, instr) {
1091 if (src->flags & IR3_REG_FIRST_KILL)
1092 remove_src(ctx, instr, src);
1093 }
1094
1095 if (ctx->spilling) {
1096 ra_foreach_src (src, instr) {
1097 rewrite_src(ctx, instr, src);
1098 }
1099 }
1100
1101 ra_foreach_dst (dst, instr) {
1102 finish_dst(ctx, dst);
1103 }
1104
1105 for (unsigned i = 0; i < instr->dsts_count; i++) {
1106 if (ra_reg_is_dst(instr->dsts[i]) &&
1107 (instr->dsts[i]->flags & IR3_REG_UNUSED))
1108 remove_dst(ctx, instr->dsts[i]);
1109 }
1110 }
1111
1112 static struct ra_spill_interval *
create_temp_interval(struct ra_spill_ctx * ctx,struct ir3_register * def)1113 create_temp_interval(struct ra_spill_ctx *ctx, struct ir3_register *def)
1114 {
1115 unsigned name = ctx->intervals_count++;
1116 unsigned offset = ctx->live->interval_offset;
1117
1118 /* This is kinda hacky, but we need to create a fake SSA def here that is
1119 * only used as part of the pcopy accounting. See below.
1120 */
1121 struct ir3_register *reg = rzalloc(ctx, struct ir3_register);
1122 *reg = *def;
1123 reg->name = name;
1124 reg->interval_start = offset;
1125 reg->interval_end = offset + reg_size(def);
1126 reg->merge_set = NULL;
1127
1128 ctx->intervals = reralloc(ctx, ctx->intervals, struct ra_spill_interval *,
1129 ctx->intervals_count);
1130 struct ra_spill_interval *interval = rzalloc(ctx, struct ra_spill_interval);
1131 ra_spill_interval_init(interval, reg);
1132 ctx->intervals[name] = interval;
1133 ctx->live->interval_offset += reg_size(def);
1134 return interval;
1135 }
1136
1137 /* In the sequence of copies generated (see below), would this source be killed?
1138 */
1139 static bool
is_last_pcopy_src(struct ir3_instruction * pcopy,unsigned src_n)1140 is_last_pcopy_src(struct ir3_instruction *pcopy, unsigned src_n)
1141 {
1142 struct ir3_register *src = pcopy->srcs[src_n];
1143 if (!(src->flags & IR3_REG_KILL))
1144 return false;
1145 for (unsigned j = src_n + 1; j < pcopy->srcs_count; j++) {
1146 if (pcopy->srcs[j]->def == src->def)
1147 return false;
1148 }
1149 return true;
1150 }
1151
1152 /* Parallel copies are different from normal instructions. The sources together
1153 * may be larger than the entire register file, so we cannot just reload every
1154 * source like normal, and indeed that probably wouldn't be a great idea.
1155 * Instead we essentially need to lower the parallel copy to "copies," just like
1156 * in the normal CSSA construction, although we implement the copies by
1157 * reloading and then possibly spilling values. We essentially just shuffle
1158 * around the sources until each source either (a) is live or (b) has the same
1159 * spill slot as its corresponding destination. We do this by decomposing the
1160 * copy into a series of copies, so:
1161 *
1162 * a, b, c = d, e, f
1163 *
1164 * becomes:
1165 *
1166 * d' = d
1167 * e' = e
1168 * f' = f
1169 * a = d'
1170 * b = e'
1171 * c = f'
1172 *
1173 * the temporary SSA values d', e', and f' never actually show up in the result.
1174 * They are only used for our internal accounting. They may, however, have their
1175 * own spill slot created for them. Similarly, we don't actually emit any copy
1176 * instructions, although we emit the spills/reloads that *would've* been
1177 * required if those copies were there.
1178 *
1179 * TODO: in order to reduce the number of temporaries and therefore spill slots,
1180 * we could instead do a more complicated analysis that considers the location
1181 * transfer graph.
1182 *
1183 * In addition, we actually remove the parallel copy and rewrite all its uses
1184 * (in the phi nodes) rather than rewrite its sources at the end. Recreating it
1185 * later turns out to be easier than keeping it up-to-date throughout this pass,
1186 * since we may have to remove entries for phi sources that are spilled and add
1187 * entries for live-outs that are spilled and reloaded, which can happen here
1188 * and then possibly be undone or done again when processing live-ins of the
1189 * successor block.
1190 */
1191
1192 static void
handle_pcopy(struct ra_spill_ctx * ctx,struct ir3_instruction * pcopy)1193 handle_pcopy(struct ra_spill_ctx *ctx, struct ir3_instruction *pcopy)
1194 {
1195 ra_foreach_dst (dst, pcopy) {
1196 struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1197 ra_spill_interval_init(dst_interval, dst);
1198 }
1199
1200 foreach_src_n (src, i, pcopy) {
1201 struct ir3_register *dst = pcopy->dsts[i];
1202 if (!(dst->flags & IR3_REG_SSA))
1203 continue;
1204
1205 d("processing src %u", i);
1206
1207 /* Skip the intermediate copy for cases where the source is merged with
1208 * the destination. Crucially this means that we also don't reload/spill
1209 * it if it's been spilled, because it shares the same spill slot.
1210 */
1211 if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1212 src->def->merge_set == dst->merge_set &&
1213 src->def->merge_set_offset == dst->merge_set_offset) {
1214 struct ra_spill_interval *src_interval = ctx->intervals[src->def->name];
1215 struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1216 if (src_interval->interval.inserted) {
1217 update_src_next_use(ctx, src);
1218 if (is_last_pcopy_src(pcopy, i))
1219 ra_spill_ctx_remove(ctx, src_interval);
1220 dst_interval->cant_spill = true;
1221 ra_spill_ctx_insert(ctx, dst_interval);
1222 limit(ctx, ir3_before_instr(pcopy));
1223 dst_interval->cant_spill = false;
1224 dst_interval->dst = src_interval->dst;
1225 }
1226 } else if (src->flags & IR3_REG_SSA) {
1227 struct ra_spill_interval *temp_interval =
1228 create_temp_interval(ctx, dst);
1229 struct ir3_register *temp = temp_interval->interval.reg;
1230 temp_interval->next_use_distance = src->next_use;
1231
1232 insert_src(ctx, src);
1233 limit(ctx, ir3_before_instr(pcopy));
1234 reload_src(ctx, ir3_before_instr(pcopy), src);
1235 update_src_next_use(ctx, src);
1236 if (is_last_pcopy_src(pcopy, i))
1237 remove_src(ctx, pcopy, src);
1238 struct ra_spill_interval *src_interval =
1239 ctx->intervals[src->def->name];
1240 temp_interval->dst = src_interval->dst;
1241
1242 temp_interval->cant_spill = true;
1243 ra_spill_ctx_insert(ctx, temp_interval);
1244 limit(ctx, ir3_before_instr(pcopy));
1245 temp_interval->cant_spill = false;
1246
1247 src->flags = temp->flags;
1248 src->def = temp;
1249 }
1250 }
1251
1252 d("done with pcopy srcs");
1253
1254 foreach_src_n (src, i, pcopy) {
1255 struct ir3_register *dst = pcopy->dsts[i];
1256 if (!(dst->flags & IR3_REG_SSA))
1257 continue;
1258
1259 if ((src->flags & IR3_REG_SSA) && src->def->merge_set &&
1260 src->def->merge_set == dst->merge_set &&
1261 src->def->merge_set_offset == dst->merge_set_offset)
1262 continue;
1263
1264 struct ra_spill_interval *dst_interval = ctx->intervals[dst->name];
1265
1266 if (!(src->flags & IR3_REG_SSA)) {
1267 dst_interval->cant_spill = true;
1268 ra_spill_ctx_insert(ctx, dst_interval);
1269 limit(ctx, ir3_before_instr(pcopy));
1270 dst_interval->cant_spill = false;
1271
1272 assert(src->flags & (IR3_REG_CONST | IR3_REG_IMMED));
1273 if (src->flags & IR3_REG_CONST) {
1274 dst_interval->dst.flags = src->flags;
1275 dst_interval->dst.const_num = src->num;
1276 } else {
1277 dst_interval->dst.flags = src->flags;
1278 dst_interval->dst.uimm = src->uim_val;
1279 }
1280 } else {
1281 struct ra_spill_interval *temp_interval = ctx->intervals[src->def->name];
1282
1283 insert_src(ctx, src);
1284 limit(ctx, ir3_before_instr(pcopy));
1285 reload_src(ctx, ir3_before_instr(pcopy), src);
1286 remove_src(ctx, pcopy, src);
1287
1288 dst_interval->dst = temp_interval->dst;
1289 ra_spill_ctx_insert(ctx, dst_interval);
1290 }
1291 }
1292
1293 pcopy->flags |= IR3_INSTR_UNUSED;
1294 }
1295
1296 static void
handle_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1297 handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1298 {
1299 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1300 return;
1301
1302 init_dst(ctx, instr->dsts[0]);
1303 insert_dst(ctx, instr->dsts[0]);
1304 finish_dst(ctx, instr->dsts[0]);
1305 }
1306
1307 static void
remove_input_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * instr)1308 remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
1309 {
1310 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1311 return;
1312
1313 if (instr->opc == OPC_META_TEX_PREFETCH) {
1314 ra_foreach_src (src, instr) {
1315 if (src->flags & IR3_REG_FIRST_KILL)
1316 remove_src(ctx, instr, src);
1317 }
1318 }
1319 if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1320 remove_dst(ctx, instr->dsts[0]);
1321 }
1322
1323 static void
handle_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1324 handle_live_in(struct ra_spill_ctx *ctx, struct ir3_block *block,
1325 struct ir3_register *def)
1326 {
1327 struct ra_spill_interval *interval = ctx->intervals[def->name];
1328 ra_spill_interval_init(interval, def);
1329 if (ctx->spilling) {
1330 interval->next_use_distance =
1331 ctx->blocks[block->index].next_use_start[def->name];
1332 }
1333
1334 ra_spill_ctx_insert(ctx, interval);
1335 }
1336
1337 static bool
is_live_in_phi(struct ir3_register * def,struct ir3_block * block)1338 is_live_in_phi(struct ir3_register *def, struct ir3_block *block)
1339 {
1340 return def->instr->opc == OPC_META_PHI && def->instr->block == block;
1341 }
1342
1343 static bool
is_live_in_pred(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1344 is_live_in_pred(struct ra_spill_ctx *ctx, struct ir3_register *def,
1345 struct ir3_block *block, unsigned pred_idx)
1346 {
1347 struct ir3_block *pred = block->predecessors[pred_idx];
1348 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1349 if (is_live_in_phi(def, block)) {
1350 def = def->instr->srcs[pred_idx]->def;
1351 if (!def)
1352 return false;
1353 }
1354
1355 return _mesa_hash_table_search(state->remap, def);
1356 }
1357
1358 static bool
is_live_in_undef(struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1359 is_live_in_undef(struct ir3_register *def,
1360 struct ir3_block *block, unsigned pred_idx)
1361 {
1362 if (!is_live_in_phi(def, block))
1363 return false;
1364
1365 return !def->instr->srcs[pred_idx]->def;
1366 }
1367
1368 static struct reg_or_immed *
read_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block,unsigned pred_idx)1369 read_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1370 struct ir3_block *block, unsigned pred_idx)
1371 {
1372 struct ir3_block *pred = block->predecessors[pred_idx];
1373 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1374
1375 if (is_live_in_phi(def, block)) {
1376 def = def->instr->srcs[pred_idx]->def;
1377 if (!def)
1378 return NULL;
1379 }
1380
1381 struct hash_entry *entry = _mesa_hash_table_search(state->remap, def);
1382 if (entry)
1383 return entry->data;
1384 else
1385 return NULL;
1386 }
1387
1388 static bool
is_live_in_all_preds(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1389 is_live_in_all_preds(struct ra_spill_ctx *ctx, struct ir3_register *def,
1390 struct ir3_block *block)
1391 {
1392 for (unsigned i = 0; i < block->predecessors_count; i++) {
1393 if (!is_live_in_pred(ctx, def, block, i))
1394 return false;
1395 }
1396
1397 return true;
1398 }
1399
1400 static void
spill_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1401 spill_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1402 struct ir3_block *block)
1403 {
1404 for (unsigned i = 0; i < block->predecessors_count; i++) {
1405 struct ir3_block *pred = block->predecessors[i];
1406 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1407
1408 if (!state->visited)
1409 continue;
1410
1411 struct reg_or_immed *pred_def = read_live_in(ctx, def, block, i);
1412 if (pred_def) {
1413 spill(ctx, pred_def, get_spill_slot(ctx, def),
1414 ir3_before_terminator(pred));
1415 }
1416 }
1417 }
1418
1419 static void
spill_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1420 spill_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1421 {
1422 bool all_preds_visited = true;
1423 for (unsigned i = 0; i < block->predecessors_count; i++) {
1424 struct ir3_block *pred = block->predecessors[i];
1425 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1426 if (!state->visited) {
1427 all_preds_visited = false;
1428 break;
1429 }
1430 }
1431
1432 /* Note: in the paper they explicitly spill live-through values first, but we
1433 * should be doing that automatically by virtue of picking the largest
1434 * distance due to the extra distance added to edges out of loops.
1435 *
1436 * TODO: Keep track of pressure in each block and preemptively spill
1437 * live-through values as described in the paper to avoid spilling them
1438 * inside the loop.
1439 */
1440
1441 if (ctx->cur_pressure.half > ctx->limit_pressure.half) {
1442 rb_tree_foreach_safe (struct ra_spill_interval, interval,
1443 &ctx->half_live_intervals, half_node) {
1444 if (all_preds_visited &&
1445 is_live_in_all_preds(ctx, interval->interval.reg, block))
1446 continue;
1447 if (interval->interval.reg->merge_set ||
1448 !interval->can_rematerialize)
1449 spill_live_in(ctx, interval->interval.reg, block);
1450 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1451 if (ctx->cur_pressure.half <= ctx->limit_pressure.half)
1452 break;
1453 }
1454 }
1455
1456 if (ctx->cur_pressure.full > ctx->limit_pressure.full) {
1457 rb_tree_foreach_safe (struct ra_spill_interval, interval,
1458 &ctx->full_live_intervals, node) {
1459 if (all_preds_visited &&
1460 is_live_in_all_preds(ctx, interval->interval.reg, block))
1461 continue;
1462 spill_live_in(ctx, interval->interval.reg, block);
1463 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1464 if (ctx->cur_pressure.full <= ctx->limit_pressure.full)
1465 break;
1466 }
1467 }
1468 }
1469
1470 static void
live_in_rewrite(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct reg_or_immed * new_val,struct ir3_block * block,unsigned pred_idx)1471 live_in_rewrite(struct ra_spill_ctx *ctx,
1472 struct ra_spill_interval *interval,
1473 struct reg_or_immed *new_val,
1474 struct ir3_block *block, unsigned pred_idx)
1475 {
1476 struct ir3_block *pred = block->predecessors[pred_idx];
1477 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1478 struct ir3_register *def = interval->interval.reg;
1479 if (is_live_in_phi(def, block)) {
1480 def = def->instr->srcs[pred_idx]->def;
1481 }
1482
1483 if (def)
1484 _mesa_hash_table_insert(state->remap, def, new_val);
1485 }
1486
1487 static void
reload_live_in(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1488 reload_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def,
1489 struct ir3_block *block)
1490 {
1491 struct ra_spill_interval *interval = ctx->intervals[def->name];
1492 for (unsigned i = 0; i < block->predecessors_count; i++) {
1493 struct ir3_block *pred = block->predecessors[i];
1494 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1495 if (!state->visited)
1496 continue;
1497
1498 if (is_live_in_undef(def, block, i))
1499 continue;
1500
1501 struct reg_or_immed *new_val = read_live_in(ctx, def, block, i);
1502
1503 if (!new_val) {
1504 new_val = ralloc(ctx, struct reg_or_immed);
1505 if (interval->can_rematerialize)
1506 new_val->def = rematerialize(def, ir3_before_terminator(pred));
1507 else
1508 new_val->def = reload(ctx, def, ir3_before_terminator(pred));
1509 new_val->flags = new_val->def->flags;
1510 }
1511 live_in_rewrite(ctx, interval, new_val, block, i);
1512 }
1513 }
1514
1515 static void
reload_live_ins(struct ra_spill_ctx * ctx,struct ir3_block * block)1516 reload_live_ins(struct ra_spill_ctx *ctx, struct ir3_block *block)
1517 {
1518 rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1519 interval.node) {
1520 reload_live_in(ctx, interval->interval.reg, block);
1521 }
1522 }
1523
1524 static void
add_live_in_phi(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_register * parent_def,struct ir3_block * block)1525 add_live_in_phi(struct ra_spill_ctx *ctx, struct ir3_register *def,
1526 struct ir3_register *parent_def, struct ir3_block *block)
1527 {
1528 struct ra_spill_interval *interval = ctx->intervals[def->name];
1529 if (!interval->interval.inserted)
1530 return;
1531
1532 bool needs_phi = false;
1533 struct ir3_register *cur_def = NULL;
1534 for (unsigned i = 0; i < block->predecessors_count; i++) {
1535 struct ir3_block *pred = block->predecessors[i];
1536 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1537
1538 if (!state->visited) {
1539 needs_phi = true;
1540 break;
1541 }
1542
1543 struct hash_entry *entry =
1544 _mesa_hash_table_search(state->remap, def);
1545 assert(entry);
1546 struct reg_or_immed *pred_val = entry->data;
1547 if ((pred_val->flags & (IR3_REG_IMMED | IR3_REG_CONST)) ||
1548 !pred_val->def ||
1549 (cur_def && cur_def != pred_val->def)) {
1550 needs_phi = true;
1551 break;
1552 }
1553 cur_def = pred_val->def;
1554 }
1555
1556 if (!needs_phi) {
1557 interval->dst.def = cur_def;
1558 interval->dst.flags = cur_def->flags;
1559
1560 rb_tree_foreach (struct ra_spill_interval, child,
1561 &interval->interval.children, interval.node) {
1562 add_live_in_phi(ctx, child->interval.reg, cur_def, block);
1563 }
1564
1565 return;
1566 }
1567
1568 if (parent_def) {
1569 /* We have a child interval that needs a phi but whose parent does not
1570 * need one (otherwise parent_def would be NULL). Just extract the child
1571 * from the parent without creating a phi for the child.
1572 */
1573 unsigned offset = (def->interval_start - parent_def->interval_start) /
1574 reg_elem_size(def);
1575 struct ir3_register *extracted =
1576 extract(parent_def, offset, reg_elems(def), ir3_after_phis(block));
1577 rewrite_src_interval(ctx, interval, extracted,
1578 ir3_after_instr(extracted->instr));
1579 return;
1580 }
1581
1582 struct ir3_instruction *phi = ir3_instr_create_at(
1583 ir3_before_block(block), OPC_META_PHI, 1, block->predecessors_count);
1584 struct ir3_register *dst = __ssa_dst(phi);
1585 dst->flags |= def->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
1586 dst->size = def->size;
1587 dst->wrmask = def->wrmask;
1588
1589 dst->interval_start = def->interval_start;
1590 dst->interval_end = def->interval_end;
1591 dst->merge_set = def->merge_set;
1592 dst->merge_set_offset = def->merge_set_offset;
1593
1594 for (unsigned i = 0; i < block->predecessors_count; i++) {
1595 struct ir3_block *pred = block->predecessors[i];
1596 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1597 struct ir3_register *src = ir3_src_create(phi, INVALID_REG, dst->flags);
1598 src->size = def->size;
1599 src->wrmask = def->wrmask;
1600
1601 if (state->visited) {
1602 struct hash_entry *entry =
1603 _mesa_hash_table_search(state->remap, def);
1604 assert(entry);
1605 struct reg_or_immed *new_val = entry->data;
1606 set_src_val(src, new_val);
1607 } else {
1608 src->def = def;
1609 }
1610 }
1611
1612 interval->dst.def = dst;
1613 interval->dst.flags = dst->flags;
1614 rewrite_src_interval(ctx, interval, dst, ir3_after_phis(block));
1615 }
1616
1617 static void
add_live_in_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1618 add_live_in_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1619 {
1620 rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1621 interval.node) {
1622 if (BITSET_TEST(ctx->live->live_in[block->index],
1623 interval->interval.reg->name)) {
1624 add_live_in_phi(ctx, interval->interval.reg, NULL, block);
1625 }
1626 }
1627 }
1628
1629 /* When spilling a block with a single predecessors, the pred may have other
1630 * successors so we can't choose what's live in and we can't spill/restore
1631 * anything. Just make the inserted intervals exactly match the predecessor. If
1632 * it wasn't live in the predecessor then it must've already been spilled. Also,
1633 * there are no phi nodes and no live-ins.
1634 */
1635 static void
spill_single_pred_live_in(struct ra_spill_ctx * ctx,struct ir3_block * block)1636 spill_single_pred_live_in(struct ra_spill_ctx *ctx,
1637 struct ir3_block *block)
1638 {
1639 unsigned name;
1640 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1641 ctx->live->definitions_count) {
1642 struct ir3_register *reg = ctx->live->definitions[name];
1643 struct ra_spill_interval *interval = ctx->intervals[reg->name];
1644 struct reg_or_immed *val = read_live_in(ctx, reg, block, 0);
1645 if (val)
1646 interval->dst = *val;
1647 else
1648 ra_spill_ctx_remove(ctx, interval);
1649 }
1650 }
1651
1652 static void
rewrite_phi(struct ra_spill_ctx * ctx,struct ir3_instruction * phi,struct ir3_block * block)1653 rewrite_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *phi,
1654 struct ir3_block *block)
1655 {
1656 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
1657 return;
1658
1659 if (!ctx->intervals[phi->dsts[0]->name]->interval.inserted) {
1660 phi->flags |= IR3_INSTR_UNUSED;
1661 return;
1662 }
1663
1664 for (unsigned i = 0; i < block->predecessors_count; i++) {
1665 struct ir3_block *pred = block->predecessors[i];
1666 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1667
1668 if (!state->visited)
1669 continue;
1670
1671 struct ir3_register *src = phi->srcs[i];
1672 if (!src->def)
1673 continue;
1674
1675 struct hash_entry *entry =
1676 _mesa_hash_table_search(state->remap, src->def);
1677 assert(entry);
1678 struct reg_or_immed *new_val = entry->data;
1679 set_src_val(src, new_val);
1680 }
1681 }
1682
1683 static void
spill_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block)1684 spill_live_out(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval,
1685 struct ir3_block *block)
1686 {
1687 struct ir3_register *def = interval->interval.reg;
1688
1689 if (interval->interval.reg->merge_set ||
1690 !interval->can_rematerialize) {
1691 spill(ctx, &interval->dst, get_spill_slot(ctx, def),
1692 ir3_before_terminator(block));
1693 }
1694 ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
1695 }
1696
1697 static void
spill_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1698 spill_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1699 {
1700 struct ra_spill_block_state *state = &ctx->blocks[block->index];
1701 rb_tree_foreach_safe (struct ra_spill_interval, interval,
1702 &ctx->reg_ctx.intervals, interval.node) {
1703 if (!BITSET_TEST(state->live_out, interval->interval.reg->name)) {
1704 spill_live_out(ctx, interval, block);
1705 }
1706 }
1707 }
1708
1709 static void
reload_live_out(struct ra_spill_ctx * ctx,struct ir3_register * def,struct ir3_block * block)1710 reload_live_out(struct ra_spill_ctx *ctx, struct ir3_register *def,
1711 struct ir3_block *block)
1712 {
1713 struct ra_spill_interval *interval = ctx->intervals[def->name];
1714 ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1715
1716 reload_def(ctx, def, ir3_before_terminator(block));
1717 }
1718
1719 static void
reload_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1720 reload_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1721 {
1722 struct ra_spill_block_state *state = &ctx->blocks[block->index];
1723 unsigned name;
1724 BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1725 struct ir3_register *reg = ctx->live->definitions[name];
1726 struct ra_spill_interval *interval = ctx->intervals[name];
1727 if (!interval->interval.inserted)
1728 reload_live_out(ctx, reg, block);
1729 }
1730 }
1731
1732 static void
update_live_out_phis(struct ra_spill_ctx * ctx,struct ir3_block * block)1733 update_live_out_phis(struct ra_spill_ctx *ctx, struct ir3_block *block)
1734 {
1735 assert(!block->successors[1]);
1736 struct ir3_block *succ = block->successors[0];
1737 unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1738
1739 foreach_instr (instr, &succ->instr_list) {
1740 if (instr->opc != OPC_META_PHI)
1741 break;
1742
1743 struct ir3_register *def = instr->srcs[pred_idx]->def;
1744 if (!def)
1745 continue;
1746
1747 struct ra_spill_interval *interval = ctx->intervals[def->name];
1748 if (!interval->interval.inserted)
1749 continue;
1750 set_src_val(instr->srcs[pred_idx], &interval->dst);
1751 }
1752 }
1753
1754 static void
record_pred_live_out(struct ra_spill_ctx * ctx,struct ra_spill_interval * interval,struct ir3_block * block,unsigned pred_idx)1755 record_pred_live_out(struct ra_spill_ctx *ctx,
1756 struct ra_spill_interval *interval,
1757 struct ir3_block *block, unsigned pred_idx)
1758 {
1759 struct ir3_block *pred = block->predecessors[pred_idx];
1760 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1761
1762 struct ir3_register *def = interval->interval.reg;
1763 if (is_live_in_phi(def, block)) {
1764 def = def->instr->srcs[pred_idx]->def;
1765 }
1766 BITSET_SET(state->live_out, def->name);
1767
1768 rb_tree_foreach (struct ra_spill_interval, child,
1769 &interval->interval.children, interval.node) {
1770 record_pred_live_out(ctx, child, block, pred_idx);
1771 }
1772 }
1773
1774 static void
record_pred_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1775 record_pred_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1776 {
1777 for (unsigned i = 0; i < block->predecessors_count; i++) {
1778 struct ir3_block *pred = block->predecessors[i];
1779 struct ra_spill_block_state *state = &ctx->blocks[pred->index];
1780 if (state->visited)
1781 continue;
1782
1783 state->live_out = rzalloc_array(ctx, BITSET_WORD,
1784 BITSET_WORDS(ctx->live->definitions_count));
1785
1786
1787 rb_tree_foreach (struct ra_spill_interval, interval,
1788 &ctx->reg_ctx.intervals, interval.node) {
1789 record_pred_live_out(ctx, interval, block, i);
1790 }
1791 }
1792 }
1793
1794 static void
record_live_out(struct ra_spill_ctx * ctx,struct ra_spill_block_state * state,struct ra_spill_interval * interval)1795 record_live_out(struct ra_spill_ctx *ctx,
1796 struct ra_spill_block_state *state,
1797 struct ra_spill_interval *interval)
1798 {
1799 if (!(interval->dst.flags & IR3_REG_SSA) ||
1800 interval->dst.def) {
1801 struct reg_or_immed *val = ralloc(ctx, struct reg_or_immed);
1802 *val = interval->dst;
1803 _mesa_hash_table_insert(state->remap, interval->interval.reg, val);
1804 }
1805 rb_tree_foreach (struct ra_spill_interval, child,
1806 &interval->interval.children, interval.node) {
1807 record_live_out(ctx, state, child);
1808 }
1809 }
1810
1811 static void
record_live_outs(struct ra_spill_ctx * ctx,struct ir3_block * block)1812 record_live_outs(struct ra_spill_ctx *ctx, struct ir3_block *block)
1813 {
1814 struct ra_spill_block_state *state = &ctx->blocks[block->index];
1815 state->remap = _mesa_pointer_hash_table_create(ctx);
1816
1817 rb_tree_foreach (struct ra_spill_interval, interval, &ctx->reg_ctx.intervals,
1818 interval.node) {
1819 record_live_out(ctx, state, interval);
1820 }
1821 }
1822
1823 static void
handle_block(struct ra_spill_ctx * ctx,struct ir3_block * block)1824 handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
1825 {
1826 memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
1827 rb_tree_init(&ctx->reg_ctx.intervals);
1828 rb_tree_init(&ctx->full_live_intervals);
1829 rb_tree_init(&ctx->half_live_intervals);
1830
1831 unsigned name;
1832 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1833 ctx->live->definitions_count) {
1834 struct ir3_register *reg = ctx->live->definitions[name];
1835 handle_live_in(ctx, block, reg);
1836 }
1837
1838 foreach_instr (instr, &block->instr_list) {
1839 if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
1840 instr->opc != OPC_META_TEX_PREFETCH)
1841 break;
1842 handle_input_phi(ctx, instr);
1843 }
1844
1845 if (ctx->spilling) {
1846 if (block->predecessors_count == 1 &&
1847 block->predecessors[0]->successors[1]) {
1848 spill_single_pred_live_in(ctx, block);
1849 } else {
1850 spill_live_ins(ctx, block);
1851 reload_live_ins(ctx, block);
1852 record_pred_live_outs(ctx, block);
1853 foreach_instr (instr, &block->instr_list) {
1854 if (instr->opc != OPC_META_PHI)
1855 break;
1856 rewrite_phi(ctx, instr, block);
1857 }
1858 add_live_in_phis(ctx, block);
1859 }
1860 } else {
1861 update_max_pressure(ctx);
1862 }
1863
1864 foreach_instr (instr, &block->instr_list) {
1865 di(instr, "processing");
1866
1867 if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
1868 instr->opc == OPC_META_TEX_PREFETCH)
1869 remove_input_phi(ctx, instr);
1870 else if (ctx->spilling && instr->opc == OPC_META_PARALLEL_COPY)
1871 handle_pcopy(ctx, instr);
1872 else if (ctx->spilling && instr->opc == OPC_MOV &&
1873 instr->dsts[0] == ctx->base_reg)
1874 /* skip */;
1875 else
1876 handle_instr(ctx, instr);
1877 }
1878
1879 if (ctx->spilling && block->successors[0]) {
1880 struct ra_spill_block_state *state =
1881 &ctx->blocks[block->successors[0]->index];
1882 if (state->visited) {
1883 assert(!block->successors[1]);
1884
1885 spill_live_outs(ctx, block);
1886 reload_live_outs(ctx, block);
1887 update_live_out_phis(ctx, block);
1888 }
1889 }
1890
1891 if (ctx->spilling) {
1892 record_live_outs(ctx, block);
1893 ctx->blocks[block->index].visited = true;
1894 }
1895 }
1896
1897 static bool
simplify_phi_node(struct ir3_instruction * phi)1898 simplify_phi_node(struct ir3_instruction *phi)
1899 {
1900 struct ir3_register *def = NULL;
1901 foreach_src (src, phi) {
1902 /* Ignore phi sources which point to the phi itself. */
1903 if (src->def == phi->dsts[0])
1904 continue;
1905 /* If it's undef or it doesn't match the previous sources, bail */
1906 if (!src->def || (def && def != src->def))
1907 return false;
1908 def = src->def;
1909 }
1910
1911 phi->data = def;
1912 phi->flags |= IR3_INSTR_UNUSED;
1913 return true;
1914 }
1915
1916 static struct ir3_register *
simplify_phi_def(struct ir3_register * def)1917 simplify_phi_def(struct ir3_register *def)
1918 {
1919 if (def->instr->opc == OPC_META_PHI) {
1920 struct ir3_instruction *phi = def->instr;
1921
1922 /* Note: this function is always called at least once after visiting the
1923 * phi, so either there has been a simplified phi in the meantime, in
1924 * which case we will set progress=true and visit the definition again, or
1925 * phi->data already has the most up-to-date value. Therefore we don't
1926 * have to recursively check phi->data.
1927 */
1928 if (phi->data)
1929 return phi->data;
1930 }
1931
1932 return def;
1933 }
1934
1935 static void
simplify_phi_srcs(struct ir3_instruction * instr)1936 simplify_phi_srcs(struct ir3_instruction *instr)
1937 {
1938 foreach_src (src, instr) {
1939 if (src->def)
1940 src->def = simplify_phi_def(src->def);
1941 }
1942 }
1943
1944 /* We insert phi nodes for all live-ins of loops in case we need to split the
1945 * live range. This pass cleans that up for the case where the live range didn't
1946 * actually need to be split.
1947 */
1948 static void
simplify_phi_nodes(struct ir3 * ir)1949 simplify_phi_nodes(struct ir3 *ir)
1950 {
1951 foreach_block (block, &ir->block_list) {
1952 foreach_instr (instr, &block->instr_list) {
1953 if (instr->opc != OPC_META_PHI)
1954 break;
1955 instr->data = NULL;
1956 }
1957 }
1958
1959 bool progress;
1960 do {
1961 progress = false;
1962 foreach_block (block, &ir->block_list) {
1963 foreach_instr (instr, &block->instr_list) {
1964 if (instr->opc == OPC_META_PHI || (instr->flags & IR3_INSTR_UNUSED))
1965 continue;
1966
1967 simplify_phi_srcs(instr);
1968 }
1969
1970 /* Visit phi nodes in the successors to make sure that phi sources are
1971 * always visited at least once after visiting the definition they
1972 * point to. See note in simplify_phi_def() for why this is necessary.
1973 */
1974 for (unsigned i = 0; i < 2; i++) {
1975 struct ir3_block *succ = block->successors[i];
1976 if (!succ)
1977 continue;
1978 foreach_instr (instr, &succ->instr_list) {
1979 if (instr->opc != OPC_META_PHI)
1980 break;
1981 if (instr->flags & IR3_INSTR_UNUSED) {
1982 if (instr->data)
1983 instr->data = simplify_phi_def(instr->data);
1984 } else {
1985 simplify_phi_srcs(instr);
1986 progress |= simplify_phi_node(instr);
1987 }
1988 }
1989 }
1990 }
1991 } while (progress);
1992 }
1993
1994 static void
unmark_dead(struct ir3 * ir)1995 unmark_dead(struct ir3 *ir)
1996 {
1997 foreach_block (block, &ir->block_list) {
1998 foreach_instr (instr, &block->instr_list) {
1999 instr->flags &= ~IR3_INSTR_UNUSED;
2000 }
2001 }
2002 }
2003
2004 /* Simple pass to remove now-dead phi nodes and pcopy instructions. We mark
2005 * which ones are dead along the way, so there's nothing to compute here.
2006 */
2007 static void
cleanup_dead(struct ir3 * ir)2008 cleanup_dead(struct ir3 *ir)
2009 {
2010 foreach_block (block, &ir->block_list) {
2011 foreach_instr_safe (instr, &block->instr_list) {
2012 if (instr->flags & IR3_INSTR_UNUSED) {
2013 if (instr->opc == OPC_META_PARALLEL_COPY) {
2014 /* There may be non-SSA shared copies, we need to preserve these.
2015 */
2016 for (unsigned i = 0; i < instr->dsts_count;) {
2017 if (instr->dsts[i]->flags & IR3_REG_SSA) {
2018 instr->dsts[i] = instr->dsts[--instr->dsts_count];
2019 instr->srcs[i] = instr->srcs[--instr->srcs_count];
2020 } else {
2021 i++;
2022 }
2023 }
2024
2025 if (instr->dsts_count == 0)
2026 list_delinit(&instr->node);
2027 } else {
2028 list_delinit(&instr->node);
2029 }
2030 }
2031 }
2032 }
2033 }
2034
2035 /* Deal with merge sets after spilling. Spilling generally leaves the merge sets
2036 * in a mess, and even if we properly cleaned up after ourselves, we would want
2037 * to recompute the merge sets afterward anway. That's because
2038 * spilling/reloading can "break up" phi webs and split/collect webs so that
2039 * allocating them to the same register no longer gives any benefit. For
2040 * example, imagine we have this:
2041 *
2042 * if (...) {
2043 * foo = ...
2044 * } else {
2045 * bar = ...
2046 * }
2047 * baz = phi(foo, bar)
2048 *
2049 * and we spill "baz":
2050 *
2051 * if (...) {
2052 * foo = ...
2053 * spill(foo)
2054 * } else {
2055 * bar = ...
2056 * spill(bar)
2057 * }
2058 * baz = reload()
2059 *
2060 * now foo, bar, and baz don't have to be allocated to the same register. How
2061 * exactly the merge sets change can be complicated, so it's easier just to
2062 * recompute them.
2063 *
2064 * However, there's a wrinkle in this: those same merge sets determine the
2065 * register pressure, due to multiple values inhabiting the same register! And
2066 * we assume that this sharing happens when spilling. Therefore we need a
2067 * three-step procedure:
2068 *
2069 * 1. Drop the original merge sets.
2070 * 2. Calculate which values *must* be merged, being careful to only use the
2071 * interval information which isn't trashed by spilling, and forcibly merge
2072 * them.
2073 * 3. Let ir3_merge_regs() finish the job, including recalculating the
2074 * intervals.
2075 */
2076
2077 static void
fixup_merge_sets(struct ir3_liveness * live,struct ir3 * ir)2078 fixup_merge_sets(struct ir3_liveness *live, struct ir3 *ir)
2079 {
2080 foreach_block (block, &ir->block_list) {
2081 foreach_instr (instr, &block->instr_list) {
2082 foreach_dst (dst, instr) {
2083 dst->merge_set = NULL;
2084 dst->merge_set_offset = 0;
2085 }
2086 }
2087 }
2088
2089 ir3_index_instrs_for_merge_sets(ir);
2090
2091 foreach_block (block, &ir->block_list) {
2092 foreach_instr (instr, &block->instr_list) {
2093 if (instr->opc != OPC_META_SPLIT &&
2094 instr->opc != OPC_META_COLLECT)
2095 continue;
2096
2097 struct ir3_register *dst = instr->dsts[0];
2098 ra_foreach_src (src, instr) {
2099 if (!(src->flags & IR3_REG_KILL) &&
2100 src->def->interval_start < dst->interval_end &&
2101 dst->interval_start < src->def->interval_end) {
2102 ir3_force_merge(dst, src->def,
2103 src->def->interval_start - dst->interval_start);
2104 }
2105 }
2106 }
2107 }
2108
2109 ir3_merge_regs(live, ir);
2110 }
2111
2112 void
ir3_calc_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * max_pressure)2113 ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
2114 struct ir3_pressure *max_pressure)
2115 {
2116 struct ra_spill_ctx *ctx = rzalloc(NULL, struct ra_spill_ctx);
2117 spill_ctx_init(ctx, v, live);
2118
2119 foreach_block (block, &v->ir->block_list) {
2120 handle_block(ctx, block);
2121 }
2122
2123 assert(ctx->cur_pressure.full == 0);
2124 assert(ctx->cur_pressure.half == 0);
2125 assert(ctx->cur_pressure.shared == 0);
2126 assert(ctx->cur_pressure.shared_half == 0);
2127
2128 *max_pressure = ctx->max_pressure;
2129 ralloc_free(ctx);
2130 }
2131
2132 bool
ir3_spill(struct ir3 * ir,struct ir3_shader_variant * v,struct ir3_liveness ** live,const struct ir3_pressure * limit_pressure)2133 ir3_spill(struct ir3 *ir, struct ir3_shader_variant *v,
2134 struct ir3_liveness **live,
2135 const struct ir3_pressure *limit_pressure)
2136 {
2137 void *mem_ctx = ralloc_parent(*live);
2138 struct ra_spill_ctx *ctx = rzalloc(mem_ctx, struct ra_spill_ctx);
2139 spill_ctx_init(ctx, v, *live);
2140
2141 ctx->spilling = true;
2142
2143 ctx->blocks = rzalloc_array(ctx, struct ra_spill_block_state,
2144 ctx->live->block_count);
2145 rb_tree_init(&ctx->full_live_intervals);
2146 rb_tree_init(&ctx->half_live_intervals);
2147
2148 ctx->limit_pressure = *limit_pressure;
2149 ctx->spill_slot = v->pvtmem_size;
2150
2151 add_base_reg(ctx, ir);
2152 compute_next_distance(ctx, ir);
2153
2154 unmark_dead(ir);
2155
2156 foreach_block (block, &ir->block_list) {
2157 handle_block(ctx, block);
2158 }
2159
2160 simplify_phi_nodes(ir);
2161
2162 cleanup_dead(ir);
2163
2164 ir3_create_parallel_copies(ir);
2165
2166 /* After this point, we're done mutating the IR. Liveness has been trashed,
2167 * so recalculate it. We'll need it for recalculating the merge sets.
2168 */
2169 ralloc_free(ctx->live);
2170 *live = ir3_calc_liveness(mem_ctx, ir);
2171
2172 fixup_merge_sets(*live, ir);
2173
2174 v->pvtmem_size = ctx->spill_slot;
2175 ralloc_free(ctx);
2176
2177 return true;
2178 }
2179