xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_shared_ra.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Valve Corporation
3  * Copyright © 2014 Rob Clark <[email protected]>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ir3_ra.h"
8 #include "ir3_shader.h"
9 
10 #include "util/u_math.h"
11 
12 /* Allocating shared registers can pose a challenge, because their live
13  * intervals use the physical CFG which has extra edges inserted that are
14  * pretty much always critical edges. This causes problems with phi nodes,
15  * because copies for phi nodes have to happen "along the edge," and similarly
16  * causes problems when reunifying values that have had their live range split.
17  * Problematic phi nodes should be relatively rare, so we ban them for now.
18  * The solution we choose for live-range splitting is to integrate spilling and
19  * register allcoation and spill to vector registers rather than split a live
20  * range, which negates some of the advantages of SSA-based RA, but it isn't as
21  * bad as it seems because the conditions needed (vector shared registers, which
22  * only movmsk currently produces, or fixed registers which we don't do) are
23  * relatively rare. Spilling is also much cheaper than spilling vector registers
24  * to private memory.
25  */
26 
27 struct ra_interval {
28    struct ir3_reg_interval interval;
29 
30    struct rb_node physreg_node;
31    physreg_t physreg_start, physreg_end;
32 
33    /* Where the shared register is spilled to. If there were no uses when it's
34     * spilled it could be the original defining instruction.
35     */
36    struct ir3_register *spill_def;
37 
38    /* Whether this contains a source of the current instruction that can't be
39     * spilled.
40     */
41    bool src;
42 
43    bool needs_reload;
44 };
45 
46 struct ra_block_state {
47    bool visited;
48 
49    /* For blocks whose successors are visited first (i.e. loop backedges), which
50     * values should be live at the end.
51     */
52    BITSET_WORD *live_out;
53 };
54 
55 struct ra_ctx {
56    struct ir3_reg_ctx reg_ctx;
57 
58    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
59 
60    struct rb_tree physreg_intervals;
61 
62    struct ra_interval *intervals;
63 
64    struct ir3_liveness *live;
65 
66    struct hash_table *pcopy_src_map;
67 
68    struct ra_block_state *blocks;
69 
70    unsigned start;
71 };
72 
73 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)74 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
75 {
76    return rb_node_data(struct ra_interval, interval, interval);
77 }
78 
79 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)80 rb_node_to_interval(struct rb_node *node)
81 {
82    return rb_node_data(struct ra_interval, node, physreg_node);
83 }
84 
85 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)86 rb_node_to_interval_const(const struct rb_node *node)
87 {
88    return rb_node_data(struct ra_interval, node, physreg_node);
89 }
90 
91 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)92 ra_interval_next(struct ra_interval *interval)
93 {
94    struct rb_node *next = rb_node_next(&interval->physreg_node);
95    return next ? rb_node_to_interval(next) : NULL;
96 }
97 
98 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)99 ra_interval_next_or_null(struct ra_interval *interval)
100 {
101    return interval ? ra_interval_next(interval) : NULL;
102 }
103 
104 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)105 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
106 {
107    const struct ra_interval *a = rb_node_to_interval_const(_a);
108    const struct ra_interval *b = rb_node_to_interval_const(_b);
109    return b->physreg_start - a->physreg_start;
110 }
111 
112 static int
ra_interval_cmp(const struct rb_node * node,const void * data)113 ra_interval_cmp(const struct rb_node *node, const void *data)
114 {
115    physreg_t reg = *(const physreg_t *)data;
116    const struct ra_interval *interval = rb_node_to_interval_const(node);
117    if (interval->physreg_start > reg)
118       return -1;
119    else if (interval->physreg_end <= reg)
120       return 1;
121    else
122       return 0;
123 }
124 
125 static struct ra_ctx *
ir3_reg_ctx_to_ctx(struct ir3_reg_ctx * ctx)126 ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
127 {
128    return rb_node_data(struct ra_ctx, ctx, reg_ctx);
129 }
130 
131 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)132 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
133 {
134    struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
135    return node ? rb_node_to_interval(node) : NULL;
136 }
137 
138 /* Get the interval covering the reg, or the closest to the right if it
139  * doesn't exist.
140  */
141 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)142 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
143 {
144    struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
145    if (!interval) {
146       return NULL;
147    } else if (interval->physreg_end > reg) {
148       return interval;
149    } else {
150       /* There is no interval covering reg, and ra_file_search_sloppy()
151        * returned the closest range to the left, so the next interval to the
152        * right should be the closest to the right.
153        */
154       return ra_interval_next_or_null(interval);
155    }
156 }
157 
158 static struct ra_interval *
ra_ctx_search_right(struct ra_ctx * ctx,physreg_t reg)159 ra_ctx_search_right(struct ra_ctx *ctx, physreg_t reg)
160 {
161    return ra_interval_search_right(&ctx->physreg_intervals, reg);
162 }
163 
164 static void
interval_add(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)165 interval_add(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
166 {
167    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
168    struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
169 
170    /* We can assume in this case that physreg_start/physreg_end is already
171     * initialized.
172     */
173    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
174       BITSET_CLEAR(ctx->available, i);
175    }
176 
177    rb_tree_insert(&ctx->physreg_intervals, &interval->physreg_node,
178                   ra_interval_insert_cmp);
179 }
180 
181 static void
interval_delete(struct ir3_reg_ctx * reg_ctx,struct ir3_reg_interval * _interval)182 interval_delete(struct ir3_reg_ctx *reg_ctx, struct ir3_reg_interval *_interval)
183 {
184    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
185    struct ra_ctx *ctx = ir3_reg_ctx_to_ctx(reg_ctx);
186 
187    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
188       BITSET_SET(ctx->available, i);
189    }
190 
191    rb_tree_remove(&ctx->physreg_intervals, &interval->physreg_node);
192 }
193 
194 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)195 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
196                struct ir3_reg_interval *_child)
197 {
198    struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
199    struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
200 
201    child->physreg_start =
202       parent->physreg_start + (child->interval.reg->interval_start -
203                                parent->interval.reg->interval_start);
204    child->physreg_end =
205       child->physreg_start +
206       (child->interval.reg->interval_end - child->interval.reg->interval_start);
207 
208    interval_add(ctx, _child);
209 }
210 
211 static void
ra_ctx_init(struct ra_ctx * ctx)212 ra_ctx_init(struct ra_ctx *ctx)
213 {
214    ctx->reg_ctx.interval_add = interval_add;
215    ctx->reg_ctx.interval_delete = interval_delete;
216    ctx->reg_ctx.interval_readd = interval_readd;
217 }
218 
219 static void
ra_ctx_reset_block(struct ra_ctx * ctx)220 ra_ctx_reset_block(struct ra_ctx *ctx)
221 {
222    for (unsigned i = 0; i < RA_SHARED_SIZE; i++) {
223       BITSET_SET(ctx->available, i);
224    }
225 
226    rb_tree_init(&ctx->reg_ctx.intervals);
227    rb_tree_init(&ctx->physreg_intervals);
228 }
229 
230 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)231 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
232 {
233    ir3_reg_interval_init(&interval->interval, reg);
234 }
235 
236 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)237 ra_interval_get_physreg(const struct ra_interval *interval)
238 {
239    unsigned child_start = interval->interval.reg->interval_start;
240 
241    while (interval->interval.parent) {
242       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
243    }
244 
245    return interval->physreg_start +
246           (child_start - interval->interval.reg->interval_start);
247 }
248 
249 static unsigned
ra_interval_get_num(const struct ra_interval * interval)250 ra_interval_get_num(const struct ra_interval *interval)
251 {
252    return ra_physreg_to_num(ra_interval_get_physreg(interval),
253                             interval->interval.reg->flags);
254 }
255 
256 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)257 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
258 {
259    mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
260 
261    ir3_reg_interval_dump(stream, &interval->interval);
262 }
263 
264 static void
ra_ctx_dump(struct ra_ctx * ctx)265 ra_ctx_dump(struct ra_ctx *ctx)
266 {
267    struct log_stream *stream = mesa_log_streami();
268 
269    mesa_log_stream_printf(stream, "shared:\n");
270    rb_tree_foreach (struct ra_interval, interval, &ctx->physreg_intervals,
271                     physreg_node) {
272       ra_interval_dump(stream, interval);
273    }
274 
275    unsigned start, end;
276    mesa_log_stream_printf(stream, "available:\n");
277    BITSET_FOREACH_RANGE (start, end, ctx->available, RA_SHARED_SIZE) {
278       mesa_log_stream_printf(stream, "%u-%u ", start, end);
279    }
280    mesa_log_stream_printf(stream, "\n");
281    mesa_log_stream_printf(stream, "start: %u\n", ctx->start);
282 }
283 
284 static bool
get_reg_specified(struct ra_ctx * ctx,struct ir3_register * reg,physreg_t physreg)285 get_reg_specified(struct ra_ctx *ctx, struct ir3_register *reg, physreg_t physreg)
286 {
287    for (unsigned i = 0; i < reg_size(reg); i++) {
288       if (!BITSET_TEST(ctx->available, physreg + i))
289          return false;
290    }
291 
292    return true;
293 }
294 
295 static unsigned
reg_file_size(struct ir3_register * reg)296 reg_file_size(struct ir3_register *reg)
297 {
298    if (reg->flags & IR3_REG_HALF)
299       return RA_SHARED_HALF_SIZE;
300    else
301       return RA_SHARED_SIZE;
302 }
303 
304 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ir3_register * dst,unsigned size,unsigned align)305 find_best_gap(struct ra_ctx *ctx, struct ir3_register *dst, unsigned size,
306               unsigned align)
307 {
308    unsigned file_size = reg_file_size(dst);
309 
310    /* This can happen if we create a very large merge set. Just bail out in that
311     * case.
312     */
313    if (size > file_size)
314       return (physreg_t) ~0;
315 
316    unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
317    unsigned candidate = start;
318    do {
319       bool is_available = true;
320       for (unsigned i = 0; i < size; i++) {
321          if (!BITSET_TEST(ctx->available, candidate + i)) {
322             is_available = false;
323             break;
324          }
325       }
326 
327       if (is_available) {
328          ctx->start = (candidate + size) % file_size;
329          return candidate;
330       }
331 
332       candidate += align;
333       if (candidate + size > file_size)
334          candidate = 0;
335    } while (candidate != start);
336 
337    return (physreg_t)~0;
338 }
339 
340 static physreg_t
find_best_spill_reg(struct ra_ctx * ctx,struct ir3_register * reg,unsigned size,unsigned align)341 find_best_spill_reg(struct ra_ctx *ctx, struct ir3_register *reg,
342                     unsigned size, unsigned align)
343 {
344    unsigned file_size = reg_file_size(reg);
345    unsigned min_cost = UINT_MAX;
346 
347    unsigned start = ALIGN(ctx->start, align) % (file_size - size + align);
348    physreg_t candidate = start;
349    physreg_t best_reg = (physreg_t)~0;
350    do {
351       unsigned cost = 0;
352 
353       /* Iterate through intervals we'd need to spill to use this reg. */
354       for (struct ra_interval *interval = ra_ctx_search_right(ctx, candidate);
355            interval && interval->physreg_start < candidate + size;
356            interval = ra_interval_next_or_null(interval)) {
357          /* We can't spill sources of the current instruction when reloading
358           * sources.
359           */
360          if (interval->src) {
361             cost = UINT_MAX;
362             break;
363          }
364 
365          /* We prefer spilling intervals that already have been spilled, so we
366           * don't have to emit another mov.
367           */
368          if (!interval->spill_def)
369             cost += (interval->physreg_end - interval->physreg_start);
370       }
371 
372       if (cost < min_cost) {
373          min_cost = cost;
374          best_reg = candidate;
375       }
376 
377       candidate += align;
378       if (candidate + size > file_size)
379          candidate = 0;
380    } while (candidate != start);
381 
382    return best_reg;
383 }
384 
385 static struct ir3_register *
split(struct ir3_register * def,unsigned offset,struct ir3_instruction * before)386 split(struct ir3_register *def, unsigned offset, struct ir3_instruction *before)
387 {
388    if (reg_elems(def) == 1) {
389       assert(offset == 0);
390       return def;
391    }
392 
393    struct ir3_instruction *split =
394       ir3_instr_create(before->block, OPC_META_SPLIT, 1, 1);
395    split->split.off = offset;
396    struct ir3_register *dst = __ssa_dst(split);
397    struct ir3_register *src =
398       ir3_src_create(split, INVALID_REG, def->flags & (IR3_REG_HALF | IR3_REG_SSA));
399    src->wrmask = def->wrmask;
400    src->def = def;
401    ir3_instr_move_after(split, before);
402    return dst;
403 }
404 
405 static struct ir3_register *
extract(struct ir3_register * parent_def,unsigned offset,unsigned elems,struct ir3_instruction * before)406 extract(struct ir3_register *parent_def, unsigned offset, unsigned elems,
407         struct ir3_instruction *before)
408 {
409    if (offset == 0 && elems == reg_elems(parent_def))
410       return parent_def;
411 
412    if (elems == 1)
413       return split(parent_def, offset, before);
414 
415    struct ir3_instruction *collect =
416       ir3_instr_create(before->block, OPC_META_COLLECT, 1, elems);
417    struct ir3_register *dst = __ssa_dst(collect);
418    dst->flags |= parent_def->flags & IR3_REG_HALF;
419    dst->wrmask = MASK(elems);
420 
421    ir3_instr_move_after(collect, before);
422 
423    for (unsigned i = 0; i < elems; i++) {
424       ir3_src_create(collect, INVALID_REG,
425                      parent_def->flags & (IR3_REG_HALF | IR3_REG_SSA))->def =
426          split(parent_def, offset + i, before);
427    }
428 
429    return dst;
430 }
431 
432 static void
spill_interval_children(struct ra_interval * interval,struct ir3_instruction * before)433 spill_interval_children(struct ra_interval *interval,
434                         struct ir3_instruction *before)
435 {
436    rb_tree_foreach (struct ra_interval, child, &interval->interval.children,
437                     interval.node) {
438       if (!child->spill_def) {
439          child->spill_def = extract(interval->spill_def,
440                                     (child->interval.reg->interval_start -
441                                      interval->interval.reg->interval_start) /
442                                     reg_elem_size(interval->interval.reg),
443                                     reg_elems(child->interval.reg), before);
444       }
445       spill_interval_children(child, before);
446    }
447 }
448 
449 static void
spill_interval(struct ra_ctx * ctx,struct ra_interval * interval)450 spill_interval(struct ra_ctx *ctx, struct ra_interval *interval)
451 {
452    struct ir3_instruction *before = interval->interval.reg->instr;
453 
454    d("spilling ssa_%u:%u", before->serialno, interval->interval.reg->name);
455 
456    if (!interval->spill_def) {
457       /* If this is a phi node or input, we need to insert the demotion to a
458        * regular register after the last phi or input in the block.
459        */
460       if (before->opc == OPC_META_PHI ||
461           before->opc == OPC_META_INPUT) {
462          struct ir3_block *block = before->block;
463          struct ir3_instruction *last_phi_input = NULL;
464          foreach_instr_from (instr, before, &block->instr_list) {
465             if (instr->opc != before->opc)
466                break;
467             last_phi_input = instr;
468          }
469          before = last_phi_input;
470       }
471 
472       struct ir3_instruction *mov = ir3_instr_create(before->block, OPC_MOV, 1, 1);
473       mov->flags |= IR3_INSTR_SHARED_SPILL;
474       struct ir3_register *dst = __ssa_dst(mov);
475       dst->flags |= (interval->interval.reg->flags & IR3_REG_HALF);
476       dst->wrmask = interval->interval.reg->wrmask;
477       mov->repeat = reg_elems(dst) - 1;
478       ir3_src_create(mov, interval->interval.reg->num,
479                      IR3_REG_SHARED | (mov->repeat ? IR3_REG_R : 0) |
480                      (interval->interval.reg->flags & IR3_REG_HALF))->wrmask =
481                      interval->interval.reg->wrmask;
482       mov->cat1.src_type = mov->cat1.dst_type =
483          (interval->interval.reg->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
484 
485       ir3_instr_move_after(mov, before);
486       interval->spill_def = dst;
487    }
488 
489    spill_interval_children(interval, interval->spill_def->instr);
490 
491    ir3_reg_interval_remove_all(&ctx->reg_ctx, &interval->interval);
492 }
493 
494 /* Try to demote a scalar ALU instruction to a normal ALU instruction, using the
495  * spilled sources. We have to take into account restrictions on the number of
496  * shared sources that only exist for normal ALU instructions.
497  */
498 static bool
try_demote_instruction(struct ra_ctx * ctx,struct ir3_instruction * instr)499 try_demote_instruction(struct ra_ctx *ctx, struct ir3_instruction *instr)
500 {
501    /* First, check restrictions. */
502    switch (opc_cat(instr->opc)) {
503    case 1:
504       /* MOVMSK is special and can't be demoted. It also has no sources so must
505        * go before the check below.
506        */
507       if (instr->opc == OPC_MOVMSK)
508          return false;
509 
510       assert(instr->srcs_count >= 1);
511       if (!(instr->srcs[0]->flags & (IR3_REG_CONST | IR3_REG_IMMED)))
512          return false;
513       break;
514    case 2: {
515       /* We need one source to either be demotable or an immediate. */
516       if (instr->srcs_count > 1) {
517          struct ra_interval *src0_interval =
518             (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
519          struct ra_interval *src1_interval =
520             (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
521          if (!(src0_interval && src0_interval->spill_def) &&
522              !(src1_interval && src1_interval->spill_def) &&
523              !(instr->srcs[0]->flags & IR3_REG_IMMED) &&
524              !(instr->srcs[1]->flags & IR3_REG_IMMED))
525             return false;
526       }
527       break;
528    }
529    case 3: {
530       struct ra_interval *src0_interval =
531          (instr->srcs[0]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[0]->def->name] : NULL;
532       struct ra_interval *src1_interval =
533          (instr->srcs[1]->flags & IR3_REG_SSA) ? &ctx->intervals[instr->srcs[1]->def->name] : NULL;
534 
535       /* src1 cannot be shared */
536       if (src1_interval && !src1_interval->spill_def) {
537          /* Try to swap src0 and src1, similar to what copy prop does. */
538          if (!is_mad(instr->opc))
539             return false;
540 
541          if ((src0_interval && src0_interval->spill_def) ||
542              (instr->srcs[0]->flags & IR3_REG_IMMED)) {
543             struct ir3_register *src0 = instr->srcs[0];
544             instr->srcs[0] = instr->srcs[1];
545             instr->srcs[1] = src0;
546          } else {
547             return false;
548          }
549       }
550       break;
551    }
552    case 4: {
553       assert(instr->srcs[0]->flags & IR3_REG_SSA);
554       struct ra_interval *src_interval = &ctx->intervals[instr->srcs[0]->def->name];
555       if (!src_interval->spill_def)
556          return false;
557       break;
558    }
559 
560    default:
561       return false;
562    }
563 
564    d("demoting instruction");
565 
566    /* If the instruction is already not a scalar ALU instruction, we should've
567     * skipped reloading and just demoted sources directly, so we should never
568     * get here.
569     */
570    assert(instr->dsts[0]->flags & IR3_REG_SHARED);
571 
572    /* Now we actually demote the instruction */
573    ra_foreach_src (src, instr) {
574       assert(src->flags & IR3_REG_SHARED);
575       struct ra_interval *interval = &ctx->intervals[src->def->name];
576       if (interval->spill_def) {
577          src->def = interval->spill_def;
578          src->flags &= ~IR3_REG_SHARED;
579          interval->needs_reload = false;
580          if (interval->interval.inserted)
581             ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
582          while (interval->interval.parent)
583             interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
584          interval->src = false;
585       }
586    }
587 
588    struct ra_interval *dst_interval = &ctx->intervals[instr->dsts[0]->name];
589    instr->dsts[0]->flags &= ~IR3_REG_SHARED;
590    ra_interval_init(dst_interval, instr->dsts[0]);
591    dst_interval->spill_def = instr->dsts[0];
592 
593    instr->flags |= IR3_INSTR_SHARED_SPILL;
594 
595    return true;
596 }
597 
598 /* Free up [start, start + size) by spilling live intervals.
599  */
600 static void
free_space(struct ra_ctx * ctx,physreg_t start,unsigned size)601 free_space(struct ra_ctx *ctx, physreg_t start, unsigned size)
602 {
603    struct ra_interval *interval = ra_ctx_search_right(ctx, start);
604    while (interval && interval->physreg_start < start + size) {
605       struct ra_interval *next = ra_interval_next_or_null(interval);
606       spill_interval(ctx, interval);
607       interval = next;
608    }
609 }
610 
611 static physreg_t
get_reg(struct ra_ctx * ctx,struct ir3_register * reg,bool src)612 get_reg(struct ra_ctx *ctx, struct ir3_register *reg, bool src)
613 {
614    if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
615       physreg_t preferred_reg =
616          reg->merge_set->preferred_reg + reg->merge_set_offset;
617       if (preferred_reg < reg_file_size(reg) &&
618           preferred_reg % reg_elem_size(reg) == 0 &&
619           get_reg_specified(ctx, reg, preferred_reg))
620          return preferred_reg;
621    }
622 
623    /* If this register is a subset of a merge set which we have not picked a
624     * register for, first try to allocate enough space for the entire merge
625     * set.
626     */
627    unsigned size = reg_size(reg);
628    if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
629        size < reg->merge_set->size) {
630       physreg_t best_reg = find_best_gap(ctx, reg, reg->merge_set->size,
631                                          reg->merge_set->alignment);
632       if (best_reg != (physreg_t)~0u) {
633          best_reg += reg->merge_set_offset;
634          return best_reg;
635       }
636    }
637 
638    /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
639     * Because this doesn't introduce unnecessary dependencies, and it
640     * potentially avoids needing (ss) syncs for write after read hazards for
641     * SFU instructions:
642     */
643    if (!src && (is_sfu(reg->instr) || is_alu(reg->instr))) {
644       for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
645          struct ir3_register *src = reg->instr->srcs[i];
646          if (!ra_reg_is_src(src))
647             continue;
648          if ((src->flags & IR3_REG_SHARED) && reg_size(src) >= size) {
649             struct ra_interval *src_interval = &ctx->intervals[src->def->name];
650             physreg_t src_physreg = ra_interval_get_physreg(src_interval);
651             if (src_physreg % reg_elem_size(reg) == 0 &&
652                 src_physreg + size <= reg_file_size(reg) &&
653                 get_reg_specified(ctx, reg, src_physreg))
654                return src_physreg;
655          }
656       }
657    }
658 
659    return find_best_gap(ctx, reg, size, reg_elem_size(reg));
660 }
661 
662 /* The reload process is split in two, first we allocate a register to reload to
663  * for all sources that need a reload and then we actually execute the reload.
664  * This is to allow us to demote shared ALU instructions to non-shared whenever
665  * we would otherwise need to spill to reload, without leaving dangling unused
666  * reload mov's from previously processed sources. So, for example, we could
667  * need to reload both sources of an add, but after reloading the first source
668  * we realize that we would need to spill to reload the second source and we
669  * should demote the add instead, which means cancelling the first reload.
670  */
671 static void
reload_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)672 reload_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
673            struct ir3_register *src)
674 {
675    struct ir3_register *reg = src->def;
676    struct ra_interval *interval = &ctx->intervals[reg->name];
677    unsigned size = reg_size(reg);
678 
679    physreg_t best_reg = get_reg(ctx, reg, true);
680 
681    if (best_reg == (physreg_t)~0u) {
682       if (try_demote_instruction(ctx, instr))
683          return;
684 
685       best_reg = find_best_spill_reg(ctx, reg, size, reg_elem_size(reg));
686       assert(best_reg != (physreg_t)~0u);
687 
688       free_space(ctx, best_reg, size);
689    }
690 
691    d("reload src %u physreg %u", reg->name, best_reg);
692    interval->physreg_start = best_reg;
693    interval->physreg_end = best_reg + size;
694    interval->needs_reload = true;
695    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
696    interval->src = true;
697 }
698 
699 static void
reload_interval(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_block * block,struct ra_interval * interval)700 reload_interval(struct ra_ctx *ctx, struct ir3_instruction *instr,
701                 struct ir3_block *block, struct ra_interval *interval)
702 {
703    struct ir3_register *def = interval->interval.reg;
704    struct ir3_instruction *mov = ir3_instr_create(block, OPC_MOV, 1, 1);
705    mov->flags |= IR3_INSTR_SHARED_SPILL;
706    unsigned flags = IR3_REG_SHARED | (def->flags & IR3_REG_HALF);
707    ir3_dst_create(mov, ra_physreg_to_num(interval->physreg_start, flags),
708                   flags)->wrmask = def->wrmask;
709    mov->repeat = reg_elems(def) - 1;
710    struct ir3_register *mov_src =
711       ir3_src_create(mov, INVALID_REG, IR3_REG_SSA | (def->flags & IR3_REG_HALF) |
712                      (mov->repeat ? IR3_REG_R : 0));
713    assert(interval->spill_def);
714    mov_src->def = interval->spill_def;
715    mov_src->wrmask = def->wrmask;
716    mov->cat1.src_type = mov->cat1.dst_type =
717       (def->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
718 
719    if (instr)
720       ir3_instr_move_before(mov, instr);
721 }
722 
723 static void
reload_src_finalize(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)724 reload_src_finalize(struct ra_ctx *ctx, struct ir3_instruction *instr,
725                     struct ir3_register *src)
726 {
727    struct ir3_register *reg = src->def;
728    struct ra_interval *interval = &ctx->intervals[reg->name];
729 
730    if (!interval->needs_reload)
731       return;
732 
733    reload_interval(ctx, instr, instr->block, interval);
734 
735    interval->needs_reload = false;
736 }
737 
738 static bool
can_demote_src(struct ir3_instruction * instr)739 can_demote_src(struct ir3_instruction *instr)
740 {
741    switch (instr->opc) {
742    case OPC_SCAN_MACRO:
743    case OPC_META_COLLECT:
744       return false;
745    case OPC_MOV:
746       /* non-shared -> shared floating-point conversions and
747        * 8-bit sign extension don't work.
748        */
749       return (!(instr->dsts[0]->flags & IR3_REG_SHARED) ||
750               !((full_type(instr->cat1.src_type) == TYPE_F32 ||
751                  full_type(instr->cat1.dst_type) == TYPE_F32) ||
752                 (instr->cat1.src_type == TYPE_U8 &&
753                  full_type(instr->cat1.dst_type) == TYPE_S32)));
754    default:
755       return (!is_alu(instr) && !is_sfu(instr)) ||
756          !(instr->dsts[0]->flags & IR3_REG_SHARED);
757    }
758 }
759 
760 /* Ensure that this source is never spilled while reloading other sources.
761  */
762 static void
mark_src(struct ra_ctx * ctx,struct ir3_register * src)763 mark_src(struct ra_ctx *ctx, struct ir3_register *src)
764 {
765    if (!(src->flags & IR3_REG_SHARED))
766       return;
767 
768    struct ra_interval *interval = &ctx->intervals[src->def->name];
769 
770    if (interval->interval.inserted) {
771       while (interval->interval.parent)
772          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
773 
774       interval->src = true;
775    }
776 }
777 
778 static void
ensure_src_live(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)779 ensure_src_live(struct ra_ctx *ctx, struct ir3_instruction *instr,
780                 struct ir3_register *src)
781 {
782    if (!(src->flags & IR3_REG_SHARED))
783       return;
784 
785    struct ra_interval *interval = &ctx->intervals[src->def->name];
786 
787    if (!interval->interval.inserted) {
788       /* In some cases we cannot demote shared reg sources to non-shared regs,
789        * then we have to reload it.
790        */
791       assert(interval->spill_def);
792       if (!can_demote_src(instr)) {
793          reload_src(ctx, instr, src);
794       } else {
795          if (instr->opc == OPC_META_PARALLEL_COPY) {
796             /* Stash away the original def to use later in case we actually have
797              * to insert a reload.
798              */
799             _mesa_hash_table_insert(ctx->pcopy_src_map, src, src->def);
800          }
801          src->def = interval->spill_def;
802          src->flags &= ~IR3_REG_SHARED;
803       }
804    }
805 }
806 
807 static void
assign_src(struct ra_ctx * ctx,struct ir3_register * src)808 assign_src(struct ra_ctx *ctx, struct ir3_register *src)
809 {
810    if (!(src->flags & IR3_REG_SHARED))
811       return;
812 
813    struct ra_interval *interval = &ctx->intervals[src->def->name];
814    assert(interval->interval.inserted);
815    src->num = ra_physreg_to_num(ra_interval_get_physreg(interval), src->flags);
816 
817    if ((src->flags & IR3_REG_FIRST_KILL) &&
818        !interval->interval.parent &&
819        rb_tree_is_empty(&interval->interval.children))
820       ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
821 
822    while (interval->interval.parent)
823       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
824 
825    interval->src = false;
826 }
827 
828 static void
handle_dst(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * dst)829 handle_dst(struct ra_ctx *ctx, struct ir3_instruction *instr,
830            struct ir3_register *dst)
831 {
832    if (!(dst->flags & IR3_REG_SHARED))
833       return;
834 
835    struct ra_interval *interval = &ctx->intervals[dst->name];
836    ra_interval_init(interval, dst);
837    interval->spill_def = NULL;
838 
839    if (dst->tied) {
840       struct ir3_register *tied_def = dst->tied->def;
841       struct ra_interval *tied_interval = &ctx->intervals[tied_def->name];
842       if ((dst->tied->flags & IR3_REG_KILL) &&
843           !tied_interval->interval.parent &&
844           rb_tree_is_empty(&tied_interval->interval.children)) {
845          dst->num = dst->tied->num;
846          interval->physreg_start = tied_interval->physreg_start;
847          interval->physreg_end = tied_interval->physreg_end;
848          ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
849          return;
850       }
851    }
852 
853    physreg_t physreg = get_reg(ctx, dst, false);
854    if (physreg == (physreg_t) ~0u) {
855       if (try_demote_instruction(ctx, instr))
856          return;
857 
858       unsigned size = reg_size(dst);
859       physreg = find_best_spill_reg(ctx, dst, size, reg_elem_size(dst));
860       assert(physreg != (physreg_t)~0u);
861       free_space(ctx, physreg, size);
862    }
863 
864    ra_update_affinity(reg_file_size(dst), dst, physreg);
865    interval->physreg_start = physreg;
866    interval->physreg_end = physreg + reg_size(dst);
867    dst->num = ra_physreg_to_num(physreg, dst->flags);
868    ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
869    d("insert dst %u physreg %u", dst->name, physreg);
870 
871    if (dst->tied) {
872       struct ir3_instruction *mov = ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 1, 1);
873       unsigned flags = IR3_REG_SHARED | (dst->flags & IR3_REG_HALF);
874       ir3_dst_create(mov, dst->num, flags)->wrmask = dst->wrmask;
875       ir3_src_create(mov, dst->tied->num, flags)->wrmask = dst->wrmask;
876       mov->cat1.src_type = mov->cat1.dst_type =
877          (dst->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;;
878       ir3_instr_move_before(mov, instr);
879       dst->tied->num = dst->num;
880    }
881 }
882 
883 static void
handle_src_late(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)884 handle_src_late(struct ra_ctx *ctx, struct ir3_instruction *instr,
885                 struct ir3_register *src)
886 {
887    if (!(src->flags & IR3_REG_SHARED))
888       return;
889 
890    struct ra_interval *interval = &ctx->intervals[src->def->name];
891    reload_src_finalize(ctx, instr, src);
892 
893    /* Remove killed sources that have to be killed late due to being merged with
894     * other defs.
895     */
896    if (!(src->flags & IR3_REG_KILL))
897       return;
898 
899    if (interval->interval.inserted)
900       ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
901 }
902 
903 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)904 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
905 {
906    ra_foreach_src (src, instr)
907       mark_src(ctx, src);
908 
909    ra_foreach_src (src, instr)
910       ensure_src_live(ctx, instr, src);
911 
912    ra_foreach_src_rev (src, instr)
913       assign_src(ctx, src);
914 
915    ra_foreach_dst (dst, instr)
916       handle_dst(ctx, instr, dst);
917 
918    ra_foreach_src (src, instr)
919       handle_src_late(ctx, instr, src);
920 }
921 
922 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * split)923 handle_split(struct ra_ctx *ctx, struct ir3_instruction *split)
924 {
925    struct ir3_register *src = split->srcs[0];
926    struct ir3_register *dst = split->dsts[0];
927 
928    if (!(dst->flags & IR3_REG_SHARED))
929       return;
930 
931    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
932       handle_normal_instr(ctx, split);
933       return;
934    }
935 
936    struct ra_interval *src_interval = &ctx->intervals[src->def->name];
937    struct ra_interval *dst_interval = &ctx->intervals[dst->name];
938 
939    ra_interval_init(dst_interval, dst);
940    dst_interval->spill_def = NULL;
941 
942    if (src_interval->spill_def) {
943       struct ir3_instruction *spill_split =
944          ir3_instr_create(split->block, OPC_META_SPLIT, 1, 1);
945       struct ir3_register *dst = __ssa_dst(spill_split);
946       struct ir3_register *src =
947          ir3_src_create(spill_split, INVALID_REG, IR3_REG_SSA);
948       src->def = src_interval->spill_def;
949       src->wrmask = src_interval->spill_def->wrmask;
950       spill_split->split.off = split->split.off;
951       ir3_instr_move_after(spill_split, split);
952       dst_interval->spill_def = dst;
953       list_del(&split->node);
954       return;
955    }
956 
957    dst_interval->physreg_start =
958       src_interval->physreg_start + dst->merge_set_offset -
959       src->def->merge_set_offset;
960    dst_interval->physreg_end = dst_interval->physreg_start + reg_size(dst);
961    ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
962    src->num = ra_interval_get_num(src_interval);
963    dst->num = ra_interval_get_num(dst_interval);
964    d("insert dst %u physreg %u", dst->name, dst_interval->physreg_start);
965 
966    if (src->flags & IR3_REG_KILL)
967       ir3_reg_interval_remove(&ctx->reg_ctx, &src_interval->interval);
968 }
969 
970 static void
handle_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)971 handle_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
972 {
973    struct ir3_register *dst = phi->dsts[0];
974 
975    if (!(dst->flags & IR3_REG_SHARED))
976       return;
977 
978    struct ra_interval *dst_interval = &ctx->intervals[dst->name];
979    ra_interval_init(dst_interval, dst);
980 
981    /* In some rare cases, it's possible to have a phi node with a physical-only
982     * source. Here's a contrived example:
983     *
984     * loop {
985     *    if non-uniform {
986     *       if uniform {
987     *          x_1 = ...;
988     *          continue;
989     *       }
990     *       x_2 = ...;
991     *    } else {
992     *       break;
993     *    }
994     *    // continue block
995     *    x_3 = phi(x_1, x_2)
996     * }
997     *
998     * Assuming x_1 and x_2 are uniform, x_3 will also be uniform, because all
999     * threads that stay in the loop take the same branch to the continue block,
1000     * however execution may fall through from the assignment to x_2 to the
1001     * break statement because the outer if is non-uniform, and then it will fall
1002     * through again to the continue block, so if x_3 is to be in a shared reg
1003     * then the phi needs an extra source pointing to the break statement, which
1004     * itself needs a phi node:
1005     *
1006     * loop {
1007     *    if non-uniform {
1008     *       if uniform {
1009     *          x_1 = ...;
1010     *          continue;
1011     *       }
1012     *       x_2 = ...;
1013     *    } else {
1014     *       x_4 = phi(undef, x_2)
1015     *       break;
1016     *    }
1017     *    // continue block
1018     *    x_3 = phi(x_1, x_2, x_4)
1019     * }
1020     */
1021 
1022    /* phi nodes are special because we cannot spill them normally, instead we
1023     * have to spill the parallel copies that their sources point to and make the
1024     * entire phi not shared anymore.
1025     */
1026 
1027    physreg_t physreg = get_reg(ctx, dst, false);
1028    if (physreg == (physreg_t) ~0u) {
1029       d("spilling phi destination");
1030       dst->flags &= ~IR3_REG_SHARED;
1031       dst_interval->spill_def = dst;
1032       phi->flags |= IR3_INSTR_SHARED_SPILL;
1033 
1034       foreach_src (src, phi) {
1035          src->flags &= ~IR3_REG_SHARED;
1036          if (src->def)
1037             src->def->flags &= ~IR3_REG_SHARED;
1038       }
1039 
1040       return;
1041    }
1042 
1043    dst->num = ra_physreg_to_num(physreg, dst->flags);
1044    dst_interval->spill_def = NULL;
1045    dst_interval->physreg_start = physreg;
1046    dst_interval->physreg_end = physreg + reg_size(dst);
1047    ir3_reg_interval_insert(&ctx->reg_ctx, &dst_interval->interval);
1048 
1049    ra_foreach_src_n (src, i, phi) {
1050       /* We assume that any phis with non-logical sources aren't promoted. */
1051       assert(i < phi->block->predecessors_count);
1052       src->num = dst->num;
1053       src->def->num = dst->num;
1054    }
1055 }
1056 
1057 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * pcopy)1058 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *pcopy)
1059 {
1060    /* For parallel copies, we only handle the source. The destination is handled
1061     * later when processing phi nodes.
1062     */
1063 
1064    ra_foreach_src (src, pcopy)
1065       mark_src(ctx, src);
1066 
1067    ra_foreach_src (src, pcopy)
1068       ensure_src_live(ctx, pcopy, src);
1069 
1070    ra_foreach_src_rev (src, pcopy)
1071       assign_src(ctx, src);
1072 
1073    ra_foreach_src (src, pcopy)
1074       handle_src_late(ctx, pcopy, src);
1075 }
1076 
1077 static void
handle_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1078 handle_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1079 {
1080    instr->flags &= ~IR3_INSTR_SHARED_SPILL;
1081 
1082    switch (instr->opc) {
1083    case OPC_META_SPLIT:
1084       handle_split(ctx, instr);
1085       break;
1086    case OPC_META_PHI:
1087       handle_phi(ctx, instr);
1088       break;
1089    case OPC_META_PARALLEL_COPY:
1090       handle_pcopy(ctx, instr);
1091       break;
1092    default:
1093       handle_normal_instr(ctx, instr);
1094    }
1095 }
1096 
1097 /* In case we define a value outside a loop, use it inside the loop, then spill
1098  * it afterwards inside the same loop, we could lose the value so we have to
1099  * reload it. We have to reload it after any parallel copy instruction, when the
1100  * live shared registers equal the live-in of the backedge. lower_pcopy() will
1101  * then move any non-shared parallel copies down past the reload.
1102  */
1103 static void
reload_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1104 reload_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1105 {
1106    struct ra_block_state *state = &ctx->blocks[block->index];
1107    unsigned name;
1108    BITSET_FOREACH_SET (name, state->live_out, ctx->live->definitions_count) {
1109       struct ir3_register *reg = ctx->live->definitions[name];
1110 
1111       struct ra_interval *interval = &ctx->intervals[name];
1112       if (!interval->interval.inserted) {
1113          d("reloading %d at end of backedge", reg->name);
1114          reload_interval(ctx, NULL, block, interval);
1115       }
1116    }
1117 }
1118 
1119 static void
record_pred_live_out(struct ra_ctx * ctx,struct ra_interval * interval,struct ir3_block * pred)1120 record_pred_live_out(struct ra_ctx *ctx,
1121                      struct ra_interval *interval,
1122                      struct ir3_block *pred)
1123 {
1124    struct ra_block_state *state = &ctx->blocks[pred->index];
1125 
1126    struct ir3_register *def = interval->interval.reg;
1127    BITSET_SET(state->live_out, def->name);
1128 
1129    rb_tree_foreach (struct ra_interval, child,
1130                     &interval->interval.children, interval.node) {
1131       record_pred_live_out(ctx, child, pred);
1132    }
1133 }
1134 
1135 static void
record_pred_live_outs(struct ra_ctx * ctx,struct ir3_block * block)1136 record_pred_live_outs(struct ra_ctx *ctx, struct ir3_block *block)
1137 {
1138    for (unsigned i = 0; i < block->predecessors_count; i++) {
1139       struct ir3_block *pred = block->predecessors[i];
1140       struct ra_block_state *state = &ctx->blocks[pred->index];
1141       if (state->visited)
1142          continue;
1143 
1144       state->live_out = rzalloc_array(NULL, BITSET_WORD,
1145                                       BITSET_WORDS(ctx->live->definitions_count));
1146 
1147 
1148       rb_tree_foreach (struct ra_interval, interval,
1149                        &ctx->reg_ctx.intervals, interval.node) {
1150          record_pred_live_out(ctx, interval, pred);
1151       }
1152    }
1153 }
1154 
1155 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)1156 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
1157 {
1158    ra_ctx_reset_block(ctx);
1159 
1160    unsigned name;
1161    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
1162                        ctx->live->definitions_count) {
1163       struct ir3_register *def = ctx->live->definitions[name];
1164       struct ra_interval *interval = &ctx->intervals[name];
1165 
1166       /* Non-shared definitions may still be definitions we spilled by demoting
1167        * them, so we still need to initialize the interval. But we shouldn't
1168        * make these intervals live.
1169        */
1170       ra_interval_init(interval, def);
1171 
1172       if ((def->flags & IR3_REG_SHARED) && !interval->spill_def) {
1173          ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
1174       }
1175    }
1176 
1177    if (RA_DEBUG) {
1178       d("after live-in block %u:\n", block->index);
1179       ra_ctx_dump(ctx);
1180    }
1181 
1182    if (block->predecessors_count > 1)
1183       record_pred_live_outs(ctx, block);
1184 
1185    foreach_instr_safe (instr, &block->instr_list) {
1186       di(instr, "processing");
1187 
1188       handle_instr(ctx, instr);
1189 
1190       if (RA_DEBUG)
1191          ra_ctx_dump(ctx);
1192    }
1193 
1194    if (block->successors[0]) {
1195       struct ra_block_state *state = &ctx->blocks[block->successors[0]->index];
1196 
1197       if (state->visited) {
1198          assert(!block->successors[1]);
1199 
1200          reload_live_outs(ctx, block);
1201       }
1202    }
1203 
1204    ctx->blocks[block->index].visited = true;
1205 }
1206 
1207 static void
lower_pcopy(struct ir3 * ir,struct ra_ctx * ctx)1208 lower_pcopy(struct ir3 *ir, struct ra_ctx *ctx)
1209 {
1210    foreach_block (block, &ir->block_list) {
1211       foreach_instr_safe (instr, &block->instr_list) {
1212          /* At this point due to spilling there may be parallel copies from
1213           * shared to non-shared registers and vice versa. Lowering these after
1214           * RA may produce cycles involving shared and non-shared registers,
1215           * which would need to be resolved by swapping a shared and non-shared
1216           * register which is something we can't handle. However by lowering
1217           * these to moves now, we can make sure that cycles only involve
1218           * non-shared registers. To avoid illegally moving a shared register
1219           * read or write across the parallel copy, which may have other
1220           * conflicting reads/writes if there's a cycle, we need to move copies
1221           * from non-shared to shared below the shared copies, and we need to
1222           * move copies from shared to non-shared above them. So, we have the
1223           * following order:
1224           *
1225           * 1. shared->non-shared copies (spills)
1226           * 2. shared->shared copies (one parallel copy as there may be cycles)
1227           * 3. non-shared->shared copies (reloads)
1228           * 4. non-shared->non-shared copies
1229           *
1230           * We split out the non-shared->non-shared copies as a separate step.
1231           */
1232          if (instr->opc == OPC_META_PARALLEL_COPY) {
1233             for (unsigned i = 0; i < instr->srcs_count; i++) {
1234                if ((instr->srcs[i]->flags & IR3_REG_SHARED) &&
1235                    !(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1236                   /* shared->non-shared. Create a spill move and rewrite the
1237                    * source to be the destination of the move (so that the
1238                    * original shared->non-shared copy becomes a
1239                    * non-shared->non-shared copy).
1240                    */
1241                   struct ir3_instruction *mov =
1242                      ir3_instr_create(block, OPC_MOV, 1, 1);
1243                   mov->flags |= IR3_INSTR_SHARED_SPILL;
1244                   struct ir3_register *dst =
1245                      ir3_dst_create(mov, INVALID_REG, instr->dsts[i]->flags);
1246                   dst->wrmask = instr->dsts[i]->wrmask;
1247                   dst->instr = mov;
1248                   mov->repeat = reg_elems(mov->dsts[0]) - 1;
1249                   struct ir3_register *src =
1250                      ir3_src_create(mov, instr->srcs[i]->num,
1251                                     instr->srcs[i]->flags |
1252                                     (mov->repeat ? IR3_REG_R : 0));
1253                   src->wrmask = instr->srcs[i]->wrmask;
1254                   mov->cat1.dst_type = mov->cat1.src_type =
1255                      (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1256                   instr->srcs[i]->flags = mov->dsts[0]->flags;
1257                   instr->srcs[i]->def = mov->dsts[0];
1258                   ir3_instr_move_before(mov, instr);
1259                }
1260             }
1261 
1262             for (unsigned i = 0; i < instr->dsts_count;) {
1263                if ((instr->dsts[i]->flags & IR3_REG_SHARED) &&
1264                    (instr->srcs[i]->flags & IR3_REG_SSA) &&
1265                    !(instr->srcs[i]->flags & IR3_REG_SHARED)) {
1266                   /* non-shared->shared. Create a reload move.
1267                    */
1268                   struct ir3_instruction *mov =
1269                      ir3_instr_create(block, OPC_MOV, 1, 1);
1270                   mov->flags |= IR3_INSTR_SHARED_SPILL;
1271                   struct ir3_register *dst =
1272                      ir3_dst_create(mov, instr->dsts[i]->num,
1273                                     instr->dsts[i]->flags);
1274                   dst->instr = mov;
1275                   dst->wrmask = instr->dsts[i]->wrmask;
1276                   mov->repeat = reg_elems(mov->dsts[0]) - 1;
1277                   struct ir3_register *src =
1278                      ir3_src_create(mov, INVALID_REG, instr->srcs[i]->flags |
1279                                     (mov->repeat ? IR3_REG_R : 0));
1280                   src->def = instr->srcs[i]->def;
1281                   src->wrmask = instr->srcs[i]->wrmask;
1282                   mov->cat1.dst_type = mov->cat1.src_type =
1283                      (mov->dsts[0]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
1284 
1285                   /* When we spill a parallel copy source, we lose the
1286                    * information of where it originally points to since we make
1287                    * it point to the spill def. If we later decide not to also
1288                    * spill the phi associated with it, we have to restore it
1289                    * here using the stashed original source so that RA
1290                    * validation can check that we did the correct thing.
1291                    *
1292                    * Because SSA-ness goes away after validation, this is really
1293                    * just about validation.
1294                    */
1295                   struct ir3_block *succ = block->successors[0];
1296                   unsigned pred_idx = ir3_block_get_pred_index(succ, block);
1297                   foreach_instr (phi, &succ->instr_list) {
1298                      if (phi->opc != OPC_META_PHI)
1299                         break;
1300 
1301                      if (phi->srcs[pred_idx]->def == instr->dsts[i]) {
1302                         struct ir3_register *def =
1303                            _mesa_hash_table_search(ctx->pcopy_src_map,
1304                                                    instr->srcs[i])->data;
1305                         phi->srcs[pred_idx]->def = def;
1306                         break;
1307                      }
1308                   }
1309 
1310                   instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1311                   instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1312                   instr->srcs_count--;
1313                   instr->dsts_count--;
1314                   ir3_instr_move_after(mov, instr);
1315                   continue;
1316                }
1317 
1318                i++;
1319             }
1320 
1321             /* Move any non-shared copies to a separate parallel copy
1322              * instruction right at the end of the block, after any reloads. At
1323              * this point all copies should be {shared,immediate}->shared or
1324              * {non-shared,immediate}->non-shared.
1325              */
1326             unsigned non_shared_copies = 0;
1327             for (unsigned i = 0; i < instr->dsts_count; i++) {
1328                if (!(instr->dsts[i]->flags & IR3_REG_SHARED))
1329                   non_shared_copies++;
1330             }
1331 
1332             if (non_shared_copies != 0) {
1333                struct ir3_instruction *pcopy =
1334                   ir3_instr_create(block, OPC_META_PARALLEL_COPY,
1335                                    non_shared_copies, non_shared_copies);
1336 
1337                unsigned j = 0;
1338                for (unsigned i = 0; i < instr->dsts_count;) {
1339                   if (!(instr->dsts[i]->flags & IR3_REG_SHARED)) {
1340                      pcopy->dsts[j] = instr->dsts[i];
1341                      pcopy->srcs[j] = instr->srcs[i];
1342                      pcopy->dsts[j]->instr = pcopy;
1343                      instr->srcs[i] = instr->srcs[instr->srcs_count - 1];
1344                      instr->dsts[i] = instr->dsts[instr->dsts_count - 1];
1345                      instr->srcs_count--;
1346                      instr->dsts_count--;
1347                      j++;
1348                      continue;
1349                   }
1350                   i++;
1351                }
1352 
1353                pcopy->srcs_count = pcopy->dsts_count = j;
1354                if (instr->dsts_count == 0)
1355                   list_del(&instr->node);
1356             }
1357          }
1358       }
1359    }
1360 }
1361 
1362 static void
finalize(struct ir3 * ir)1363 finalize(struct ir3 *ir)
1364 {
1365    foreach_block (block, &ir->block_list) {
1366       foreach_instr (instr, &block->instr_list) {
1367          for (unsigned i = 0; i < instr->dsts_count; i++) {
1368             if (instr->dsts[i]->flags & IR3_REG_SHARED) {
1369                instr->dsts[i]->flags &= ~IR3_REG_SSA;
1370             }
1371          }
1372 
1373          for (unsigned i = 0; i < instr->srcs_count; i++) {
1374             if (instr->srcs[i]->flags & IR3_REG_SHARED) {
1375                instr->srcs[i]->flags &= ~IR3_REG_SSA;
1376                instr->srcs[i]->def = NULL;
1377             }
1378          }
1379       }
1380    }
1381 }
1382 
1383 void
ir3_ra_shared(struct ir3_shader_variant * v,struct ir3_liveness ** live_ptr)1384 ir3_ra_shared(struct ir3_shader_variant *v, struct ir3_liveness **live_ptr)
1385 {
1386    struct ra_ctx ctx;
1387    struct ir3_liveness *live = *live_ptr;
1388 
1389    ra_ctx_init(&ctx);
1390    ctx.intervals = rzalloc_array(NULL, struct ra_interval,
1391                                  live->definitions_count);
1392    ctx.blocks = rzalloc_array(NULL, struct ra_block_state,
1393                               live->block_count);
1394    ctx.start = 0;
1395    ctx.live = live;
1396    ctx.pcopy_src_map = _mesa_pointer_hash_table_create(NULL);
1397 
1398    foreach_block (block, &v->ir->block_list) {
1399       handle_block(&ctx, block);
1400    }
1401 
1402    lower_pcopy(v->ir, &ctx);
1403 
1404    for (unsigned i = 0; i < live->block_count; i++) {
1405       if (ctx.blocks[i].live_out)
1406          ralloc_free(ctx.blocks[i].live_out);
1407    }
1408 
1409    ralloc_free(ctx.intervals);
1410    ralloc_free(ctx.pcopy_src_map);
1411    ralloc_free(ctx.blocks);
1412 
1413    ir3_ra_validate(v, RA_FULL_SIZE, RA_HALF_SIZE, live->block_count, true);
1414    finalize(v->ir);
1415 
1416    /* Recalculate liveness and register pressure now that additional values have
1417     * been added.
1418     * TODO we should only do this if any values have been spilled/reloaded.
1419     * Note: since we don't have to recreate merge sets, we have to manually copy
1420     * interval_offset to the new liveness struct.
1421     */
1422    unsigned interval_offset = live->interval_offset;
1423    void *live_mem_ctx = ralloc_parent(live);
1424    ralloc_free(live);
1425    *live_ptr = ir3_calc_liveness(live_mem_ctx, v->ir);
1426    (*live_ptr)->interval_offset = interval_offset;
1427 }
1428 
1429