xref: /aosp_15_r20/external/mesa3d/src/freedreno/ir3/ir3_ra.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Valve Corporation
3  * Copyright © 2014 Rob Clark <[email protected]>
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "ir3_ra.h"
8 #include "util/rb_tree.h"
9 #include "util/u_math.h"
10 #include "ir3_shader.h"
11 
12 /* This file implements an SSA-based register allocator. Unlike other
13  * SSA-based allocators, it handles vector split/collect "smartly," meaning
14  * that multiple values may share the same register interval. From the
15  * perspective of the allocator itself, only the top-level intervals matter,
16  * and the allocator is only concerned with allocating top-level intervals,
17  * which may mean moving other top-level intervals around. Other intervals,
18  * like the destination of a split instruction or the source of a collect
19  * instruction, are "locked" to their parent interval. The details of this are
20  * mostly handled by ir3_merge_regs and ir3_reg_ctx.
21  *
22  * We currently don't do any backtracking, but we do use the merge sets as a
23  * form of affinity to try to avoid moves from phis/splits/collects. Each
24  * merge set is what a more "classic" graph-coloring or live-range based
25  * allocator would consider a single register, but here we use it as merely a
26  * hint, except when multiple overlapping values are live at the same time.
27  * Each merge set has a "preferred" register, and we try to honor that when
28  * allocating values in the merge set.
29  */
30 
31 /* ir3_reg_ctx implementation. */
32 
33 static int
ir3_reg_interval_cmp(const struct rb_node * node,const void * data)34 ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
35 {
36    unsigned reg = *(const unsigned *)data;
37    const struct ir3_reg_interval *interval =
38       ir3_rb_node_to_interval_const(node);
39    if (interval->reg->interval_start > reg)
40       return -1;
41    else if (interval->reg->interval_end <= reg)
42       return 1;
43    else
44       return 0;
45 }
46 
47 static struct ir3_reg_interval *
ir3_reg_interval_search(struct rb_tree * tree,unsigned offset)48 ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
49 {
50    struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
51    return node ? ir3_rb_node_to_interval(node) : NULL;
52 }
53 
54 static struct ir3_reg_interval *
ir3_reg_interval_search_sloppy(struct rb_tree * tree,unsigned offset)55 ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
56 {
57    struct rb_node *node =
58       rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
59    return node ? ir3_rb_node_to_interval(node) : NULL;
60 }
61 
62 /* Get the interval covering the reg, or the closest to the right if it
63  * doesn't exist.
64  */
65 static struct ir3_reg_interval *
ir3_reg_interval_search_right(struct rb_tree * tree,unsigned offset)66 ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
67 {
68    struct ir3_reg_interval *interval =
69       ir3_reg_interval_search_sloppy(tree, offset);
70    if (!interval) {
71       return NULL;
72    } else if (interval->reg->interval_end > offset) {
73       return interval;
74    } else {
75       /* There is no interval covering reg, and ra_file_search_sloppy()
76        * returned the closest range to the left, so the next interval to the
77        * right should be the closest to the right.
78        */
79       return ir3_reg_interval_next_or_null(interval);
80    }
81 }
82 
83 static int
ir3_reg_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)84 ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
85 {
86    const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
87    const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
88    return b->reg->interval_start - a->reg->interval_start;
89 }
90 
91 static void
interval_insert(struct ir3_reg_ctx * ctx,struct rb_tree * tree,struct ir3_reg_interval * interval)92 interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
93                 struct ir3_reg_interval *interval)
94 {
95    struct ir3_reg_interval *right =
96       ir3_reg_interval_search_right(tree, interval->reg->interval_start);
97    if (right && right->reg->interval_start < interval->reg->interval_end) {
98       /* We disallow trees where different members have different half-ness.
99        * This means that we can't treat bitcasts as copies like normal
100        * split/collect, so something like this would require an extra copy
101        * in mergedregs mode, and count as 4 half-units of register pressure
102        * instead of 2:
103        *
104        * f16vec2 foo = unpackFloat2x16(bar)
105        * ... = foo.x
106        * ... = bar
107        *
108        * However, relaxing this rule would open a huge can of worms. What
109        * happens when there's a vector of 16 things, and the fifth element
110        * has been bitcasted as a half-reg? Would that element alone have to
111        * be small enough to be used as a half-reg source? Let's keep that
112        * can of worms firmly shut for now.
113        */
114       assert((interval->reg->flags & IR3_REG_HALF) ==
115              (right->reg->flags & IR3_REG_HALF));
116 
117       if (right->reg->interval_end <= interval->reg->interval_end &&
118           right->reg->interval_start >= interval->reg->interval_start) {
119          /* Check if we're inserting something that's already inserted */
120          assert(interval != right);
121 
122          /* "right" is contained in "interval" and must become a child of
123           * it. There may be further children too.
124           */
125          for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
126               right && right->reg->interval_start < interval->reg->interval_end;
127               right = next, next = ir3_reg_interval_next_or_null(next)) {
128             /* "right" must be contained in "interval." */
129             assert(right->reg->interval_end <= interval->reg->interval_end);
130             assert((interval->reg->flags & IR3_REG_HALF) ==
131                    (right->reg->flags & IR3_REG_HALF));
132             if (!right->parent)
133                ctx->interval_delete(ctx, right);
134             right->parent = interval;
135             rb_tree_remove(tree, &right->node);
136             rb_tree_insert(&interval->children, &right->node,
137                            ir3_reg_interval_insert_cmp);
138          }
139       } else {
140          /* "right" must contain "interval," since intervals must form a
141           * tree.
142           */
143          assert(right->reg->interval_start <= interval->reg->interval_start);
144          interval->parent = right;
145          interval_insert(ctx, &right->children, interval);
146          return;
147       }
148    }
149 
150    if (!interval->parent)
151       ctx->interval_add(ctx, interval);
152    rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
153    interval->inserted = true;
154 }
155 
156 void
ir3_reg_interval_insert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)157 ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
158                         struct ir3_reg_interval *interval)
159 {
160    rb_tree_init(&interval->children);
161    interval->parent = NULL;
162    interval_insert(ctx, &ctx->intervals, interval);
163 }
164 
165 /* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
166 static void
ir3_reg_interval_reinsert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)167 ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
168                           struct ir3_reg_interval *interval)
169 {
170    interval->parent = NULL;
171    interval_insert(ctx, &ctx->intervals, interval);
172 }
173 
174 void
ir3_reg_interval_remove(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)175 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
176                         struct ir3_reg_interval *interval)
177 {
178    assert(interval->inserted);
179 
180    if (interval->parent) {
181       rb_tree_remove(&interval->parent->children, &interval->node);
182    } else {
183       ctx->interval_delete(ctx, interval);
184       rb_tree_remove(&ctx->intervals, &interval->node);
185    }
186 
187    rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
188                          node) {
189       rb_tree_remove(&interval->children, &child->node);
190       child->parent = interval->parent;
191 
192       if (interval->parent) {
193          rb_tree_insert(&child->parent->children, &child->node,
194                         ir3_reg_interval_insert_cmp);
195       } else {
196          ctx->interval_readd(ctx, interval, child);
197          rb_tree_insert(&ctx->intervals, &child->node,
198                         ir3_reg_interval_insert_cmp);
199       }
200    }
201 
202    interval->inserted = false;
203 }
204 
205 static void
_mark_free(struct ir3_reg_interval * interval)206 _mark_free(struct ir3_reg_interval *interval)
207 {
208    interval->inserted = false;
209    rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
210       _mark_free(child);
211    }
212 }
213 
214 /* Remove an interval and all its children from the tree. */
215 void
ir3_reg_interval_remove_all(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)216 ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
217                             struct ir3_reg_interval *interval)
218 {
219    assert(!interval->parent);
220 
221    ctx->interval_delete(ctx, interval);
222    rb_tree_remove(&ctx->intervals, &interval->node);
223    _mark_free(interval);
224 }
225 
226 /* Used when popping an interval to be shuffled around. Don't disturb children
227  * so that it can be later reinserted.
228  */
229 static void
ir3_reg_interval_remove_temp(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)230 ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
231                              struct ir3_reg_interval *interval)
232 {
233    assert(!interval->parent);
234 
235    ctx->interval_delete(ctx, interval);
236    rb_tree_remove(&ctx->intervals, &interval->node);
237 }
238 
239 static void
interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval,unsigned indent)240 interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
241               unsigned indent)
242 {
243    for (unsigned i = 0; i < indent; i++)
244       mesa_log_stream_printf(stream, "\t");
245    mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
246                           interval->reg->interval_start);
247 
248    rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
249       interval_dump(stream, child, indent + 1);
250    }
251 
252    for (unsigned i = 0; i < indent; i++)
253       mesa_log_stream_printf(stream, "\t");
254    mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
255                           interval->reg->interval_end);
256 }
257 
258 void
ir3_reg_interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval)259 ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
260 {
261    interval_dump(stream, interval, 0);
262 }
263 
264 /* These are the core datastructures used by the register allocator. First
265  * ra_interval and ra_file, which are used for intra-block tracking and use
266  * the ir3_reg_ctx infrastructure:
267  */
268 
269 struct ra_interval {
270    struct ir3_reg_interval interval;
271 
272    struct rb_node physreg_node;
273    physreg_t physreg_start, physreg_end;
274 
275    /* True if this is a source of the current instruction which is entirely
276     * killed. This means we can allocate the dest over it, but we can't break
277     * it up.
278     */
279    bool is_killed;
280 
281    /* True if this interval cannot be moved from its position. This is only
282     * used for precolored inputs to ensure that other inputs don't get
283     * allocated on top of them.
284     */
285    bool frozen;
286 };
287 
288 struct ra_file {
289    struct ir3_reg_ctx reg_ctx;
290 
291    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
292    BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
293 
294    struct rb_tree physreg_intervals;
295 
296    unsigned size;
297    unsigned start;
298 };
299 
300 /* State for inter-block tracking. When we split a live range to make space
301  * for a vector, we may need to insert fixup code when a block has multiple
302  * predecessors that have moved the same live value to different registers.
303  * This keeps track of state required to do that.
304  */
305 
306 struct ra_block_state {
307    /* Map of defining ir3_register -> physreg it was allocated to at the end
308     * of the block.
309     */
310    struct hash_table *renames;
311 
312    /* For loops, we need to process a block before all its predecessors have
313     * been processed. In particular, we need to pick registers for values
314     * without knowing if all the predecessors have been renamed. This keeps
315     * track of the registers we chose so that when we visit the back-edge we
316     * can move them appropriately. If all predecessors have been visited
317     * before this block is visited then we don't need to fill this out. This
318     * is a map from ir3_register -> physreg.
319     */
320    struct hash_table *entry_regs;
321 
322    /* True if the block has been visited and "renames" is complete.
323     */
324    bool visited;
325 };
326 
327 struct ra_parallel_copy {
328    struct ra_interval *interval;
329    physreg_t src;
330 };
331 
332 /* The main context: */
333 
334 struct ra_ctx {
335    /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
336     * half of this file too.
337     */
338    struct ra_file full;
339 
340    /* hr0.x - hr63.w, only used without merged-regs. */
341    struct ra_file half;
342 
343    /* Shared regs. */
344    struct ra_file shared;
345 
346    struct ir3_liveness *live;
347 
348    struct ir3_block *block;
349 
350    const struct ir3_compiler *compiler;
351    gl_shader_stage stage;
352 
353    /* Pending moves of top-level intervals that will be emitted once we're
354     * finished:
355     */
356    DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
357 
358    struct ra_interval *intervals;
359    struct ra_block_state *blocks;
360 
361    bool merged_regs;
362 };
363 
364 #define foreach_interval(interval, file)                                       \
365    rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
366                     physreg_node)
367 #define foreach_interval_rev(interval, file)                                   \
368    rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals,  \
369                     physreg_node)
370 #define foreach_interval_safe(interval, file)                                  \
371    rb_tree_foreach_safe (struct ra_interval, interval,                         \
372                          &(file)->physreg_intervals, physreg_node)
373 #define foreach_interval_rev_safe(interval, file)                              \
374    rb_tree_foreach_rev_safe(struct ra_interval, interval,                      \
375                             &(file)->physreg_intervals, physreg_node)
376 
377 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)378 rb_node_to_interval(struct rb_node *node)
379 {
380    return rb_node_data(struct ra_interval, node, physreg_node);
381 }
382 
383 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)384 rb_node_to_interval_const(const struct rb_node *node)
385 {
386    return rb_node_data(struct ra_interval, node, physreg_node);
387 }
388 
389 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)390 ra_interval_next(struct ra_interval *interval)
391 {
392    struct rb_node *next = rb_node_next(&interval->physreg_node);
393    return next ? rb_node_to_interval(next) : NULL;
394 }
395 
396 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)397 ra_interval_next_or_null(struct ra_interval *interval)
398 {
399    return interval ? ra_interval_next(interval) : NULL;
400 }
401 
402 static int
ra_interval_cmp(const struct rb_node * node,const void * data)403 ra_interval_cmp(const struct rb_node *node, const void *data)
404 {
405    physreg_t reg = *(const physreg_t *)data;
406    const struct ra_interval *interval = rb_node_to_interval_const(node);
407    if (interval->physreg_start > reg)
408       return -1;
409    else if (interval->physreg_end <= reg)
410       return 1;
411    else
412       return 0;
413 }
414 
415 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)416 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
417 {
418    struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
419    return node ? rb_node_to_interval(node) : NULL;
420 }
421 
422 /* Get the interval covering the reg, or the closest to the right if it
423  * doesn't exist.
424  */
425 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)426 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
427 {
428    struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
429    if (!interval) {
430       return NULL;
431    } else if (interval->physreg_end > reg) {
432       return interval;
433    } else {
434       /* There is no interval covering reg, and ra_file_search_sloppy()
435        * returned the closest range to the left, so the next interval to the
436        * right should be the closest to the right.
437        */
438       return ra_interval_next_or_null(interval);
439    }
440 }
441 
442 static struct ra_interval *
ra_file_search_right(struct ra_file * file,physreg_t reg)443 ra_file_search_right(struct ra_file *file, physreg_t reg)
444 {
445    return ra_interval_search_right(&file->physreg_intervals, reg);
446 }
447 
448 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)449 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
450 {
451    const struct ra_interval *a = rb_node_to_interval_const(_a);
452    const struct ra_interval *b = rb_node_to_interval_const(_b);
453    return b->physreg_start - a->physreg_start;
454 }
455 
456 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)457 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
458 {
459    return rb_node_data(struct ra_interval, interval, interval);
460 }
461 
462 static struct ra_file *
ir3_reg_ctx_to_file(struct ir3_reg_ctx * ctx)463 ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
464 {
465    return rb_node_data(struct ra_file, ctx, reg_ctx);
466 }
467 
468 static void
interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)469 interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
470 {
471    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
472    struct ra_file *file = ir3_reg_ctx_to_file(ctx);
473 
474    /* We can assume in this case that physreg_start/physreg_end is already
475     * initialized.
476     */
477    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
478       BITSET_CLEAR(file->available, i);
479       BITSET_CLEAR(file->available_to_evict, i);
480    }
481 
482    rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
483                   ra_interval_insert_cmp);
484 }
485 
486 static void
interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)487 interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
488 {
489    struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
490    struct ra_file *file = ir3_reg_ctx_to_file(ctx);
491 
492    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
493       BITSET_SET(file->available, i);
494       BITSET_SET(file->available_to_evict, i);
495    }
496 
497    rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
498 }
499 
500 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)501 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
502                struct ir3_reg_interval *_child)
503 {
504    struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
505    struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
506 
507    child->physreg_start =
508       parent->physreg_start + (child->interval.reg->interval_start -
509                                parent->interval.reg->interval_start);
510    child->physreg_end =
511       child->physreg_start +
512       (child->interval.reg->interval_end - child->interval.reg->interval_start);
513 
514    interval_add(ctx, _child);
515 }
516 
517 static void
ra_file_init(struct ra_file * file)518 ra_file_init(struct ra_file *file)
519 {
520    for (unsigned i = 0; i < file->size; i++) {
521       BITSET_SET(file->available, i);
522       BITSET_SET(file->available_to_evict, i);
523    }
524 
525    rb_tree_init(&file->reg_ctx.intervals);
526    rb_tree_init(&file->physreg_intervals);
527 
528    file->reg_ctx.interval_add = interval_add;
529    file->reg_ctx.interval_delete = interval_delete;
530    file->reg_ctx.interval_readd = interval_readd;
531 }
532 
533 static void
ra_file_insert(struct ra_file * file,struct ra_interval * interval)534 ra_file_insert(struct ra_file *file, struct ra_interval *interval)
535 {
536    assert(interval->physreg_start < interval->physreg_end);
537    assert(interval->physreg_end <= file->size);
538    if (interval->interval.reg->flags & IR3_REG_HALF)
539       assert(interval->physreg_end <= RA_HALF_SIZE);
540 
541    ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
542 }
543 
544 static void
ra_file_remove(struct ra_file * file,struct ra_interval * interval)545 ra_file_remove(struct ra_file *file, struct ra_interval *interval)
546 {
547    ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
548 }
549 
550 static void
ra_file_mark_killed(struct ra_file * file,struct ra_interval * interval)551 ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
552 {
553    assert(!interval->interval.parent);
554 
555    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
556       BITSET_SET(file->available, i);
557    }
558 
559    interval->is_killed = true;
560 }
561 
562 static void
ra_file_unmark_killed(struct ra_file * file,struct ra_interval * interval)563 ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
564 {
565    assert(!interval->interval.parent);
566 
567    for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
568       BITSET_CLEAR(file->available, i);
569    }
570 
571    interval->is_killed = false;
572 }
573 
574 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)575 ra_interval_get_physreg(const struct ra_interval *interval)
576 {
577    unsigned child_start = interval->interval.reg->interval_start;
578 
579    while (interval->interval.parent) {
580       interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
581    }
582 
583    return interval->physreg_start +
584           (child_start - interval->interval.reg->interval_start);
585 }
586 
587 static unsigned
ra_interval_get_num(const struct ra_interval * interval)588 ra_interval_get_num(const struct ra_interval *interval)
589 {
590    return ra_physreg_to_num(ra_interval_get_physreg(interval),
591                             interval->interval.reg->flags);
592 }
593 
594 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)595 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
596 {
597    ir3_reg_interval_init(&interval->interval, reg);
598    interval->is_killed = false;
599    interval->frozen = false;
600 }
601 
602 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)603 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
604 {
605    mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
606 
607    ir3_reg_interval_dump(stream, &interval->interval);
608 }
609 
610 static void
ra_file_dump(struct log_stream * stream,struct ra_file * file)611 ra_file_dump(struct log_stream *stream, struct ra_file *file)
612 {
613    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
614                     physreg_node) {
615       ra_interval_dump(stream, interval);
616    }
617 
618    unsigned start, end;
619    mesa_log_stream_printf(stream, "available:\n");
620    BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
621       mesa_log_stream_printf(stream, "%u-%u ", start, end);
622    }
623    mesa_log_stream_printf(stream, "\n");
624 
625    mesa_log_stream_printf(stream, "available to evict:\n");
626    BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
627       mesa_log_stream_printf(stream, "%u-%u ", start, end);
628    }
629    mesa_log_stream_printf(stream, "\n");
630    mesa_log_stream_printf(stream, "start: %u\n", file->start);
631 }
632 
633 static void
ra_ctx_dump(struct ra_ctx * ctx)634 ra_ctx_dump(struct ra_ctx *ctx)
635 {
636    struct log_stream *stream = mesa_log_streami();
637    mesa_log_stream_printf(stream, "full:\n");
638    ra_file_dump(stream, &ctx->full);
639    mesa_log_stream_printf(stream, "half:\n");
640    ra_file_dump(stream, &ctx->half);
641    mesa_log_stream_printf(stream, "shared:\n");
642    ra_file_dump(stream, &ctx->shared);
643    mesa_log_stream_destroy(stream);
644 }
645 
646 static unsigned
reg_file_size(struct ra_file * file,struct ir3_register * reg)647 reg_file_size(struct ra_file *file, struct ir3_register *reg)
648 {
649    /* Half-regs can only take up the first half of the combined regfile */
650    if (reg->flags & IR3_REG_HALF) {
651       if (reg->flags & IR3_REG_SHARED)
652          return RA_SHARED_HALF_SIZE;
653       else
654          return MIN2(file->size, RA_HALF_SIZE);
655    } else {
656       return file->size;
657    }
658 }
659 
660 /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
661  * top-level intervals at once. Pop multiple intervals, then push them back in
662  * any order.
663  */
664 
665 struct ra_removed_interval {
666    struct ra_interval *interval;
667    unsigned size;
668 };
669 
670 static struct ra_removed_interval
ra_pop_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval)671 ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
672                 struct ra_interval *interval)
673 {
674    assert(!interval->interval.parent);
675    /* shared live splitting is not allowed! */
676    assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
677 
678    /* Check if we've already moved this reg before */
679    unsigned pcopy_index;
680    for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
681         pcopy_index++) {
682       if (ctx->parallel_copies[pcopy_index].interval == interval)
683          break;
684    }
685 
686    if (pcopy_index == ctx->parallel_copies_count) {
687       array_insert(ctx, ctx->parallel_copies,
688                    (struct ra_parallel_copy){
689                       .interval = interval,
690                       .src = interval->physreg_start,
691                    });
692    }
693 
694    ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
695 
696    return (struct ra_removed_interval){
697       .interval = interval,
698       .size = interval->physreg_end - interval->physreg_start,
699    };
700 }
701 
702 static void
ra_push_interval(struct ra_ctx * ctx,struct ra_file * file,const struct ra_removed_interval * removed,physreg_t dst)703 ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
704                  const struct ra_removed_interval *removed, physreg_t dst)
705 {
706    struct ra_interval *interval = removed->interval;
707 
708    interval->physreg_start = dst;
709    interval->physreg_end = dst + removed->size;
710 
711    assert(interval->physreg_end <= file->size);
712    if (interval->interval.reg->flags & IR3_REG_HALF)
713       assert(interval->physreg_end <= RA_HALF_SIZE);
714 
715    ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
716 }
717 
718 /* Pick up the interval and place it at "dst". */
719 static void
ra_move_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval,physreg_t dst)720 ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
721                  struct ra_interval *interval, physreg_t dst)
722 {
723    struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
724    ra_push_interval(ctx, file, &temp, dst);
725 }
726 
727 static struct ra_file *
ra_get_file(struct ra_ctx * ctx,struct ir3_register * reg)728 ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
729 {
730    if (reg->flags & IR3_REG_SHARED)
731       return &ctx->shared;
732    else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
733       return &ctx->full;
734    else
735       return &ctx->half;
736 }
737 
738 
739 /* Returns true if the proposed spot for "dst" or a killed source overlaps a
740  * destination that's been allocated.
741  */
742 static bool
check_dst_overlap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,physreg_t start,physreg_t end)743 check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file,
744                   struct ir3_register *dst, physreg_t start,
745                   physreg_t end)
746 {
747    struct ir3_instruction *instr = dst->instr;
748 
749    ra_foreach_dst (other_dst, instr) {
750       /* We assume only destinations before the current one have been allocated.
751        */
752       if (other_dst == dst)
753          break;
754 
755       if (ra_get_file(ctx, other_dst) != file)
756          continue;
757 
758       struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
759       assert(!other_interval->interval.parent);
760       physreg_t other_start = other_interval->physreg_start;
761       physreg_t other_end = other_interval->physreg_end;
762 
763       if (other_end > start && end > other_start)
764          return true;
765    }
766 
767    return false;
768 }
769 
770 /* True if the destination is "early-clobber," meaning that it cannot be
771  * allocated over killed sources. Some destinations always require it, but it
772  * also is implicitly true for tied destinations whose source is live-through.
773  * If the source is killed, then we skip allocating a register for the
774  * destination altogether so we don't need to worry about that case here.
775  */
776 static bool
is_early_clobber(struct ir3_register * reg)777 is_early_clobber(struct ir3_register *reg)
778 {
779    return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied;
780 }
781 
782 static bool
get_reg_specified(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,bool is_source)783 get_reg_specified(struct ra_ctx *ctx, struct ra_file *file,
784                   struct ir3_register *reg, physreg_t physreg, bool is_source)
785 {
786    for (unsigned i = 0; i < reg_size(reg); i++) {
787       if (!BITSET_TEST(is_early_clobber(reg) || is_source ?
788                            file->available_to_evict : file->available,
789                        physreg + i))
790          return false;
791    }
792 
793    if (!is_source &&
794        check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg)))
795       return false;
796 
797    return true;
798 }
799 
800 /* Try to evict any registers conflicting with the proposed spot "physreg" for
801  * "reg". That is, move them to other places so that we can allocate "physreg"
802  * here.
803  */
804 
805 static bool
try_evict_regs(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,unsigned * _eviction_count,bool is_source,bool speculative)806 try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
807                struct ir3_register *reg, physreg_t physreg,
808                unsigned *_eviction_count, bool is_source, bool speculative)
809 {
810    BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
811    memcpy(available_to_evict, file->available_to_evict,
812           sizeof(available_to_evict));
813 
814    BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
815    memcpy(available, file->available, sizeof(available));
816 
817    for (unsigned i = 0; i < reg_size(reg); i++) {
818       BITSET_CLEAR(available_to_evict, physreg + i);
819       BITSET_CLEAR(available, physreg + i);
820    }
821 
822    unsigned eviction_count = 0;
823    /* Iterate over each range conflicting with physreg */
824    for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
825                            *next = ra_interval_next_or_null(conflicting);
826         conflicting != NULL &&
827         conflicting->physreg_start < physreg + reg_size(reg);
828         conflicting = next, next = ra_interval_next_or_null(next)) {
829       if (!is_early_clobber(reg) && !is_source && conflicting->is_killed)
830          continue;
831 
832       if (conflicting->frozen) {
833          assert(speculative);
834          return false;
835       }
836 
837       unsigned conflicting_file_size =
838          reg_file_size(file, conflicting->interval.reg);
839       unsigned avail_start, avail_end;
840       bool evicted = false;
841       BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
842                             conflicting_file_size) {
843          unsigned size = avail_end - avail_start;
844 
845          /* non-half registers must be aligned */
846          if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
847              avail_start % 2 == 1) {
848             avail_start++;
849             size--;
850          }
851 
852          unsigned conflicting_size =
853             conflicting->physreg_end - conflicting->physreg_start;
854          if (size >= conflicting_size &&
855              (is_source ||
856               !check_dst_overlap(ctx, file, reg, avail_start,
857                                  avail_start + conflicting_size))) {
858             for (unsigned i = 0;
859                  i < conflicting->physreg_end - conflicting->physreg_start; i++)
860                BITSET_CLEAR(available_to_evict, avail_start + i);
861             eviction_count +=
862                conflicting->physreg_end - conflicting->physreg_start;
863             if (!speculative)
864                ra_move_interval(ctx, file, conflicting, avail_start);
865             evicted = true;
866             break;
867          }
868       }
869 
870       if (evicted)
871          continue;
872 
873       /* If we couldn't evict this range, but the register we're allocating is
874        * allowed to overlap with a killed range, then we may be able to swap it
875        * with a killed range to acheive the same effect.
876        */
877       if (is_early_clobber(reg) || is_source)
878          return false;
879 
880       foreach_interval (killed, file) {
881          if (!killed->is_killed)
882             continue;
883 
884          if (killed->physreg_end - killed->physreg_start !=
885              conflicting->physreg_end - conflicting->physreg_start)
886             continue;
887 
888          if (killed->physreg_end > conflicting_file_size ||
889              conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
890             continue;
891 
892          /* We can't swap the killed range if it partially/fully overlaps the
893           * space we're trying to allocate or (in speculative mode) if it's
894           * already been swapped and will overlap when we actually evict.
895           */
896          bool killed_available = true;
897          for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
898             if (!BITSET_TEST(available, i)) {
899                killed_available = false;
900                break;
901             }
902          }
903 
904          if (!killed_available)
905             continue;
906 
907          if (check_dst_overlap(ctx, file, reg, killed->physreg_start,
908                                killed->physreg_end))
909             continue;
910 
911          /* Check for alignment if one is a full reg */
912          if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
913               !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
914              (killed->physreg_start % 2 != 0 ||
915               conflicting->physreg_start % 2 != 0))
916             continue;
917 
918          for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
919             BITSET_CLEAR(available, i);
920          }
921          /* Because this will generate swaps instead of moves, multiply the
922           * cost by 2.
923           */
924          eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
925          if (!speculative) {
926             physreg_t killed_start = killed->physreg_start,
927                       conflicting_start = conflicting->physreg_start;
928             struct ra_removed_interval killed_removed =
929                ra_pop_interval(ctx, file, killed);
930             struct ra_removed_interval conflicting_removed =
931                ra_pop_interval(ctx, file, conflicting);
932             ra_push_interval(ctx, file, &killed_removed, conflicting_start);
933             ra_push_interval(ctx, file, &conflicting_removed, killed_start);
934          }
935 
936          evicted = true;
937          break;
938       }
939 
940       if (!evicted)
941          return false;
942    }
943 
944    *_eviction_count = eviction_count;
945    return true;
946 }
947 
948 static int
removed_interval_cmp(const void * _i1,const void * _i2)949 removed_interval_cmp(const void *_i1, const void *_i2)
950 {
951    const struct ra_removed_interval *i1 = _i1;
952    const struct ra_removed_interval *i2 = _i2;
953 
954    /* We sort the registers as follows:
955     *
956     * |------------------------------------------------------------------------------------------|
957     * |               |                    |        |        |                    |              |
958     * |  Half         | Half early-clobber | Half   | Full   | Full early-clobber | Full         |
959     * |  live-through | destination        | killed | killed | destination        | live-through |
960     * |               |                    |        |        |                    |              |
961     * |------------------------------------------------------------------------------------------|
962     *                                      |                 |
963     *                                      |   Destination   |
964     *                                      |                 |
965     *                                      |-----------------|
966     *
967     * Half-registers have to be first so that they stay in the low half of
968     * the register file. Then half and full killed must stay together so that
969     * there's a contiguous range where we can put the register. With this
970     * structure we should be able to accomodate any collection of intervals
971     * such that the total number of half components is within the half limit
972     * and the combined components are within the full limit.
973     */
974 
975    unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
976    unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
977    if (i1_align > i2_align)
978       return 1;
979    if (i1_align < i2_align)
980       return -1;
981 
982    if (i1_align == 1) {
983       if (i2->interval->is_killed)
984          return -1;
985       if (i1->interval->is_killed)
986          return 1;
987    } else {
988       if (i2->interval->is_killed)
989          return 1;
990       if (i1->interval->is_killed)
991          return -1;
992    }
993 
994    return 0;
995 }
996 
997 static int
dsts_cmp(const void * _i1,const void * _i2)998 dsts_cmp(const void *_i1, const void *_i2)
999 {
1000    struct ir3_register *i1 = *(struct ir3_register *const *) _i1;
1001    struct ir3_register *i2 = *(struct ir3_register *const *) _i2;
1002 
1003    /* Treat tied destinations as-if they are live-through sources, and normal
1004     * destinations as killed sources.
1005     */
1006    unsigned i1_align = reg_elem_size(i1);
1007    unsigned i2_align = reg_elem_size(i2);
1008    if (i1_align > i2_align)
1009       return 1;
1010    if (i1_align < i2_align)
1011       return -1;
1012 
1013    if (i1_align == 1) {
1014       if (!is_early_clobber(i2))
1015          return -1;
1016       if (!is_early_clobber(i1))
1017          return 1;
1018    } else {
1019       if (!is_early_clobber(i2))
1020          return 1;
1021       if (!is_early_clobber(i1))
1022          return -1;
1023    }
1024 
1025    return 0;
1026 }
1027 
1028 /* "Compress" all the live intervals so that there is enough space for the
1029  * destination register. As there can be gaps when a more-aligned interval
1030  * follows a less-aligned interval, this also sorts them to remove such
1031  * "padding", which may be required when space is very tight.  This isn't
1032  * amazing, but should be used only as a last resort in case the register file
1033  * is almost full and badly fragmented.
1034  *
1035  * Return the physreg to use.
1036  */
1037 static physreg_t
compress_regs_left(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1038 compress_regs_left(struct ra_ctx *ctx, struct ra_file *file,
1039                    struct ir3_register *reg)
1040 {
1041    unsigned reg_align = reg_elem_size(reg);
1042    DECLARE_ARRAY(struct ra_removed_interval, intervals);
1043    intervals_count = intervals_sz = 0;
1044    intervals = NULL;
1045 
1046    DECLARE_ARRAY(struct ir3_register *, dsts);
1047    dsts_count = dsts_sz = 0;
1048    dsts = NULL;
1049    array_insert(ctx, dsts, reg);
1050    bool dst_inserted[reg->instr->dsts_count];
1051 
1052    unsigned dst_size = reg->tied ? 0 : reg_size(reg);
1053    unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0;
1054    unsigned half_dst_size = 0, ec_half_dst_size = 0;
1055    if (reg_align == 1) {
1056       half_dst_size = dst_size;
1057       ec_half_dst_size = ec_dst_size;
1058    }
1059 
1060    unsigned removed_size = 0, removed_half_size = 0;
1061    unsigned removed_killed_size = 0, removed_killed_half_size = 0;
1062    unsigned file_size = reg_file_size(file, reg);
1063    physreg_t start_reg = 0;
1064 
1065    foreach_interval_rev_safe (interval, file) {
1066       /* We'll check if we can compact the intervals starting here. */
1067       physreg_t candidate_start = interval->physreg_end;
1068 
1069       /* Check if there are any other destinations we need to compact. */
1070       ra_foreach_dst_n (other_dst, n, reg->instr) {
1071          if (other_dst == reg)
1072             break;
1073          if (ra_get_file(ctx, other_dst) != file)
1074             continue;
1075          if (dst_inserted[n])
1076             continue;
1077 
1078          struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
1079          /* if the destination partially overlaps this interval, we need to
1080           * extend candidate_start to the end.
1081           */
1082          if (other_interval->physreg_start < candidate_start) {
1083             candidate_start = MAX2(candidate_start,
1084                                    other_interval->physreg_end);
1085             continue;
1086          }
1087 
1088          dst_inserted[n] = true;
1089 
1090          /* dst intervals with a tied killed source are considered attached to
1091           * that source. Don't actually insert them. This means we have to
1092           * update them below if their tied source moves.
1093           */
1094          if (other_dst->tied) {
1095             struct ra_interval *tied_interval =
1096                &ctx->intervals[other_dst->tied->def->name];
1097             if (tied_interval->is_killed)
1098                continue;
1099          }
1100 
1101          d("popping destination %u physreg %u\n",
1102            other_interval->interval.reg->name,
1103            other_interval->physreg_start);
1104 
1105          array_insert(ctx, dsts, other_dst);
1106          unsigned interval_size = reg_size(other_dst);
1107          if (is_early_clobber(other_dst)) {
1108             ec_dst_size += interval_size;
1109             if (other_interval->interval.reg->flags & IR3_REG_HALF)
1110                ec_half_dst_size += interval_size;
1111          } else {
1112             dst_size += interval_size;
1113             if (other_interval->interval.reg->flags & IR3_REG_HALF)
1114                half_dst_size += interval_size;
1115          }
1116       }
1117 
1118       /* Check if we can sort the intervals *after* this one and have enough
1119        * space leftover to accomodate all intervals, keeping in mind that killed
1120        * sources overlap non-tied destinations. Also check that we have enough
1121        * space leftover for half-registers, if we're inserting a half-register
1122        * (otherwise we only shift any half-registers down so they should be
1123        * safe).
1124        */
1125       if (candidate_start + removed_size + ec_dst_size +
1126           MAX2(removed_killed_size, dst_size) <= file->size &&
1127           (reg_align != 1 ||
1128            candidate_start + removed_half_size + ec_half_dst_size +
1129            MAX2(removed_killed_half_size, half_dst_size) <= file_size)) {
1130          start_reg = candidate_start;
1131          break;
1132       }
1133 
1134       /* We assume that all frozen intervals are at the start and that we
1135        * can avoid popping them.
1136        */
1137       assert(!interval->frozen);
1138 
1139       /* Killed sources are different because they go at the end and can
1140        * overlap the register we're trying to add.
1141        */
1142       unsigned interval_size = interval->physreg_end - interval->physreg_start;
1143       if (interval->is_killed) {
1144          removed_killed_size += interval_size;
1145          if (interval->interval.reg->flags & IR3_REG_HALF)
1146             removed_killed_half_size += interval_size;
1147       } else {
1148          removed_size += interval_size;
1149          if (interval->interval.reg->flags & IR3_REG_HALF)
1150             removed_half_size += interval_size;
1151       }
1152 
1153       /* Now that we've done the accounting, pop this off */
1154       d("popping interval %u physreg %u%s\n", interval->interval.reg->name,
1155         interval->physreg_start, interval->is_killed ? ", killed" : "");
1156       array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
1157    }
1158 
1159    /* TODO: In addition to skipping registers at the beginning that are
1160     * well-packed, we should try to skip registers at the end.
1161     */
1162 
1163    qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
1164    qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp);
1165 
1166    physreg_t live_reg = start_reg;
1167    physreg_t dst_reg = (physreg_t)~0;
1168    physreg_t ret_reg = (physreg_t)~0;
1169    unsigned dst_index = 0;
1170    unsigned live_index = 0;
1171 
1172    /* We have two lists of intervals to process, live intervals and destination
1173     * intervals. Process them in the order of the disgram in insert_cmp().
1174     */
1175    while (live_index < intervals_count || dst_index < dsts_count) {
1176       bool process_dst;
1177       if (live_index == intervals_count) {
1178          process_dst = true;
1179       } else if (dst_index == dsts_count) {
1180          process_dst = false;
1181       } else {
1182          struct ir3_register *dst = dsts[dst_index];
1183          struct ra_interval *live_interval = intervals[live_index].interval;
1184 
1185          bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF;
1186          bool live_killed = live_interval->is_killed;
1187          bool dst_half = dst->flags & IR3_REG_HALF;
1188          bool dst_early_clobber = is_early_clobber(dst);
1189 
1190          if (live_half && !live_killed) {
1191             /* far-left of diagram. */
1192             process_dst = false;
1193          } else if (dst_half && dst_early_clobber) {
1194             /* mid-left of diagram. */
1195             process_dst = true;
1196          } else if (!dst_early_clobber) {
1197             /* bottom of disagram. */
1198             process_dst = true;
1199          } else if (live_killed) {
1200             /* middle of diagram. */
1201             process_dst = false;
1202          } else if (!dst_half && dst_early_clobber) {
1203             /* mid-right of diagram. */
1204             process_dst = true;
1205          } else {
1206             /* far right of diagram. */
1207             assert(!live_killed && !live_half);
1208             process_dst = false;
1209          }
1210       }
1211 
1212       struct ir3_register *cur_reg =
1213          process_dst ? dsts[dst_index] :
1214          intervals[live_index].interval->interval.reg;
1215 
1216       physreg_t physreg;
1217       if (process_dst && !is_early_clobber(cur_reg)) {
1218          if (dst_reg == (physreg_t)~0)
1219             dst_reg = live_reg;
1220          physreg = dst_reg;
1221       } else {
1222          physreg = live_reg;
1223          struct ra_interval *live_interval = intervals[live_index].interval;
1224          bool live_killed = live_interval->is_killed;
1225          /* If this is live-through and we've processed the destinations, we
1226           * need to make sure we take into account any overlapping destinations.
1227           */
1228          if (!live_killed && dst_reg != (physreg_t)~0)
1229             physreg = MAX2(physreg, dst_reg);
1230       }
1231 
1232       if (!(cur_reg->flags & IR3_REG_HALF))
1233          physreg = ALIGN(physreg, 2);
1234 
1235       d("pushing reg %u physreg %u\n", cur_reg->name, physreg);
1236 
1237       unsigned interval_size = reg_size(cur_reg);
1238       if (physreg + interval_size >
1239           reg_file_size(file, cur_reg)) {
1240          d("ran out of room for interval %u!\n",
1241            cur_reg->name);
1242          unreachable("reg pressure calculation was wrong!");
1243          return 0;
1244       }
1245 
1246       if (process_dst) {
1247          if (cur_reg == reg) {
1248             ret_reg = physreg;
1249          } else {
1250             struct ra_interval *interval = &ctx->intervals[cur_reg->name];
1251             interval->physreg_start = physreg;
1252             interval->physreg_end = physreg + interval_size;
1253          }
1254          dst_index++;
1255       } else {
1256          ra_push_interval(ctx, file, &intervals[live_index], physreg);
1257          live_index++;
1258       }
1259 
1260       physreg += interval_size;
1261 
1262       if (process_dst && !is_early_clobber(cur_reg)) {
1263          dst_reg = physreg;
1264       } else {
1265          live_reg = physreg;
1266       }
1267    }
1268 
1269    /* If we shuffled around a tied source that is killed, we may have to update
1270     * its corresponding destination since we didn't insert it above.
1271     */
1272    ra_foreach_dst (dst, reg->instr) {
1273       if (dst == reg)
1274          break;
1275 
1276       struct ir3_register *tied = dst->tied;
1277       if (!tied)
1278          continue;
1279 
1280       struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1281       if (!tied_interval->is_killed)
1282          continue;
1283 
1284       struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1285       unsigned dst_size = reg_size(dst);
1286       dst_interval->physreg_start = ra_interval_get_physreg(tied_interval);
1287       dst_interval->physreg_end = dst_interval->physreg_start + dst_size;
1288    }
1289 
1290    return ret_reg;
1291 }
1292 
1293 void
ra_update_affinity(unsigned file_size,struct ir3_register * reg,physreg_t physreg)1294 ra_update_affinity(unsigned file_size, struct ir3_register *reg,
1295                    physreg_t physreg)
1296 {
1297    if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1298       return;
1299 
1300    if (physreg < reg->merge_set_offset)
1301       return;
1302 
1303    if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file_size)
1304       return;
1305 
1306    reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1307 }
1308 
1309 /* Try to find free space for a register without shuffling anything. This uses
1310  * a round-robin algorithm to reduce false dependencies.
1311  */
1312 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,unsigned file_size,unsigned size,unsigned alignment)1313 find_best_gap(struct ra_ctx *ctx, struct ra_file *file,
1314               struct ir3_register *dst, unsigned file_size, unsigned size,
1315               unsigned alignment)
1316 {
1317    /* This can happen if we create a very large merge set. Just bail out in that
1318     * case.
1319     */
1320    if (size > file_size)
1321       return (physreg_t) ~0;
1322 
1323    BITSET_WORD *available =
1324       is_early_clobber(dst) ? file->available_to_evict : file->available;
1325 
1326    unsigned start = ALIGN(file->start, alignment) % (file_size - size + alignment);
1327    unsigned candidate = start;
1328    do {
1329       bool is_available = true;
1330       for (unsigned i = 0; i < size; i++) {
1331          if (!BITSET_TEST(available, candidate + i)) {
1332             is_available = false;
1333             break;
1334          }
1335       }
1336 
1337       if (is_available) {
1338          is_available =
1339             !check_dst_overlap(ctx, file, dst, candidate, candidate + size);
1340       }
1341 
1342       if (is_available) {
1343          file->start = (candidate + size) % file_size;
1344          return candidate;
1345       }
1346 
1347       candidate += alignment;
1348       if (candidate + size > file_size)
1349          candidate = 0;
1350    } while (candidate != start);
1351 
1352    return (physreg_t)~0;
1353 }
1354 
1355 static physreg_t
try_allocate_src(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1356 try_allocate_src(struct ra_ctx *ctx, struct ra_file *file,
1357                  struct ir3_register *reg)
1358 {
1359    unsigned file_size = reg_file_size(file, reg);
1360    unsigned size = reg_size(reg);
1361    for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1362       struct ir3_register *src = reg->instr->srcs[i];
1363       if (!ra_reg_is_src(src))
1364          continue;
1365       if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1366          struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1367          physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1368          if (src_physreg % reg_elem_size(reg) == 0 &&
1369              src_physreg + size <= file_size &&
1370              get_reg_specified(ctx, file, reg, src_physreg, false))
1371             return src_physreg;
1372       }
1373    }
1374 
1375    return ~0;
1376 }
1377 
1378 static bool
rpt_has_unique_merge_set(struct ir3_instruction * instr)1379 rpt_has_unique_merge_set(struct ir3_instruction *instr)
1380 {
1381    assert(ir3_instr_is_rpt(instr));
1382 
1383    if (!instr->dsts[0]->merge_set)
1384       return false;
1385 
1386    struct ir3_instruction *first = ir3_instr_first_rpt(instr);
1387    struct ir3_register *def = first->dsts[0];
1388 
1389    if (def->merge_set != instr->dsts[0]->merge_set ||
1390        def->merge_set->regs_count != ir3_instr_rpt_length(first)) {
1391       return false;
1392    }
1393 
1394    unsigned i = 0;
1395 
1396    foreach_instr_rpt (rpt, first) {
1397       if (rpt->dsts[0] != def->merge_set->regs[i++])
1398          return false;
1399    }
1400 
1401    return true;
1402 }
1403 
1404 /* This is the main entrypoint for picking a register. Pick a free register
1405  * for "reg", shuffling around sources if necessary. In the normal case where
1406  * "is_source" is false, this register can overlap with killed sources
1407  * (intervals with "is_killed == true"). If "is_source" is true, then
1408  * is_killed is ignored and the register returned must not overlap with killed
1409  * sources. This must be used for tied registers, because we're actually
1410  * allocating the destination and the tied source at the same time.
1411  */
1412 
1413 static physreg_t
get_reg(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1414 get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg)
1415 {
1416    unsigned file_size = reg_file_size(file, reg);
1417    if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1418       physreg_t preferred_reg =
1419          reg->merge_set->preferred_reg + reg->merge_set_offset;
1420       if (preferred_reg + reg_size(reg) <= file_size &&
1421           preferred_reg % reg_elem_size(reg) == 0 &&
1422           get_reg_specified(ctx, file, reg, preferred_reg, false))
1423          return preferred_reg;
1424    }
1425 
1426    /* For repeated instructions whose merge set is unique (i.e., only used for
1427     * these repeated instructions), try to first allocate one of their sources
1428     * (for the same reason as for ALU/SFU instructions explained below). This
1429     * also prevents us from allocating a new register range for this merge set
1430     * when the one from a source could be reused.
1431     */
1432    if (ir3_instr_is_rpt(reg->instr) && rpt_has_unique_merge_set(reg->instr)) {
1433       physreg_t src_reg = try_allocate_src(ctx, file, reg);
1434       if (src_reg != (physreg_t)~0)
1435          return src_reg;
1436    }
1437 
1438    /* If this register is a subset of a merge set which we have not picked a
1439     * register for, first try to allocate enough space for the entire merge
1440     * set.
1441     */
1442    unsigned size = reg_size(reg);
1443    if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1444        size < reg->merge_set->size) {
1445       physreg_t best_reg = find_best_gap(ctx, file, reg, file_size,
1446                                          reg->merge_set->size,
1447                                          reg->merge_set->alignment);
1448       if (best_reg != (physreg_t)~0u) {
1449          best_reg += reg->merge_set_offset;
1450          return best_reg;
1451       }
1452    }
1453 
1454    /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1455     * Because this doesn't introduce unnecessary dependencies, and it
1456     * potentially avoids needing (ss) syncs for write after read hazards for
1457     * SFU instructions:
1458     */
1459    if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1460       physreg_t src_reg = try_allocate_src(ctx, file, reg);
1461       if (src_reg != (physreg_t)~0)
1462          return src_reg;
1463    }
1464 
1465    physreg_t best_reg =
1466       find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg));
1467    if (best_reg != (physreg_t)~0u) {
1468       return best_reg;
1469    }
1470 
1471    /* Ok, we couldn't find anything that fits. Here is where we have to start
1472     * moving things around to make stuff fit. First try solely evicting
1473     * registers in the way.
1474     */
1475    unsigned best_eviction_count = ~0;
1476    for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1477       unsigned eviction_count;
1478       if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) {
1479          if (eviction_count < best_eviction_count) {
1480             best_eviction_count = eviction_count;
1481             best_reg = i;
1482          }
1483       }
1484    }
1485 
1486    if (best_eviction_count != ~0) {
1487       ASSERTED bool result = try_evict_regs(
1488          ctx, file, reg, best_reg, &best_eviction_count, false, false);
1489       assert(result);
1490       return best_reg;
1491    }
1492 
1493    /* Use the dumb fallback only if try_evict_regs() fails. */
1494    return compress_regs_left(ctx, file, reg);
1495 }
1496 
1497 static void
assign_reg(struct ir3_instruction * instr,struct ir3_register * reg,unsigned num)1498 assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1499            unsigned num)
1500 {
1501    if (reg->flags & IR3_REG_ARRAY) {
1502       reg->array.base = num;
1503       if (reg->flags & IR3_REG_RELATIV)
1504          reg->array.offset += num;
1505       else
1506          reg->num = num + reg->array.offset;
1507    } else {
1508       reg->num = num;
1509    }
1510 }
1511 
1512 static void
mark_src_killed(struct ra_ctx * ctx,struct ir3_register * src)1513 mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1514 {
1515    struct ra_interval *interval = &ctx->intervals[src->def->name];
1516 
1517    if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1518        interval->interval.parent ||
1519        !rb_tree_is_empty(&interval->interval.children))
1520       return;
1521 
1522    ra_file_mark_killed(ra_get_file(ctx, src), interval);
1523 }
1524 
1525 static void
insert_dst(struct ra_ctx * ctx,struct ir3_register * dst)1526 insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1527 {
1528    struct ra_file *file = ra_get_file(ctx, dst);
1529    struct ra_interval *interval = &ctx->intervals[dst->name];
1530 
1531    d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1532 
1533    if (!(dst->flags & IR3_REG_UNUSED))
1534       ra_file_insert(file, interval);
1535 
1536    assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1537 }
1538 
1539 static void
allocate_dst_fixed(struct ra_ctx * ctx,struct ir3_register * dst,physreg_t physreg)1540 allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1541                    physreg_t physreg)
1542 {
1543    struct ra_file *file = ra_get_file(ctx, dst);
1544    struct ra_interval *interval = &ctx->intervals[dst->name];
1545    ra_update_affinity(file->size, dst, physreg);
1546 
1547    ra_interval_init(interval, dst);
1548    interval->physreg_start = physreg;
1549    interval->physreg_end = physreg + reg_size(dst);
1550 }
1551 
1552 /* If a tied destination interferes with its source register, we have to insert
1553  * a copy beforehand to copy the source to the destination. Because we are using
1554  * the parallel_copies array and not creating a separate copy, this copy will
1555  * happen in parallel with any shuffling around of the tied source, so we have
1556  * to copy the source *as it exists before it is shuffled around*. We do this by
1557  * inserting the copy early, before any other copies are inserted. We don't
1558  * actually know the destination of the copy, but that's ok because the
1559  * dst_interval will be filled out later.
1560  */
1561 static void
insert_tied_dst_copy(struct ra_ctx * ctx,struct ir3_register * dst)1562 insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst)
1563 {
1564    struct ir3_register *tied = dst->tied;
1565 
1566    if (!tied)
1567       return;
1568 
1569    struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1570    struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1571 
1572    if (tied_interval->is_killed)
1573       return;
1574 
1575    physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1576 
1577    array_insert(ctx, ctx->parallel_copies,
1578                 (struct ra_parallel_copy){
1579                    .interval = dst_interval,
1580                    .src = tied_physreg,
1581                 });
1582 }
1583 
1584 static void
allocate_dst(struct ra_ctx * ctx,struct ir3_register * dst)1585 allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1586 {
1587    struct ra_file *file = ra_get_file(ctx, dst);
1588 
1589    struct ir3_register *tied = dst->tied;
1590    if (tied) {
1591       struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1592       if (tied_interval->is_killed) {
1593          /* The easy case: the source is killed, so we can just reuse it
1594           * for the destination.
1595           */
1596          allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1597          return;
1598       }
1599    }
1600 
1601    /* All the hard work is done by get_reg here. */
1602    physreg_t physreg = get_reg(ctx, file, dst);
1603 
1604    allocate_dst_fixed(ctx, dst, physreg);
1605 }
1606 
1607 static void
assign_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1608 assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1609            struct ir3_register *src)
1610 {
1611    struct ra_interval *interval = &ctx->intervals[src->def->name];
1612    struct ra_file *file = ra_get_file(ctx, src);
1613 
1614    struct ir3_register *tied = src->tied;
1615    physreg_t physreg;
1616    if (tied) {
1617       struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1618       physreg = ra_interval_get_physreg(tied_interval);
1619    } else {
1620       physreg = ra_interval_get_physreg(interval);
1621    }
1622 
1623    assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1624 
1625    if (src->flags & IR3_REG_FIRST_KILL)
1626       ra_file_remove(file, interval);
1627 }
1628 
1629 /* Insert a parallel copy instruction before the instruction with the parallel
1630  * copy entries we've built up.
1631  */
1632 static void
insert_parallel_copy_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1633 insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1634 {
1635    if (ctx->parallel_copies_count == 0)
1636       return;
1637 
1638    struct ir3_instruction *pcopy =
1639       ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1640                        ctx->parallel_copies_count, ctx->parallel_copies_count);
1641 
1642    for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1643       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1644       struct ir3_register *reg =
1645          ir3_dst_create(pcopy, INVALID_REG,
1646                         entry->interval->interval.reg->flags &
1647                         (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
1648       reg->size = entry->interval->interval.reg->size;
1649       reg->wrmask = entry->interval->interval.reg->wrmask;
1650       assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1651    }
1652 
1653    for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1654       struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1655       struct ir3_register *reg =
1656          ir3_src_create(pcopy, INVALID_REG,
1657                         entry->interval->interval.reg->flags &
1658                         (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
1659       reg->size = entry->interval->interval.reg->size;
1660       reg->wrmask = entry->interval->interval.reg->wrmask;
1661       assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1662    }
1663 
1664    list_del(&pcopy->node);
1665    list_addtail(&pcopy->node, &instr->node);
1666    ctx->parallel_copies_count = 0;
1667 }
1668 
1669 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1670 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1671 {
1672    /* First, mark sources as going-to-be-killed while allocating the dest. */
1673    ra_foreach_src (src, instr) {
1674       mark_src_killed(ctx, src);
1675    }
1676 
1677    /* Pre-insert tied dst copies. */
1678    ra_foreach_dst (dst, instr) {
1679       insert_tied_dst_copy(ctx, dst);
1680    }
1681 
1682    /* Allocate the destination. */
1683    ra_foreach_dst (dst, instr) {
1684       allocate_dst(ctx, dst);
1685    }
1686 
1687    /* Now handle sources. Go backward so that in case there are multiple
1688     * sources with the same def and that def is killed we only remove it at
1689     * the end.
1690     */
1691    ra_foreach_src_rev (src, instr) {
1692       assign_src(ctx, instr, src);
1693    }
1694 
1695    /* Now finally insert the destination into the map. */
1696    ra_foreach_dst (dst, instr) {
1697       insert_dst(ctx, dst);
1698    }
1699 
1700    insert_parallel_copy_instr(ctx, instr);
1701 }
1702 
1703 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * instr)1704 handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1705 {
1706    struct ir3_register *dst = instr->dsts[0];
1707    struct ir3_register *src = instr->srcs[0];
1708 
1709    if (!(dst->flags & IR3_REG_SSA))
1710       return;
1711 
1712    if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1713       handle_normal_instr(ctx, instr);
1714       return;
1715    }
1716 
1717    struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1718 
1719    physreg_t physreg = ra_interval_get_physreg(src_interval);
1720    assign_src(ctx, instr, src);
1721 
1722    allocate_dst_fixed(
1723       ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1724    insert_dst(ctx, dst);
1725 }
1726 
1727 static void
handle_collect(struct ra_ctx * ctx,struct ir3_instruction * instr)1728 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1729 {
1730    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1731       return;
1732 
1733    struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1734    unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1735 
1736    if (!dst_set || dst_set->regs_count == 1) {
1737       handle_normal_instr(ctx, instr);
1738       return;
1739    }
1740 
1741    /* We need to check if any of the sources are contained in an interval
1742     * that is at least as large as the vector. In this case, we should put
1743     * the vector inside that larger interval. (There should be one
1744     * unambiguous place to put it, because values sharing the same merge set
1745     * should be allocated together.) This can happen in a case like:
1746     *
1747     * ssa_1 (wrmask=0xf) = ...
1748     * ssa_2 = split ssa_1 off:0
1749     * ssa_3 = split ssa_1 off:1
1750     * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1751     * ... = (kill)ssa_1
1752     * ... = (kill)ssa_4
1753     *
1754     * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1755     */
1756    physreg_t dst_fixed = (physreg_t)~0u;
1757 
1758    ra_foreach_src (src, instr) {
1759       if (src->flags & IR3_REG_FIRST_KILL) {
1760          mark_src_killed(ctx, src);
1761       }
1762 
1763       struct ra_interval *interval = &ctx->intervals[src->def->name];
1764 
1765       /* We only need special handling if the source's interval overlaps with
1766        * the destination's interval.
1767        */
1768       if (src->def->interval_start >= instr->dsts[0]->interval_end ||
1769           instr->dsts[0]->interval_start >= src->def->interval_end ||
1770           interval->is_killed)
1771          continue;
1772 
1773       while (interval->interval.parent != NULL) {
1774          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1775       }
1776       if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1777          dst_fixed = interval->physreg_start -
1778                      interval->interval.reg->merge_set_offset + dst_offset;
1779       } else {
1780          /* For sources whose root interval is smaller than the
1781           * destination (i.e. the normal case), we will shuffle them
1782           * around after allocating the destination. Mark them killed so
1783           * that the destination can be allocated over them, even if they
1784           * aren't actually killed.
1785           */
1786          ra_file_mark_killed(ra_get_file(ctx, src), interval);
1787       }
1788    }
1789 
1790    if (dst_fixed != (physreg_t)~0u)
1791       allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1792    else
1793       allocate_dst(ctx, instr->dsts[0]);
1794 
1795    /* Remove the temporary is_killed we added */
1796    ra_foreach_src (src, instr) {
1797       struct ra_interval *interval = &ctx->intervals[src->def->name];
1798       while (interval->interval.parent != NULL) {
1799          interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1800       }
1801 
1802       /* Filter out cases where it actually should be killed */
1803       if (interval != &ctx->intervals[src->def->name] ||
1804           !(src->flags & IR3_REG_KILL)) {
1805          ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1806       }
1807    }
1808 
1809    ra_foreach_src_rev (src, instr) {
1810       assign_src(ctx, instr, src);
1811    }
1812 
1813    /* We need to do this before insert_dst(), so that children of the
1814     * destination which got marked as killed and then shuffled around to make
1815     * space for the destination have the correct pcopy destination that
1816     * matches what we assign the source of the collect to in assign_src().
1817     *
1818     * TODO: In this case we'll wind up copying the value in the pcopy and
1819     * then again in the collect. We could avoid one of those by updating the
1820     * pcopy destination to match up with the final location of the source
1821     * after the collect and making the collect a no-op. However this doesn't
1822     * seem to happen often.
1823     */
1824    insert_parallel_copy_instr(ctx, instr);
1825 
1826    /* Note: insert_dst will automatically shuffle around any intervals that
1827     * are a child of the collect by making them children of the collect.
1828     */
1829 
1830    insert_dst(ctx, instr->dsts[0]);
1831 }
1832 
1833 /* Parallel copies before RA should only be at the end of the block, for
1834  * phi's. For these we only need to fill in the sources, and then we fill in
1835  * the destinations in the successor block.
1836  */
1837 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * instr)1838 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1839 {
1840    ra_foreach_src_rev (src, instr) {
1841       assign_src(ctx, instr, src);
1842    }
1843 }
1844 
1845 /* Some inputs may need to be precolored. We need to handle those first, so
1846  * that other non-precolored inputs don't accidentally get allocated over
1847  * them. Inputs are the very first thing in the shader, so it shouldn't be a
1848  * problem to allocate them to a specific physreg.
1849  */
1850 
1851 static void
handle_precolored_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1852 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1853 {
1854    if (instr->dsts[0]->num == INVALID_REG ||
1855        !(instr->dsts[0]->flags & IR3_REG_SSA))
1856       return;
1857 
1858    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1859    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1860    physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1861    allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1862 
1863    d("insert precolored dst %u physreg %u", instr->dsts[0]->name,
1864      ra_interval_get_physreg(interval));
1865 
1866    ra_file_insert(file, interval);
1867    interval->frozen = true;
1868 }
1869 
1870 static void
handle_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1871 handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1872 {
1873    if (instr->dsts[0]->num != INVALID_REG)
1874       return;
1875 
1876    allocate_dst(ctx, instr->dsts[0]);
1877 
1878    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1879    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1880    ra_file_insert(file, interval);
1881 }
1882 
1883 static void
assign_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1884 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1885 {
1886    if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1887       return;
1888 
1889    struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1890    struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1891 
1892    if (instr->dsts[0]->num == INVALID_REG) {
1893       assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1894    } else {
1895       interval->frozen = false;
1896    }
1897 
1898    if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1899       ra_file_remove(file, interval);
1900 
1901    ra_foreach_src_rev (src, instr)
1902       assign_src(ctx, instr, src);
1903 }
1904 
1905 /* chmask is a bit weird, because it has pre-colored sources due to the need
1906  * to pass some registers to the next stage. Fortunately there are only at
1907  * most two, and there should be no other live values by the time we get to
1908  * this instruction, so we only have to do the minimum and don't need any
1909  * fancy fallbacks.
1910  *
1911  * TODO: Add more complete handling of precolored sources, e.g. for function
1912  * argument handling. We'd need a way to mark sources as fixed so that they
1913  * don't get moved around when placing other sources in the fallback case, and
1914  * a duplication of much of the logic in get_reg(). This also opens another
1915  * can of worms, e.g. what if the precolored source is a split of a vector
1916  * which is still live -- this breaks our assumption that splits don't incur
1917  * any "extra" register requirements and we'd have to break it out of the
1918  * parent ra_interval.
1919  */
1920 
1921 static void
handle_precolored_source(struct ra_ctx * ctx,struct ir3_register * src)1922 handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1923 {
1924    struct ra_file *file = ra_get_file(ctx, src);
1925    struct ra_interval *interval = &ctx->intervals[src->def->name];
1926    physreg_t physreg = ra_reg_get_physreg(src);
1927 
1928    if (ra_interval_get_num(interval) == src->num)
1929       return;
1930 
1931    /* Try evicting stuff in our way if it isn't free. This won't move
1932     * anything unless it overlaps with our precolored physreg, so we don't
1933     * have to worry about evicting other precolored sources.
1934     */
1935    if (!get_reg_specified(ctx, file, src, physreg, true)) {
1936       unsigned eviction_count;
1937       if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1938                           false)) {
1939          unreachable("failed to evict for precolored source!");
1940          return;
1941       }
1942    }
1943 
1944    ra_move_interval(ctx, file, interval, physreg);
1945 }
1946 
1947 static void
handle_chmask(struct ra_ctx * ctx,struct ir3_instruction * instr)1948 handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1949 {
1950    /* Note: we purposely don't mark sources as killed, so that we can reuse
1951     * some of the get_reg() machinery as-if the source is a destination.
1952     * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1953     * correctly.
1954     */
1955    ra_foreach_src (src, instr) {
1956       assert(src->num != INVALID_REG);
1957       handle_precolored_source(ctx, src);
1958    }
1959 
1960    ra_foreach_src (src, instr) {
1961       struct ra_file *file = ra_get_file(ctx, src);
1962       struct ra_interval *interval = &ctx->intervals[src->def->name];
1963       if (src->flags & IR3_REG_FIRST_KILL)
1964          ra_file_remove(file, interval);
1965    }
1966 
1967    insert_parallel_copy_instr(ctx, instr);
1968 }
1969 
1970 static physreg_t
read_register(struct ra_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1971 read_register(struct ra_ctx *ctx, struct ir3_block *block,
1972               struct ir3_register *def)
1973 {
1974    struct ra_block_state *state = &ctx->blocks[block->index];
1975    if (state->renames) {
1976       struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1977       if (entry) {
1978          return (physreg_t)(uintptr_t)entry->data;
1979       }
1980    }
1981 
1982    return ra_reg_get_physreg(def);
1983 }
1984 
1985 static void
handle_live_in(struct ra_ctx * ctx,struct ir3_register * def)1986 handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1987 {
1988    physreg_t physreg = ~0;
1989    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1990       struct ir3_block *pred = ctx->block->predecessors[i];
1991       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1992 
1993       if (!pred_state->visited)
1994          continue;
1995 
1996       physreg = read_register(ctx, pred, def);
1997       break;
1998    }
1999 
2000    assert(physreg != (physreg_t)~0);
2001 
2002    struct ra_interval *interval = &ctx->intervals[def->name];
2003    struct ra_file *file = ra_get_file(ctx, def);
2004    ra_interval_init(interval, def);
2005    interval->physreg_start = physreg;
2006    interval->physreg_end = physreg + reg_size(def);
2007    ra_file_insert(file, interval);
2008 }
2009 
2010 static void
handle_live_out(struct ra_ctx * ctx,struct ir3_register * def)2011 handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
2012 {
2013    /* Skip parallelcopy's which in the original program are only used as phi
2014     * arguments. Even though phi arguments are live out, they are only
2015     * assigned when the phi is.
2016     */
2017    if (def->instr->opc == OPC_META_PARALLEL_COPY)
2018       return;
2019 
2020    struct ra_block_state *state = &ctx->blocks[ctx->block->index];
2021    struct ra_interval *interval = &ctx->intervals[def->name];
2022    physreg_t physreg = ra_interval_get_physreg(interval);
2023    if (physreg != ra_reg_get_physreg(def)) {
2024       if (!state->renames)
2025          state->renames = _mesa_pointer_hash_table_create(ctx);
2026       _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
2027    }
2028 }
2029 
2030 static void
handle_phi(struct ra_ctx * ctx,struct ir3_register * def)2031 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
2032 {
2033    if (!(def->flags & IR3_REG_SSA))
2034       return;
2035 
2036    struct ra_file *file = ra_get_file(ctx, def);
2037    struct ra_interval *interval = &ctx->intervals[def->name];
2038 
2039    /* phis are always scalar, so they should already be the smallest possible
2040     * size. However they may be coalesced with other live-in values/phi
2041     * nodes, so check for that here.
2042     */
2043    struct ir3_reg_interval *parent_ir3 =
2044       ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
2045    physreg_t physreg;
2046    if (parent_ir3) {
2047       struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
2048       physreg = ra_interval_get_physreg(parent) +
2049                 (def->interval_start - parent_ir3->reg->interval_start);
2050    } else {
2051       physreg = get_reg(ctx, file, def);
2052    }
2053 
2054    allocate_dst_fixed(ctx, def, physreg);
2055 
2056    ra_file_insert(file, interval);
2057 }
2058 
2059 static void
assign_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)2060 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
2061 {
2062    if (!(phi->dsts[0]->flags & IR3_REG_SSA))
2063       return;
2064 
2065    struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
2066    struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
2067    assert(!interval->interval.parent);
2068    unsigned num = ra_interval_get_num(interval);
2069    assign_reg(phi, phi->dsts[0], num);
2070 
2071    /* Assign the parallelcopy sources of this phi */
2072    for (unsigned i = 0; i < phi->srcs_count; i++) {
2073       if (phi->srcs[i]->def) {
2074          assign_reg(phi, phi->srcs[i], num);
2075          assign_reg(phi, phi->srcs[i]->def, num);
2076       }
2077    }
2078 
2079    if (phi->dsts[0]->flags & IR3_REG_UNUSED)
2080       ra_file_remove(file, interval);
2081 }
2082 
2083 /* When we split a live range, we sometimes need to emit fixup code at the end
2084  * of a block. For example, something like:
2085  *
2086  * a = ...
2087  * if (...) {
2088  *    ...
2089  *    a' = a
2090  *    b = ... // a evicted to make room for b
2091  *    ...
2092  * }
2093  * ... = a
2094  *
2095  * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
2096  * to insert another copy "a = a'" at the end of the if. Normally this would
2097  * also entail adding a phi node, but since we're about to go out of SSA
2098  * anyway we just insert an extra move. Note, however, that "b" might be used
2099  * in a phi node at the end of the if and share registers with "a", so we
2100  * have to be careful to extend any preexisting parallelcopy instruction
2101  * instead of creating our own in order to guarantee that they properly get
2102  * swapped.
2103  */
2104 
2105 static void
insert_liveout_copy(struct ir3_block * block,physreg_t dst,physreg_t src,struct ir3_register * reg)2106 insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
2107                     struct ir3_register *reg)
2108 {
2109    struct ir3_instruction *old_pcopy = NULL;
2110    struct ir3_instruction *last = ir3_block_get_last_non_terminator(block);
2111 
2112    if (last && last->opc == OPC_META_PARALLEL_COPY)
2113       old_pcopy = last;
2114 
2115    unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
2116    struct ir3_instruction *pcopy = ir3_instr_create(
2117       block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
2118 
2119    for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2120       old_pcopy->dsts[i]->instr = pcopy;
2121       pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
2122    }
2123 
2124    unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
2125 
2126    struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags);
2127    dst_reg->wrmask = reg->wrmask;
2128    dst_reg->size = reg->size;
2129    assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
2130 
2131    for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2132       pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
2133    }
2134 
2135    struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags);
2136    src_reg->wrmask = reg->wrmask;
2137    src_reg->size = reg->size;
2138    assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
2139 
2140    if (old_pcopy)
2141       list_del(&old_pcopy->node);
2142 }
2143 
2144 static void
insert_live_in_move(struct ra_ctx * ctx,struct ra_interval * interval)2145 insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
2146 {
2147    physreg_t physreg = ra_interval_get_physreg(interval);
2148 
2149    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2150       struct ir3_block *pred = ctx->block->predecessors[i];
2151       struct ra_block_state *pred_state = &ctx->blocks[pred->index];
2152 
2153       if (!pred_state->visited)
2154          continue;
2155 
2156       physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
2157       if (pred_reg != physreg) {
2158          assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
2159          insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
2160       }
2161    }
2162 }
2163 
2164 static void
insert_file_live_in_moves(struct ra_ctx * ctx,struct ra_file * file)2165 insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
2166 {
2167    BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
2168    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2169                     physreg_node) {
2170       /* Skip phi nodes. This needs to happen after phi nodes are allocated,
2171        * because we may have to move live-ins around to make space for phi
2172        * nodes, but we shouldn't be handling phi nodes here.
2173        */
2174       if (BITSET_TEST(live_in, interval->interval.reg->name))
2175          insert_live_in_move(ctx, interval);
2176    }
2177 }
2178 
2179 static void
insert_entry_regs(struct ra_block_state * state,struct ra_file * file)2180 insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
2181 {
2182    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2183                     physreg_node) {
2184       _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
2185                               (void *)(uintptr_t)interval->physreg_start);
2186    }
2187 }
2188 
2189 static void
insert_live_in_moves(struct ra_ctx * ctx)2190 insert_live_in_moves(struct ra_ctx *ctx)
2191 {
2192    insert_file_live_in_moves(ctx, &ctx->full);
2193    insert_file_live_in_moves(ctx, &ctx->half);
2194    insert_file_live_in_moves(ctx, &ctx->shared);
2195 
2196    /* If not all predecessors are visited, insert live-in regs so that
2197     * insert_live_out_moves() will work.
2198     */
2199    bool all_preds_visited = true;
2200    for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2201       if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
2202          all_preds_visited = false;
2203          break;
2204       }
2205    }
2206 
2207    if (!all_preds_visited) {
2208       struct ra_block_state *state = &ctx->blocks[ctx->block->index];
2209       state->entry_regs = _mesa_pointer_hash_table_create(ctx);
2210 
2211       insert_entry_regs(state, &ctx->full);
2212       insert_entry_regs(state, &ctx->half);
2213       insert_entry_regs(state, &ctx->shared);
2214    }
2215 }
2216 
2217 static void
insert_live_out_move(struct ra_ctx * ctx,struct ra_interval * interval)2218 insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
2219 {
2220    for (unsigned i = 0; i < 2; i++) {
2221       if (!ctx->block->successors[i])
2222          continue;
2223 
2224       struct ir3_block *succ = ctx->block->successors[i];
2225       struct ra_block_state *succ_state = &ctx->blocks[succ->index];
2226 
2227       if (!succ_state->visited)
2228          continue;
2229 
2230       struct hash_entry *entry = _mesa_hash_table_search(
2231          succ_state->entry_regs, interval->interval.reg);
2232       if (!entry)
2233          continue;
2234 
2235       physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
2236       if (new_reg != interval->physreg_start) {
2237          insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
2238                              interval->interval.reg);
2239       }
2240    }
2241 }
2242 
2243 static void
insert_file_live_out_moves(struct ra_ctx * ctx,struct ra_file * file)2244 insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
2245 {
2246    rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2247                     physreg_node) {
2248       insert_live_out_move(ctx, interval);
2249    }
2250 }
2251 
2252 static void
insert_live_out_moves(struct ra_ctx * ctx)2253 insert_live_out_moves(struct ra_ctx *ctx)
2254 {
2255    insert_file_live_out_moves(ctx, &ctx->full);
2256    insert_file_live_out_moves(ctx, &ctx->half);
2257    insert_file_live_out_moves(ctx, &ctx->shared);
2258 }
2259 
2260 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)2261 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
2262 {
2263    ctx->block = block;
2264 
2265    /* Reset the register files from the last block */
2266    ra_file_init(&ctx->full);
2267    ra_file_init(&ctx->half);
2268    ra_file_init(&ctx->shared);
2269 
2270    /* Handle live-ins, phis, and input meta-instructions. These all appear
2271     * live at the beginning of the block, and interfere with each other
2272     * therefore need to be allocated "in parallel". This means that we
2273     * have to allocate all of them, inserting them into the file, and then
2274     * delay updating the IR until all of them are allocated.
2275     *
2276     * Handle precolored inputs first, because we need to make sure that other
2277     * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
2278     * and inputs at the same time, because the first block doesn't have
2279     * predecessors. Therefore handle_live_in doesn't have to worry about
2280     * them.
2281     */
2282 
2283    foreach_instr (instr, &block->instr_list) {
2284       if (instr->opc == OPC_META_INPUT)
2285          handle_precolored_input(ctx, instr);
2286       else
2287          break;
2288    }
2289 
2290    unsigned name;
2291    BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
2292                        ctx->live->definitions_count) {
2293       struct ir3_register *reg = ctx->live->definitions[name];
2294       handle_live_in(ctx, reg);
2295    }
2296 
2297    foreach_instr (instr, &block->instr_list) {
2298       if (instr->opc == OPC_META_PHI)
2299          handle_phi(ctx, instr->dsts[0]);
2300       else if (instr->opc == OPC_META_INPUT ||
2301                instr->opc == OPC_META_TEX_PREFETCH)
2302          handle_input(ctx, instr);
2303       else
2304          break;
2305    }
2306 
2307    /* After this point, every live-in/phi/input has an interval assigned to
2308     * it. We delay actually assigning values until everything has been
2309     * allocated, so we can simply ignore any parallel copy entries created
2310     * when shuffling them around.
2311     */
2312    ctx->parallel_copies_count = 0;
2313 
2314    insert_live_in_moves(ctx);
2315 
2316    if (RA_DEBUG) {
2317       d("after live-in block %u:\n", block->index);
2318       ra_ctx_dump(ctx);
2319    }
2320 
2321    /* Now we're done with processing live-ins, and can handle the body of the
2322     * block.
2323     */
2324    foreach_instr (instr, &block->instr_list) {
2325       di(instr, "processing");
2326 
2327       if (instr->opc == OPC_META_PHI)
2328          assign_phi(ctx, instr);
2329       else if (instr->opc == OPC_META_INPUT ||
2330                instr->opc == OPC_META_TEX_PREFETCH)
2331          assign_input(ctx, instr);
2332       else if (instr->opc == OPC_META_SPLIT)
2333          handle_split(ctx, instr);
2334       else if (instr->opc == OPC_META_COLLECT)
2335          handle_collect(ctx, instr);
2336       else if (instr->opc == OPC_META_PARALLEL_COPY)
2337          handle_pcopy(ctx, instr);
2338       else if (instr->opc == OPC_CHMASK)
2339          handle_chmask(ctx, instr);
2340       else
2341          handle_normal_instr(ctx, instr);
2342 
2343       if (RA_DEBUG)
2344          ra_ctx_dump(ctx);
2345    }
2346 
2347    insert_live_out_moves(ctx);
2348 
2349    BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2350                        ctx->live->definitions_count) {
2351       struct ir3_register *reg = ctx->live->definitions[name];
2352       handle_live_out(ctx, reg);
2353    }
2354 
2355    ctx->blocks[block->index].visited = true;
2356 }
2357 
2358 static unsigned
calc_target_full_pressure(struct ir3_shader_variant * v,unsigned pressure)2359 calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2360 {
2361    /* Registers are allocated in units of vec4, so switch from units of
2362     * half-regs to vec4.
2363     */
2364    unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2365 
2366    bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2367 
2368    unsigned target = reg_count;
2369    unsigned reg_independent_max_waves =
2370       ir3_get_reg_independent_max_waves(v, double_threadsize);
2371    unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2372       v->compiler, reg_count, double_threadsize);
2373    unsigned target_waves =
2374       MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2375 
2376    while (target <= RA_FULL_SIZE / (2 * 4) &&
2377           ir3_should_double_threadsize(v, target) == double_threadsize &&
2378           ir3_get_reg_dependent_max_waves(v->compiler, target,
2379                                           double_threadsize) >= target_waves)
2380       target++;
2381 
2382    return (target - 1) * 2 * 4;
2383 }
2384 
2385 static void
add_pressure(struct ir3_pressure * pressure,struct ir3_register * reg,bool merged_regs)2386 add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2387              bool merged_regs)
2388 {
2389    unsigned size = reg_size(reg);
2390    if (reg->flags & IR3_REG_HALF)
2391       pressure->half += size;
2392    if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2393       pressure->full += size;
2394 }
2395 
2396 static void
dummy_interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2397 dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2398 {
2399 }
2400 
2401 static void
dummy_interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2402 dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2403 {
2404 }
2405 
2406 static void
dummy_interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * parent,struct ir3_reg_interval * child)2407 dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2408                      struct ir3_reg_interval *child)
2409 {
2410 }
2411 
2412 /* Calculate the minimum possible limit on register pressure so that spilling
2413  * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2414  */
2415 
2416 static void
calc_min_limit_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * limit)2417 calc_min_limit_pressure(struct ir3_shader_variant *v,
2418                         struct ir3_liveness *live,
2419                         struct ir3_pressure *limit)
2420 {
2421    struct ir3_block *start = ir3_start_block(v->ir);
2422    struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2423    struct ir3_reg_interval *intervals =
2424       rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2425 
2426    ctx->interval_add = dummy_interval_add;
2427    ctx->interval_delete = dummy_interval_delete;
2428    ctx->interval_readd = dummy_interval_readd;
2429 
2430    limit->full = limit->half = 0;
2431 
2432    struct ir3_pressure cur_pressure = {0};
2433    foreach_instr (input, &start->instr_list) {
2434       if (input->opc != OPC_META_INPUT &&
2435           input->opc != OPC_META_TEX_PREFETCH)
2436          break;
2437 
2438       add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2439    }
2440 
2441    limit->full = MAX2(limit->full, cur_pressure.full);
2442    limit->half = MAX2(limit->half, cur_pressure.half);
2443 
2444    foreach_instr (input, &start->instr_list) {
2445       if (input->opc != OPC_META_INPUT &&
2446           input->opc != OPC_META_TEX_PREFETCH)
2447          break;
2448 
2449       /* pre-colored inputs may have holes, which increases the pressure. */
2450       struct ir3_register *dst = input->dsts[0];
2451       if (dst->num != INVALID_REG) {
2452          unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2453          if (dst->flags & IR3_REG_HALF)
2454             limit->half = MAX2(limit->half, physreg);
2455          if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2456             limit->full = MAX2(limit->full, physreg);
2457       }
2458    }
2459 
2460    foreach_block (block, &v->ir->block_list) {
2461       rb_tree_init(&ctx->intervals);
2462 
2463       unsigned name;
2464       BITSET_FOREACH_SET (name, live->live_in[block->index],
2465                           live->definitions_count) {
2466          struct ir3_register *reg = live->definitions[name];
2467          ir3_reg_interval_init(&intervals[reg->name], reg);
2468          ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2469       }
2470 
2471       foreach_instr (instr, &block->instr_list) {
2472          ra_foreach_dst (dst, instr) {
2473             ir3_reg_interval_init(&intervals[dst->name], dst);
2474          }
2475          /* phis and parallel copies can be deleted via spilling */
2476 
2477          if (instr->opc == OPC_META_PHI) {
2478             ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2479             continue;
2480          }
2481 
2482          if (instr->opc == OPC_META_PARALLEL_COPY)
2483             continue;
2484 
2485          cur_pressure = (struct ir3_pressure) {0};
2486 
2487          ra_foreach_dst (dst, instr) {
2488             if ((dst->tied && !(dst->tied->flags & IR3_REG_KILL)) ||
2489                 (dst->flags & IR3_REG_EARLY_CLOBBER))
2490                add_pressure(&cur_pressure, dst, v->mergedregs);
2491          }
2492 
2493          ra_foreach_src_rev (src, instr) {
2494             /* We currently don't support spilling the parent of a source when
2495              * making space for sources, so we have to keep track of the
2496              * intervals and figure out the root of the tree to figure out how
2497              * much space we need.
2498              *
2499              * TODO: We should probably support this in the spiller.
2500              */
2501             struct ir3_reg_interval *interval = &intervals[src->def->name];
2502             while (interval->parent)
2503                interval = interval->parent;
2504             add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2505 
2506             if (src->flags & IR3_REG_FIRST_KILL)
2507                ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2508          }
2509 
2510          limit->full = MAX2(limit->full, cur_pressure.full);
2511          limit->half = MAX2(limit->half, cur_pressure.half);
2512 
2513          cur_pressure = (struct ir3_pressure) {0};
2514 
2515          ra_foreach_dst (dst, instr) {
2516             ir3_reg_interval_init(&intervals[dst->name], dst);
2517             ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2518             add_pressure(&cur_pressure, dst, v->mergedregs);
2519          }
2520 
2521          limit->full = MAX2(limit->full, cur_pressure.full);
2522          limit->half = MAX2(limit->half, cur_pressure.half);
2523       }
2524    }
2525 
2526    /* Account for the base register, which needs to be available everywhere. */
2527    limit->full += 2;
2528 
2529    ralloc_free(ctx);
2530 }
2531 
2532 /*
2533  * If barriers are used, it must be possible for all waves in the workgroup
2534  * to execute concurrently. Thus we may have to reduce the registers limit.
2535  */
2536 static void
calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant * v,struct ir3_pressure * limit_pressure)2537 calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
2538                                         struct ir3_pressure *limit_pressure)
2539 {
2540    const struct ir3_compiler *compiler = v->compiler;
2541 
2542    unsigned threads_per_wg;
2543    if (v->local_size_variable) {
2544       /* We have to expect the worst case. */
2545       threads_per_wg = compiler->max_variable_workgroup_size;
2546    } else {
2547       threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
2548    }
2549 
2550    /* The register file is grouped into reg_size_vec4 number of parts.
2551     * Each part has enough registers to add a single vec4 register to
2552     * each thread of a single-sized wave-pair. With double threadsize
2553     * each wave-pair would consume two parts of the register file to get
2554     * a single vec4 for a thread. The more active wave-pairs the less
2555     * parts each could get.
2556     */
2557 
2558    bool double_threadsize = ir3_should_double_threadsize(v, 0);
2559    unsigned waves_per_wg = DIV_ROUND_UP(
2560       threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
2561                          compiler->wave_granularity);
2562 
2563    uint32_t vec4_regs_per_thread =
2564       compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
2565    assert(vec4_regs_per_thread > 0);
2566 
2567    uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
2568 
2569    if (limit_pressure->full > half_regs_per_thread) {
2570       if (v->mergedregs) {
2571          limit_pressure->full = half_regs_per_thread;
2572       } else {
2573          /* TODO: Handle !mergedregs case, probably we would have to do this
2574           * after the first register pressure pass.
2575           */
2576       }
2577    }
2578 }
2579 
2580 int
ir3_ra(struct ir3_shader_variant * v)2581 ir3_ra(struct ir3_shader_variant *v)
2582 {
2583    ir3_calc_dominance(v->ir);
2584 
2585    /* Predicate RA needs dominance. */
2586    ir3_ra_predicates(v);
2587 
2588    ir3_create_parallel_copies(v->ir);
2589 
2590    struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2591 
2592    ctx->merged_regs = v->mergedregs;
2593    ctx->compiler = v->compiler;
2594    ctx->stage = v->type;
2595 
2596    struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2597 
2598    ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2599 
2600    ir3_index_instrs_for_merge_sets(v->ir);
2601    ir3_merge_regs(live, v->ir);
2602 
2603    bool has_shared_vectors = false;
2604    foreach_block (block, &v->ir->block_list) {
2605       foreach_instr (instr, &block->instr_list) {
2606          ra_foreach_dst (dst, instr) {
2607             if ((dst->flags & IR3_REG_SHARED) && reg_elems(dst) > 1) {
2608                has_shared_vectors = true;
2609                break;
2610             }
2611          }
2612       }
2613    }
2614 
2615    struct ir3_pressure max_pressure;
2616    ir3_calc_pressure(v, live, &max_pressure);
2617    d("max pressure:");
2618    d("\tfull: %u", max_pressure.full);
2619    d("\thalf: %u", max_pressure.half);
2620    d("\tshared: %u", max_pressure.shared);
2621 
2622    struct ir3_pressure limit_pressure;
2623    limit_pressure.full = RA_FULL_SIZE;
2624    limit_pressure.half = RA_HALF_SIZE;
2625    limit_pressure.shared = RA_SHARED_SIZE;
2626    limit_pressure.shared_half = RA_SHARED_HALF_SIZE;
2627 
2628    if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
2629       calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
2630    }
2631 
2632    /* If the user forces a doubled threadsize, we may have to lower the limit
2633     * because on some gens the register file is not big enough to hold a
2634     * double-size wave with all 48 registers in use.
2635     */
2636    if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY) {
2637       limit_pressure.full =
2638          MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16);
2639    }
2640 
2641    /* If requested, lower the limit so that spilling happens more often. */
2642    if (ir3_shader_debug & IR3_DBG_SPILLALL)
2643       calc_min_limit_pressure(v, live, &limit_pressure);
2644 
2645    d("limit pressure:");
2646    d("\tfull: %u", limit_pressure.full);
2647    d("\thalf: %u", limit_pressure.half);
2648    d("\tshared: %u", limit_pressure.shared);
2649 
2650    /* In the worst case, each half register could block one full register, so
2651     * add shared_half in case of fragmentation. In addition, full registers can
2652     * block half registers so we have to consider the total pressure against the
2653     * half limit to prevent live range splitting when we run out of space for
2654     * half registers in the bottom half.
2655     */
2656    if (max_pressure.shared + max_pressure.shared_half > limit_pressure.shared ||
2657        (max_pressure.shared_half > 0 && max_pressure.shared > limit_pressure.shared_half) ||
2658        has_shared_vectors) {
2659       ir3_ra_shared(v, &live);
2660       ir3_calc_pressure(v, live, &max_pressure);
2661 
2662       ir3_debug_print(v->ir, "AFTER: shared register allocation");
2663    }
2664 
2665    bool spilled = false;
2666    if (max_pressure.full > limit_pressure.full ||
2667        max_pressure.half > limit_pressure.half) {
2668       if (!v->compiler->has_pvtmem) {
2669          d("max pressure exceeded!");
2670          goto fail;
2671       }
2672       d("max pressure exceeded, spilling!");
2673       IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2674       ir3_calc_pressure(v, live, &max_pressure);
2675 
2676       d("max pressure after spilling:");
2677       d("\tfull: %u", max_pressure.full);
2678       d("\thalf: %u", max_pressure.half);
2679       d("\tshared: %u", max_pressure.shared);
2680 
2681       assert(max_pressure.full <= limit_pressure.full &&
2682              max_pressure.half <= limit_pressure.half);
2683       spilled = true;
2684    }
2685 
2686    ctx->live = live;
2687    ctx->intervals =
2688       rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2689    ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2690 
2691    ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2692    d("full size: %u", ctx->full.size);
2693 
2694    if (!v->mergedregs)
2695       ctx->half.size = RA_HALF_SIZE;
2696 
2697    ctx->shared.size = RA_SHARED_SIZE;
2698 
2699    ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2700 
2701    foreach_block (block, &v->ir->block_list)
2702       handle_block(ctx, block);
2703 
2704    ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count, false);
2705 
2706    /* Strip array-ness and SSA-ness at the end, because various helpers still
2707     * need to work even on definitions that have already been assigned. For
2708     * example, we need to preserve array-ness so that array live-ins have the
2709     * right size.
2710     */
2711    foreach_block (block, &v->ir->block_list) {
2712       foreach_instr (instr, &block->instr_list) {
2713          for (unsigned i = 0; i < instr->dsts_count; i++) {
2714             instr->dsts[i]->flags &= ~IR3_REG_SSA;
2715 
2716             /* Parallel copies of array registers copy the whole register, and
2717              * we need some way to let the parallel copy code know that this was
2718              * an array whose size is determined by reg->size. So keep the array
2719              * flag on those. spill/reload also need to work on the entire
2720              * array.
2721              */
2722             if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2723                instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2724          }
2725 
2726          for (unsigned i = 0; i < instr->srcs_count; i++) {
2727             instr->srcs[i]->flags &= ~IR3_REG_SSA;
2728 
2729             if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2730                instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2731          }
2732       }
2733    }
2734 
2735    ir3_debug_print(v->ir, "AFTER: register allocation");
2736 
2737    if (spilled) {
2738       IR3_PASS(v->ir, ir3_lower_spill);
2739    }
2740 
2741    ir3_lower_copies(v);
2742 
2743    ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2744 
2745    ralloc_free(ctx);
2746 
2747    return 0;
2748 fail:
2749    ralloc_free(ctx);
2750    return -1;
2751 }
2752