1 /*
2 * Copyright © 2021 Valve Corporation
3 * Copyright © 2014 Rob Clark <[email protected]>
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "ir3_ra.h"
8 #include "util/rb_tree.h"
9 #include "util/u_math.h"
10 #include "ir3_shader.h"
11
12 /* This file implements an SSA-based register allocator. Unlike other
13 * SSA-based allocators, it handles vector split/collect "smartly," meaning
14 * that multiple values may share the same register interval. From the
15 * perspective of the allocator itself, only the top-level intervals matter,
16 * and the allocator is only concerned with allocating top-level intervals,
17 * which may mean moving other top-level intervals around. Other intervals,
18 * like the destination of a split instruction or the source of a collect
19 * instruction, are "locked" to their parent interval. The details of this are
20 * mostly handled by ir3_merge_regs and ir3_reg_ctx.
21 *
22 * We currently don't do any backtracking, but we do use the merge sets as a
23 * form of affinity to try to avoid moves from phis/splits/collects. Each
24 * merge set is what a more "classic" graph-coloring or live-range based
25 * allocator would consider a single register, but here we use it as merely a
26 * hint, except when multiple overlapping values are live at the same time.
27 * Each merge set has a "preferred" register, and we try to honor that when
28 * allocating values in the merge set.
29 */
30
31 /* ir3_reg_ctx implementation. */
32
33 static int
ir3_reg_interval_cmp(const struct rb_node * node,const void * data)34 ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
35 {
36 unsigned reg = *(const unsigned *)data;
37 const struct ir3_reg_interval *interval =
38 ir3_rb_node_to_interval_const(node);
39 if (interval->reg->interval_start > reg)
40 return -1;
41 else if (interval->reg->interval_end <= reg)
42 return 1;
43 else
44 return 0;
45 }
46
47 static struct ir3_reg_interval *
ir3_reg_interval_search(struct rb_tree * tree,unsigned offset)48 ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
49 {
50 struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
51 return node ? ir3_rb_node_to_interval(node) : NULL;
52 }
53
54 static struct ir3_reg_interval *
ir3_reg_interval_search_sloppy(struct rb_tree * tree,unsigned offset)55 ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
56 {
57 struct rb_node *node =
58 rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
59 return node ? ir3_rb_node_to_interval(node) : NULL;
60 }
61
62 /* Get the interval covering the reg, or the closest to the right if it
63 * doesn't exist.
64 */
65 static struct ir3_reg_interval *
ir3_reg_interval_search_right(struct rb_tree * tree,unsigned offset)66 ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
67 {
68 struct ir3_reg_interval *interval =
69 ir3_reg_interval_search_sloppy(tree, offset);
70 if (!interval) {
71 return NULL;
72 } else if (interval->reg->interval_end > offset) {
73 return interval;
74 } else {
75 /* There is no interval covering reg, and ra_file_search_sloppy()
76 * returned the closest range to the left, so the next interval to the
77 * right should be the closest to the right.
78 */
79 return ir3_reg_interval_next_or_null(interval);
80 }
81 }
82
83 static int
ir3_reg_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)84 ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
85 {
86 const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
87 const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
88 return b->reg->interval_start - a->reg->interval_start;
89 }
90
91 static void
interval_insert(struct ir3_reg_ctx * ctx,struct rb_tree * tree,struct ir3_reg_interval * interval)92 interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
93 struct ir3_reg_interval *interval)
94 {
95 struct ir3_reg_interval *right =
96 ir3_reg_interval_search_right(tree, interval->reg->interval_start);
97 if (right && right->reg->interval_start < interval->reg->interval_end) {
98 /* We disallow trees where different members have different half-ness.
99 * This means that we can't treat bitcasts as copies like normal
100 * split/collect, so something like this would require an extra copy
101 * in mergedregs mode, and count as 4 half-units of register pressure
102 * instead of 2:
103 *
104 * f16vec2 foo = unpackFloat2x16(bar)
105 * ... = foo.x
106 * ... = bar
107 *
108 * However, relaxing this rule would open a huge can of worms. What
109 * happens when there's a vector of 16 things, and the fifth element
110 * has been bitcasted as a half-reg? Would that element alone have to
111 * be small enough to be used as a half-reg source? Let's keep that
112 * can of worms firmly shut for now.
113 */
114 assert((interval->reg->flags & IR3_REG_HALF) ==
115 (right->reg->flags & IR3_REG_HALF));
116
117 if (right->reg->interval_end <= interval->reg->interval_end &&
118 right->reg->interval_start >= interval->reg->interval_start) {
119 /* Check if we're inserting something that's already inserted */
120 assert(interval != right);
121
122 /* "right" is contained in "interval" and must become a child of
123 * it. There may be further children too.
124 */
125 for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
126 right && right->reg->interval_start < interval->reg->interval_end;
127 right = next, next = ir3_reg_interval_next_or_null(next)) {
128 /* "right" must be contained in "interval." */
129 assert(right->reg->interval_end <= interval->reg->interval_end);
130 assert((interval->reg->flags & IR3_REG_HALF) ==
131 (right->reg->flags & IR3_REG_HALF));
132 if (!right->parent)
133 ctx->interval_delete(ctx, right);
134 right->parent = interval;
135 rb_tree_remove(tree, &right->node);
136 rb_tree_insert(&interval->children, &right->node,
137 ir3_reg_interval_insert_cmp);
138 }
139 } else {
140 /* "right" must contain "interval," since intervals must form a
141 * tree.
142 */
143 assert(right->reg->interval_start <= interval->reg->interval_start);
144 interval->parent = right;
145 interval_insert(ctx, &right->children, interval);
146 return;
147 }
148 }
149
150 if (!interval->parent)
151 ctx->interval_add(ctx, interval);
152 rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
153 interval->inserted = true;
154 }
155
156 void
ir3_reg_interval_insert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)157 ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
158 struct ir3_reg_interval *interval)
159 {
160 rb_tree_init(&interval->children);
161 interval->parent = NULL;
162 interval_insert(ctx, &ctx->intervals, interval);
163 }
164
165 /* Call after ir3_reg_interval_remove_temp() to reinsert the interval */
166 static void
ir3_reg_interval_reinsert(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)167 ir3_reg_interval_reinsert(struct ir3_reg_ctx *ctx,
168 struct ir3_reg_interval *interval)
169 {
170 interval->parent = NULL;
171 interval_insert(ctx, &ctx->intervals, interval);
172 }
173
174 void
ir3_reg_interval_remove(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)175 ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
176 struct ir3_reg_interval *interval)
177 {
178 assert(interval->inserted);
179
180 if (interval->parent) {
181 rb_tree_remove(&interval->parent->children, &interval->node);
182 } else {
183 ctx->interval_delete(ctx, interval);
184 rb_tree_remove(&ctx->intervals, &interval->node);
185 }
186
187 rb_tree_foreach_safe (struct ir3_reg_interval, child, &interval->children,
188 node) {
189 rb_tree_remove(&interval->children, &child->node);
190 child->parent = interval->parent;
191
192 if (interval->parent) {
193 rb_tree_insert(&child->parent->children, &child->node,
194 ir3_reg_interval_insert_cmp);
195 } else {
196 ctx->interval_readd(ctx, interval, child);
197 rb_tree_insert(&ctx->intervals, &child->node,
198 ir3_reg_interval_insert_cmp);
199 }
200 }
201
202 interval->inserted = false;
203 }
204
205 static void
_mark_free(struct ir3_reg_interval * interval)206 _mark_free(struct ir3_reg_interval *interval)
207 {
208 interval->inserted = false;
209 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
210 _mark_free(child);
211 }
212 }
213
214 /* Remove an interval and all its children from the tree. */
215 void
ir3_reg_interval_remove_all(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)216 ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
217 struct ir3_reg_interval *interval)
218 {
219 assert(!interval->parent);
220
221 ctx->interval_delete(ctx, interval);
222 rb_tree_remove(&ctx->intervals, &interval->node);
223 _mark_free(interval);
224 }
225
226 /* Used when popping an interval to be shuffled around. Don't disturb children
227 * so that it can be later reinserted.
228 */
229 static void
ir3_reg_interval_remove_temp(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)230 ir3_reg_interval_remove_temp(struct ir3_reg_ctx *ctx,
231 struct ir3_reg_interval *interval)
232 {
233 assert(!interval->parent);
234
235 ctx->interval_delete(ctx, interval);
236 rb_tree_remove(&ctx->intervals, &interval->node);
237 }
238
239 static void
interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval,unsigned indent)240 interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval,
241 unsigned indent)
242 {
243 for (unsigned i = 0; i < indent; i++)
244 mesa_log_stream_printf(stream, "\t");
245 mesa_log_stream_printf(stream, "reg %u start %u\n", interval->reg->name,
246 interval->reg->interval_start);
247
248 rb_tree_foreach (struct ir3_reg_interval, child, &interval->children, node) {
249 interval_dump(stream, child, indent + 1);
250 }
251
252 for (unsigned i = 0; i < indent; i++)
253 mesa_log_stream_printf(stream, "\t");
254 mesa_log_stream_printf(stream, "reg %u end %u\n", interval->reg->name,
255 interval->reg->interval_end);
256 }
257
258 void
ir3_reg_interval_dump(struct log_stream * stream,struct ir3_reg_interval * interval)259 ir3_reg_interval_dump(struct log_stream *stream, struct ir3_reg_interval *interval)
260 {
261 interval_dump(stream, interval, 0);
262 }
263
264 /* These are the core datastructures used by the register allocator. First
265 * ra_interval and ra_file, which are used for intra-block tracking and use
266 * the ir3_reg_ctx infrastructure:
267 */
268
269 struct ra_interval {
270 struct ir3_reg_interval interval;
271
272 struct rb_node physreg_node;
273 physreg_t physreg_start, physreg_end;
274
275 /* True if this is a source of the current instruction which is entirely
276 * killed. This means we can allocate the dest over it, but we can't break
277 * it up.
278 */
279 bool is_killed;
280
281 /* True if this interval cannot be moved from its position. This is only
282 * used for precolored inputs to ensure that other inputs don't get
283 * allocated on top of them.
284 */
285 bool frozen;
286 };
287
288 struct ra_file {
289 struct ir3_reg_ctx reg_ctx;
290
291 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
292 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
293
294 struct rb_tree physreg_intervals;
295
296 unsigned size;
297 unsigned start;
298 };
299
300 /* State for inter-block tracking. When we split a live range to make space
301 * for a vector, we may need to insert fixup code when a block has multiple
302 * predecessors that have moved the same live value to different registers.
303 * This keeps track of state required to do that.
304 */
305
306 struct ra_block_state {
307 /* Map of defining ir3_register -> physreg it was allocated to at the end
308 * of the block.
309 */
310 struct hash_table *renames;
311
312 /* For loops, we need to process a block before all its predecessors have
313 * been processed. In particular, we need to pick registers for values
314 * without knowing if all the predecessors have been renamed. This keeps
315 * track of the registers we chose so that when we visit the back-edge we
316 * can move them appropriately. If all predecessors have been visited
317 * before this block is visited then we don't need to fill this out. This
318 * is a map from ir3_register -> physreg.
319 */
320 struct hash_table *entry_regs;
321
322 /* True if the block has been visited and "renames" is complete.
323 */
324 bool visited;
325 };
326
327 struct ra_parallel_copy {
328 struct ra_interval *interval;
329 physreg_t src;
330 };
331
332 /* The main context: */
333
334 struct ra_ctx {
335 /* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
336 * half of this file too.
337 */
338 struct ra_file full;
339
340 /* hr0.x - hr63.w, only used without merged-regs. */
341 struct ra_file half;
342
343 /* Shared regs. */
344 struct ra_file shared;
345
346 struct ir3_liveness *live;
347
348 struct ir3_block *block;
349
350 const struct ir3_compiler *compiler;
351 gl_shader_stage stage;
352
353 /* Pending moves of top-level intervals that will be emitted once we're
354 * finished:
355 */
356 DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
357
358 struct ra_interval *intervals;
359 struct ra_block_state *blocks;
360
361 bool merged_regs;
362 };
363
364 #define foreach_interval(interval, file) \
365 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
366 physreg_node)
367 #define foreach_interval_rev(interval, file) \
368 rb_tree_foreach (struct ra_interval, interval, &(file)->physreg_intervals, \
369 physreg_node)
370 #define foreach_interval_safe(interval, file) \
371 rb_tree_foreach_safe (struct ra_interval, interval, \
372 &(file)->physreg_intervals, physreg_node)
373 #define foreach_interval_rev_safe(interval, file) \
374 rb_tree_foreach_rev_safe(struct ra_interval, interval, \
375 &(file)->physreg_intervals, physreg_node)
376
377 static struct ra_interval *
rb_node_to_interval(struct rb_node * node)378 rb_node_to_interval(struct rb_node *node)
379 {
380 return rb_node_data(struct ra_interval, node, physreg_node);
381 }
382
383 static const struct ra_interval *
rb_node_to_interval_const(const struct rb_node * node)384 rb_node_to_interval_const(const struct rb_node *node)
385 {
386 return rb_node_data(struct ra_interval, node, physreg_node);
387 }
388
389 static struct ra_interval *
ra_interval_next(struct ra_interval * interval)390 ra_interval_next(struct ra_interval *interval)
391 {
392 struct rb_node *next = rb_node_next(&interval->physreg_node);
393 return next ? rb_node_to_interval(next) : NULL;
394 }
395
396 static struct ra_interval *
ra_interval_next_or_null(struct ra_interval * interval)397 ra_interval_next_or_null(struct ra_interval *interval)
398 {
399 return interval ? ra_interval_next(interval) : NULL;
400 }
401
402 static int
ra_interval_cmp(const struct rb_node * node,const void * data)403 ra_interval_cmp(const struct rb_node *node, const void *data)
404 {
405 physreg_t reg = *(const physreg_t *)data;
406 const struct ra_interval *interval = rb_node_to_interval_const(node);
407 if (interval->physreg_start > reg)
408 return -1;
409 else if (interval->physreg_end <= reg)
410 return 1;
411 else
412 return 0;
413 }
414
415 static struct ra_interval *
ra_interval_search_sloppy(struct rb_tree * tree,physreg_t reg)416 ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
417 {
418 struct rb_node *node = rb_tree_search_sloppy(tree, ®, ra_interval_cmp);
419 return node ? rb_node_to_interval(node) : NULL;
420 }
421
422 /* Get the interval covering the reg, or the closest to the right if it
423 * doesn't exist.
424 */
425 static struct ra_interval *
ra_interval_search_right(struct rb_tree * tree,physreg_t reg)426 ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
427 {
428 struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
429 if (!interval) {
430 return NULL;
431 } else if (interval->physreg_end > reg) {
432 return interval;
433 } else {
434 /* There is no interval covering reg, and ra_file_search_sloppy()
435 * returned the closest range to the left, so the next interval to the
436 * right should be the closest to the right.
437 */
438 return ra_interval_next_or_null(interval);
439 }
440 }
441
442 static struct ra_interval *
ra_file_search_right(struct ra_file * file,physreg_t reg)443 ra_file_search_right(struct ra_file *file, physreg_t reg)
444 {
445 return ra_interval_search_right(&file->physreg_intervals, reg);
446 }
447
448 static int
ra_interval_insert_cmp(const struct rb_node * _a,const struct rb_node * _b)449 ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
450 {
451 const struct ra_interval *a = rb_node_to_interval_const(_a);
452 const struct ra_interval *b = rb_node_to_interval_const(_b);
453 return b->physreg_start - a->physreg_start;
454 }
455
456 static struct ra_interval *
ir3_reg_interval_to_ra_interval(struct ir3_reg_interval * interval)457 ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
458 {
459 return rb_node_data(struct ra_interval, interval, interval);
460 }
461
462 static struct ra_file *
ir3_reg_ctx_to_file(struct ir3_reg_ctx * ctx)463 ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
464 {
465 return rb_node_data(struct ra_file, ctx, reg_ctx);
466 }
467
468 static void
interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)469 interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
470 {
471 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
472 struct ra_file *file = ir3_reg_ctx_to_file(ctx);
473
474 /* We can assume in this case that physreg_start/physreg_end is already
475 * initialized.
476 */
477 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
478 BITSET_CLEAR(file->available, i);
479 BITSET_CLEAR(file->available_to_evict, i);
480 }
481
482 rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
483 ra_interval_insert_cmp);
484 }
485
486 static void
interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _interval)487 interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
488 {
489 struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
490 struct ra_file *file = ir3_reg_ctx_to_file(ctx);
491
492 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
493 BITSET_SET(file->available, i);
494 BITSET_SET(file->available_to_evict, i);
495 }
496
497 rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
498 }
499
500 static void
interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * _parent,struct ir3_reg_interval * _child)501 interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
502 struct ir3_reg_interval *_child)
503 {
504 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
505 struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
506
507 child->physreg_start =
508 parent->physreg_start + (child->interval.reg->interval_start -
509 parent->interval.reg->interval_start);
510 child->physreg_end =
511 child->physreg_start +
512 (child->interval.reg->interval_end - child->interval.reg->interval_start);
513
514 interval_add(ctx, _child);
515 }
516
517 static void
ra_file_init(struct ra_file * file)518 ra_file_init(struct ra_file *file)
519 {
520 for (unsigned i = 0; i < file->size; i++) {
521 BITSET_SET(file->available, i);
522 BITSET_SET(file->available_to_evict, i);
523 }
524
525 rb_tree_init(&file->reg_ctx.intervals);
526 rb_tree_init(&file->physreg_intervals);
527
528 file->reg_ctx.interval_add = interval_add;
529 file->reg_ctx.interval_delete = interval_delete;
530 file->reg_ctx.interval_readd = interval_readd;
531 }
532
533 static void
ra_file_insert(struct ra_file * file,struct ra_interval * interval)534 ra_file_insert(struct ra_file *file, struct ra_interval *interval)
535 {
536 assert(interval->physreg_start < interval->physreg_end);
537 assert(interval->physreg_end <= file->size);
538 if (interval->interval.reg->flags & IR3_REG_HALF)
539 assert(interval->physreg_end <= RA_HALF_SIZE);
540
541 ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
542 }
543
544 static void
ra_file_remove(struct ra_file * file,struct ra_interval * interval)545 ra_file_remove(struct ra_file *file, struct ra_interval *interval)
546 {
547 ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
548 }
549
550 static void
ra_file_mark_killed(struct ra_file * file,struct ra_interval * interval)551 ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
552 {
553 assert(!interval->interval.parent);
554
555 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
556 BITSET_SET(file->available, i);
557 }
558
559 interval->is_killed = true;
560 }
561
562 static void
ra_file_unmark_killed(struct ra_file * file,struct ra_interval * interval)563 ra_file_unmark_killed(struct ra_file *file, struct ra_interval *interval)
564 {
565 assert(!interval->interval.parent);
566
567 for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
568 BITSET_CLEAR(file->available, i);
569 }
570
571 interval->is_killed = false;
572 }
573
574 static physreg_t
ra_interval_get_physreg(const struct ra_interval * interval)575 ra_interval_get_physreg(const struct ra_interval *interval)
576 {
577 unsigned child_start = interval->interval.reg->interval_start;
578
579 while (interval->interval.parent) {
580 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
581 }
582
583 return interval->physreg_start +
584 (child_start - interval->interval.reg->interval_start);
585 }
586
587 static unsigned
ra_interval_get_num(const struct ra_interval * interval)588 ra_interval_get_num(const struct ra_interval *interval)
589 {
590 return ra_physreg_to_num(ra_interval_get_physreg(interval),
591 interval->interval.reg->flags);
592 }
593
594 static void
ra_interval_init(struct ra_interval * interval,struct ir3_register * reg)595 ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
596 {
597 ir3_reg_interval_init(&interval->interval, reg);
598 interval->is_killed = false;
599 interval->frozen = false;
600 }
601
602 static void
ra_interval_dump(struct log_stream * stream,struct ra_interval * interval)603 ra_interval_dump(struct log_stream *stream, struct ra_interval *interval)
604 {
605 mesa_log_stream_printf(stream, "physreg %u ", interval->physreg_start);
606
607 ir3_reg_interval_dump(stream, &interval->interval);
608 }
609
610 static void
ra_file_dump(struct log_stream * stream,struct ra_file * file)611 ra_file_dump(struct log_stream *stream, struct ra_file *file)
612 {
613 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
614 physreg_node) {
615 ra_interval_dump(stream, interval);
616 }
617
618 unsigned start, end;
619 mesa_log_stream_printf(stream, "available:\n");
620 BITSET_FOREACH_RANGE (start, end, file->available, file->size) {
621 mesa_log_stream_printf(stream, "%u-%u ", start, end);
622 }
623 mesa_log_stream_printf(stream, "\n");
624
625 mesa_log_stream_printf(stream, "available to evict:\n");
626 BITSET_FOREACH_RANGE (start, end, file->available_to_evict, file->size) {
627 mesa_log_stream_printf(stream, "%u-%u ", start, end);
628 }
629 mesa_log_stream_printf(stream, "\n");
630 mesa_log_stream_printf(stream, "start: %u\n", file->start);
631 }
632
633 static void
ra_ctx_dump(struct ra_ctx * ctx)634 ra_ctx_dump(struct ra_ctx *ctx)
635 {
636 struct log_stream *stream = mesa_log_streami();
637 mesa_log_stream_printf(stream, "full:\n");
638 ra_file_dump(stream, &ctx->full);
639 mesa_log_stream_printf(stream, "half:\n");
640 ra_file_dump(stream, &ctx->half);
641 mesa_log_stream_printf(stream, "shared:\n");
642 ra_file_dump(stream, &ctx->shared);
643 mesa_log_stream_destroy(stream);
644 }
645
646 static unsigned
reg_file_size(struct ra_file * file,struct ir3_register * reg)647 reg_file_size(struct ra_file *file, struct ir3_register *reg)
648 {
649 /* Half-regs can only take up the first half of the combined regfile */
650 if (reg->flags & IR3_REG_HALF) {
651 if (reg->flags & IR3_REG_SHARED)
652 return RA_SHARED_HALF_SIZE;
653 else
654 return MIN2(file->size, RA_HALF_SIZE);
655 } else {
656 return file->size;
657 }
658 }
659
660 /* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
661 * top-level intervals at once. Pop multiple intervals, then push them back in
662 * any order.
663 */
664
665 struct ra_removed_interval {
666 struct ra_interval *interval;
667 unsigned size;
668 };
669
670 static struct ra_removed_interval
ra_pop_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval)671 ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
672 struct ra_interval *interval)
673 {
674 assert(!interval->interval.parent);
675 /* shared live splitting is not allowed! */
676 assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
677
678 /* Check if we've already moved this reg before */
679 unsigned pcopy_index;
680 for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count;
681 pcopy_index++) {
682 if (ctx->parallel_copies[pcopy_index].interval == interval)
683 break;
684 }
685
686 if (pcopy_index == ctx->parallel_copies_count) {
687 array_insert(ctx, ctx->parallel_copies,
688 (struct ra_parallel_copy){
689 .interval = interval,
690 .src = interval->physreg_start,
691 });
692 }
693
694 ir3_reg_interval_remove_temp(&file->reg_ctx, &interval->interval);
695
696 return (struct ra_removed_interval){
697 .interval = interval,
698 .size = interval->physreg_end - interval->physreg_start,
699 };
700 }
701
702 static void
ra_push_interval(struct ra_ctx * ctx,struct ra_file * file,const struct ra_removed_interval * removed,physreg_t dst)703 ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
704 const struct ra_removed_interval *removed, physreg_t dst)
705 {
706 struct ra_interval *interval = removed->interval;
707
708 interval->physreg_start = dst;
709 interval->physreg_end = dst + removed->size;
710
711 assert(interval->physreg_end <= file->size);
712 if (interval->interval.reg->flags & IR3_REG_HALF)
713 assert(interval->physreg_end <= RA_HALF_SIZE);
714
715 ir3_reg_interval_reinsert(&file->reg_ctx, &interval->interval);
716 }
717
718 /* Pick up the interval and place it at "dst". */
719 static void
ra_move_interval(struct ra_ctx * ctx,struct ra_file * file,struct ra_interval * interval,physreg_t dst)720 ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
721 struct ra_interval *interval, physreg_t dst)
722 {
723 struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
724 ra_push_interval(ctx, file, &temp, dst);
725 }
726
727 static struct ra_file *
ra_get_file(struct ra_ctx * ctx,struct ir3_register * reg)728 ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
729 {
730 if (reg->flags & IR3_REG_SHARED)
731 return &ctx->shared;
732 else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
733 return &ctx->full;
734 else
735 return &ctx->half;
736 }
737
738
739 /* Returns true if the proposed spot for "dst" or a killed source overlaps a
740 * destination that's been allocated.
741 */
742 static bool
check_dst_overlap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,physreg_t start,physreg_t end)743 check_dst_overlap(struct ra_ctx *ctx, struct ra_file *file,
744 struct ir3_register *dst, physreg_t start,
745 physreg_t end)
746 {
747 struct ir3_instruction *instr = dst->instr;
748
749 ra_foreach_dst (other_dst, instr) {
750 /* We assume only destinations before the current one have been allocated.
751 */
752 if (other_dst == dst)
753 break;
754
755 if (ra_get_file(ctx, other_dst) != file)
756 continue;
757
758 struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
759 assert(!other_interval->interval.parent);
760 physreg_t other_start = other_interval->physreg_start;
761 physreg_t other_end = other_interval->physreg_end;
762
763 if (other_end > start && end > other_start)
764 return true;
765 }
766
767 return false;
768 }
769
770 /* True if the destination is "early-clobber," meaning that it cannot be
771 * allocated over killed sources. Some destinations always require it, but it
772 * also is implicitly true for tied destinations whose source is live-through.
773 * If the source is killed, then we skip allocating a register for the
774 * destination altogether so we don't need to worry about that case here.
775 */
776 static bool
is_early_clobber(struct ir3_register * reg)777 is_early_clobber(struct ir3_register *reg)
778 {
779 return (reg->flags & IR3_REG_EARLY_CLOBBER) || reg->tied;
780 }
781
782 static bool
get_reg_specified(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,bool is_source)783 get_reg_specified(struct ra_ctx *ctx, struct ra_file *file,
784 struct ir3_register *reg, physreg_t physreg, bool is_source)
785 {
786 for (unsigned i = 0; i < reg_size(reg); i++) {
787 if (!BITSET_TEST(is_early_clobber(reg) || is_source ?
788 file->available_to_evict : file->available,
789 physreg + i))
790 return false;
791 }
792
793 if (!is_source &&
794 check_dst_overlap(ctx, file, reg, physreg, physreg + reg_size(reg)))
795 return false;
796
797 return true;
798 }
799
800 /* Try to evict any registers conflicting with the proposed spot "physreg" for
801 * "reg". That is, move them to other places so that we can allocate "physreg"
802 * here.
803 */
804
805 static bool
try_evict_regs(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg,physreg_t physreg,unsigned * _eviction_count,bool is_source,bool speculative)806 try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
807 struct ir3_register *reg, physreg_t physreg,
808 unsigned *_eviction_count, bool is_source, bool speculative)
809 {
810 BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
811 memcpy(available_to_evict, file->available_to_evict,
812 sizeof(available_to_evict));
813
814 BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
815 memcpy(available, file->available, sizeof(available));
816
817 for (unsigned i = 0; i < reg_size(reg); i++) {
818 BITSET_CLEAR(available_to_evict, physreg + i);
819 BITSET_CLEAR(available, physreg + i);
820 }
821
822 unsigned eviction_count = 0;
823 /* Iterate over each range conflicting with physreg */
824 for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
825 *next = ra_interval_next_or_null(conflicting);
826 conflicting != NULL &&
827 conflicting->physreg_start < physreg + reg_size(reg);
828 conflicting = next, next = ra_interval_next_or_null(next)) {
829 if (!is_early_clobber(reg) && !is_source && conflicting->is_killed)
830 continue;
831
832 if (conflicting->frozen) {
833 assert(speculative);
834 return false;
835 }
836
837 unsigned conflicting_file_size =
838 reg_file_size(file, conflicting->interval.reg);
839 unsigned avail_start, avail_end;
840 bool evicted = false;
841 BITSET_FOREACH_RANGE (avail_start, avail_end, available_to_evict,
842 conflicting_file_size) {
843 unsigned size = avail_end - avail_start;
844
845 /* non-half registers must be aligned */
846 if (!(conflicting->interval.reg->flags & IR3_REG_HALF) &&
847 avail_start % 2 == 1) {
848 avail_start++;
849 size--;
850 }
851
852 unsigned conflicting_size =
853 conflicting->physreg_end - conflicting->physreg_start;
854 if (size >= conflicting_size &&
855 (is_source ||
856 !check_dst_overlap(ctx, file, reg, avail_start,
857 avail_start + conflicting_size))) {
858 for (unsigned i = 0;
859 i < conflicting->physreg_end - conflicting->physreg_start; i++)
860 BITSET_CLEAR(available_to_evict, avail_start + i);
861 eviction_count +=
862 conflicting->physreg_end - conflicting->physreg_start;
863 if (!speculative)
864 ra_move_interval(ctx, file, conflicting, avail_start);
865 evicted = true;
866 break;
867 }
868 }
869
870 if (evicted)
871 continue;
872
873 /* If we couldn't evict this range, but the register we're allocating is
874 * allowed to overlap with a killed range, then we may be able to swap it
875 * with a killed range to acheive the same effect.
876 */
877 if (is_early_clobber(reg) || is_source)
878 return false;
879
880 foreach_interval (killed, file) {
881 if (!killed->is_killed)
882 continue;
883
884 if (killed->physreg_end - killed->physreg_start !=
885 conflicting->physreg_end - conflicting->physreg_start)
886 continue;
887
888 if (killed->physreg_end > conflicting_file_size ||
889 conflicting->physreg_end > reg_file_size(file, killed->interval.reg))
890 continue;
891
892 /* We can't swap the killed range if it partially/fully overlaps the
893 * space we're trying to allocate or (in speculative mode) if it's
894 * already been swapped and will overlap when we actually evict.
895 */
896 bool killed_available = true;
897 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
898 if (!BITSET_TEST(available, i)) {
899 killed_available = false;
900 break;
901 }
902 }
903
904 if (!killed_available)
905 continue;
906
907 if (check_dst_overlap(ctx, file, reg, killed->physreg_start,
908 killed->physreg_end))
909 continue;
910
911 /* Check for alignment if one is a full reg */
912 if ((!(killed->interval.reg->flags & IR3_REG_HALF) ||
913 !(conflicting->interval.reg->flags & IR3_REG_HALF)) &&
914 (killed->physreg_start % 2 != 0 ||
915 conflicting->physreg_start % 2 != 0))
916 continue;
917
918 for (unsigned i = killed->physreg_start; i < killed->physreg_end; i++) {
919 BITSET_CLEAR(available, i);
920 }
921 /* Because this will generate swaps instead of moves, multiply the
922 * cost by 2.
923 */
924 eviction_count += (killed->physreg_end - killed->physreg_start) * 2;
925 if (!speculative) {
926 physreg_t killed_start = killed->physreg_start,
927 conflicting_start = conflicting->physreg_start;
928 struct ra_removed_interval killed_removed =
929 ra_pop_interval(ctx, file, killed);
930 struct ra_removed_interval conflicting_removed =
931 ra_pop_interval(ctx, file, conflicting);
932 ra_push_interval(ctx, file, &killed_removed, conflicting_start);
933 ra_push_interval(ctx, file, &conflicting_removed, killed_start);
934 }
935
936 evicted = true;
937 break;
938 }
939
940 if (!evicted)
941 return false;
942 }
943
944 *_eviction_count = eviction_count;
945 return true;
946 }
947
948 static int
removed_interval_cmp(const void * _i1,const void * _i2)949 removed_interval_cmp(const void *_i1, const void *_i2)
950 {
951 const struct ra_removed_interval *i1 = _i1;
952 const struct ra_removed_interval *i2 = _i2;
953
954 /* We sort the registers as follows:
955 *
956 * |------------------------------------------------------------------------------------------|
957 * | | | | | | |
958 * | Half | Half early-clobber | Half | Full | Full early-clobber | Full |
959 * | live-through | destination | killed | killed | destination | live-through |
960 * | | | | | | |
961 * |------------------------------------------------------------------------------------------|
962 * | |
963 * | Destination |
964 * | |
965 * |-----------------|
966 *
967 * Half-registers have to be first so that they stay in the low half of
968 * the register file. Then half and full killed must stay together so that
969 * there's a contiguous range where we can put the register. With this
970 * structure we should be able to accomodate any collection of intervals
971 * such that the total number of half components is within the half limit
972 * and the combined components are within the full limit.
973 */
974
975 unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
976 unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
977 if (i1_align > i2_align)
978 return 1;
979 if (i1_align < i2_align)
980 return -1;
981
982 if (i1_align == 1) {
983 if (i2->interval->is_killed)
984 return -1;
985 if (i1->interval->is_killed)
986 return 1;
987 } else {
988 if (i2->interval->is_killed)
989 return 1;
990 if (i1->interval->is_killed)
991 return -1;
992 }
993
994 return 0;
995 }
996
997 static int
dsts_cmp(const void * _i1,const void * _i2)998 dsts_cmp(const void *_i1, const void *_i2)
999 {
1000 struct ir3_register *i1 = *(struct ir3_register *const *) _i1;
1001 struct ir3_register *i2 = *(struct ir3_register *const *) _i2;
1002
1003 /* Treat tied destinations as-if they are live-through sources, and normal
1004 * destinations as killed sources.
1005 */
1006 unsigned i1_align = reg_elem_size(i1);
1007 unsigned i2_align = reg_elem_size(i2);
1008 if (i1_align > i2_align)
1009 return 1;
1010 if (i1_align < i2_align)
1011 return -1;
1012
1013 if (i1_align == 1) {
1014 if (!is_early_clobber(i2))
1015 return -1;
1016 if (!is_early_clobber(i1))
1017 return 1;
1018 } else {
1019 if (!is_early_clobber(i2))
1020 return 1;
1021 if (!is_early_clobber(i1))
1022 return -1;
1023 }
1024
1025 return 0;
1026 }
1027
1028 /* "Compress" all the live intervals so that there is enough space for the
1029 * destination register. As there can be gaps when a more-aligned interval
1030 * follows a less-aligned interval, this also sorts them to remove such
1031 * "padding", which may be required when space is very tight. This isn't
1032 * amazing, but should be used only as a last resort in case the register file
1033 * is almost full and badly fragmented.
1034 *
1035 * Return the physreg to use.
1036 */
1037 static physreg_t
compress_regs_left(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1038 compress_regs_left(struct ra_ctx *ctx, struct ra_file *file,
1039 struct ir3_register *reg)
1040 {
1041 unsigned reg_align = reg_elem_size(reg);
1042 DECLARE_ARRAY(struct ra_removed_interval, intervals);
1043 intervals_count = intervals_sz = 0;
1044 intervals = NULL;
1045
1046 DECLARE_ARRAY(struct ir3_register *, dsts);
1047 dsts_count = dsts_sz = 0;
1048 dsts = NULL;
1049 array_insert(ctx, dsts, reg);
1050 bool dst_inserted[reg->instr->dsts_count];
1051
1052 unsigned dst_size = reg->tied ? 0 : reg_size(reg);
1053 unsigned ec_dst_size = is_early_clobber(reg) ? reg_size(reg) : 0;
1054 unsigned half_dst_size = 0, ec_half_dst_size = 0;
1055 if (reg_align == 1) {
1056 half_dst_size = dst_size;
1057 ec_half_dst_size = ec_dst_size;
1058 }
1059
1060 unsigned removed_size = 0, removed_half_size = 0;
1061 unsigned removed_killed_size = 0, removed_killed_half_size = 0;
1062 unsigned file_size = reg_file_size(file, reg);
1063 physreg_t start_reg = 0;
1064
1065 foreach_interval_rev_safe (interval, file) {
1066 /* We'll check if we can compact the intervals starting here. */
1067 physreg_t candidate_start = interval->physreg_end;
1068
1069 /* Check if there are any other destinations we need to compact. */
1070 ra_foreach_dst_n (other_dst, n, reg->instr) {
1071 if (other_dst == reg)
1072 break;
1073 if (ra_get_file(ctx, other_dst) != file)
1074 continue;
1075 if (dst_inserted[n])
1076 continue;
1077
1078 struct ra_interval *other_interval = &ctx->intervals[other_dst->name];
1079 /* if the destination partially overlaps this interval, we need to
1080 * extend candidate_start to the end.
1081 */
1082 if (other_interval->physreg_start < candidate_start) {
1083 candidate_start = MAX2(candidate_start,
1084 other_interval->physreg_end);
1085 continue;
1086 }
1087
1088 dst_inserted[n] = true;
1089
1090 /* dst intervals with a tied killed source are considered attached to
1091 * that source. Don't actually insert them. This means we have to
1092 * update them below if their tied source moves.
1093 */
1094 if (other_dst->tied) {
1095 struct ra_interval *tied_interval =
1096 &ctx->intervals[other_dst->tied->def->name];
1097 if (tied_interval->is_killed)
1098 continue;
1099 }
1100
1101 d("popping destination %u physreg %u\n",
1102 other_interval->interval.reg->name,
1103 other_interval->physreg_start);
1104
1105 array_insert(ctx, dsts, other_dst);
1106 unsigned interval_size = reg_size(other_dst);
1107 if (is_early_clobber(other_dst)) {
1108 ec_dst_size += interval_size;
1109 if (other_interval->interval.reg->flags & IR3_REG_HALF)
1110 ec_half_dst_size += interval_size;
1111 } else {
1112 dst_size += interval_size;
1113 if (other_interval->interval.reg->flags & IR3_REG_HALF)
1114 half_dst_size += interval_size;
1115 }
1116 }
1117
1118 /* Check if we can sort the intervals *after* this one and have enough
1119 * space leftover to accomodate all intervals, keeping in mind that killed
1120 * sources overlap non-tied destinations. Also check that we have enough
1121 * space leftover for half-registers, if we're inserting a half-register
1122 * (otherwise we only shift any half-registers down so they should be
1123 * safe).
1124 */
1125 if (candidate_start + removed_size + ec_dst_size +
1126 MAX2(removed_killed_size, dst_size) <= file->size &&
1127 (reg_align != 1 ||
1128 candidate_start + removed_half_size + ec_half_dst_size +
1129 MAX2(removed_killed_half_size, half_dst_size) <= file_size)) {
1130 start_reg = candidate_start;
1131 break;
1132 }
1133
1134 /* We assume that all frozen intervals are at the start and that we
1135 * can avoid popping them.
1136 */
1137 assert(!interval->frozen);
1138
1139 /* Killed sources are different because they go at the end and can
1140 * overlap the register we're trying to add.
1141 */
1142 unsigned interval_size = interval->physreg_end - interval->physreg_start;
1143 if (interval->is_killed) {
1144 removed_killed_size += interval_size;
1145 if (interval->interval.reg->flags & IR3_REG_HALF)
1146 removed_killed_half_size += interval_size;
1147 } else {
1148 removed_size += interval_size;
1149 if (interval->interval.reg->flags & IR3_REG_HALF)
1150 removed_half_size += interval_size;
1151 }
1152
1153 /* Now that we've done the accounting, pop this off */
1154 d("popping interval %u physreg %u%s\n", interval->interval.reg->name,
1155 interval->physreg_start, interval->is_killed ? ", killed" : "");
1156 array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
1157 }
1158
1159 /* TODO: In addition to skipping registers at the beginning that are
1160 * well-packed, we should try to skip registers at the end.
1161 */
1162
1163 qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
1164 qsort(dsts, dsts_count, sizeof(*dsts), dsts_cmp);
1165
1166 physreg_t live_reg = start_reg;
1167 physreg_t dst_reg = (physreg_t)~0;
1168 physreg_t ret_reg = (physreg_t)~0;
1169 unsigned dst_index = 0;
1170 unsigned live_index = 0;
1171
1172 /* We have two lists of intervals to process, live intervals and destination
1173 * intervals. Process them in the order of the disgram in insert_cmp().
1174 */
1175 while (live_index < intervals_count || dst_index < dsts_count) {
1176 bool process_dst;
1177 if (live_index == intervals_count) {
1178 process_dst = true;
1179 } else if (dst_index == dsts_count) {
1180 process_dst = false;
1181 } else {
1182 struct ir3_register *dst = dsts[dst_index];
1183 struct ra_interval *live_interval = intervals[live_index].interval;
1184
1185 bool live_half = live_interval->interval.reg->flags & IR3_REG_HALF;
1186 bool live_killed = live_interval->is_killed;
1187 bool dst_half = dst->flags & IR3_REG_HALF;
1188 bool dst_early_clobber = is_early_clobber(dst);
1189
1190 if (live_half && !live_killed) {
1191 /* far-left of diagram. */
1192 process_dst = false;
1193 } else if (dst_half && dst_early_clobber) {
1194 /* mid-left of diagram. */
1195 process_dst = true;
1196 } else if (!dst_early_clobber) {
1197 /* bottom of disagram. */
1198 process_dst = true;
1199 } else if (live_killed) {
1200 /* middle of diagram. */
1201 process_dst = false;
1202 } else if (!dst_half && dst_early_clobber) {
1203 /* mid-right of diagram. */
1204 process_dst = true;
1205 } else {
1206 /* far right of diagram. */
1207 assert(!live_killed && !live_half);
1208 process_dst = false;
1209 }
1210 }
1211
1212 struct ir3_register *cur_reg =
1213 process_dst ? dsts[dst_index] :
1214 intervals[live_index].interval->interval.reg;
1215
1216 physreg_t physreg;
1217 if (process_dst && !is_early_clobber(cur_reg)) {
1218 if (dst_reg == (physreg_t)~0)
1219 dst_reg = live_reg;
1220 physreg = dst_reg;
1221 } else {
1222 physreg = live_reg;
1223 struct ra_interval *live_interval = intervals[live_index].interval;
1224 bool live_killed = live_interval->is_killed;
1225 /* If this is live-through and we've processed the destinations, we
1226 * need to make sure we take into account any overlapping destinations.
1227 */
1228 if (!live_killed && dst_reg != (physreg_t)~0)
1229 physreg = MAX2(physreg, dst_reg);
1230 }
1231
1232 if (!(cur_reg->flags & IR3_REG_HALF))
1233 physreg = ALIGN(physreg, 2);
1234
1235 d("pushing reg %u physreg %u\n", cur_reg->name, physreg);
1236
1237 unsigned interval_size = reg_size(cur_reg);
1238 if (physreg + interval_size >
1239 reg_file_size(file, cur_reg)) {
1240 d("ran out of room for interval %u!\n",
1241 cur_reg->name);
1242 unreachable("reg pressure calculation was wrong!");
1243 return 0;
1244 }
1245
1246 if (process_dst) {
1247 if (cur_reg == reg) {
1248 ret_reg = physreg;
1249 } else {
1250 struct ra_interval *interval = &ctx->intervals[cur_reg->name];
1251 interval->physreg_start = physreg;
1252 interval->physreg_end = physreg + interval_size;
1253 }
1254 dst_index++;
1255 } else {
1256 ra_push_interval(ctx, file, &intervals[live_index], physreg);
1257 live_index++;
1258 }
1259
1260 physreg += interval_size;
1261
1262 if (process_dst && !is_early_clobber(cur_reg)) {
1263 dst_reg = physreg;
1264 } else {
1265 live_reg = physreg;
1266 }
1267 }
1268
1269 /* If we shuffled around a tied source that is killed, we may have to update
1270 * its corresponding destination since we didn't insert it above.
1271 */
1272 ra_foreach_dst (dst, reg->instr) {
1273 if (dst == reg)
1274 break;
1275
1276 struct ir3_register *tied = dst->tied;
1277 if (!tied)
1278 continue;
1279
1280 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1281 if (!tied_interval->is_killed)
1282 continue;
1283
1284 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1285 unsigned dst_size = reg_size(dst);
1286 dst_interval->physreg_start = ra_interval_get_physreg(tied_interval);
1287 dst_interval->physreg_end = dst_interval->physreg_start + dst_size;
1288 }
1289
1290 return ret_reg;
1291 }
1292
1293 void
ra_update_affinity(unsigned file_size,struct ir3_register * reg,physreg_t physreg)1294 ra_update_affinity(unsigned file_size, struct ir3_register *reg,
1295 physreg_t physreg)
1296 {
1297 if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t)~0)
1298 return;
1299
1300 if (physreg < reg->merge_set_offset)
1301 return;
1302
1303 if ((physreg - reg->merge_set_offset + reg->merge_set->size) > file_size)
1304 return;
1305
1306 reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
1307 }
1308
1309 /* Try to find free space for a register without shuffling anything. This uses
1310 * a round-robin algorithm to reduce false dependencies.
1311 */
1312 static physreg_t
find_best_gap(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * dst,unsigned file_size,unsigned size,unsigned alignment)1313 find_best_gap(struct ra_ctx *ctx, struct ra_file *file,
1314 struct ir3_register *dst, unsigned file_size, unsigned size,
1315 unsigned alignment)
1316 {
1317 /* This can happen if we create a very large merge set. Just bail out in that
1318 * case.
1319 */
1320 if (size > file_size)
1321 return (physreg_t) ~0;
1322
1323 BITSET_WORD *available =
1324 is_early_clobber(dst) ? file->available_to_evict : file->available;
1325
1326 unsigned start = ALIGN(file->start, alignment) % (file_size - size + alignment);
1327 unsigned candidate = start;
1328 do {
1329 bool is_available = true;
1330 for (unsigned i = 0; i < size; i++) {
1331 if (!BITSET_TEST(available, candidate + i)) {
1332 is_available = false;
1333 break;
1334 }
1335 }
1336
1337 if (is_available) {
1338 is_available =
1339 !check_dst_overlap(ctx, file, dst, candidate, candidate + size);
1340 }
1341
1342 if (is_available) {
1343 file->start = (candidate + size) % file_size;
1344 return candidate;
1345 }
1346
1347 candidate += alignment;
1348 if (candidate + size > file_size)
1349 candidate = 0;
1350 } while (candidate != start);
1351
1352 return (physreg_t)~0;
1353 }
1354
1355 static physreg_t
try_allocate_src(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1356 try_allocate_src(struct ra_ctx *ctx, struct ra_file *file,
1357 struct ir3_register *reg)
1358 {
1359 unsigned file_size = reg_file_size(file, reg);
1360 unsigned size = reg_size(reg);
1361 for (unsigned i = 0; i < reg->instr->srcs_count; i++) {
1362 struct ir3_register *src = reg->instr->srcs[i];
1363 if (!ra_reg_is_src(src))
1364 continue;
1365 if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
1366 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1367 physreg_t src_physreg = ra_interval_get_physreg(src_interval);
1368 if (src_physreg % reg_elem_size(reg) == 0 &&
1369 src_physreg + size <= file_size &&
1370 get_reg_specified(ctx, file, reg, src_physreg, false))
1371 return src_physreg;
1372 }
1373 }
1374
1375 return ~0;
1376 }
1377
1378 static bool
rpt_has_unique_merge_set(struct ir3_instruction * instr)1379 rpt_has_unique_merge_set(struct ir3_instruction *instr)
1380 {
1381 assert(ir3_instr_is_rpt(instr));
1382
1383 if (!instr->dsts[0]->merge_set)
1384 return false;
1385
1386 struct ir3_instruction *first = ir3_instr_first_rpt(instr);
1387 struct ir3_register *def = first->dsts[0];
1388
1389 if (def->merge_set != instr->dsts[0]->merge_set ||
1390 def->merge_set->regs_count != ir3_instr_rpt_length(first)) {
1391 return false;
1392 }
1393
1394 unsigned i = 0;
1395
1396 foreach_instr_rpt (rpt, first) {
1397 if (rpt->dsts[0] != def->merge_set->regs[i++])
1398 return false;
1399 }
1400
1401 return true;
1402 }
1403
1404 /* This is the main entrypoint for picking a register. Pick a free register
1405 * for "reg", shuffling around sources if necessary. In the normal case where
1406 * "is_source" is false, this register can overlap with killed sources
1407 * (intervals with "is_killed == true"). If "is_source" is true, then
1408 * is_killed is ignored and the register returned must not overlap with killed
1409 * sources. This must be used for tied registers, because we're actually
1410 * allocating the destination and the tied source at the same time.
1411 */
1412
1413 static physreg_t
get_reg(struct ra_ctx * ctx,struct ra_file * file,struct ir3_register * reg)1414 get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg)
1415 {
1416 unsigned file_size = reg_file_size(file, reg);
1417 if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t)~0) {
1418 physreg_t preferred_reg =
1419 reg->merge_set->preferred_reg + reg->merge_set_offset;
1420 if (preferred_reg + reg_size(reg) <= file_size &&
1421 preferred_reg % reg_elem_size(reg) == 0 &&
1422 get_reg_specified(ctx, file, reg, preferred_reg, false))
1423 return preferred_reg;
1424 }
1425
1426 /* For repeated instructions whose merge set is unique (i.e., only used for
1427 * these repeated instructions), try to first allocate one of their sources
1428 * (for the same reason as for ALU/SFU instructions explained below). This
1429 * also prevents us from allocating a new register range for this merge set
1430 * when the one from a source could be reused.
1431 */
1432 if (ir3_instr_is_rpt(reg->instr) && rpt_has_unique_merge_set(reg->instr)) {
1433 physreg_t src_reg = try_allocate_src(ctx, file, reg);
1434 if (src_reg != (physreg_t)~0)
1435 return src_reg;
1436 }
1437
1438 /* If this register is a subset of a merge set which we have not picked a
1439 * register for, first try to allocate enough space for the entire merge
1440 * set.
1441 */
1442 unsigned size = reg_size(reg);
1443 if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
1444 size < reg->merge_set->size) {
1445 physreg_t best_reg = find_best_gap(ctx, file, reg, file_size,
1446 reg->merge_set->size,
1447 reg->merge_set->alignment);
1448 if (best_reg != (physreg_t)~0u) {
1449 best_reg += reg->merge_set_offset;
1450 return best_reg;
1451 }
1452 }
1453
1454 /* For ALU and SFU instructions, if the src reg is avail to pick, use it.
1455 * Because this doesn't introduce unnecessary dependencies, and it
1456 * potentially avoids needing (ss) syncs for write after read hazards for
1457 * SFU instructions:
1458 */
1459 if (is_sfu(reg->instr) || is_alu(reg->instr)) {
1460 physreg_t src_reg = try_allocate_src(ctx, file, reg);
1461 if (src_reg != (physreg_t)~0)
1462 return src_reg;
1463 }
1464
1465 physreg_t best_reg =
1466 find_best_gap(ctx, file, reg, file_size, size, reg_elem_size(reg));
1467 if (best_reg != (physreg_t)~0u) {
1468 return best_reg;
1469 }
1470
1471 /* Ok, we couldn't find anything that fits. Here is where we have to start
1472 * moving things around to make stuff fit. First try solely evicting
1473 * registers in the way.
1474 */
1475 unsigned best_eviction_count = ~0;
1476 for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
1477 unsigned eviction_count;
1478 if (try_evict_regs(ctx, file, reg, i, &eviction_count, false, true)) {
1479 if (eviction_count < best_eviction_count) {
1480 best_eviction_count = eviction_count;
1481 best_reg = i;
1482 }
1483 }
1484 }
1485
1486 if (best_eviction_count != ~0) {
1487 ASSERTED bool result = try_evict_regs(
1488 ctx, file, reg, best_reg, &best_eviction_count, false, false);
1489 assert(result);
1490 return best_reg;
1491 }
1492
1493 /* Use the dumb fallback only if try_evict_regs() fails. */
1494 return compress_regs_left(ctx, file, reg);
1495 }
1496
1497 static void
assign_reg(struct ir3_instruction * instr,struct ir3_register * reg,unsigned num)1498 assign_reg(struct ir3_instruction *instr, struct ir3_register *reg,
1499 unsigned num)
1500 {
1501 if (reg->flags & IR3_REG_ARRAY) {
1502 reg->array.base = num;
1503 if (reg->flags & IR3_REG_RELATIV)
1504 reg->array.offset += num;
1505 else
1506 reg->num = num + reg->array.offset;
1507 } else {
1508 reg->num = num;
1509 }
1510 }
1511
1512 static void
mark_src_killed(struct ra_ctx * ctx,struct ir3_register * src)1513 mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
1514 {
1515 struct ra_interval *interval = &ctx->intervals[src->def->name];
1516
1517 if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
1518 interval->interval.parent ||
1519 !rb_tree_is_empty(&interval->interval.children))
1520 return;
1521
1522 ra_file_mark_killed(ra_get_file(ctx, src), interval);
1523 }
1524
1525 static void
insert_dst(struct ra_ctx * ctx,struct ir3_register * dst)1526 insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1527 {
1528 struct ra_file *file = ra_get_file(ctx, dst);
1529 struct ra_interval *interval = &ctx->intervals[dst->name];
1530
1531 d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
1532
1533 if (!(dst->flags & IR3_REG_UNUSED))
1534 ra_file_insert(file, interval);
1535
1536 assign_reg(dst->instr, dst, ra_interval_get_num(interval));
1537 }
1538
1539 static void
allocate_dst_fixed(struct ra_ctx * ctx,struct ir3_register * dst,physreg_t physreg)1540 allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst,
1541 physreg_t physreg)
1542 {
1543 struct ra_file *file = ra_get_file(ctx, dst);
1544 struct ra_interval *interval = &ctx->intervals[dst->name];
1545 ra_update_affinity(file->size, dst, physreg);
1546
1547 ra_interval_init(interval, dst);
1548 interval->physreg_start = physreg;
1549 interval->physreg_end = physreg + reg_size(dst);
1550 }
1551
1552 /* If a tied destination interferes with its source register, we have to insert
1553 * a copy beforehand to copy the source to the destination. Because we are using
1554 * the parallel_copies array and not creating a separate copy, this copy will
1555 * happen in parallel with any shuffling around of the tied source, so we have
1556 * to copy the source *as it exists before it is shuffled around*. We do this by
1557 * inserting the copy early, before any other copies are inserted. We don't
1558 * actually know the destination of the copy, but that's ok because the
1559 * dst_interval will be filled out later.
1560 */
1561 static void
insert_tied_dst_copy(struct ra_ctx * ctx,struct ir3_register * dst)1562 insert_tied_dst_copy(struct ra_ctx *ctx, struct ir3_register *dst)
1563 {
1564 struct ir3_register *tied = dst->tied;
1565
1566 if (!tied)
1567 return;
1568
1569 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1570 struct ra_interval *dst_interval = &ctx->intervals[dst->name];
1571
1572 if (tied_interval->is_killed)
1573 return;
1574
1575 physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
1576
1577 array_insert(ctx, ctx->parallel_copies,
1578 (struct ra_parallel_copy){
1579 .interval = dst_interval,
1580 .src = tied_physreg,
1581 });
1582 }
1583
1584 static void
allocate_dst(struct ra_ctx * ctx,struct ir3_register * dst)1585 allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
1586 {
1587 struct ra_file *file = ra_get_file(ctx, dst);
1588
1589 struct ir3_register *tied = dst->tied;
1590 if (tied) {
1591 struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
1592 if (tied_interval->is_killed) {
1593 /* The easy case: the source is killed, so we can just reuse it
1594 * for the destination.
1595 */
1596 allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
1597 return;
1598 }
1599 }
1600
1601 /* All the hard work is done by get_reg here. */
1602 physreg_t physreg = get_reg(ctx, file, dst);
1603
1604 allocate_dst_fixed(ctx, dst, physreg);
1605 }
1606
1607 static void
assign_src(struct ra_ctx * ctx,struct ir3_instruction * instr,struct ir3_register * src)1608 assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr,
1609 struct ir3_register *src)
1610 {
1611 struct ra_interval *interval = &ctx->intervals[src->def->name];
1612 struct ra_file *file = ra_get_file(ctx, src);
1613
1614 struct ir3_register *tied = src->tied;
1615 physreg_t physreg;
1616 if (tied) {
1617 struct ra_interval *tied_interval = &ctx->intervals[tied->name];
1618 physreg = ra_interval_get_physreg(tied_interval);
1619 } else {
1620 physreg = ra_interval_get_physreg(interval);
1621 }
1622
1623 assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
1624
1625 if (src->flags & IR3_REG_FIRST_KILL)
1626 ra_file_remove(file, interval);
1627 }
1628
1629 /* Insert a parallel copy instruction before the instruction with the parallel
1630 * copy entries we've built up.
1631 */
1632 static void
insert_parallel_copy_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1633 insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1634 {
1635 if (ctx->parallel_copies_count == 0)
1636 return;
1637
1638 struct ir3_instruction *pcopy =
1639 ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY,
1640 ctx->parallel_copies_count, ctx->parallel_copies_count);
1641
1642 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1643 struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1644 struct ir3_register *reg =
1645 ir3_dst_create(pcopy, INVALID_REG,
1646 entry->interval->interval.reg->flags &
1647 (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
1648 reg->size = entry->interval->interval.reg->size;
1649 reg->wrmask = entry->interval->interval.reg->wrmask;
1650 assign_reg(pcopy, reg, ra_interval_get_num(entry->interval));
1651 }
1652
1653 for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
1654 struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
1655 struct ir3_register *reg =
1656 ir3_src_create(pcopy, INVALID_REG,
1657 entry->interval->interval.reg->flags &
1658 (IR3_REG_HALF | IR3_REG_ARRAY | IR3_REG_SHARED));
1659 reg->size = entry->interval->interval.reg->size;
1660 reg->wrmask = entry->interval->interval.reg->wrmask;
1661 assign_reg(pcopy, reg, ra_physreg_to_num(entry->src, reg->flags));
1662 }
1663
1664 list_del(&pcopy->node);
1665 list_addtail(&pcopy->node, &instr->node);
1666 ctx->parallel_copies_count = 0;
1667 }
1668
1669 static void
handle_normal_instr(struct ra_ctx * ctx,struct ir3_instruction * instr)1670 handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
1671 {
1672 /* First, mark sources as going-to-be-killed while allocating the dest. */
1673 ra_foreach_src (src, instr) {
1674 mark_src_killed(ctx, src);
1675 }
1676
1677 /* Pre-insert tied dst copies. */
1678 ra_foreach_dst (dst, instr) {
1679 insert_tied_dst_copy(ctx, dst);
1680 }
1681
1682 /* Allocate the destination. */
1683 ra_foreach_dst (dst, instr) {
1684 allocate_dst(ctx, dst);
1685 }
1686
1687 /* Now handle sources. Go backward so that in case there are multiple
1688 * sources with the same def and that def is killed we only remove it at
1689 * the end.
1690 */
1691 ra_foreach_src_rev (src, instr) {
1692 assign_src(ctx, instr, src);
1693 }
1694
1695 /* Now finally insert the destination into the map. */
1696 ra_foreach_dst (dst, instr) {
1697 insert_dst(ctx, dst);
1698 }
1699
1700 insert_parallel_copy_instr(ctx, instr);
1701 }
1702
1703 static void
handle_split(struct ra_ctx * ctx,struct ir3_instruction * instr)1704 handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
1705 {
1706 struct ir3_register *dst = instr->dsts[0];
1707 struct ir3_register *src = instr->srcs[0];
1708
1709 if (!(dst->flags & IR3_REG_SSA))
1710 return;
1711
1712 if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
1713 handle_normal_instr(ctx, instr);
1714 return;
1715 }
1716
1717 struct ra_interval *src_interval = &ctx->intervals[src->def->name];
1718
1719 physreg_t physreg = ra_interval_get_physreg(src_interval);
1720 assign_src(ctx, instr, src);
1721
1722 allocate_dst_fixed(
1723 ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
1724 insert_dst(ctx, dst);
1725 }
1726
1727 static void
handle_collect(struct ra_ctx * ctx,struct ir3_instruction * instr)1728 handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
1729 {
1730 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1731 return;
1732
1733 struct ir3_merge_set *dst_set = instr->dsts[0]->merge_set;
1734 unsigned dst_offset = instr->dsts[0]->merge_set_offset;
1735
1736 if (!dst_set || dst_set->regs_count == 1) {
1737 handle_normal_instr(ctx, instr);
1738 return;
1739 }
1740
1741 /* We need to check if any of the sources are contained in an interval
1742 * that is at least as large as the vector. In this case, we should put
1743 * the vector inside that larger interval. (There should be one
1744 * unambiguous place to put it, because values sharing the same merge set
1745 * should be allocated together.) This can happen in a case like:
1746 *
1747 * ssa_1 (wrmask=0xf) = ...
1748 * ssa_2 = split ssa_1 off:0
1749 * ssa_3 = split ssa_1 off:1
1750 * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
1751 * ... = (kill)ssa_1
1752 * ... = (kill)ssa_4
1753 *
1754 * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
1755 */
1756 physreg_t dst_fixed = (physreg_t)~0u;
1757
1758 ra_foreach_src (src, instr) {
1759 if (src->flags & IR3_REG_FIRST_KILL) {
1760 mark_src_killed(ctx, src);
1761 }
1762
1763 struct ra_interval *interval = &ctx->intervals[src->def->name];
1764
1765 /* We only need special handling if the source's interval overlaps with
1766 * the destination's interval.
1767 */
1768 if (src->def->interval_start >= instr->dsts[0]->interval_end ||
1769 instr->dsts[0]->interval_start >= src->def->interval_end ||
1770 interval->is_killed)
1771 continue;
1772
1773 while (interval->interval.parent != NULL) {
1774 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1775 }
1776 if (reg_size(interval->interval.reg) >= reg_size(instr->dsts[0])) {
1777 dst_fixed = interval->physreg_start -
1778 interval->interval.reg->merge_set_offset + dst_offset;
1779 } else {
1780 /* For sources whose root interval is smaller than the
1781 * destination (i.e. the normal case), we will shuffle them
1782 * around after allocating the destination. Mark them killed so
1783 * that the destination can be allocated over them, even if they
1784 * aren't actually killed.
1785 */
1786 ra_file_mark_killed(ra_get_file(ctx, src), interval);
1787 }
1788 }
1789
1790 if (dst_fixed != (physreg_t)~0u)
1791 allocate_dst_fixed(ctx, instr->dsts[0], dst_fixed);
1792 else
1793 allocate_dst(ctx, instr->dsts[0]);
1794
1795 /* Remove the temporary is_killed we added */
1796 ra_foreach_src (src, instr) {
1797 struct ra_interval *interval = &ctx->intervals[src->def->name];
1798 while (interval->interval.parent != NULL) {
1799 interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
1800 }
1801
1802 /* Filter out cases where it actually should be killed */
1803 if (interval != &ctx->intervals[src->def->name] ||
1804 !(src->flags & IR3_REG_KILL)) {
1805 ra_file_unmark_killed(ra_get_file(ctx, src), interval);
1806 }
1807 }
1808
1809 ra_foreach_src_rev (src, instr) {
1810 assign_src(ctx, instr, src);
1811 }
1812
1813 /* We need to do this before insert_dst(), so that children of the
1814 * destination which got marked as killed and then shuffled around to make
1815 * space for the destination have the correct pcopy destination that
1816 * matches what we assign the source of the collect to in assign_src().
1817 *
1818 * TODO: In this case we'll wind up copying the value in the pcopy and
1819 * then again in the collect. We could avoid one of those by updating the
1820 * pcopy destination to match up with the final location of the source
1821 * after the collect and making the collect a no-op. However this doesn't
1822 * seem to happen often.
1823 */
1824 insert_parallel_copy_instr(ctx, instr);
1825
1826 /* Note: insert_dst will automatically shuffle around any intervals that
1827 * are a child of the collect by making them children of the collect.
1828 */
1829
1830 insert_dst(ctx, instr->dsts[0]);
1831 }
1832
1833 /* Parallel copies before RA should only be at the end of the block, for
1834 * phi's. For these we only need to fill in the sources, and then we fill in
1835 * the destinations in the successor block.
1836 */
1837 static void
handle_pcopy(struct ra_ctx * ctx,struct ir3_instruction * instr)1838 handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
1839 {
1840 ra_foreach_src_rev (src, instr) {
1841 assign_src(ctx, instr, src);
1842 }
1843 }
1844
1845 /* Some inputs may need to be precolored. We need to handle those first, so
1846 * that other non-precolored inputs don't accidentally get allocated over
1847 * them. Inputs are the very first thing in the shader, so it shouldn't be a
1848 * problem to allocate them to a specific physreg.
1849 */
1850
1851 static void
handle_precolored_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1852 handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1853 {
1854 if (instr->dsts[0]->num == INVALID_REG ||
1855 !(instr->dsts[0]->flags & IR3_REG_SSA))
1856 return;
1857
1858 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1859 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1860 physreg_t physreg = ra_reg_get_physreg(instr->dsts[0]);
1861 allocate_dst_fixed(ctx, instr->dsts[0], physreg);
1862
1863 d("insert precolored dst %u physreg %u", instr->dsts[0]->name,
1864 ra_interval_get_physreg(interval));
1865
1866 ra_file_insert(file, interval);
1867 interval->frozen = true;
1868 }
1869
1870 static void
handle_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1871 handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1872 {
1873 if (instr->dsts[0]->num != INVALID_REG)
1874 return;
1875
1876 allocate_dst(ctx, instr->dsts[0]);
1877
1878 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1879 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1880 ra_file_insert(file, interval);
1881 }
1882
1883 static void
assign_input(struct ra_ctx * ctx,struct ir3_instruction * instr)1884 assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
1885 {
1886 if (!(instr->dsts[0]->flags & IR3_REG_SSA))
1887 return;
1888
1889 struct ra_interval *interval = &ctx->intervals[instr->dsts[0]->name];
1890 struct ra_file *file = ra_get_file(ctx, instr->dsts[0]);
1891
1892 if (instr->dsts[0]->num == INVALID_REG) {
1893 assign_reg(instr, instr->dsts[0], ra_interval_get_num(interval));
1894 } else {
1895 interval->frozen = false;
1896 }
1897
1898 if (instr->dsts[0]->flags & IR3_REG_UNUSED)
1899 ra_file_remove(file, interval);
1900
1901 ra_foreach_src_rev (src, instr)
1902 assign_src(ctx, instr, src);
1903 }
1904
1905 /* chmask is a bit weird, because it has pre-colored sources due to the need
1906 * to pass some registers to the next stage. Fortunately there are only at
1907 * most two, and there should be no other live values by the time we get to
1908 * this instruction, so we only have to do the minimum and don't need any
1909 * fancy fallbacks.
1910 *
1911 * TODO: Add more complete handling of precolored sources, e.g. for function
1912 * argument handling. We'd need a way to mark sources as fixed so that they
1913 * don't get moved around when placing other sources in the fallback case, and
1914 * a duplication of much of the logic in get_reg(). This also opens another
1915 * can of worms, e.g. what if the precolored source is a split of a vector
1916 * which is still live -- this breaks our assumption that splits don't incur
1917 * any "extra" register requirements and we'd have to break it out of the
1918 * parent ra_interval.
1919 */
1920
1921 static void
handle_precolored_source(struct ra_ctx * ctx,struct ir3_register * src)1922 handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
1923 {
1924 struct ra_file *file = ra_get_file(ctx, src);
1925 struct ra_interval *interval = &ctx->intervals[src->def->name];
1926 physreg_t physreg = ra_reg_get_physreg(src);
1927
1928 if (ra_interval_get_num(interval) == src->num)
1929 return;
1930
1931 /* Try evicting stuff in our way if it isn't free. This won't move
1932 * anything unless it overlaps with our precolored physreg, so we don't
1933 * have to worry about evicting other precolored sources.
1934 */
1935 if (!get_reg_specified(ctx, file, src, physreg, true)) {
1936 unsigned eviction_count;
1937 if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true,
1938 false)) {
1939 unreachable("failed to evict for precolored source!");
1940 return;
1941 }
1942 }
1943
1944 ra_move_interval(ctx, file, interval, physreg);
1945 }
1946
1947 static void
handle_chmask(struct ra_ctx * ctx,struct ir3_instruction * instr)1948 handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
1949 {
1950 /* Note: we purposely don't mark sources as killed, so that we can reuse
1951 * some of the get_reg() machinery as-if the source is a destination.
1952 * Marking it as killed would make e.g. get_reg_specified() wouldn't work
1953 * correctly.
1954 */
1955 ra_foreach_src (src, instr) {
1956 assert(src->num != INVALID_REG);
1957 handle_precolored_source(ctx, src);
1958 }
1959
1960 ra_foreach_src (src, instr) {
1961 struct ra_file *file = ra_get_file(ctx, src);
1962 struct ra_interval *interval = &ctx->intervals[src->def->name];
1963 if (src->flags & IR3_REG_FIRST_KILL)
1964 ra_file_remove(file, interval);
1965 }
1966
1967 insert_parallel_copy_instr(ctx, instr);
1968 }
1969
1970 static physreg_t
read_register(struct ra_ctx * ctx,struct ir3_block * block,struct ir3_register * def)1971 read_register(struct ra_ctx *ctx, struct ir3_block *block,
1972 struct ir3_register *def)
1973 {
1974 struct ra_block_state *state = &ctx->blocks[block->index];
1975 if (state->renames) {
1976 struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
1977 if (entry) {
1978 return (physreg_t)(uintptr_t)entry->data;
1979 }
1980 }
1981
1982 return ra_reg_get_physreg(def);
1983 }
1984
1985 static void
handle_live_in(struct ra_ctx * ctx,struct ir3_register * def)1986 handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
1987 {
1988 physreg_t physreg = ~0;
1989 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
1990 struct ir3_block *pred = ctx->block->predecessors[i];
1991 struct ra_block_state *pred_state = &ctx->blocks[pred->index];
1992
1993 if (!pred_state->visited)
1994 continue;
1995
1996 physreg = read_register(ctx, pred, def);
1997 break;
1998 }
1999
2000 assert(physreg != (physreg_t)~0);
2001
2002 struct ra_interval *interval = &ctx->intervals[def->name];
2003 struct ra_file *file = ra_get_file(ctx, def);
2004 ra_interval_init(interval, def);
2005 interval->physreg_start = physreg;
2006 interval->physreg_end = physreg + reg_size(def);
2007 ra_file_insert(file, interval);
2008 }
2009
2010 static void
handle_live_out(struct ra_ctx * ctx,struct ir3_register * def)2011 handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
2012 {
2013 /* Skip parallelcopy's which in the original program are only used as phi
2014 * arguments. Even though phi arguments are live out, they are only
2015 * assigned when the phi is.
2016 */
2017 if (def->instr->opc == OPC_META_PARALLEL_COPY)
2018 return;
2019
2020 struct ra_block_state *state = &ctx->blocks[ctx->block->index];
2021 struct ra_interval *interval = &ctx->intervals[def->name];
2022 physreg_t physreg = ra_interval_get_physreg(interval);
2023 if (physreg != ra_reg_get_physreg(def)) {
2024 if (!state->renames)
2025 state->renames = _mesa_pointer_hash_table_create(ctx);
2026 _mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
2027 }
2028 }
2029
2030 static void
handle_phi(struct ra_ctx * ctx,struct ir3_register * def)2031 handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
2032 {
2033 if (!(def->flags & IR3_REG_SSA))
2034 return;
2035
2036 struct ra_file *file = ra_get_file(ctx, def);
2037 struct ra_interval *interval = &ctx->intervals[def->name];
2038
2039 /* phis are always scalar, so they should already be the smallest possible
2040 * size. However they may be coalesced with other live-in values/phi
2041 * nodes, so check for that here.
2042 */
2043 struct ir3_reg_interval *parent_ir3 =
2044 ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
2045 physreg_t physreg;
2046 if (parent_ir3) {
2047 struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
2048 physreg = ra_interval_get_physreg(parent) +
2049 (def->interval_start - parent_ir3->reg->interval_start);
2050 } else {
2051 physreg = get_reg(ctx, file, def);
2052 }
2053
2054 allocate_dst_fixed(ctx, def, physreg);
2055
2056 ra_file_insert(file, interval);
2057 }
2058
2059 static void
assign_phi(struct ra_ctx * ctx,struct ir3_instruction * phi)2060 assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
2061 {
2062 if (!(phi->dsts[0]->flags & IR3_REG_SSA))
2063 return;
2064
2065 struct ra_file *file = ra_get_file(ctx, phi->dsts[0]);
2066 struct ra_interval *interval = &ctx->intervals[phi->dsts[0]->name];
2067 assert(!interval->interval.parent);
2068 unsigned num = ra_interval_get_num(interval);
2069 assign_reg(phi, phi->dsts[0], num);
2070
2071 /* Assign the parallelcopy sources of this phi */
2072 for (unsigned i = 0; i < phi->srcs_count; i++) {
2073 if (phi->srcs[i]->def) {
2074 assign_reg(phi, phi->srcs[i], num);
2075 assign_reg(phi, phi->srcs[i]->def, num);
2076 }
2077 }
2078
2079 if (phi->dsts[0]->flags & IR3_REG_UNUSED)
2080 ra_file_remove(file, interval);
2081 }
2082
2083 /* When we split a live range, we sometimes need to emit fixup code at the end
2084 * of a block. For example, something like:
2085 *
2086 * a = ...
2087 * if (...) {
2088 * ...
2089 * a' = a
2090 * b = ... // a evicted to make room for b
2091 * ...
2092 * }
2093 * ... = a
2094 *
2095 * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
2096 * to insert another copy "a = a'" at the end of the if. Normally this would
2097 * also entail adding a phi node, but since we're about to go out of SSA
2098 * anyway we just insert an extra move. Note, however, that "b" might be used
2099 * in a phi node at the end of the if and share registers with "a", so we
2100 * have to be careful to extend any preexisting parallelcopy instruction
2101 * instead of creating our own in order to guarantee that they properly get
2102 * swapped.
2103 */
2104
2105 static void
insert_liveout_copy(struct ir3_block * block,physreg_t dst,physreg_t src,struct ir3_register * reg)2106 insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
2107 struct ir3_register *reg)
2108 {
2109 struct ir3_instruction *old_pcopy = NULL;
2110 struct ir3_instruction *last = ir3_block_get_last_non_terminator(block);
2111
2112 if (last && last->opc == OPC_META_PARALLEL_COPY)
2113 old_pcopy = last;
2114
2115 unsigned old_pcopy_srcs = old_pcopy ? old_pcopy->srcs_count : 0;
2116 struct ir3_instruction *pcopy = ir3_instr_create(
2117 block, OPC_META_PARALLEL_COPY, old_pcopy_srcs + 1, old_pcopy_srcs + 1);
2118
2119 for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2120 old_pcopy->dsts[i]->instr = pcopy;
2121 pcopy->dsts[pcopy->dsts_count++] = old_pcopy->dsts[i];
2122 }
2123
2124 unsigned flags = reg->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
2125
2126 struct ir3_register *dst_reg = ir3_dst_create(pcopy, INVALID_REG, flags);
2127 dst_reg->wrmask = reg->wrmask;
2128 dst_reg->size = reg->size;
2129 assign_reg(pcopy, dst_reg, ra_physreg_to_num(dst, reg->flags));
2130
2131 for (unsigned i = 0; i < old_pcopy_srcs; i++) {
2132 pcopy->srcs[pcopy->srcs_count++] = old_pcopy->srcs[i];
2133 }
2134
2135 struct ir3_register *src_reg = ir3_src_create(pcopy, INVALID_REG, flags);
2136 src_reg->wrmask = reg->wrmask;
2137 src_reg->size = reg->size;
2138 assign_reg(pcopy, src_reg, ra_physreg_to_num(src, reg->flags));
2139
2140 if (old_pcopy)
2141 list_del(&old_pcopy->node);
2142 }
2143
2144 static void
insert_live_in_move(struct ra_ctx * ctx,struct ra_interval * interval)2145 insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
2146 {
2147 physreg_t physreg = ra_interval_get_physreg(interval);
2148
2149 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2150 struct ir3_block *pred = ctx->block->predecessors[i];
2151 struct ra_block_state *pred_state = &ctx->blocks[pred->index];
2152
2153 if (!pred_state->visited)
2154 continue;
2155
2156 physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
2157 if (pred_reg != physreg) {
2158 assert(!(interval->interval.reg->flags & IR3_REG_SHARED));
2159 insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
2160 }
2161 }
2162 }
2163
2164 static void
insert_file_live_in_moves(struct ra_ctx * ctx,struct ra_file * file)2165 insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
2166 {
2167 BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
2168 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2169 physreg_node) {
2170 /* Skip phi nodes. This needs to happen after phi nodes are allocated,
2171 * because we may have to move live-ins around to make space for phi
2172 * nodes, but we shouldn't be handling phi nodes here.
2173 */
2174 if (BITSET_TEST(live_in, interval->interval.reg->name))
2175 insert_live_in_move(ctx, interval);
2176 }
2177 }
2178
2179 static void
insert_entry_regs(struct ra_block_state * state,struct ra_file * file)2180 insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
2181 {
2182 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2183 physreg_node) {
2184 _mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
2185 (void *)(uintptr_t)interval->physreg_start);
2186 }
2187 }
2188
2189 static void
insert_live_in_moves(struct ra_ctx * ctx)2190 insert_live_in_moves(struct ra_ctx *ctx)
2191 {
2192 insert_file_live_in_moves(ctx, &ctx->full);
2193 insert_file_live_in_moves(ctx, &ctx->half);
2194 insert_file_live_in_moves(ctx, &ctx->shared);
2195
2196 /* If not all predecessors are visited, insert live-in regs so that
2197 * insert_live_out_moves() will work.
2198 */
2199 bool all_preds_visited = true;
2200 for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
2201 if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
2202 all_preds_visited = false;
2203 break;
2204 }
2205 }
2206
2207 if (!all_preds_visited) {
2208 struct ra_block_state *state = &ctx->blocks[ctx->block->index];
2209 state->entry_regs = _mesa_pointer_hash_table_create(ctx);
2210
2211 insert_entry_regs(state, &ctx->full);
2212 insert_entry_regs(state, &ctx->half);
2213 insert_entry_regs(state, &ctx->shared);
2214 }
2215 }
2216
2217 static void
insert_live_out_move(struct ra_ctx * ctx,struct ra_interval * interval)2218 insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
2219 {
2220 for (unsigned i = 0; i < 2; i++) {
2221 if (!ctx->block->successors[i])
2222 continue;
2223
2224 struct ir3_block *succ = ctx->block->successors[i];
2225 struct ra_block_state *succ_state = &ctx->blocks[succ->index];
2226
2227 if (!succ_state->visited)
2228 continue;
2229
2230 struct hash_entry *entry = _mesa_hash_table_search(
2231 succ_state->entry_regs, interval->interval.reg);
2232 if (!entry)
2233 continue;
2234
2235 physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
2236 if (new_reg != interval->physreg_start) {
2237 insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
2238 interval->interval.reg);
2239 }
2240 }
2241 }
2242
2243 static void
insert_file_live_out_moves(struct ra_ctx * ctx,struct ra_file * file)2244 insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
2245 {
2246 rb_tree_foreach (struct ra_interval, interval, &file->physreg_intervals,
2247 physreg_node) {
2248 insert_live_out_move(ctx, interval);
2249 }
2250 }
2251
2252 static void
insert_live_out_moves(struct ra_ctx * ctx)2253 insert_live_out_moves(struct ra_ctx *ctx)
2254 {
2255 insert_file_live_out_moves(ctx, &ctx->full);
2256 insert_file_live_out_moves(ctx, &ctx->half);
2257 insert_file_live_out_moves(ctx, &ctx->shared);
2258 }
2259
2260 static void
handle_block(struct ra_ctx * ctx,struct ir3_block * block)2261 handle_block(struct ra_ctx *ctx, struct ir3_block *block)
2262 {
2263 ctx->block = block;
2264
2265 /* Reset the register files from the last block */
2266 ra_file_init(&ctx->full);
2267 ra_file_init(&ctx->half);
2268 ra_file_init(&ctx->shared);
2269
2270 /* Handle live-ins, phis, and input meta-instructions. These all appear
2271 * live at the beginning of the block, and interfere with each other
2272 * therefore need to be allocated "in parallel". This means that we
2273 * have to allocate all of them, inserting them into the file, and then
2274 * delay updating the IR until all of them are allocated.
2275 *
2276 * Handle precolored inputs first, because we need to make sure that other
2277 * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
2278 * and inputs at the same time, because the first block doesn't have
2279 * predecessors. Therefore handle_live_in doesn't have to worry about
2280 * them.
2281 */
2282
2283 foreach_instr (instr, &block->instr_list) {
2284 if (instr->opc == OPC_META_INPUT)
2285 handle_precolored_input(ctx, instr);
2286 else
2287 break;
2288 }
2289
2290 unsigned name;
2291 BITSET_FOREACH_SET (name, ctx->live->live_in[block->index],
2292 ctx->live->definitions_count) {
2293 struct ir3_register *reg = ctx->live->definitions[name];
2294 handle_live_in(ctx, reg);
2295 }
2296
2297 foreach_instr (instr, &block->instr_list) {
2298 if (instr->opc == OPC_META_PHI)
2299 handle_phi(ctx, instr->dsts[0]);
2300 else if (instr->opc == OPC_META_INPUT ||
2301 instr->opc == OPC_META_TEX_PREFETCH)
2302 handle_input(ctx, instr);
2303 else
2304 break;
2305 }
2306
2307 /* After this point, every live-in/phi/input has an interval assigned to
2308 * it. We delay actually assigning values until everything has been
2309 * allocated, so we can simply ignore any parallel copy entries created
2310 * when shuffling them around.
2311 */
2312 ctx->parallel_copies_count = 0;
2313
2314 insert_live_in_moves(ctx);
2315
2316 if (RA_DEBUG) {
2317 d("after live-in block %u:\n", block->index);
2318 ra_ctx_dump(ctx);
2319 }
2320
2321 /* Now we're done with processing live-ins, and can handle the body of the
2322 * block.
2323 */
2324 foreach_instr (instr, &block->instr_list) {
2325 di(instr, "processing");
2326
2327 if (instr->opc == OPC_META_PHI)
2328 assign_phi(ctx, instr);
2329 else if (instr->opc == OPC_META_INPUT ||
2330 instr->opc == OPC_META_TEX_PREFETCH)
2331 assign_input(ctx, instr);
2332 else if (instr->opc == OPC_META_SPLIT)
2333 handle_split(ctx, instr);
2334 else if (instr->opc == OPC_META_COLLECT)
2335 handle_collect(ctx, instr);
2336 else if (instr->opc == OPC_META_PARALLEL_COPY)
2337 handle_pcopy(ctx, instr);
2338 else if (instr->opc == OPC_CHMASK)
2339 handle_chmask(ctx, instr);
2340 else
2341 handle_normal_instr(ctx, instr);
2342
2343 if (RA_DEBUG)
2344 ra_ctx_dump(ctx);
2345 }
2346
2347 insert_live_out_moves(ctx);
2348
2349 BITSET_FOREACH_SET (name, ctx->live->live_out[block->index],
2350 ctx->live->definitions_count) {
2351 struct ir3_register *reg = ctx->live->definitions[name];
2352 handle_live_out(ctx, reg);
2353 }
2354
2355 ctx->blocks[block->index].visited = true;
2356 }
2357
2358 static unsigned
calc_target_full_pressure(struct ir3_shader_variant * v,unsigned pressure)2359 calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
2360 {
2361 /* Registers are allocated in units of vec4, so switch from units of
2362 * half-regs to vec4.
2363 */
2364 unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
2365
2366 bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
2367
2368 unsigned target = reg_count;
2369 unsigned reg_independent_max_waves =
2370 ir3_get_reg_independent_max_waves(v, double_threadsize);
2371 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves(
2372 v->compiler, reg_count, double_threadsize);
2373 unsigned target_waves =
2374 MIN2(reg_independent_max_waves, reg_dependent_max_waves);
2375
2376 while (target <= RA_FULL_SIZE / (2 * 4) &&
2377 ir3_should_double_threadsize(v, target) == double_threadsize &&
2378 ir3_get_reg_dependent_max_waves(v->compiler, target,
2379 double_threadsize) >= target_waves)
2380 target++;
2381
2382 return (target - 1) * 2 * 4;
2383 }
2384
2385 static void
add_pressure(struct ir3_pressure * pressure,struct ir3_register * reg,bool merged_regs)2386 add_pressure(struct ir3_pressure *pressure, struct ir3_register *reg,
2387 bool merged_regs)
2388 {
2389 unsigned size = reg_size(reg);
2390 if (reg->flags & IR3_REG_HALF)
2391 pressure->half += size;
2392 if (!(reg->flags & IR3_REG_HALF) || merged_regs)
2393 pressure->full += size;
2394 }
2395
2396 static void
dummy_interval_add(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2397 dummy_interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2398 {
2399 }
2400
2401 static void
dummy_interval_delete(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * interval)2402 dummy_interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
2403 {
2404 }
2405
2406 static void
dummy_interval_readd(struct ir3_reg_ctx * ctx,struct ir3_reg_interval * parent,struct ir3_reg_interval * child)2407 dummy_interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *parent,
2408 struct ir3_reg_interval *child)
2409 {
2410 }
2411
2412 /* Calculate the minimum possible limit on register pressure so that spilling
2413 * still succeeds. Used to implement IR3_SHADER_DEBUG=spillall.
2414 */
2415
2416 static void
calc_min_limit_pressure(struct ir3_shader_variant * v,struct ir3_liveness * live,struct ir3_pressure * limit)2417 calc_min_limit_pressure(struct ir3_shader_variant *v,
2418 struct ir3_liveness *live,
2419 struct ir3_pressure *limit)
2420 {
2421 struct ir3_block *start = ir3_start_block(v->ir);
2422 struct ir3_reg_ctx *ctx = ralloc(NULL, struct ir3_reg_ctx);
2423 struct ir3_reg_interval *intervals =
2424 rzalloc_array(ctx, struct ir3_reg_interval, live->definitions_count);
2425
2426 ctx->interval_add = dummy_interval_add;
2427 ctx->interval_delete = dummy_interval_delete;
2428 ctx->interval_readd = dummy_interval_readd;
2429
2430 limit->full = limit->half = 0;
2431
2432 struct ir3_pressure cur_pressure = {0};
2433 foreach_instr (input, &start->instr_list) {
2434 if (input->opc != OPC_META_INPUT &&
2435 input->opc != OPC_META_TEX_PREFETCH)
2436 break;
2437
2438 add_pressure(&cur_pressure, input->dsts[0], v->mergedregs);
2439 }
2440
2441 limit->full = MAX2(limit->full, cur_pressure.full);
2442 limit->half = MAX2(limit->half, cur_pressure.half);
2443
2444 foreach_instr (input, &start->instr_list) {
2445 if (input->opc != OPC_META_INPUT &&
2446 input->opc != OPC_META_TEX_PREFETCH)
2447 break;
2448
2449 /* pre-colored inputs may have holes, which increases the pressure. */
2450 struct ir3_register *dst = input->dsts[0];
2451 if (dst->num != INVALID_REG) {
2452 unsigned physreg = ra_reg_get_physreg(dst) + reg_size(dst);
2453 if (dst->flags & IR3_REG_HALF)
2454 limit->half = MAX2(limit->half, physreg);
2455 if (!(dst->flags & IR3_REG_HALF) || v->mergedregs)
2456 limit->full = MAX2(limit->full, physreg);
2457 }
2458 }
2459
2460 foreach_block (block, &v->ir->block_list) {
2461 rb_tree_init(&ctx->intervals);
2462
2463 unsigned name;
2464 BITSET_FOREACH_SET (name, live->live_in[block->index],
2465 live->definitions_count) {
2466 struct ir3_register *reg = live->definitions[name];
2467 ir3_reg_interval_init(&intervals[reg->name], reg);
2468 ir3_reg_interval_insert(ctx, &intervals[reg->name]);
2469 }
2470
2471 foreach_instr (instr, &block->instr_list) {
2472 ra_foreach_dst (dst, instr) {
2473 ir3_reg_interval_init(&intervals[dst->name], dst);
2474 }
2475 /* phis and parallel copies can be deleted via spilling */
2476
2477 if (instr->opc == OPC_META_PHI) {
2478 ir3_reg_interval_insert(ctx, &intervals[instr->dsts[0]->name]);
2479 continue;
2480 }
2481
2482 if (instr->opc == OPC_META_PARALLEL_COPY)
2483 continue;
2484
2485 cur_pressure = (struct ir3_pressure) {0};
2486
2487 ra_foreach_dst (dst, instr) {
2488 if ((dst->tied && !(dst->tied->flags & IR3_REG_KILL)) ||
2489 (dst->flags & IR3_REG_EARLY_CLOBBER))
2490 add_pressure(&cur_pressure, dst, v->mergedregs);
2491 }
2492
2493 ra_foreach_src_rev (src, instr) {
2494 /* We currently don't support spilling the parent of a source when
2495 * making space for sources, so we have to keep track of the
2496 * intervals and figure out the root of the tree to figure out how
2497 * much space we need.
2498 *
2499 * TODO: We should probably support this in the spiller.
2500 */
2501 struct ir3_reg_interval *interval = &intervals[src->def->name];
2502 while (interval->parent)
2503 interval = interval->parent;
2504 add_pressure(&cur_pressure, interval->reg, v->mergedregs);
2505
2506 if (src->flags & IR3_REG_FIRST_KILL)
2507 ir3_reg_interval_remove(ctx, &intervals[src->def->name]);
2508 }
2509
2510 limit->full = MAX2(limit->full, cur_pressure.full);
2511 limit->half = MAX2(limit->half, cur_pressure.half);
2512
2513 cur_pressure = (struct ir3_pressure) {0};
2514
2515 ra_foreach_dst (dst, instr) {
2516 ir3_reg_interval_init(&intervals[dst->name], dst);
2517 ir3_reg_interval_insert(ctx, &intervals[dst->name]);
2518 add_pressure(&cur_pressure, dst, v->mergedregs);
2519 }
2520
2521 limit->full = MAX2(limit->full, cur_pressure.full);
2522 limit->half = MAX2(limit->half, cur_pressure.half);
2523 }
2524 }
2525
2526 /* Account for the base register, which needs to be available everywhere. */
2527 limit->full += 2;
2528
2529 ralloc_free(ctx);
2530 }
2531
2532 /*
2533 * If barriers are used, it must be possible for all waves in the workgroup
2534 * to execute concurrently. Thus we may have to reduce the registers limit.
2535 */
2536 static void
calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant * v,struct ir3_pressure * limit_pressure)2537 calc_limit_pressure_for_cs_with_barrier(struct ir3_shader_variant *v,
2538 struct ir3_pressure *limit_pressure)
2539 {
2540 const struct ir3_compiler *compiler = v->compiler;
2541
2542 unsigned threads_per_wg;
2543 if (v->local_size_variable) {
2544 /* We have to expect the worst case. */
2545 threads_per_wg = compiler->max_variable_workgroup_size;
2546 } else {
2547 threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
2548 }
2549
2550 /* The register file is grouped into reg_size_vec4 number of parts.
2551 * Each part has enough registers to add a single vec4 register to
2552 * each thread of a single-sized wave-pair. With double threadsize
2553 * each wave-pair would consume two parts of the register file to get
2554 * a single vec4 for a thread. The more active wave-pairs the less
2555 * parts each could get.
2556 */
2557
2558 bool double_threadsize = ir3_should_double_threadsize(v, 0);
2559 unsigned waves_per_wg = DIV_ROUND_UP(
2560 threads_per_wg, compiler->threadsize_base * (double_threadsize ? 2 : 1) *
2561 compiler->wave_granularity);
2562
2563 uint32_t vec4_regs_per_thread =
2564 compiler->reg_size_vec4 / (waves_per_wg * (double_threadsize ? 2 : 1));
2565 assert(vec4_regs_per_thread > 0);
2566
2567 uint32_t half_regs_per_thread = vec4_regs_per_thread * 4 * 2;
2568
2569 if (limit_pressure->full > half_regs_per_thread) {
2570 if (v->mergedregs) {
2571 limit_pressure->full = half_regs_per_thread;
2572 } else {
2573 /* TODO: Handle !mergedregs case, probably we would have to do this
2574 * after the first register pressure pass.
2575 */
2576 }
2577 }
2578 }
2579
2580 int
ir3_ra(struct ir3_shader_variant * v)2581 ir3_ra(struct ir3_shader_variant *v)
2582 {
2583 ir3_calc_dominance(v->ir);
2584
2585 /* Predicate RA needs dominance. */
2586 ir3_ra_predicates(v);
2587
2588 ir3_create_parallel_copies(v->ir);
2589
2590 struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
2591
2592 ctx->merged_regs = v->mergedregs;
2593 ctx->compiler = v->compiler;
2594 ctx->stage = v->type;
2595
2596 struct ir3_liveness *live = ir3_calc_liveness(ctx, v->ir);
2597
2598 ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
2599
2600 ir3_index_instrs_for_merge_sets(v->ir);
2601 ir3_merge_regs(live, v->ir);
2602
2603 bool has_shared_vectors = false;
2604 foreach_block (block, &v->ir->block_list) {
2605 foreach_instr (instr, &block->instr_list) {
2606 ra_foreach_dst (dst, instr) {
2607 if ((dst->flags & IR3_REG_SHARED) && reg_elems(dst) > 1) {
2608 has_shared_vectors = true;
2609 break;
2610 }
2611 }
2612 }
2613 }
2614
2615 struct ir3_pressure max_pressure;
2616 ir3_calc_pressure(v, live, &max_pressure);
2617 d("max pressure:");
2618 d("\tfull: %u", max_pressure.full);
2619 d("\thalf: %u", max_pressure.half);
2620 d("\tshared: %u", max_pressure.shared);
2621
2622 struct ir3_pressure limit_pressure;
2623 limit_pressure.full = RA_FULL_SIZE;
2624 limit_pressure.half = RA_HALF_SIZE;
2625 limit_pressure.shared = RA_SHARED_SIZE;
2626 limit_pressure.shared_half = RA_SHARED_HALF_SIZE;
2627
2628 if (gl_shader_stage_is_compute(v->type) && v->has_barrier) {
2629 calc_limit_pressure_for_cs_with_barrier(v, &limit_pressure);
2630 }
2631
2632 /* If the user forces a doubled threadsize, we may have to lower the limit
2633 * because on some gens the register file is not big enough to hold a
2634 * double-size wave with all 48 registers in use.
2635 */
2636 if (v->shader_options.real_wavesize == IR3_DOUBLE_ONLY) {
2637 limit_pressure.full =
2638 MAX2(limit_pressure.full, ctx->compiler->reg_size_vec4 / 2 * 16);
2639 }
2640
2641 /* If requested, lower the limit so that spilling happens more often. */
2642 if (ir3_shader_debug & IR3_DBG_SPILLALL)
2643 calc_min_limit_pressure(v, live, &limit_pressure);
2644
2645 d("limit pressure:");
2646 d("\tfull: %u", limit_pressure.full);
2647 d("\thalf: %u", limit_pressure.half);
2648 d("\tshared: %u", limit_pressure.shared);
2649
2650 /* In the worst case, each half register could block one full register, so
2651 * add shared_half in case of fragmentation. In addition, full registers can
2652 * block half registers so we have to consider the total pressure against the
2653 * half limit to prevent live range splitting when we run out of space for
2654 * half registers in the bottom half.
2655 */
2656 if (max_pressure.shared + max_pressure.shared_half > limit_pressure.shared ||
2657 (max_pressure.shared_half > 0 && max_pressure.shared > limit_pressure.shared_half) ||
2658 has_shared_vectors) {
2659 ir3_ra_shared(v, &live);
2660 ir3_calc_pressure(v, live, &max_pressure);
2661
2662 ir3_debug_print(v->ir, "AFTER: shared register allocation");
2663 }
2664
2665 bool spilled = false;
2666 if (max_pressure.full > limit_pressure.full ||
2667 max_pressure.half > limit_pressure.half) {
2668 if (!v->compiler->has_pvtmem) {
2669 d("max pressure exceeded!");
2670 goto fail;
2671 }
2672 d("max pressure exceeded, spilling!");
2673 IR3_PASS(v->ir, ir3_spill, v, &live, &limit_pressure);
2674 ir3_calc_pressure(v, live, &max_pressure);
2675
2676 d("max pressure after spilling:");
2677 d("\tfull: %u", max_pressure.full);
2678 d("\thalf: %u", max_pressure.half);
2679 d("\tshared: %u", max_pressure.shared);
2680
2681 assert(max_pressure.full <= limit_pressure.full &&
2682 max_pressure.half <= limit_pressure.half);
2683 spilled = true;
2684 }
2685
2686 ctx->live = live;
2687 ctx->intervals =
2688 rzalloc_array(ctx, struct ra_interval, live->definitions_count);
2689 ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
2690
2691 ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
2692 d("full size: %u", ctx->full.size);
2693
2694 if (!v->mergedregs)
2695 ctx->half.size = RA_HALF_SIZE;
2696
2697 ctx->shared.size = RA_SHARED_SIZE;
2698
2699 ctx->full.start = ctx->half.start = ctx->shared.start = 0;
2700
2701 foreach_block (block, &v->ir->block_list)
2702 handle_block(ctx, block);
2703
2704 ir3_ra_validate(v, ctx->full.size, ctx->half.size, live->block_count, false);
2705
2706 /* Strip array-ness and SSA-ness at the end, because various helpers still
2707 * need to work even on definitions that have already been assigned. For
2708 * example, we need to preserve array-ness so that array live-ins have the
2709 * right size.
2710 */
2711 foreach_block (block, &v->ir->block_list) {
2712 foreach_instr (instr, &block->instr_list) {
2713 for (unsigned i = 0; i < instr->dsts_count; i++) {
2714 instr->dsts[i]->flags &= ~IR3_REG_SSA;
2715
2716 /* Parallel copies of array registers copy the whole register, and
2717 * we need some way to let the parallel copy code know that this was
2718 * an array whose size is determined by reg->size. So keep the array
2719 * flag on those. spill/reload also need to work on the entire
2720 * array.
2721 */
2722 if (!is_meta(instr) && instr->opc != OPC_RELOAD_MACRO)
2723 instr->dsts[i]->flags &= ~IR3_REG_ARRAY;
2724 }
2725
2726 for (unsigned i = 0; i < instr->srcs_count; i++) {
2727 instr->srcs[i]->flags &= ~IR3_REG_SSA;
2728
2729 if (!is_meta(instr) && instr->opc != OPC_SPILL_MACRO)
2730 instr->srcs[i]->flags &= ~IR3_REG_ARRAY;
2731 }
2732 }
2733 }
2734
2735 ir3_debug_print(v->ir, "AFTER: register allocation");
2736
2737 if (spilled) {
2738 IR3_PASS(v->ir, ir3_lower_spill);
2739 }
2740
2741 ir3_lower_copies(v);
2742
2743 ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
2744
2745 ralloc_free(ctx);
2746
2747 return 0;
2748 fail:
2749 ralloc_free(ctx);
2750 return -1;
2751 }
2752