1 /*
2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
3 * Copyright (C) 2019 Collabora, Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/u_math.h"
26 #include "util/u_memory.h"
27 #include "compiler.h"
28 #include "midgard_ops.h"
29 #include "midgard_quirks.h"
30
31 struct phys_reg {
32 /* Physical register: 0-31 */
33 unsigned reg;
34
35 /* Byte offset into the physical register: 0-15 */
36 unsigned offset;
37
38 /* log2(bytes per component) for fast mul/div */
39 unsigned shift;
40 };
41
42 /* Shift up by reg_offset and horizontally by dst_offset. */
43
44 static void
offset_swizzle(unsigned * swizzle,unsigned reg_offset,unsigned srcshift,unsigned dstshift,unsigned dst_offset)45 offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcshift,
46 unsigned dstshift, unsigned dst_offset)
47 {
48 unsigned out[MIR_VEC_COMPONENTS];
49
50 signed reg_comp = reg_offset >> srcshift;
51 signed dst_comp = dst_offset >> dstshift;
52
53 unsigned max_component = (16 >> srcshift) - 1;
54
55 assert(reg_comp << srcshift == reg_offset);
56 assert(dst_comp << dstshift == dst_offset);
57
58 for (signed c = 0; c < MIR_VEC_COMPONENTS; ++c) {
59 signed comp = MAX2(c - dst_comp, 0);
60 out[c] = MIN2(swizzle[comp] + reg_comp, max_component);
61 }
62
63 memcpy(swizzle, out, sizeof(out));
64 }
65
66 /* Helper to return the default phys_reg for a given register */
67
68 static struct phys_reg
default_phys_reg(int reg,unsigned shift)69 default_phys_reg(int reg, unsigned shift)
70 {
71 struct phys_reg r = {
72 .reg = reg,
73 .offset = 0,
74 .shift = shift,
75 };
76
77 return r;
78 }
79
80 /* Determine which physical register, swizzle, and mask a virtual
81 * register corresponds to */
82
83 static struct phys_reg
index_to_reg(compiler_context * ctx,struct lcra_state * l,unsigned reg,unsigned shift)84 index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg,
85 unsigned shift)
86 {
87 /* Check for special cases */
88 if (reg == ~0)
89 return default_phys_reg(REGISTER_UNUSED, shift);
90 else if (reg >= SSA_FIXED_MINIMUM)
91 return default_phys_reg(SSA_REG_FROM_FIXED(reg), shift);
92 else if (!l)
93 return default_phys_reg(REGISTER_UNUSED, shift);
94
95 struct phys_reg r = {
96 .reg = l->solutions[reg] / 16,
97 .offset = l->solutions[reg] & 0xF,
98 .shift = shift,
99 };
100
101 /* Report that we actually use this register, and return it */
102
103 if (r.reg < 16)
104 ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, r.reg + 1);
105
106 return r;
107 }
108
109 static void
set_class(unsigned * classes,unsigned node,unsigned class)110 set_class(unsigned *classes, unsigned node, unsigned class)
111 {
112 if (node < SSA_FIXED_MINIMUM && class != classes[node]) {
113 assert(classes[node] == REG_CLASS_WORK);
114 classes[node] = class;
115 }
116 }
117
118 /* Special register classes impose special constraints on who can read their
119 * values, so check that */
120
121 static bool ASSERTED
check_read_class(unsigned * classes,unsigned tag,unsigned node)122 check_read_class(unsigned *classes, unsigned tag, unsigned node)
123 {
124 /* Non-nodes are implicitly ok */
125 if (node >= SSA_FIXED_MINIMUM)
126 return true;
127
128 switch (classes[node]) {
129 case REG_CLASS_LDST:
130 return (tag == TAG_LOAD_STORE_4);
131 case REG_CLASS_TEXR:
132 return (tag == TAG_TEXTURE_4);
133 case REG_CLASS_TEXW:
134 return (tag != TAG_LOAD_STORE_4);
135 case REG_CLASS_WORK:
136 return IS_ALU(tag);
137 default:
138 unreachable("Invalid class");
139 }
140 }
141
142 static bool ASSERTED
check_write_class(unsigned * classes,unsigned tag,unsigned node)143 check_write_class(unsigned *classes, unsigned tag, unsigned node)
144 {
145 /* Non-nodes are implicitly ok */
146 if (node >= SSA_FIXED_MINIMUM)
147 return true;
148
149 switch (classes[node]) {
150 case REG_CLASS_TEXR:
151 return true;
152 case REG_CLASS_TEXW:
153 return (tag == TAG_TEXTURE_4);
154 case REG_CLASS_LDST:
155 case REG_CLASS_WORK:
156 return IS_ALU(tag) || (tag == TAG_LOAD_STORE_4);
157 default:
158 unreachable("Invalid class");
159 }
160 }
161
162 /* Prepass before RA to ensure special class restrictions are met. The idea is
163 * to create a bit field of types of instructions that read a particular index.
164 * Later, we'll add moves as appropriate and rewrite to specialize by type. */
165
166 static void
mark_node_class(unsigned * bitfield,unsigned node)167 mark_node_class(unsigned *bitfield, unsigned node)
168 {
169 if (node < SSA_FIXED_MINIMUM)
170 BITSET_SET(bitfield, node);
171 }
172
173 void
mir_lower_special_reads(compiler_context * ctx)174 mir_lower_special_reads(compiler_context *ctx)
175 {
176 mir_compute_temp_count(ctx);
177 size_t sz = BITSET_WORDS(ctx->temp_count) * sizeof(BITSET_WORD);
178
179 /* Bitfields for the various types of registers we could have. aluw can
180 * be written by either ALU or load/store */
181
182 unsigned *alur = calloc(sz, 1);
183 unsigned *aluw = calloc(sz, 1);
184 unsigned *brar = calloc(sz, 1);
185 unsigned *ldst = calloc(sz, 1);
186 unsigned *texr = calloc(sz, 1);
187 unsigned *texw = calloc(sz, 1);
188
189 /* Pass #1 is analysis, a linear scan to fill out the bitfields */
190
191 mir_foreach_instr_global(ctx, ins) {
192 switch (ins->type) {
193 case TAG_ALU_4:
194 mark_node_class(aluw, ins->dest);
195 mark_node_class(alur, ins->src[0]);
196 mark_node_class(alur, ins->src[1]);
197 mark_node_class(alur, ins->src[2]);
198
199 if (ins->compact_branch && ins->writeout)
200 mark_node_class(brar, ins->src[0]);
201
202 break;
203
204 case TAG_LOAD_STORE_4:
205 mark_node_class(aluw, ins->dest);
206 mark_node_class(ldst, ins->src[0]);
207 mark_node_class(ldst, ins->src[1]);
208 mark_node_class(ldst, ins->src[2]);
209 mark_node_class(ldst, ins->src[3]);
210 break;
211
212 case TAG_TEXTURE_4:
213 mark_node_class(texr, ins->src[0]);
214 mark_node_class(texr, ins->src[1]);
215 mark_node_class(texr, ins->src[2]);
216 mark_node_class(texw, ins->dest);
217 break;
218
219 default:
220 break;
221 }
222 }
223
224 /* Pass #2 is lowering now that we've analyzed all the classes.
225 * Conceptually, if an index is only marked for a single type of use,
226 * there is nothing to lower. If it is marked for different uses, we
227 * split up based on the number of types of uses. To do so, we divide
228 * into N distinct classes of use (where N>1 by definition), emit N-1
229 * moves from the index to copies of the index, and finally rewrite N-1
230 * of the types of uses to use the corresponding move */
231
232 unsigned spill_idx = ctx->temp_count;
233
234 for (unsigned i = 0; i < ctx->temp_count; ++i) {
235 bool is_alur = BITSET_TEST(alur, i);
236 bool is_aluw = BITSET_TEST(aluw, i);
237 bool is_brar = BITSET_TEST(brar, i);
238 bool is_ldst = BITSET_TEST(ldst, i);
239 bool is_texr = BITSET_TEST(texr, i);
240 bool is_texw = BITSET_TEST(texw, i);
241
242 /* Analyse to check how many distinct uses there are. ALU ops
243 * (alur) can read the results of the texture pipeline (texw)
244 * but not ldst or texr. Load/store ops (ldst) cannot read
245 * anything but load/store inputs. Texture pipeline cannot read
246 * anything but texture inputs. TODO: Simplify. */
247
248 bool collision = (is_alur && (is_ldst || is_texr)) ||
249 (is_ldst && (is_alur || is_texr || is_texw)) ||
250 (is_texr && (is_alur || is_ldst || is_texw)) ||
251 (is_texw && (is_aluw || is_ldst || is_texr)) ||
252 (is_brar && is_texw);
253
254 if (!collision)
255 continue;
256
257 /* Use the index as-is as the work copy. Emit copies for
258 * special uses */
259
260 unsigned classes[] = {TAG_LOAD_STORE_4, TAG_TEXTURE_4, TAG_TEXTURE_4,
261 TAG_ALU_4};
262 bool collisions[] = {is_ldst, is_texr, is_texw && is_aluw, is_brar};
263
264 for (unsigned j = 0; j < ARRAY_SIZE(collisions); ++j) {
265 if (!collisions[j])
266 continue;
267
268 /* When the hazard is from reading, we move and rewrite
269 * sources (typical case). When it's from writing, we
270 * flip the move and rewrite destinations (obscure,
271 * only from control flow -- impossible in SSA) */
272
273 bool hazard_write = (j == 2);
274
275 unsigned idx = spill_idx++;
276
277 /* Insert move before each read/write, depending on the
278 * hazard we're trying to account for */
279
280 mir_foreach_block(ctx, block_) {
281 midgard_block *block = (midgard_block *)block_;
282 midgard_instruction *mov = NULL;
283
284 mir_foreach_instr_in_block_safe(block, pre_use) {
285 if (pre_use->type != classes[j])
286 continue;
287
288 if (hazard_write) {
289 if (pre_use->dest != i)
290 continue;
291
292 midgard_instruction m = v_mov(idx, i);
293 m.dest_type = pre_use->dest_type;
294 m.src_types[1] = m.dest_type;
295 m.mask = pre_use->mask;
296
297 midgard_instruction *use = mir_next_op(pre_use);
298 assert(use);
299 mir_insert_instruction_before(ctx, use, m);
300 mir_rewrite_index_dst_single(pre_use, i, idx);
301 } else {
302 if (!mir_has_arg(pre_use, i))
303 continue;
304
305 unsigned mask = mir_from_bytemask(
306 mir_round_bytemask_up(
307 mir_bytemask_of_read_components(pre_use, i), 32),
308 32);
309
310 if (mov == NULL || !mir_is_ssa(i)) {
311 midgard_instruction m = v_mov(i, spill_idx++);
312 m.mask = mask;
313 mov = mir_insert_instruction_before(ctx, pre_use, m);
314 } else {
315 mov->mask |= mask;
316 }
317
318 mir_rewrite_index_src_single(pre_use, i, mov->dest);
319 }
320 }
321 }
322 }
323 }
324
325 free(alur);
326 free(aluw);
327 free(brar);
328 free(ldst);
329 free(texr);
330 free(texw);
331 }
332
333 static void
mir_compute_interference(compiler_context * ctx,struct lcra_state * l)334 mir_compute_interference(compiler_context *ctx, struct lcra_state *l)
335 {
336 /* First, we need liveness information to be computed per block */
337 mir_compute_liveness(ctx);
338
339 /* We need to force r1.w live throughout a blend shader */
340
341 if (ctx->inputs->is_blend) {
342 unsigned r1w = ~0;
343
344 mir_foreach_block(ctx, _block) {
345 midgard_block *block = (midgard_block *)_block;
346 mir_foreach_instr_in_block_rev(block, ins) {
347 if (ins->writeout)
348 r1w = ins->dest;
349 }
350
351 if (r1w != ~0)
352 break;
353 }
354
355 mir_foreach_instr_global(ctx, ins) {
356 if (ins->dest < ctx->temp_count)
357 lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), r1w,
358 0xF);
359 }
360 }
361
362 /* Now that every block has live_in/live_out computed, we can determine
363 * interference by walking each block linearly. Take live_out at the
364 * end of each block and walk the block backwards. */
365
366 mir_foreach_block(ctx, _blk) {
367 midgard_block *blk = (midgard_block *)_blk;
368
369 /* The scalar and vector units run in parallel. We need to make
370 * sure they don't write to same portion of the register file
371 * otherwise the result is undefined. Add interferences to
372 * avoid this situation.
373 */
374 util_dynarray_foreach(&blk->bundles, midgard_bundle, bundle) {
375 midgard_instruction *instrs[2][4];
376 unsigned instr_count[2] = {0, 0};
377
378 for (unsigned i = 0; i < bundle->instruction_count; i++) {
379 if (bundle->instructions[i]->unit == UNIT_VMUL ||
380 bundle->instructions[i]->unit == UNIT_SADD)
381 instrs[0][instr_count[0]++] = bundle->instructions[i];
382 else
383 instrs[1][instr_count[1]++] = bundle->instructions[i];
384 }
385
386 for (unsigned i = 0; i < ARRAY_SIZE(instr_count); i++) {
387 for (unsigned j = 0; j < instr_count[i]; j++) {
388 midgard_instruction *ins_a = instrs[i][j];
389
390 if (ins_a->dest >= ctx->temp_count)
391 continue;
392
393 for (unsigned k = j + 1; k < instr_count[i]; k++) {
394 midgard_instruction *ins_b = instrs[i][k];
395
396 if (ins_b->dest >= ctx->temp_count)
397 continue;
398
399 lcra_add_node_interference(l, ins_b->dest,
400 mir_bytemask(ins_b), ins_a->dest,
401 mir_bytemask(ins_a));
402 }
403 }
404 }
405 }
406
407 uint16_t *live =
408 mem_dup(_blk->live_out, ctx->temp_count * sizeof(uint16_t));
409
410 mir_foreach_instr_in_block_rev(blk, ins) {
411 /* Mark all registers live after the instruction as
412 * interfering with the destination */
413
414 unsigned dest = ins->dest;
415
416 if (dest < ctx->temp_count) {
417 for (unsigned i = 0; i < ctx->temp_count; ++i) {
418 if (live[i]) {
419 unsigned mask = mir_bytemask(ins);
420 lcra_add_node_interference(l, dest, mask, i, live[i]);
421 }
422 }
423 }
424
425 /* Add blend shader interference: blend shaders might
426 * clobber r0-r3. */
427 if (ins->compact_branch && ins->writeout) {
428 for (unsigned i = 0; i < ctx->temp_count; ++i) {
429 if (!live[i])
430 continue;
431
432 for (unsigned j = 0; j < 4; j++) {
433 lcra_add_node_interference(l, ctx->temp_count + j, 0xFFFF, i,
434 live[i]);
435 }
436 }
437 }
438
439 /* Update live_in */
440 mir_liveness_ins_update(live, ins, ctx->temp_count);
441 }
442
443 free(live);
444 }
445 }
446
447 static bool
mir_is_64(midgard_instruction * ins)448 mir_is_64(midgard_instruction *ins)
449 {
450 if (nir_alu_type_get_type_size(ins->dest_type) == 64)
451 return true;
452
453 mir_foreach_src(ins, v) {
454 if (nir_alu_type_get_type_size(ins->src_types[v]) == 64)
455 return true;
456 }
457
458 return false;
459 }
460
461 /*
462 * Determine if a shader needs a contiguous workgroup. This impacts register
463 * allocation. TODO: Optimize if barriers and local memory are unused.
464 */
465 static bool
needs_contiguous_workgroup(compiler_context * ctx)466 needs_contiguous_workgroup(compiler_context *ctx)
467 {
468 return gl_shader_stage_uses_workgroup(ctx->stage);
469 }
470
471 /*
472 * Determine an upper-bound on the number of threads in a workgroup. The GL
473 * driver reports 128 for the maximum number of threads (the minimum-maximum in
474 * OpenGL ES 3.1), so we pessimistically assume 128 threads for variable
475 * workgroups.
476 */
477 static unsigned
max_threads_per_workgroup(compiler_context * ctx)478 max_threads_per_workgroup(compiler_context *ctx)
479 {
480 if (ctx->nir->info.workgroup_size_variable) {
481 return 128;
482 } else {
483 return ctx->nir->info.workgroup_size[0] *
484 ctx->nir->info.workgroup_size[1] *
485 ctx->nir->info.workgroup_size[2];
486 }
487 }
488
489 /*
490 * Calculate the maximum number of work registers available to the shader.
491 * Architecturally, Midgard shaders may address up to 16 work registers, but
492 * various features impose other limits:
493 *
494 * 1. Blend shaders are limited to 8 registers by ABI.
495 * 2. If there are more than 8 register-mapped uniforms, then additional
496 * register-mapped uniforms use space that otherwise would be used for work
497 * registers.
498 * 3. If more than 4 registers are used, at most 128 threads may be spawned. If
499 * more than 8 registers are used, at most 64 threads may be spawned. These
500 * limits are architecturally visible in compute kernels that require an
501 * entire workgroup to be spawned at once (for barriers or local memory to
502 * work properly).
503 */
504 static unsigned
max_work_registers(compiler_context * ctx)505 max_work_registers(compiler_context *ctx)
506 {
507 if (ctx->inputs->is_blend)
508 return 8;
509
510 unsigned rmu_vec4 = ctx->info->push.count / 4;
511 unsigned max_work_registers = (rmu_vec4 >= 8) ? (24 - rmu_vec4) : 16;
512
513 if (needs_contiguous_workgroup(ctx)) {
514 unsigned threads = max_threads_per_workgroup(ctx);
515 assert(threads <= 128 && "maximum threads in ABI exceeded");
516
517 if (threads > 64)
518 max_work_registers = MIN2(max_work_registers, 8);
519 }
520
521 return max_work_registers;
522 }
523
524 /* This routine performs the actual register allocation. It should be succeeded
525 * by install_registers */
526
527 static struct lcra_state *
allocate_registers(compiler_context * ctx,bool * spilled)528 allocate_registers(compiler_context *ctx, bool *spilled)
529 {
530 int work_count = max_work_registers(ctx);
531
532 /* No register allocation to do with no SSA */
533 mir_compute_temp_count(ctx);
534 if (!ctx->temp_count)
535 return NULL;
536
537 /* Initialize LCRA. Allocate extra node at the end for r1-r3 for
538 * interference */
539
540 struct lcra_state *l = lcra_alloc_equations(ctx->temp_count + 4, 5);
541 unsigned node_r1 = ctx->temp_count + 1;
542
543 /* Starts of classes, in bytes */
544 l->class_start[REG_CLASS_WORK] = 16 * 0;
545 l->class_start[REG_CLASS_LDST] = 16 * 26;
546 l->class_start[REG_CLASS_TEXR] = 16 * 28;
547 l->class_start[REG_CLASS_TEXW] = 16 * 28;
548
549 l->class_size[REG_CLASS_WORK] = 16 * work_count;
550 l->class_size[REG_CLASS_LDST] = 16 * 2;
551 l->class_size[REG_CLASS_TEXR] = 16 * 2;
552 l->class_size[REG_CLASS_TEXW] = 16 * 2;
553
554 lcra_set_disjoint_class(l, REG_CLASS_TEXR, REG_CLASS_TEXW);
555
556 /* To save space on T*20, we don't have real texture registers.
557 * Instead, tex inputs reuse the load/store pipeline registers, and
558 * tex outputs use work r0/r1. Note we still use TEXR/TEXW classes,
559 * noting that this handles interferences and sizes correctly. */
560
561 if (ctx->quirks & MIDGARD_INTERPIPE_REG_ALIASING) {
562 l->class_start[REG_CLASS_TEXR] = l->class_start[REG_CLASS_LDST];
563 l->class_start[REG_CLASS_TEXW] = l->class_start[REG_CLASS_WORK];
564 }
565
566 unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count);
567 unsigned *min_alignment = calloc(sizeof(unsigned), ctx->temp_count);
568 unsigned *min_bound = calloc(sizeof(unsigned), ctx->temp_count);
569
570 mir_foreach_instr_global(ctx, ins) {
571 /* Swizzles of 32-bit sources on 64-bit instructions need to be
572 * aligned to either bottom (xy) or top (zw). More general
573 * swizzle lowering should happen prior to scheduling (TODO),
574 * but once we get RA we shouldn't disrupt this further. Align
575 * sources of 64-bit instructions. */
576
577 if (ins->type == TAG_ALU_4 && mir_is_64(ins)) {
578 mir_foreach_src(ins, v) {
579 unsigned s = ins->src[v];
580
581 if (s < ctx->temp_count)
582 min_alignment[s] = MAX2(3, min_alignment[s]);
583 }
584 }
585
586 if (ins->type == TAG_LOAD_STORE_4 && OP_HAS_ADDRESS(ins->op)) {
587 mir_foreach_src(ins, v) {
588 unsigned s = ins->src[v];
589 unsigned size = nir_alu_type_get_type_size(ins->src_types[v]);
590
591 if (s < ctx->temp_count)
592 min_alignment[s] = MAX2((size == 64) ? 3 : 2, min_alignment[s]);
593 }
594 }
595
596 /* Anything read as 16-bit needs proper alignment to ensure the
597 * resulting code can be packed.
598 */
599 mir_foreach_src(ins, s) {
600 unsigned src_size = nir_alu_type_get_type_size(ins->src_types[s]);
601 if (src_size == 16 && ins->src[s] < SSA_FIXED_MINIMUM)
602 min_bound[ins->src[s]] = MAX2(min_bound[ins->src[s]], 8);
603 }
604
605 /* Everything after this concerns only the destination, not the
606 * sources.
607 */
608 if (ins->dest >= SSA_FIXED_MINIMUM)
609 continue;
610
611 unsigned size = nir_alu_type_get_type_size(ins->dest_type);
612
613 if (ins->is_pack)
614 size = 32;
615
616 /* 0 for x, 1 for xy, 2 for xyz, 3 for xyzw */
617 int comps1 = util_logbase2(ins->mask);
618
619 int bytes = (comps1 + 1) * (size / 8);
620
621 /* Use the largest class if there's ambiguity, this
622 * handles partial writes */
623
624 int dest = ins->dest;
625 found_class[dest] = MAX2(found_class[dest], bytes);
626
627 min_alignment[dest] =
628 MAX2(min_alignment[dest], (size == 16) ? 1 : /* (1 << 1) = 2-byte */
629 (size == 32) ? 2
630 : /* (1 << 2) = 4-byte */
631 (size == 64) ? 3
632 : /* (1 << 3) = 8-byte */
633 3); /* 8-bit todo */
634
635 /* We can't cross xy/zw boundaries. TODO: vec8 can */
636 if (size == 16 && min_alignment[dest] != 4)
637 min_bound[dest] = 8;
638
639 /* We don't have a swizzle for the conditional and we don't
640 * want to muck with the conditional itself, so just force
641 * alignment for now */
642
643 if (ins->type == TAG_ALU_4 && OP_IS_CSEL_V(ins->op)) {
644 min_alignment[dest] = 4; /* 1 << 4= 16-byte = vec4 */
645
646 /* LCRA assumes bound >= alignment */
647 min_bound[dest] = 16;
648 }
649
650 /* Since ld/st swizzles and masks are 32-bit only, we need them
651 * aligned to enable final packing */
652 if (ins->type == TAG_LOAD_STORE_4)
653 min_alignment[dest] = MAX2(min_alignment[dest], 2);
654 }
655
656 for (unsigned i = 0; i < ctx->temp_count; ++i) {
657 lcra_set_alignment(l, i, min_alignment[i] ? min_alignment[i] : 2,
658 min_bound[i] ? min_bound[i] : 16);
659 lcra_restrict_range(l, i, found_class[i]);
660 }
661
662 free(found_class);
663 free(min_alignment);
664 free(min_bound);
665
666 /* Next, we'll determine semantic class. We default to zero (work).
667 * But, if we're used with a special operation, that will force us to a
668 * particular class. Each node must be assigned to exactly one class; a
669 * prepass before RA should have lowered what-would-have-been
670 * multiclass nodes into a series of moves to break it up into multiple
671 * nodes (TODO) */
672
673 mir_foreach_instr_global(ctx, ins) {
674 /* Check if this operation imposes any classes */
675
676 if (ins->type == TAG_LOAD_STORE_4) {
677 set_class(l->class, ins->src[0], REG_CLASS_LDST);
678 set_class(l->class, ins->src[1], REG_CLASS_LDST);
679 set_class(l->class, ins->src[2], REG_CLASS_LDST);
680 set_class(l->class, ins->src[3], REG_CLASS_LDST);
681
682 if (OP_IS_VEC4_ONLY(ins->op)) {
683 lcra_restrict_range(l, ins->dest, 16);
684 lcra_restrict_range(l, ins->src[0], 16);
685 lcra_restrict_range(l, ins->src[1], 16);
686 lcra_restrict_range(l, ins->src[2], 16);
687 lcra_restrict_range(l, ins->src[3], 16);
688 }
689 } else if (ins->type == TAG_TEXTURE_4) {
690 set_class(l->class, ins->dest, REG_CLASS_TEXW);
691 set_class(l->class, ins->src[0], REG_CLASS_TEXR);
692 set_class(l->class, ins->src[1], REG_CLASS_TEXR);
693 set_class(l->class, ins->src[2], REG_CLASS_TEXR);
694 set_class(l->class, ins->src[3], REG_CLASS_TEXR);
695 }
696 }
697
698 /* Check that the semantics of the class are respected */
699 mir_foreach_instr_global(ctx, ins) {
700 assert(check_write_class(l->class, ins->type, ins->dest));
701 assert(check_read_class(l->class, ins->type, ins->src[0]));
702 assert(check_read_class(l->class, ins->type, ins->src[1]));
703 assert(check_read_class(l->class, ins->type, ins->src[2]));
704 assert(check_read_class(l->class, ins->type, ins->src[3]));
705 }
706
707 /* Mark writeout to r0, depth to r1.x, stencil to r1.y,
708 * render target to r1.z, unknown to r1.w */
709 mir_foreach_instr_global(ctx, ins) {
710 if (!(ins->compact_branch && ins->writeout))
711 continue;
712
713 if (ins->src[0] < ctx->temp_count)
714 l->solutions[ins->src[0]] = 0;
715
716 if (ins->src[2] < ctx->temp_count)
717 l->solutions[ins->src[2]] = (16 * 1) + COMPONENT_X * 4;
718
719 if (ins->src[3] < ctx->temp_count)
720 l->solutions[ins->src[3]] = (16 * 1) + COMPONENT_Y * 4;
721
722 if (ins->src[1] < ctx->temp_count)
723 l->solutions[ins->src[1]] = (16 * 1) + COMPONENT_Z * 4;
724
725 if (ins->dest < ctx->temp_count)
726 l->solutions[ins->dest] = (16 * 1) + COMPONENT_W * 4;
727 }
728
729 /* Destinations of instructions in a writeout block cannot be assigned
730 * to r1 unless they are actually used as r1 from the writeout itself,
731 * since the writes to r1 are special. A code sequence like:
732 *
733 * sadd.fmov r1.x, [...]
734 * vadd.fadd r0, r1, r2
735 * [writeout branch]
736 *
737 * will misbehave since the r1.x write will be interpreted as a
738 * gl_FragDepth write so it won't show up correctly when r1 is read in
739 * the following segment. We model this as interference.
740 */
741
742 for (unsigned i = 0; i < 4; ++i)
743 l->solutions[ctx->temp_count + i] = (16 * i);
744
745 mir_foreach_block(ctx, _blk) {
746 midgard_block *blk = (midgard_block *)_blk;
747
748 mir_foreach_bundle_in_block(blk, v) {
749 /* We need at least a writeout and nonwriteout instruction */
750 if (v->instruction_count < 2)
751 continue;
752
753 /* Branches always come at the end */
754 midgard_instruction *br = v->instructions[v->instruction_count - 1];
755
756 if (!br->writeout)
757 continue;
758
759 for (signed i = v->instruction_count - 2; i >= 0; --i) {
760 midgard_instruction *ins = v->instructions[i];
761
762 if (ins->dest >= ctx->temp_count)
763 continue;
764
765 bool used_as_r1 = (br->dest == ins->dest);
766
767 mir_foreach_src(br, s)
768 used_as_r1 |= (s > 0) && (br->src[s] == ins->dest);
769
770 if (!used_as_r1)
771 lcra_add_node_interference(l, ins->dest, mir_bytemask(ins),
772 node_r1, 0xFFFF);
773 }
774 }
775 }
776
777 /* Precolour blend input to r0. Note writeout is necessarily at the end
778 * and blend shaders are single-RT only so there is only a single
779 * writeout block, so this cannot conflict with the writeout r0 (there
780 * is no need to have an intermediate move) */
781
782 if (ctx->blend_input != ~0) {
783 assert(ctx->blend_input < ctx->temp_count);
784 l->solutions[ctx->blend_input] = 0;
785 }
786
787 /* Same for the dual-source blend input/output, except here we use r2,
788 * which is also set in the fragment shader. */
789
790 if (ctx->blend_src1 != ~0) {
791 assert(ctx->blend_src1 < ctx->temp_count);
792 l->solutions[ctx->blend_src1] = (16 * 2);
793 ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, 3);
794 }
795
796 mir_compute_interference(ctx, l);
797
798 *spilled = !lcra_solve(l);
799 return l;
800 }
801
802 /* Once registers have been decided via register allocation
803 * (allocate_registers), we need to rewrite the MIR to use registers instead of
804 * indices */
805
806 static void
install_registers_instr(compiler_context * ctx,struct lcra_state * l,midgard_instruction * ins)807 install_registers_instr(compiler_context *ctx, struct lcra_state *l,
808 midgard_instruction *ins)
809 {
810 unsigned src_shift[MIR_SRC_COUNT];
811
812 for (unsigned i = 0; i < MIR_SRC_COUNT; ++i) {
813 src_shift[i] =
814 util_logbase2(nir_alu_type_get_type_size(ins->src_types[i]) / 8);
815 }
816
817 unsigned dest_shift =
818 util_logbase2(nir_alu_type_get_type_size(ins->dest_type) / 8);
819
820 switch (ins->type) {
821 case TAG_ALU_4:
822 case TAG_ALU_8:
823 case TAG_ALU_12:
824 case TAG_ALU_16: {
825 if (ins->compact_branch)
826 return;
827
828 struct phys_reg src1 = index_to_reg(ctx, l, ins->src[0], src_shift[0]);
829 struct phys_reg src2 = index_to_reg(ctx, l, ins->src[1], src_shift[1]);
830 struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift);
831
832 mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset);
833
834 unsigned dest_offset =
835 GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props) ? 0 : dest.offset;
836
837 offset_swizzle(ins->swizzle[0], src1.offset, src1.shift, dest.shift,
838 dest_offset);
839 if (!ins->has_inline_constant)
840 offset_swizzle(ins->swizzle[1], src2.offset, src2.shift, dest.shift,
841 dest_offset);
842 if (ins->src[0] != ~0)
843 ins->src[0] = SSA_FIXED_REGISTER(src1.reg);
844 if (ins->src[1] != ~0)
845 ins->src[1] = SSA_FIXED_REGISTER(src2.reg);
846 if (ins->dest != ~0)
847 ins->dest = SSA_FIXED_REGISTER(dest.reg);
848 break;
849 }
850
851 case TAG_LOAD_STORE_4: {
852 /* Which physical register we read off depends on
853 * whether we are loading or storing -- think about the
854 * logical dataflow */
855
856 bool encodes_src = OP_IS_STORE(ins->op);
857
858 if (encodes_src) {
859 struct phys_reg src = index_to_reg(ctx, l, ins->src[0], src_shift[0]);
860 assert(src.reg == 26 || src.reg == 27);
861
862 ins->src[0] = SSA_FIXED_REGISTER(src.reg);
863 offset_swizzle(ins->swizzle[0], src.offset, src.shift, 0, 0);
864 } else {
865 struct phys_reg dst = index_to_reg(ctx, l, ins->dest, dest_shift);
866
867 ins->dest = SSA_FIXED_REGISTER(dst.reg);
868 offset_swizzle(ins->swizzle[0], 0, 2, dest_shift, dst.offset);
869 mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset);
870 }
871
872 /* We also follow up by actual arguments */
873
874 for (int i = 1; i <= 3; i++) {
875 unsigned src_index = ins->src[i];
876 if (src_index != ~0) {
877 struct phys_reg src = index_to_reg(ctx, l, src_index, src_shift[i]);
878 unsigned component = src.offset >> src.shift;
879 assert(component << src.shift == src.offset);
880 ins->src[i] = SSA_FIXED_REGISTER(src.reg);
881 ins->swizzle[i][0] += component;
882 }
883 }
884
885 break;
886 }
887
888 case TAG_TEXTURE_4: {
889 if (ins->op == midgard_tex_op_barrier)
890 break;
891
892 /* Grab RA results */
893 struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift);
894 struct phys_reg coord = index_to_reg(ctx, l, ins->src[1], src_shift[1]);
895 struct phys_reg lod = index_to_reg(ctx, l, ins->src[2], src_shift[2]);
896 struct phys_reg offset = index_to_reg(ctx, l, ins->src[3], src_shift[3]);
897
898 /* First, install the texture coordinate */
899 if (ins->src[1] != ~0)
900 ins->src[1] = SSA_FIXED_REGISTER(coord.reg);
901 offset_swizzle(ins->swizzle[1], coord.offset, coord.shift, dest.shift, 0);
902
903 /* Next, install the destination */
904 if (ins->dest != ~0)
905 ins->dest = SSA_FIXED_REGISTER(dest.reg);
906 offset_swizzle(ins->swizzle[0], 0, 2, dest.shift,
907 dest_shift == 1 ? dest.offset % 8 : dest.offset);
908 mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset);
909
910 /* If there is a register LOD/bias, use it */
911 if (ins->src[2] != ~0) {
912 assert(!(lod.offset & 3));
913 ins->src[2] = SSA_FIXED_REGISTER(lod.reg);
914 ins->swizzle[2][0] = lod.offset / 4;
915 }
916
917 /* If there is an offset register, install it */
918 if (ins->src[3] != ~0) {
919 ins->src[3] = SSA_FIXED_REGISTER(offset.reg);
920 ins->swizzle[3][0] = offset.offset / 4;
921 }
922
923 break;
924 }
925
926 default:
927 break;
928 }
929 }
930
931 static void
install_registers(compiler_context * ctx,struct lcra_state * l)932 install_registers(compiler_context *ctx, struct lcra_state *l)
933 {
934 mir_foreach_instr_global(ctx, ins)
935 install_registers_instr(ctx, l, ins);
936 }
937
938 /* If register allocation fails, find the best spill node */
939
940 static signed
mir_choose_spill_node(compiler_context * ctx,struct lcra_state * l)941 mir_choose_spill_node(compiler_context *ctx, struct lcra_state *l)
942 {
943 /* We can't spill a previously spilled value or an unspill */
944
945 mir_foreach_instr_global(ctx, ins) {
946 if (ins->no_spill & (1 << l->spill_class)) {
947 lcra_set_node_spill_cost(l, ins->dest, -1);
948
949 if (l->spill_class != REG_CLASS_WORK) {
950 mir_foreach_src(ins, s)
951 lcra_set_node_spill_cost(l, ins->src[s], -1);
952 }
953 }
954 }
955
956 return lcra_get_best_spill_node(l);
957 }
958
959 /* Once we've chosen a spill node, spill it */
960
961 static void
mir_spill_register(compiler_context * ctx,unsigned spill_node,unsigned spill_class,unsigned * spill_count)962 mir_spill_register(compiler_context *ctx, unsigned spill_node,
963 unsigned spill_class, unsigned *spill_count)
964 {
965 if (spill_class == REG_CLASS_WORK && ctx->inputs->is_blend)
966 unreachable("Blend shader spilling is currently unimplemented");
967
968 unsigned spill_index = ctx->temp_count;
969
970 /* We have a spill node, so check the class. Work registers
971 * legitimately spill to TLS, but special registers just spill to work
972 * registers */
973
974 bool is_special = spill_class != REG_CLASS_WORK;
975 bool is_special_w = spill_class == REG_CLASS_TEXW;
976
977 /* Allocate TLS slot (maybe) */
978 unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
979
980 /* For special reads, figure out how many bytes we need */
981 unsigned read_bytemask = 0;
982
983 /* If multiple instructions write to this destination, we'll have to
984 * fill from TLS before writing */
985 unsigned write_count = 0;
986
987 mir_foreach_instr_global_safe(ctx, ins) {
988 read_bytemask |= mir_bytemask_of_read_components(ins, spill_node);
989 if (ins->dest == spill_node)
990 ++write_count;
991 }
992
993 /* For TLS, replace all stores to the spilled node. For
994 * special reads, just keep as-is; the class will be demoted
995 * implicitly. For special writes, spill to a work register */
996
997 if (!is_special || is_special_w) {
998 if (is_special_w)
999 spill_slot = spill_index++;
1000
1001 unsigned last_id = ~0;
1002 unsigned last_fill = ~0;
1003 unsigned last_spill_index = ~0;
1004 midgard_instruction *last_spill = NULL;
1005
1006 mir_foreach_block(ctx, _block) {
1007 midgard_block *block = (midgard_block *)_block;
1008 mir_foreach_instr_in_block_safe(block, ins) {
1009 if (ins->dest != spill_node)
1010 continue;
1011
1012 /* Note: it's important to match the mask of the spill
1013 * with the mask of the instruction whose destination
1014 * we're spilling, or otherwise we'll read invalid
1015 * components and can fail RA in a subsequent iteration
1016 */
1017
1018 if (is_special_w) {
1019 midgard_instruction st = v_mov(spill_node, spill_slot);
1020 st.no_spill |= (1 << spill_class);
1021 st.mask = ins->mask;
1022 st.dest_type = st.src_types[1] = ins->dest_type;
1023
1024 /* Hint: don't rewrite this node */
1025 st.hint = true;
1026
1027 mir_insert_instruction_after_scheduled(ctx, block, ins, st);
1028 } else {
1029 unsigned bundle = ins->bundle_id;
1030 unsigned dest =
1031 (bundle == last_id) ? last_spill_index : spill_index++;
1032
1033 unsigned bytemask = mir_bytemask(ins);
1034 unsigned write_mask =
1035 mir_from_bytemask(mir_round_bytemask_up(bytemask, 32), 32);
1036
1037 if (write_count > 1 && bytemask != 0xFFFF &&
1038 bundle != last_fill) {
1039 midgard_instruction read =
1040 v_load_store_scratch(dest, spill_slot, false, 0xF);
1041 mir_insert_instruction_before_scheduled(ctx, block, ins,
1042 read);
1043 write_mask = 0xF;
1044 last_fill = bundle;
1045 }
1046
1047 ins->dest = dest;
1048 ins->no_spill |= (1 << spill_class);
1049
1050 bool move = false;
1051
1052 /* In the same bundle, reads of the destination
1053 * of the spilt instruction need to be direct */
1054 midgard_instruction *it = ins;
1055 while ((it = list_first_entry(&it->link, midgard_instruction,
1056 link)) &&
1057 (it->bundle_id == bundle)) {
1058
1059 if (!mir_has_arg(it, spill_node))
1060 continue;
1061
1062 mir_rewrite_index_src_single(it, spill_node, dest);
1063
1064 /* The spilt instruction will write to
1065 * a work register for `it` to read but
1066 * the spill needs an LD/ST register */
1067 move = true;
1068 }
1069
1070 if (move)
1071 dest = spill_index++;
1072
1073 if (last_id == bundle) {
1074 last_spill->mask |= write_mask;
1075 u_foreach_bit(c, write_mask)
1076 last_spill->swizzle[0][c] = c;
1077 } else {
1078 midgard_instruction st =
1079 v_load_store_scratch(dest, spill_slot, true, write_mask);
1080 last_spill = mir_insert_instruction_after_scheduled(
1081 ctx, block, ins, st);
1082 }
1083
1084 if (move) {
1085 midgard_instruction mv = v_mov(ins->dest, dest);
1086 mv.no_spill |= (1 << spill_class);
1087
1088 mir_insert_instruction_after_scheduled(ctx, block, ins, mv);
1089 }
1090
1091 last_id = bundle;
1092 last_spill_index = ins->dest;
1093 }
1094
1095 if (!is_special)
1096 ctx->spills++;
1097 }
1098 }
1099 }
1100
1101 /* Insert a load from TLS before the first consecutive
1102 * use of the node, rewriting to use spilled indices to
1103 * break up the live range. Or, for special, insert a
1104 * move. Ironically the latter *increases* register
1105 * pressure, but the two uses of the spilling mechanism
1106 * are somewhat orthogonal. (special spilling is to use
1107 * work registers to back special registers; TLS
1108 * spilling is to use memory to back work registers) */
1109
1110 mir_foreach_block(ctx, _block) {
1111 midgard_block *block = (midgard_block *)_block;
1112 mir_foreach_instr_in_block(block, ins) {
1113 /* We can't rewrite the moves used to spill in the
1114 * first place. These moves are hinted. */
1115 if (ins->hint)
1116 continue;
1117
1118 /* If we don't use the spilled value, nothing to do */
1119 if (!mir_has_arg(ins, spill_node))
1120 continue;
1121
1122 unsigned index = 0;
1123
1124 if (!is_special_w) {
1125 index = ++spill_index;
1126
1127 midgard_instruction *before = ins;
1128 midgard_instruction st;
1129
1130 if (is_special) {
1131 /* Move */
1132 st = v_mov(spill_node, index);
1133 st.no_spill |= (1 << spill_class);
1134 } else {
1135 /* TLS load */
1136 st = v_load_store_scratch(index, spill_slot, false, 0xF);
1137 }
1138
1139 /* Mask the load based on the component count
1140 * actually needed to prevent RA loops */
1141
1142 st.mask =
1143 mir_from_bytemask(mir_round_bytemask_up(read_bytemask, 32), 32);
1144
1145 mir_insert_instruction_before_scheduled(ctx, block, before, st);
1146 } else {
1147 /* Special writes already have their move spilled in */
1148 index = spill_slot;
1149 }
1150
1151 /* Rewrite to use */
1152 mir_rewrite_index_src_single(ins, spill_node, index);
1153
1154 if (!is_special)
1155 ctx->fills++;
1156 }
1157 }
1158
1159 /* Reset hints */
1160
1161 mir_foreach_instr_global(ctx, ins) {
1162 ins->hint = false;
1163 }
1164 }
1165
1166 static void
mir_demote_uniforms(compiler_context * ctx,unsigned new_cutoff)1167 mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff)
1168 {
1169 unsigned uniforms = ctx->info->push.count / 4;
1170 unsigned old_work_count = 16 - MAX2(uniforms - 8, 0);
1171 unsigned work_count = 16 - MAX2((new_cutoff - 8), 0);
1172
1173 unsigned min_demote = SSA_FIXED_REGISTER(old_work_count);
1174 unsigned max_demote = SSA_FIXED_REGISTER(work_count);
1175
1176 mir_foreach_block(ctx, _block) {
1177 midgard_block *block = (midgard_block *)_block;
1178 mir_foreach_instr_in_block(block, ins) {
1179 mir_foreach_src(ins, i) {
1180 if (ins->src[i] < min_demote || ins->src[i] >= max_demote)
1181 continue;
1182
1183 midgard_instruction *before = ins;
1184
1185 unsigned temp = make_compiler_temp(ctx);
1186 unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4;
1187 assert(idx < ctx->info->push.count);
1188
1189 ctx->ubo_mask |= BITSET_BIT(ctx->info->push.words[idx].ubo);
1190
1191 midgard_instruction ld = {
1192 .type = TAG_LOAD_STORE_4,
1193 .mask = 0xF,
1194 .dest = temp,
1195 .dest_type = ins->src_types[i],
1196 .src = {~0, ~0, ~0, ~0},
1197 .swizzle = SWIZZLE_IDENTITY_4,
1198 .op = midgard_op_ld_ubo_128,
1199 .load_store =
1200 {
1201 .index_reg = REGISTER_LDST_ZERO,
1202 },
1203 .constants.u32[0] = ctx->info->push.words[idx].offset,
1204 };
1205
1206 midgard_pack_ubo_index_imm(&ld.load_store,
1207 ctx->info->push.words[idx].ubo);
1208
1209 mir_insert_instruction_before_scheduled(ctx, block, before, ld);
1210
1211 mir_rewrite_index_src_single(ins, ins->src[i], temp);
1212 }
1213 }
1214 }
1215
1216 ctx->info->push.count = MIN2(ctx->info->push.count, new_cutoff * 4);
1217 }
1218
1219 /* Run register allocation in a loop, spilling until we succeed */
1220
1221 void
mir_ra(compiler_context * ctx)1222 mir_ra(compiler_context *ctx)
1223 {
1224 struct lcra_state *l = NULL;
1225 bool spilled = false;
1226 int iter_count = 1000; /* max iterations */
1227
1228 /* Number of 128-bit slots in memory we've spilled into */
1229 unsigned spill_count = DIV_ROUND_UP(ctx->info->tls_size, 16);
1230
1231 mir_create_pipeline_registers(ctx);
1232
1233 do {
1234 if (spilled) {
1235 signed spill_node = mir_choose_spill_node(ctx, l);
1236 unsigned uniforms = ctx->info->push.count / 4;
1237
1238 /* It's a lot cheaper to demote uniforms to get more
1239 * work registers than to spill to TLS. */
1240 if (l->spill_class == REG_CLASS_WORK && uniforms > 8) {
1241 mir_demote_uniforms(ctx, MAX2(uniforms - 4, 8));
1242 } else if (spill_node == -1) {
1243 fprintf(stderr, "ERROR: Failed to choose spill node\n");
1244 lcra_free(l);
1245 return;
1246 } else {
1247 mir_spill_register(ctx, spill_node, l->spill_class, &spill_count);
1248 }
1249 }
1250
1251 mir_squeeze_index(ctx);
1252 mir_invalidate_liveness(ctx);
1253
1254 if (l) {
1255 lcra_free(l);
1256 l = NULL;
1257 }
1258
1259 l = allocate_registers(ctx, &spilled);
1260 } while (spilled && ((iter_count--) > 0));
1261
1262 if (iter_count <= 0) {
1263 fprintf(
1264 stderr,
1265 "panfrost: Gave up allocating registers, rendering will be incomplete\n");
1266 assert(0);
1267 }
1268
1269 /* Report spilling information. spill_count is in 128-bit slots (vec4 x
1270 * fp32), but tls_size is in bytes, so multiply by 16 */
1271
1272 ctx->info->tls_size = spill_count * 16;
1273
1274 install_registers(ctx, l);
1275
1276 lcra_free(l);
1277 }
1278