xref: /aosp_15_r20/external/mesa3d/src/panfrost/midgard/midgard_ra.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
3  * Copyright (C) 2019 Collabora, Ltd.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "util/u_math.h"
26 #include "util/u_memory.h"
27 #include "compiler.h"
28 #include "midgard_ops.h"
29 #include "midgard_quirks.h"
30 
31 struct phys_reg {
32    /* Physical register: 0-31 */
33    unsigned reg;
34 
35    /* Byte offset into the physical register: 0-15 */
36    unsigned offset;
37 
38    /* log2(bytes per component) for fast mul/div */
39    unsigned shift;
40 };
41 
42 /* Shift up by reg_offset and horizontally by dst_offset. */
43 
44 static void
offset_swizzle(unsigned * swizzle,unsigned reg_offset,unsigned srcshift,unsigned dstshift,unsigned dst_offset)45 offset_swizzle(unsigned *swizzle, unsigned reg_offset, unsigned srcshift,
46                unsigned dstshift, unsigned dst_offset)
47 {
48    unsigned out[MIR_VEC_COMPONENTS];
49 
50    signed reg_comp = reg_offset >> srcshift;
51    signed dst_comp = dst_offset >> dstshift;
52 
53    unsigned max_component = (16 >> srcshift) - 1;
54 
55    assert(reg_comp << srcshift == reg_offset);
56    assert(dst_comp << dstshift == dst_offset);
57 
58    for (signed c = 0; c < MIR_VEC_COMPONENTS; ++c) {
59       signed comp = MAX2(c - dst_comp, 0);
60       out[c] = MIN2(swizzle[comp] + reg_comp, max_component);
61    }
62 
63    memcpy(swizzle, out, sizeof(out));
64 }
65 
66 /* Helper to return the default phys_reg for a given register */
67 
68 static struct phys_reg
default_phys_reg(int reg,unsigned shift)69 default_phys_reg(int reg, unsigned shift)
70 {
71    struct phys_reg r = {
72       .reg = reg,
73       .offset = 0,
74       .shift = shift,
75    };
76 
77    return r;
78 }
79 
80 /* Determine which physical register, swizzle, and mask a virtual
81  * register corresponds to */
82 
83 static struct phys_reg
index_to_reg(compiler_context * ctx,struct lcra_state * l,unsigned reg,unsigned shift)84 index_to_reg(compiler_context *ctx, struct lcra_state *l, unsigned reg,
85              unsigned shift)
86 {
87    /* Check for special cases */
88    if (reg == ~0)
89       return default_phys_reg(REGISTER_UNUSED, shift);
90    else if (reg >= SSA_FIXED_MINIMUM)
91       return default_phys_reg(SSA_REG_FROM_FIXED(reg), shift);
92    else if (!l)
93       return default_phys_reg(REGISTER_UNUSED, shift);
94 
95    struct phys_reg r = {
96       .reg = l->solutions[reg] / 16,
97       .offset = l->solutions[reg] & 0xF,
98       .shift = shift,
99    };
100 
101    /* Report that we actually use this register, and return it */
102 
103    if (r.reg < 16)
104       ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, r.reg + 1);
105 
106    return r;
107 }
108 
109 static void
set_class(unsigned * classes,unsigned node,unsigned class)110 set_class(unsigned *classes, unsigned node, unsigned class)
111 {
112    if (node < SSA_FIXED_MINIMUM && class != classes[node]) {
113       assert(classes[node] == REG_CLASS_WORK);
114       classes[node] = class;
115    }
116 }
117 
118 /* Special register classes impose special constraints on who can read their
119  * values, so check that */
120 
121 static bool ASSERTED
check_read_class(unsigned * classes,unsigned tag,unsigned node)122 check_read_class(unsigned *classes, unsigned tag, unsigned node)
123 {
124    /* Non-nodes are implicitly ok */
125    if (node >= SSA_FIXED_MINIMUM)
126       return true;
127 
128    switch (classes[node]) {
129    case REG_CLASS_LDST:
130       return (tag == TAG_LOAD_STORE_4);
131    case REG_CLASS_TEXR:
132       return (tag == TAG_TEXTURE_4);
133    case REG_CLASS_TEXW:
134       return (tag != TAG_LOAD_STORE_4);
135    case REG_CLASS_WORK:
136       return IS_ALU(tag);
137    default:
138       unreachable("Invalid class");
139    }
140 }
141 
142 static bool ASSERTED
check_write_class(unsigned * classes,unsigned tag,unsigned node)143 check_write_class(unsigned *classes, unsigned tag, unsigned node)
144 {
145    /* Non-nodes are implicitly ok */
146    if (node >= SSA_FIXED_MINIMUM)
147       return true;
148 
149    switch (classes[node]) {
150    case REG_CLASS_TEXR:
151       return true;
152    case REG_CLASS_TEXW:
153       return (tag == TAG_TEXTURE_4);
154    case REG_CLASS_LDST:
155    case REG_CLASS_WORK:
156       return IS_ALU(tag) || (tag == TAG_LOAD_STORE_4);
157    default:
158       unreachable("Invalid class");
159    }
160 }
161 
162 /* Prepass before RA to ensure special class restrictions are met. The idea is
163  * to create a bit field of types of instructions that read a particular index.
164  * Later, we'll add moves as appropriate and rewrite to specialize by type. */
165 
166 static void
mark_node_class(unsigned * bitfield,unsigned node)167 mark_node_class(unsigned *bitfield, unsigned node)
168 {
169    if (node < SSA_FIXED_MINIMUM)
170       BITSET_SET(bitfield, node);
171 }
172 
173 void
mir_lower_special_reads(compiler_context * ctx)174 mir_lower_special_reads(compiler_context *ctx)
175 {
176    mir_compute_temp_count(ctx);
177    size_t sz = BITSET_WORDS(ctx->temp_count) * sizeof(BITSET_WORD);
178 
179    /* Bitfields for the various types of registers we could have. aluw can
180     * be written by either ALU or load/store */
181 
182    unsigned *alur = calloc(sz, 1);
183    unsigned *aluw = calloc(sz, 1);
184    unsigned *brar = calloc(sz, 1);
185    unsigned *ldst = calloc(sz, 1);
186    unsigned *texr = calloc(sz, 1);
187    unsigned *texw = calloc(sz, 1);
188 
189    /* Pass #1 is analysis, a linear scan to fill out the bitfields */
190 
191    mir_foreach_instr_global(ctx, ins) {
192       switch (ins->type) {
193       case TAG_ALU_4:
194          mark_node_class(aluw, ins->dest);
195          mark_node_class(alur, ins->src[0]);
196          mark_node_class(alur, ins->src[1]);
197          mark_node_class(alur, ins->src[2]);
198 
199          if (ins->compact_branch && ins->writeout)
200             mark_node_class(brar, ins->src[0]);
201 
202          break;
203 
204       case TAG_LOAD_STORE_4:
205          mark_node_class(aluw, ins->dest);
206          mark_node_class(ldst, ins->src[0]);
207          mark_node_class(ldst, ins->src[1]);
208          mark_node_class(ldst, ins->src[2]);
209          mark_node_class(ldst, ins->src[3]);
210          break;
211 
212       case TAG_TEXTURE_4:
213          mark_node_class(texr, ins->src[0]);
214          mark_node_class(texr, ins->src[1]);
215          mark_node_class(texr, ins->src[2]);
216          mark_node_class(texw, ins->dest);
217          break;
218 
219       default:
220          break;
221       }
222    }
223 
224    /* Pass #2 is lowering now that we've analyzed all the classes.
225     * Conceptually, if an index is only marked for a single type of use,
226     * there is nothing to lower. If it is marked for different uses, we
227     * split up based on the number of types of uses. To do so, we divide
228     * into N distinct classes of use (where N>1 by definition), emit N-1
229     * moves from the index to copies of the index, and finally rewrite N-1
230     * of the types of uses to use the corresponding move */
231 
232    unsigned spill_idx = ctx->temp_count;
233 
234    for (unsigned i = 0; i < ctx->temp_count; ++i) {
235       bool is_alur = BITSET_TEST(alur, i);
236       bool is_aluw = BITSET_TEST(aluw, i);
237       bool is_brar = BITSET_TEST(brar, i);
238       bool is_ldst = BITSET_TEST(ldst, i);
239       bool is_texr = BITSET_TEST(texr, i);
240       bool is_texw = BITSET_TEST(texw, i);
241 
242       /* Analyse to check how many distinct uses there are. ALU ops
243        * (alur) can read the results of the texture pipeline (texw)
244        * but not ldst or texr. Load/store ops (ldst) cannot read
245        * anything but load/store inputs. Texture pipeline cannot read
246        * anything but texture inputs. TODO: Simplify.  */
247 
248       bool collision = (is_alur && (is_ldst || is_texr)) ||
249                        (is_ldst && (is_alur || is_texr || is_texw)) ||
250                        (is_texr && (is_alur || is_ldst || is_texw)) ||
251                        (is_texw && (is_aluw || is_ldst || is_texr)) ||
252                        (is_brar && is_texw);
253 
254       if (!collision)
255          continue;
256 
257       /* Use the index as-is as the work copy. Emit copies for
258        * special uses */
259 
260       unsigned classes[] = {TAG_LOAD_STORE_4, TAG_TEXTURE_4, TAG_TEXTURE_4,
261                             TAG_ALU_4};
262       bool collisions[] = {is_ldst, is_texr, is_texw && is_aluw, is_brar};
263 
264       for (unsigned j = 0; j < ARRAY_SIZE(collisions); ++j) {
265          if (!collisions[j])
266             continue;
267 
268          /* When the hazard is from reading, we move and rewrite
269           * sources (typical case). When it's from writing, we
270           * flip the move and rewrite destinations (obscure,
271           * only from control flow -- impossible in SSA) */
272 
273          bool hazard_write = (j == 2);
274 
275          unsigned idx = spill_idx++;
276 
277          /* Insert move before each read/write, depending on the
278           * hazard we're trying to account for */
279 
280          mir_foreach_block(ctx, block_) {
281             midgard_block *block = (midgard_block *)block_;
282             midgard_instruction *mov = NULL;
283 
284             mir_foreach_instr_in_block_safe(block, pre_use) {
285                if (pre_use->type != classes[j])
286                   continue;
287 
288                if (hazard_write) {
289                   if (pre_use->dest != i)
290                      continue;
291 
292                   midgard_instruction m = v_mov(idx, i);
293                   m.dest_type = pre_use->dest_type;
294                   m.src_types[1] = m.dest_type;
295                   m.mask = pre_use->mask;
296 
297                   midgard_instruction *use = mir_next_op(pre_use);
298                   assert(use);
299                   mir_insert_instruction_before(ctx, use, m);
300                   mir_rewrite_index_dst_single(pre_use, i, idx);
301                } else {
302                   if (!mir_has_arg(pre_use, i))
303                      continue;
304 
305                   unsigned mask = mir_from_bytemask(
306                      mir_round_bytemask_up(
307                         mir_bytemask_of_read_components(pre_use, i), 32),
308                      32);
309 
310                   if (mov == NULL || !mir_is_ssa(i)) {
311                      midgard_instruction m = v_mov(i, spill_idx++);
312                      m.mask = mask;
313                      mov = mir_insert_instruction_before(ctx, pre_use, m);
314                   } else {
315                      mov->mask |= mask;
316                   }
317 
318                   mir_rewrite_index_src_single(pre_use, i, mov->dest);
319                }
320             }
321          }
322       }
323    }
324 
325    free(alur);
326    free(aluw);
327    free(brar);
328    free(ldst);
329    free(texr);
330    free(texw);
331 }
332 
333 static void
mir_compute_interference(compiler_context * ctx,struct lcra_state * l)334 mir_compute_interference(compiler_context *ctx, struct lcra_state *l)
335 {
336    /* First, we need liveness information to be computed per block */
337    mir_compute_liveness(ctx);
338 
339    /* We need to force r1.w live throughout a blend shader */
340 
341    if (ctx->inputs->is_blend) {
342       unsigned r1w = ~0;
343 
344       mir_foreach_block(ctx, _block) {
345          midgard_block *block = (midgard_block *)_block;
346          mir_foreach_instr_in_block_rev(block, ins) {
347             if (ins->writeout)
348                r1w = ins->dest;
349          }
350 
351          if (r1w != ~0)
352             break;
353       }
354 
355       mir_foreach_instr_global(ctx, ins) {
356          if (ins->dest < ctx->temp_count)
357             lcra_add_node_interference(l, ins->dest, mir_bytemask(ins), r1w,
358                                        0xF);
359       }
360    }
361 
362    /* Now that every block has live_in/live_out computed, we can determine
363     * interference by walking each block linearly. Take live_out at the
364     * end of each block and walk the block backwards. */
365 
366    mir_foreach_block(ctx, _blk) {
367       midgard_block *blk = (midgard_block *)_blk;
368 
369       /* The scalar and vector units run in parallel. We need to make
370        * sure they don't write to same portion of the register file
371        * otherwise the result is undefined. Add interferences to
372        * avoid this situation.
373        */
374       util_dynarray_foreach(&blk->bundles, midgard_bundle, bundle) {
375          midgard_instruction *instrs[2][4];
376          unsigned instr_count[2] = {0, 0};
377 
378          for (unsigned i = 0; i < bundle->instruction_count; i++) {
379             if (bundle->instructions[i]->unit == UNIT_VMUL ||
380                 bundle->instructions[i]->unit == UNIT_SADD)
381                instrs[0][instr_count[0]++] = bundle->instructions[i];
382             else
383                instrs[1][instr_count[1]++] = bundle->instructions[i];
384          }
385 
386          for (unsigned i = 0; i < ARRAY_SIZE(instr_count); i++) {
387             for (unsigned j = 0; j < instr_count[i]; j++) {
388                midgard_instruction *ins_a = instrs[i][j];
389 
390                if (ins_a->dest >= ctx->temp_count)
391                   continue;
392 
393                for (unsigned k = j + 1; k < instr_count[i]; k++) {
394                   midgard_instruction *ins_b = instrs[i][k];
395 
396                   if (ins_b->dest >= ctx->temp_count)
397                      continue;
398 
399                   lcra_add_node_interference(l, ins_b->dest,
400                                              mir_bytemask(ins_b), ins_a->dest,
401                                              mir_bytemask(ins_a));
402                }
403             }
404          }
405       }
406 
407       uint16_t *live =
408          mem_dup(_blk->live_out, ctx->temp_count * sizeof(uint16_t));
409 
410       mir_foreach_instr_in_block_rev(blk, ins) {
411          /* Mark all registers live after the instruction as
412           * interfering with the destination */
413 
414          unsigned dest = ins->dest;
415 
416          if (dest < ctx->temp_count) {
417             for (unsigned i = 0; i < ctx->temp_count; ++i) {
418                if (live[i]) {
419                   unsigned mask = mir_bytemask(ins);
420                   lcra_add_node_interference(l, dest, mask, i, live[i]);
421                }
422             }
423          }
424 
425          /* Add blend shader interference: blend shaders might
426           * clobber r0-r3. */
427          if (ins->compact_branch && ins->writeout) {
428             for (unsigned i = 0; i < ctx->temp_count; ++i) {
429                if (!live[i])
430                   continue;
431 
432                for (unsigned j = 0; j < 4; j++) {
433                   lcra_add_node_interference(l, ctx->temp_count + j, 0xFFFF, i,
434                                              live[i]);
435                }
436             }
437          }
438 
439          /* Update live_in */
440          mir_liveness_ins_update(live, ins, ctx->temp_count);
441       }
442 
443       free(live);
444    }
445 }
446 
447 static bool
mir_is_64(midgard_instruction * ins)448 mir_is_64(midgard_instruction *ins)
449 {
450    if (nir_alu_type_get_type_size(ins->dest_type) == 64)
451       return true;
452 
453    mir_foreach_src(ins, v) {
454       if (nir_alu_type_get_type_size(ins->src_types[v]) == 64)
455          return true;
456    }
457 
458    return false;
459 }
460 
461 /*
462  * Determine if a shader needs a contiguous workgroup. This impacts register
463  * allocation. TODO: Optimize if barriers and local memory are unused.
464  */
465 static bool
needs_contiguous_workgroup(compiler_context * ctx)466 needs_contiguous_workgroup(compiler_context *ctx)
467 {
468    return gl_shader_stage_uses_workgroup(ctx->stage);
469 }
470 
471 /*
472  * Determine an upper-bound on the number of threads in a workgroup. The GL
473  * driver reports 128 for the maximum number of threads (the minimum-maximum in
474  * OpenGL ES 3.1), so we pessimistically assume 128 threads for variable
475  * workgroups.
476  */
477 static unsigned
max_threads_per_workgroup(compiler_context * ctx)478 max_threads_per_workgroup(compiler_context *ctx)
479 {
480    if (ctx->nir->info.workgroup_size_variable) {
481       return 128;
482    } else {
483       return ctx->nir->info.workgroup_size[0] *
484              ctx->nir->info.workgroup_size[1] *
485              ctx->nir->info.workgroup_size[2];
486    }
487 }
488 
489 /*
490  * Calculate the maximum number of work registers available to the shader.
491  * Architecturally, Midgard shaders may address up to 16 work registers, but
492  * various features impose other limits:
493  *
494  * 1. Blend shaders are limited to 8 registers by ABI.
495  * 2. If there are more than 8 register-mapped uniforms, then additional
496  *    register-mapped uniforms use space that otherwise would be used for work
497  *    registers.
498  * 3. If more than 4 registers are used, at most 128 threads may be spawned. If
499  *    more than 8 registers are used, at most 64 threads may be spawned. These
500  *    limits are architecturally visible in compute kernels that require an
501  *    entire workgroup to be spawned at once (for barriers or local memory to
502  *    work properly).
503  */
504 static unsigned
max_work_registers(compiler_context * ctx)505 max_work_registers(compiler_context *ctx)
506 {
507    if (ctx->inputs->is_blend)
508       return 8;
509 
510    unsigned rmu_vec4 = ctx->info->push.count / 4;
511    unsigned max_work_registers = (rmu_vec4 >= 8) ? (24 - rmu_vec4) : 16;
512 
513    if (needs_contiguous_workgroup(ctx)) {
514       unsigned threads = max_threads_per_workgroup(ctx);
515       assert(threads <= 128 && "maximum threads in ABI exceeded");
516 
517       if (threads > 64)
518          max_work_registers = MIN2(max_work_registers, 8);
519    }
520 
521    return max_work_registers;
522 }
523 
524 /* This routine performs the actual register allocation. It should be succeeded
525  * by install_registers */
526 
527 static struct lcra_state *
allocate_registers(compiler_context * ctx,bool * spilled)528 allocate_registers(compiler_context *ctx, bool *spilled)
529 {
530    int work_count = max_work_registers(ctx);
531 
532    /* No register allocation to do with no SSA */
533    mir_compute_temp_count(ctx);
534    if (!ctx->temp_count)
535       return NULL;
536 
537    /* Initialize LCRA. Allocate extra node at the end for r1-r3 for
538     * interference */
539 
540    struct lcra_state *l = lcra_alloc_equations(ctx->temp_count + 4, 5);
541    unsigned node_r1 = ctx->temp_count + 1;
542 
543    /* Starts of classes, in bytes */
544    l->class_start[REG_CLASS_WORK] = 16 * 0;
545    l->class_start[REG_CLASS_LDST] = 16 * 26;
546    l->class_start[REG_CLASS_TEXR] = 16 * 28;
547    l->class_start[REG_CLASS_TEXW] = 16 * 28;
548 
549    l->class_size[REG_CLASS_WORK] = 16 * work_count;
550    l->class_size[REG_CLASS_LDST] = 16 * 2;
551    l->class_size[REG_CLASS_TEXR] = 16 * 2;
552    l->class_size[REG_CLASS_TEXW] = 16 * 2;
553 
554    lcra_set_disjoint_class(l, REG_CLASS_TEXR, REG_CLASS_TEXW);
555 
556    /* To save space on T*20, we don't have real texture registers.
557     * Instead, tex inputs reuse the load/store pipeline registers, and
558     * tex outputs use work r0/r1. Note we still use TEXR/TEXW classes,
559     * noting that this handles interferences and sizes correctly. */
560 
561    if (ctx->quirks & MIDGARD_INTERPIPE_REG_ALIASING) {
562       l->class_start[REG_CLASS_TEXR] = l->class_start[REG_CLASS_LDST];
563       l->class_start[REG_CLASS_TEXW] = l->class_start[REG_CLASS_WORK];
564    }
565 
566    unsigned *found_class = calloc(sizeof(unsigned), ctx->temp_count);
567    unsigned *min_alignment = calloc(sizeof(unsigned), ctx->temp_count);
568    unsigned *min_bound = calloc(sizeof(unsigned), ctx->temp_count);
569 
570    mir_foreach_instr_global(ctx, ins) {
571       /* Swizzles of 32-bit sources on 64-bit instructions need to be
572        * aligned to either bottom (xy) or top (zw). More general
573        * swizzle lowering should happen prior to scheduling (TODO),
574        * but once we get RA we shouldn't disrupt this further. Align
575        * sources of 64-bit instructions. */
576 
577       if (ins->type == TAG_ALU_4 && mir_is_64(ins)) {
578          mir_foreach_src(ins, v) {
579             unsigned s = ins->src[v];
580 
581             if (s < ctx->temp_count)
582                min_alignment[s] = MAX2(3, min_alignment[s]);
583          }
584       }
585 
586       if (ins->type == TAG_LOAD_STORE_4 && OP_HAS_ADDRESS(ins->op)) {
587          mir_foreach_src(ins, v) {
588             unsigned s = ins->src[v];
589             unsigned size = nir_alu_type_get_type_size(ins->src_types[v]);
590 
591             if (s < ctx->temp_count)
592                min_alignment[s] = MAX2((size == 64) ? 3 : 2, min_alignment[s]);
593          }
594       }
595 
596       /* Anything read as 16-bit needs proper alignment to ensure the
597        * resulting code can be packed.
598        */
599       mir_foreach_src(ins, s) {
600          unsigned src_size = nir_alu_type_get_type_size(ins->src_types[s]);
601          if (src_size == 16 && ins->src[s] < SSA_FIXED_MINIMUM)
602             min_bound[ins->src[s]] = MAX2(min_bound[ins->src[s]], 8);
603       }
604 
605       /* Everything after this concerns only the destination, not the
606        * sources.
607        */
608       if (ins->dest >= SSA_FIXED_MINIMUM)
609          continue;
610 
611       unsigned size = nir_alu_type_get_type_size(ins->dest_type);
612 
613       if (ins->is_pack)
614          size = 32;
615 
616       /* 0 for x, 1 for xy, 2 for xyz, 3 for xyzw */
617       int comps1 = util_logbase2(ins->mask);
618 
619       int bytes = (comps1 + 1) * (size / 8);
620 
621       /* Use the largest class if there's ambiguity, this
622        * handles partial writes */
623 
624       int dest = ins->dest;
625       found_class[dest] = MAX2(found_class[dest], bytes);
626 
627       min_alignment[dest] =
628          MAX2(min_alignment[dest], (size == 16) ? 1 : /* (1 << 1) = 2-byte */
629                                       (size == 32) ? 2
630                                                    : /* (1 << 2) = 4-byte */
631                                       (size == 64) ? 3
632                                                    : /* (1 << 3) = 8-byte */
633                                       3);            /* 8-bit todo */
634 
635       /* We can't cross xy/zw boundaries. TODO: vec8 can */
636       if (size == 16 && min_alignment[dest] != 4)
637          min_bound[dest] = 8;
638 
639       /* We don't have a swizzle for the conditional and we don't
640        * want to muck with the conditional itself, so just force
641        * alignment for now */
642 
643       if (ins->type == TAG_ALU_4 && OP_IS_CSEL_V(ins->op)) {
644          min_alignment[dest] = 4; /* 1 << 4= 16-byte = vec4 */
645 
646          /* LCRA assumes bound >= alignment */
647          min_bound[dest] = 16;
648       }
649 
650       /* Since ld/st swizzles and masks are 32-bit only, we need them
651        * aligned to enable final packing */
652       if (ins->type == TAG_LOAD_STORE_4)
653          min_alignment[dest] = MAX2(min_alignment[dest], 2);
654    }
655 
656    for (unsigned i = 0; i < ctx->temp_count; ++i) {
657       lcra_set_alignment(l, i, min_alignment[i] ? min_alignment[i] : 2,
658                          min_bound[i] ? min_bound[i] : 16);
659       lcra_restrict_range(l, i, found_class[i]);
660    }
661 
662    free(found_class);
663    free(min_alignment);
664    free(min_bound);
665 
666    /* Next, we'll determine semantic class. We default to zero (work).
667     * But, if we're used with a special operation, that will force us to a
668     * particular class. Each node must be assigned to exactly one class; a
669     * prepass before RA should have lowered what-would-have-been
670     * multiclass nodes into a series of moves to break it up into multiple
671     * nodes (TODO) */
672 
673    mir_foreach_instr_global(ctx, ins) {
674       /* Check if this operation imposes any classes */
675 
676       if (ins->type == TAG_LOAD_STORE_4) {
677          set_class(l->class, ins->src[0], REG_CLASS_LDST);
678          set_class(l->class, ins->src[1], REG_CLASS_LDST);
679          set_class(l->class, ins->src[2], REG_CLASS_LDST);
680          set_class(l->class, ins->src[3], REG_CLASS_LDST);
681 
682          if (OP_IS_VEC4_ONLY(ins->op)) {
683             lcra_restrict_range(l, ins->dest, 16);
684             lcra_restrict_range(l, ins->src[0], 16);
685             lcra_restrict_range(l, ins->src[1], 16);
686             lcra_restrict_range(l, ins->src[2], 16);
687             lcra_restrict_range(l, ins->src[3], 16);
688          }
689       } else if (ins->type == TAG_TEXTURE_4) {
690          set_class(l->class, ins->dest, REG_CLASS_TEXW);
691          set_class(l->class, ins->src[0], REG_CLASS_TEXR);
692          set_class(l->class, ins->src[1], REG_CLASS_TEXR);
693          set_class(l->class, ins->src[2], REG_CLASS_TEXR);
694          set_class(l->class, ins->src[3], REG_CLASS_TEXR);
695       }
696    }
697 
698    /* Check that the semantics of the class are respected */
699    mir_foreach_instr_global(ctx, ins) {
700       assert(check_write_class(l->class, ins->type, ins->dest));
701       assert(check_read_class(l->class, ins->type, ins->src[0]));
702       assert(check_read_class(l->class, ins->type, ins->src[1]));
703       assert(check_read_class(l->class, ins->type, ins->src[2]));
704       assert(check_read_class(l->class, ins->type, ins->src[3]));
705    }
706 
707    /* Mark writeout to r0, depth to r1.x, stencil to r1.y,
708     * render target to r1.z, unknown to r1.w */
709    mir_foreach_instr_global(ctx, ins) {
710       if (!(ins->compact_branch && ins->writeout))
711          continue;
712 
713       if (ins->src[0] < ctx->temp_count)
714          l->solutions[ins->src[0]] = 0;
715 
716       if (ins->src[2] < ctx->temp_count)
717          l->solutions[ins->src[2]] = (16 * 1) + COMPONENT_X * 4;
718 
719       if (ins->src[3] < ctx->temp_count)
720          l->solutions[ins->src[3]] = (16 * 1) + COMPONENT_Y * 4;
721 
722       if (ins->src[1] < ctx->temp_count)
723          l->solutions[ins->src[1]] = (16 * 1) + COMPONENT_Z * 4;
724 
725       if (ins->dest < ctx->temp_count)
726          l->solutions[ins->dest] = (16 * 1) + COMPONENT_W * 4;
727    }
728 
729    /* Destinations of instructions in a writeout block cannot be assigned
730     * to r1 unless they are actually used as r1 from the writeout itself,
731     * since the writes to r1 are special. A code sequence like:
732     *
733     *      sadd.fmov r1.x, [...]
734     *      vadd.fadd r0, r1, r2
735     *      [writeout branch]
736     *
737     * will misbehave since the r1.x write will be interpreted as a
738     * gl_FragDepth write so it won't show up correctly when r1 is read in
739     * the following segment. We model this as interference.
740     */
741 
742    for (unsigned i = 0; i < 4; ++i)
743       l->solutions[ctx->temp_count + i] = (16 * i);
744 
745    mir_foreach_block(ctx, _blk) {
746       midgard_block *blk = (midgard_block *)_blk;
747 
748       mir_foreach_bundle_in_block(blk, v) {
749          /* We need at least a writeout and nonwriteout instruction */
750          if (v->instruction_count < 2)
751             continue;
752 
753          /* Branches always come at the end */
754          midgard_instruction *br = v->instructions[v->instruction_count - 1];
755 
756          if (!br->writeout)
757             continue;
758 
759          for (signed i = v->instruction_count - 2; i >= 0; --i) {
760             midgard_instruction *ins = v->instructions[i];
761 
762             if (ins->dest >= ctx->temp_count)
763                continue;
764 
765             bool used_as_r1 = (br->dest == ins->dest);
766 
767             mir_foreach_src(br, s)
768                used_as_r1 |= (s > 0) && (br->src[s] == ins->dest);
769 
770             if (!used_as_r1)
771                lcra_add_node_interference(l, ins->dest, mir_bytemask(ins),
772                                           node_r1, 0xFFFF);
773          }
774       }
775    }
776 
777    /* Precolour blend input to r0. Note writeout is necessarily at the end
778     * and blend shaders are single-RT only so there is only a single
779     * writeout block, so this cannot conflict with the writeout r0 (there
780     * is no need to have an intermediate move) */
781 
782    if (ctx->blend_input != ~0) {
783       assert(ctx->blend_input < ctx->temp_count);
784       l->solutions[ctx->blend_input] = 0;
785    }
786 
787    /* Same for the dual-source blend input/output, except here we use r2,
788     * which is also set in the fragment shader. */
789 
790    if (ctx->blend_src1 != ~0) {
791       assert(ctx->blend_src1 < ctx->temp_count);
792       l->solutions[ctx->blend_src1] = (16 * 2);
793       ctx->info->work_reg_count = MAX2(ctx->info->work_reg_count, 3);
794    }
795 
796    mir_compute_interference(ctx, l);
797 
798    *spilled = !lcra_solve(l);
799    return l;
800 }
801 
802 /* Once registers have been decided via register allocation
803  * (allocate_registers), we need to rewrite the MIR to use registers instead of
804  * indices */
805 
806 static void
install_registers_instr(compiler_context * ctx,struct lcra_state * l,midgard_instruction * ins)807 install_registers_instr(compiler_context *ctx, struct lcra_state *l,
808                         midgard_instruction *ins)
809 {
810    unsigned src_shift[MIR_SRC_COUNT];
811 
812    for (unsigned i = 0; i < MIR_SRC_COUNT; ++i) {
813       src_shift[i] =
814          util_logbase2(nir_alu_type_get_type_size(ins->src_types[i]) / 8);
815    }
816 
817    unsigned dest_shift =
818       util_logbase2(nir_alu_type_get_type_size(ins->dest_type) / 8);
819 
820    switch (ins->type) {
821    case TAG_ALU_4:
822    case TAG_ALU_8:
823    case TAG_ALU_12:
824    case TAG_ALU_16: {
825       if (ins->compact_branch)
826          return;
827 
828       struct phys_reg src1 = index_to_reg(ctx, l, ins->src[0], src_shift[0]);
829       struct phys_reg src2 = index_to_reg(ctx, l, ins->src[1], src_shift[1]);
830       struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift);
831 
832       mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset);
833 
834       unsigned dest_offset =
835          GET_CHANNEL_COUNT(alu_opcode_props[ins->op].props) ? 0 : dest.offset;
836 
837       offset_swizzle(ins->swizzle[0], src1.offset, src1.shift, dest.shift,
838                      dest_offset);
839       if (!ins->has_inline_constant)
840          offset_swizzle(ins->swizzle[1], src2.offset, src2.shift, dest.shift,
841                         dest_offset);
842       if (ins->src[0] != ~0)
843          ins->src[0] = SSA_FIXED_REGISTER(src1.reg);
844       if (ins->src[1] != ~0)
845          ins->src[1] = SSA_FIXED_REGISTER(src2.reg);
846       if (ins->dest != ~0)
847          ins->dest = SSA_FIXED_REGISTER(dest.reg);
848       break;
849    }
850 
851    case TAG_LOAD_STORE_4: {
852       /* Which physical register we read off depends on
853        * whether we are loading or storing -- think about the
854        * logical dataflow */
855 
856       bool encodes_src = OP_IS_STORE(ins->op);
857 
858       if (encodes_src) {
859          struct phys_reg src = index_to_reg(ctx, l, ins->src[0], src_shift[0]);
860          assert(src.reg == 26 || src.reg == 27);
861 
862          ins->src[0] = SSA_FIXED_REGISTER(src.reg);
863          offset_swizzle(ins->swizzle[0], src.offset, src.shift, 0, 0);
864       } else {
865          struct phys_reg dst = index_to_reg(ctx, l, ins->dest, dest_shift);
866 
867          ins->dest = SSA_FIXED_REGISTER(dst.reg);
868          offset_swizzle(ins->swizzle[0], 0, 2, dest_shift, dst.offset);
869          mir_set_bytemask(ins, mir_bytemask(ins) << dst.offset);
870       }
871 
872       /* We also follow up by actual arguments */
873 
874       for (int i = 1; i <= 3; i++) {
875          unsigned src_index = ins->src[i];
876          if (src_index != ~0) {
877             struct phys_reg src = index_to_reg(ctx, l, src_index, src_shift[i]);
878             unsigned component = src.offset >> src.shift;
879             assert(component << src.shift == src.offset);
880             ins->src[i] = SSA_FIXED_REGISTER(src.reg);
881             ins->swizzle[i][0] += component;
882          }
883       }
884 
885       break;
886    }
887 
888    case TAG_TEXTURE_4: {
889       if (ins->op == midgard_tex_op_barrier)
890          break;
891 
892       /* Grab RA results */
893       struct phys_reg dest = index_to_reg(ctx, l, ins->dest, dest_shift);
894       struct phys_reg coord = index_to_reg(ctx, l, ins->src[1], src_shift[1]);
895       struct phys_reg lod = index_to_reg(ctx, l, ins->src[2], src_shift[2]);
896       struct phys_reg offset = index_to_reg(ctx, l, ins->src[3], src_shift[3]);
897 
898       /* First, install the texture coordinate */
899       if (ins->src[1] != ~0)
900          ins->src[1] = SSA_FIXED_REGISTER(coord.reg);
901       offset_swizzle(ins->swizzle[1], coord.offset, coord.shift, dest.shift, 0);
902 
903       /* Next, install the destination */
904       if (ins->dest != ~0)
905          ins->dest = SSA_FIXED_REGISTER(dest.reg);
906       offset_swizzle(ins->swizzle[0], 0, 2, dest.shift,
907                      dest_shift == 1 ? dest.offset % 8 : dest.offset);
908       mir_set_bytemask(ins, mir_bytemask(ins) << dest.offset);
909 
910       /* If there is a register LOD/bias, use it */
911       if (ins->src[2] != ~0) {
912          assert(!(lod.offset & 3));
913          ins->src[2] = SSA_FIXED_REGISTER(lod.reg);
914          ins->swizzle[2][0] = lod.offset / 4;
915       }
916 
917       /* If there is an offset register, install it */
918       if (ins->src[3] != ~0) {
919          ins->src[3] = SSA_FIXED_REGISTER(offset.reg);
920          ins->swizzle[3][0] = offset.offset / 4;
921       }
922 
923       break;
924    }
925 
926    default:
927       break;
928    }
929 }
930 
931 static void
install_registers(compiler_context * ctx,struct lcra_state * l)932 install_registers(compiler_context *ctx, struct lcra_state *l)
933 {
934    mir_foreach_instr_global(ctx, ins)
935       install_registers_instr(ctx, l, ins);
936 }
937 
938 /* If register allocation fails, find the best spill node */
939 
940 static signed
mir_choose_spill_node(compiler_context * ctx,struct lcra_state * l)941 mir_choose_spill_node(compiler_context *ctx, struct lcra_state *l)
942 {
943    /* We can't spill a previously spilled value or an unspill */
944 
945    mir_foreach_instr_global(ctx, ins) {
946       if (ins->no_spill & (1 << l->spill_class)) {
947          lcra_set_node_spill_cost(l, ins->dest, -1);
948 
949          if (l->spill_class != REG_CLASS_WORK) {
950             mir_foreach_src(ins, s)
951                lcra_set_node_spill_cost(l, ins->src[s], -1);
952          }
953       }
954    }
955 
956    return lcra_get_best_spill_node(l);
957 }
958 
959 /* Once we've chosen a spill node, spill it */
960 
961 static void
mir_spill_register(compiler_context * ctx,unsigned spill_node,unsigned spill_class,unsigned * spill_count)962 mir_spill_register(compiler_context *ctx, unsigned spill_node,
963                    unsigned spill_class, unsigned *spill_count)
964 {
965    if (spill_class == REG_CLASS_WORK && ctx->inputs->is_blend)
966       unreachable("Blend shader spilling is currently unimplemented");
967 
968    unsigned spill_index = ctx->temp_count;
969 
970    /* We have a spill node, so check the class. Work registers
971     * legitimately spill to TLS, but special registers just spill to work
972     * registers */
973 
974    bool is_special = spill_class != REG_CLASS_WORK;
975    bool is_special_w = spill_class == REG_CLASS_TEXW;
976 
977    /* Allocate TLS slot (maybe) */
978    unsigned spill_slot = !is_special ? (*spill_count)++ : 0;
979 
980    /* For special reads, figure out how many bytes we need */
981    unsigned read_bytemask = 0;
982 
983    /* If multiple instructions write to this destination, we'll have to
984     * fill from TLS before writing */
985    unsigned write_count = 0;
986 
987    mir_foreach_instr_global_safe(ctx, ins) {
988       read_bytemask |= mir_bytemask_of_read_components(ins, spill_node);
989       if (ins->dest == spill_node)
990          ++write_count;
991    }
992 
993    /* For TLS, replace all stores to the spilled node. For
994     * special reads, just keep as-is; the class will be demoted
995     * implicitly. For special writes, spill to a work register */
996 
997    if (!is_special || is_special_w) {
998       if (is_special_w)
999          spill_slot = spill_index++;
1000 
1001       unsigned last_id = ~0;
1002       unsigned last_fill = ~0;
1003       unsigned last_spill_index = ~0;
1004       midgard_instruction *last_spill = NULL;
1005 
1006       mir_foreach_block(ctx, _block) {
1007          midgard_block *block = (midgard_block *)_block;
1008          mir_foreach_instr_in_block_safe(block, ins) {
1009             if (ins->dest != spill_node)
1010                continue;
1011 
1012             /* Note: it's important to match the mask of the spill
1013              * with the mask of the instruction whose destination
1014              * we're spilling, or otherwise we'll read invalid
1015              * components and can fail RA in a subsequent iteration
1016              */
1017 
1018             if (is_special_w) {
1019                midgard_instruction st = v_mov(spill_node, spill_slot);
1020                st.no_spill |= (1 << spill_class);
1021                st.mask = ins->mask;
1022                st.dest_type = st.src_types[1] = ins->dest_type;
1023 
1024                /* Hint: don't rewrite this node */
1025                st.hint = true;
1026 
1027                mir_insert_instruction_after_scheduled(ctx, block, ins, st);
1028             } else {
1029                unsigned bundle = ins->bundle_id;
1030                unsigned dest =
1031                   (bundle == last_id) ? last_spill_index : spill_index++;
1032 
1033                unsigned bytemask = mir_bytemask(ins);
1034                unsigned write_mask =
1035                   mir_from_bytemask(mir_round_bytemask_up(bytemask, 32), 32);
1036 
1037                if (write_count > 1 && bytemask != 0xFFFF &&
1038                    bundle != last_fill) {
1039                   midgard_instruction read =
1040                      v_load_store_scratch(dest, spill_slot, false, 0xF);
1041                   mir_insert_instruction_before_scheduled(ctx, block, ins,
1042                                                           read);
1043                   write_mask = 0xF;
1044                   last_fill = bundle;
1045                }
1046 
1047                ins->dest = dest;
1048                ins->no_spill |= (1 << spill_class);
1049 
1050                bool move = false;
1051 
1052                /* In the same bundle, reads of the destination
1053                 * of the spilt instruction need to be direct */
1054                midgard_instruction *it = ins;
1055                while ((it = list_first_entry(&it->link, midgard_instruction,
1056                                              link)) &&
1057                       (it->bundle_id == bundle)) {
1058 
1059                   if (!mir_has_arg(it, spill_node))
1060                      continue;
1061 
1062                   mir_rewrite_index_src_single(it, spill_node, dest);
1063 
1064                   /* The spilt instruction will write to
1065                    * a work register for `it` to read but
1066                    * the spill needs an LD/ST register */
1067                   move = true;
1068                }
1069 
1070                if (move)
1071                   dest = spill_index++;
1072 
1073                if (last_id == bundle) {
1074                   last_spill->mask |= write_mask;
1075                   u_foreach_bit(c, write_mask)
1076                      last_spill->swizzle[0][c] = c;
1077                } else {
1078                   midgard_instruction st =
1079                      v_load_store_scratch(dest, spill_slot, true, write_mask);
1080                   last_spill = mir_insert_instruction_after_scheduled(
1081                      ctx, block, ins, st);
1082                }
1083 
1084                if (move) {
1085                   midgard_instruction mv = v_mov(ins->dest, dest);
1086                   mv.no_spill |= (1 << spill_class);
1087 
1088                   mir_insert_instruction_after_scheduled(ctx, block, ins, mv);
1089                }
1090 
1091                last_id = bundle;
1092                last_spill_index = ins->dest;
1093             }
1094 
1095             if (!is_special)
1096                ctx->spills++;
1097          }
1098       }
1099    }
1100 
1101    /* Insert a load from TLS before the first consecutive
1102     * use of the node, rewriting to use spilled indices to
1103     * break up the live range. Or, for special, insert a
1104     * move. Ironically the latter *increases* register
1105     * pressure, but the two uses of the spilling mechanism
1106     * are somewhat orthogonal. (special spilling is to use
1107     * work registers to back special registers; TLS
1108     * spilling is to use memory to back work registers) */
1109 
1110    mir_foreach_block(ctx, _block) {
1111       midgard_block *block = (midgard_block *)_block;
1112       mir_foreach_instr_in_block(block, ins) {
1113          /* We can't rewrite the moves used to spill in the
1114           * first place. These moves are hinted. */
1115          if (ins->hint)
1116             continue;
1117 
1118          /* If we don't use the spilled value, nothing to do */
1119          if (!mir_has_arg(ins, spill_node))
1120             continue;
1121 
1122          unsigned index = 0;
1123 
1124          if (!is_special_w) {
1125             index = ++spill_index;
1126 
1127             midgard_instruction *before = ins;
1128             midgard_instruction st;
1129 
1130             if (is_special) {
1131                /* Move */
1132                st = v_mov(spill_node, index);
1133                st.no_spill |= (1 << spill_class);
1134             } else {
1135                /* TLS load */
1136                st = v_load_store_scratch(index, spill_slot, false, 0xF);
1137             }
1138 
1139             /* Mask the load based on the component count
1140              * actually needed to prevent RA loops */
1141 
1142             st.mask =
1143                mir_from_bytemask(mir_round_bytemask_up(read_bytemask, 32), 32);
1144 
1145             mir_insert_instruction_before_scheduled(ctx, block, before, st);
1146          } else {
1147             /* Special writes already have their move spilled in */
1148             index = spill_slot;
1149          }
1150 
1151          /* Rewrite to use */
1152          mir_rewrite_index_src_single(ins, spill_node, index);
1153 
1154          if (!is_special)
1155             ctx->fills++;
1156       }
1157    }
1158 
1159    /* Reset hints */
1160 
1161    mir_foreach_instr_global(ctx, ins) {
1162       ins->hint = false;
1163    }
1164 }
1165 
1166 static void
mir_demote_uniforms(compiler_context * ctx,unsigned new_cutoff)1167 mir_demote_uniforms(compiler_context *ctx, unsigned new_cutoff)
1168 {
1169    unsigned uniforms = ctx->info->push.count / 4;
1170    unsigned old_work_count = 16 - MAX2(uniforms - 8, 0);
1171    unsigned work_count = 16 - MAX2((new_cutoff - 8), 0);
1172 
1173    unsigned min_demote = SSA_FIXED_REGISTER(old_work_count);
1174    unsigned max_demote = SSA_FIXED_REGISTER(work_count);
1175 
1176    mir_foreach_block(ctx, _block) {
1177       midgard_block *block = (midgard_block *)_block;
1178       mir_foreach_instr_in_block(block, ins) {
1179          mir_foreach_src(ins, i) {
1180             if (ins->src[i] < min_demote || ins->src[i] >= max_demote)
1181                continue;
1182 
1183             midgard_instruction *before = ins;
1184 
1185             unsigned temp = make_compiler_temp(ctx);
1186             unsigned idx = (23 - SSA_REG_FROM_FIXED(ins->src[i])) * 4;
1187             assert(idx < ctx->info->push.count);
1188 
1189             ctx->ubo_mask |= BITSET_BIT(ctx->info->push.words[idx].ubo);
1190 
1191             midgard_instruction ld = {
1192                .type = TAG_LOAD_STORE_4,
1193                .mask = 0xF,
1194                .dest = temp,
1195                .dest_type = ins->src_types[i],
1196                .src = {~0, ~0, ~0, ~0},
1197                .swizzle = SWIZZLE_IDENTITY_4,
1198                .op = midgard_op_ld_ubo_128,
1199                .load_store =
1200                   {
1201                      .index_reg = REGISTER_LDST_ZERO,
1202                   },
1203                .constants.u32[0] = ctx->info->push.words[idx].offset,
1204             };
1205 
1206             midgard_pack_ubo_index_imm(&ld.load_store,
1207                                        ctx->info->push.words[idx].ubo);
1208 
1209             mir_insert_instruction_before_scheduled(ctx, block, before, ld);
1210 
1211             mir_rewrite_index_src_single(ins, ins->src[i], temp);
1212          }
1213       }
1214    }
1215 
1216    ctx->info->push.count = MIN2(ctx->info->push.count, new_cutoff * 4);
1217 }
1218 
1219 /* Run register allocation in a loop, spilling until we succeed */
1220 
1221 void
mir_ra(compiler_context * ctx)1222 mir_ra(compiler_context *ctx)
1223 {
1224    struct lcra_state *l = NULL;
1225    bool spilled = false;
1226    int iter_count = 1000; /* max iterations */
1227 
1228    /* Number of 128-bit slots in memory we've spilled into */
1229    unsigned spill_count = DIV_ROUND_UP(ctx->info->tls_size, 16);
1230 
1231    mir_create_pipeline_registers(ctx);
1232 
1233    do {
1234       if (spilled) {
1235          signed spill_node = mir_choose_spill_node(ctx, l);
1236          unsigned uniforms = ctx->info->push.count / 4;
1237 
1238          /* It's a lot cheaper to demote uniforms to get more
1239           * work registers than to spill to TLS. */
1240          if (l->spill_class == REG_CLASS_WORK && uniforms > 8) {
1241             mir_demote_uniforms(ctx, MAX2(uniforms - 4, 8));
1242          } else if (spill_node == -1) {
1243             fprintf(stderr, "ERROR: Failed to choose spill node\n");
1244             lcra_free(l);
1245             return;
1246          } else {
1247             mir_spill_register(ctx, spill_node, l->spill_class, &spill_count);
1248          }
1249       }
1250 
1251       mir_squeeze_index(ctx);
1252       mir_invalidate_liveness(ctx);
1253 
1254       if (l) {
1255          lcra_free(l);
1256          l = NULL;
1257       }
1258 
1259       l = allocate_registers(ctx, &spilled);
1260    } while (spilled && ((iter_count--) > 0));
1261 
1262    if (iter_count <= 0) {
1263       fprintf(
1264          stderr,
1265          "panfrost: Gave up allocating registers, rendering will be incomplete\n");
1266       assert(0);
1267    }
1268 
1269    /* Report spilling information. spill_count is in 128-bit slots (vec4 x
1270     * fp32), but tls_size is in bytes, so multiply by 16 */
1271 
1272    ctx->info->tls_size = spill_count * 16;
1273 
1274    install_registers(ctx, l);
1275 
1276    lcra_free(l);
1277 }
1278