xref: /aosp_15_r20/external/mesa3d/src/panfrost/midgard/midgard_schedule.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2018-2019 Alyssa Rosenzweig <[email protected]>
3  * Copyright (C) 2019-2020 Collabora, Ltd.
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "util/half_float.h"
26 #include "util/u_math.h"
27 #include "util/u_memory.h"
28 #include "compiler.h"
29 #include "midgard_ops.h"
30 #include "midgard_quirks.h"
31 
32 /* Scheduling for Midgard is complicated, to say the least. ALU instructions
33  * must be grouped into VLIW bundles according to following model:
34  *
35  * [VMUL] [SADD]
36  * [VADD] [SMUL] [VLUT]
37  *
38  * A given instruction can execute on some subset of the units (or a few can
39  * execute on all). Instructions can be either vector or scalar; only scalar
40  * instructions can execute on SADD/SMUL units. Units on a given line execute
41  * in parallel. Subsequent lines execute separately and can pass results
42  * directly via pipeline registers r24/r25, bypassing the register file.
43  *
44  * A bundle can optionally have 128-bits of embedded constants, shared across
45  * all of the instructions within a bundle.
46  *
47  * Instructions consuming conditionals (branches and conditional selects)
48  * require their condition to be written into the conditional register (r31)
49  * within the same bundle they are consumed.
50  *
51  * Fragment writeout requires its argument to be written in full within the
52  * same bundle as the branch, with no hanging dependencies.
53  *
54  * Load/store instructions are also in bundles of simply two instructions, and
55  * texture instructions have no bundling.
56  *
57  * -------------------------------------------------------------------------
58  *
59  */
60 
61 /* We create the dependency graph with per-byte granularity */
62 
63 #define BYTE_COUNT 16
64 
65 static void
add_dependency(struct util_dynarray * table,unsigned index,uint16_t mask,midgard_instruction ** instructions,unsigned child)66 add_dependency(struct util_dynarray *table, unsigned index, uint16_t mask,
67                midgard_instruction **instructions, unsigned child)
68 {
69    for (unsigned i = 0; i < BYTE_COUNT; ++i) {
70       if (!(mask & (1 << i)))
71          continue;
72 
73       struct util_dynarray *parents = &table[(BYTE_COUNT * index) + i];
74 
75       util_dynarray_foreach(parents, unsigned, parent) {
76          BITSET_WORD *dependents = instructions[*parent]->dependents;
77 
78          /* Already have the dependency */
79          if (BITSET_TEST(dependents, child))
80             continue;
81 
82          BITSET_SET(dependents, child);
83          instructions[child]->nr_dependencies++;
84       }
85    }
86 }
87 
88 static void
mark_access(struct util_dynarray * table,unsigned index,uint16_t mask,unsigned parent)89 mark_access(struct util_dynarray *table, unsigned index, uint16_t mask,
90             unsigned parent)
91 {
92    for (unsigned i = 0; i < BYTE_COUNT; ++i) {
93       if (!(mask & (1 << i)))
94          continue;
95 
96       util_dynarray_append(&table[(BYTE_COUNT * index) + i], unsigned, parent);
97    }
98 }
99 
100 static void
mir_create_dependency_graph(midgard_instruction ** instructions,unsigned count,unsigned node_count)101 mir_create_dependency_graph(midgard_instruction **instructions, unsigned count,
102                             unsigned node_count)
103 {
104    size_t sz = node_count * BYTE_COUNT;
105 
106    struct util_dynarray *last_read = calloc(sizeof(struct util_dynarray), sz);
107    struct util_dynarray *last_write = calloc(sizeof(struct util_dynarray), sz);
108 
109    for (unsigned i = 0; i < sz; ++i) {
110       util_dynarray_init(&last_read[i], NULL);
111       util_dynarray_init(&last_write[i], NULL);
112    }
113 
114    /* Initialize dependency graph */
115    for (unsigned i = 0; i < count; ++i) {
116       instructions[i]->dependents =
117          calloc(BITSET_WORDS(count), sizeof(BITSET_WORD));
118 
119       instructions[i]->nr_dependencies = 0;
120    }
121 
122    unsigned prev_ldst[3] = {~0, ~0, ~0};
123 
124    /* Populate dependency graph */
125    for (signed i = count - 1; i >= 0; --i) {
126       if (instructions[i]->compact_branch)
127          continue;
128 
129       unsigned dest = instructions[i]->dest;
130       unsigned mask = mir_bytemask(instructions[i]);
131 
132       mir_foreach_src((*instructions), s) {
133          unsigned src = instructions[i]->src[s];
134 
135          if (src < node_count) {
136             unsigned readmask =
137                mir_bytemask_of_read_components(instructions[i], src);
138             add_dependency(last_write, src, readmask, instructions, i);
139          }
140       }
141 
142       /* Create a list of dependencies for each type of load/store
143        * instruction to prevent reordering. */
144       if (instructions[i]->type == TAG_LOAD_STORE_4 &&
145           load_store_opcode_props[instructions[i]->op].props & LDST_ADDRESS) {
146 
147          unsigned type = instructions[i]->load_store.arg_reg |
148                          instructions[i]->load_store.arg_comp;
149 
150          unsigned idx;
151          switch (type) {
152          case LDST_SHARED:
153             idx = 0;
154             break;
155          case LDST_SCRATCH:
156             idx = 1;
157             break;
158          default:
159             idx = 2;
160             break;
161          }
162 
163          unsigned prev = prev_ldst[idx];
164 
165          if (prev != ~0) {
166             BITSET_WORD *dependents = instructions[prev]->dependents;
167 
168             /* Already have the dependency */
169             if (BITSET_TEST(dependents, i))
170                continue;
171 
172             BITSET_SET(dependents, i);
173             instructions[i]->nr_dependencies++;
174          }
175 
176          prev_ldst[idx] = i;
177       }
178 
179       if (dest < node_count) {
180          add_dependency(last_read, dest, mask, instructions, i);
181          add_dependency(last_write, dest, mask, instructions, i);
182          mark_access(last_write, dest, mask, i);
183       }
184 
185       mir_foreach_src((*instructions), s) {
186          unsigned src = instructions[i]->src[s];
187 
188          if (src < node_count) {
189             unsigned readmask =
190                mir_bytemask_of_read_components(instructions[i], src);
191             mark_access(last_read, src, readmask, i);
192          }
193       }
194    }
195 
196    /* If there is a branch, all instructions depend on it, as interblock
197     * execution must be purely in-order */
198 
199    if (instructions[count - 1]->compact_branch) {
200       BITSET_WORD *dependents = instructions[count - 1]->dependents;
201 
202       for (signed i = count - 2; i >= 0; --i) {
203          if (BITSET_TEST(dependents, i))
204             continue;
205 
206          BITSET_SET(dependents, i);
207          instructions[i]->nr_dependencies++;
208       }
209    }
210 
211    /* Free the intermediate structures */
212    for (unsigned i = 0; i < sz; ++i) {
213       util_dynarray_fini(&last_read[i]);
214       util_dynarray_fini(&last_write[i]);
215    }
216 
217    free(last_read);
218    free(last_write);
219 }
220 
221 /* Does the mask cover more than a scalar? */
222 
223 static bool
is_single_component_mask(unsigned mask)224 is_single_component_mask(unsigned mask)
225 {
226    int components = 0;
227 
228    for (int c = 0; c < 8; ++c) {
229       if (mask & (1 << c))
230          components++;
231    }
232 
233    return components == 1;
234 }
235 
236 /* Helpers for scheudling */
237 
238 static bool
mir_is_scalar(midgard_instruction * ains)239 mir_is_scalar(midgard_instruction *ains)
240 {
241    /* Do we try to use it as a vector op? */
242    if (!is_single_component_mask(ains->mask))
243       return false;
244 
245    /* Otherwise, check mode hazards */
246    bool could_scalar = true;
247    unsigned szd = nir_alu_type_get_type_size(ains->dest_type);
248    unsigned sz0 = nir_alu_type_get_type_size(ains->src_types[0]);
249    unsigned sz1 = nir_alu_type_get_type_size(ains->src_types[1]);
250 
251    /* Only 16/32-bit can run on a scalar unit */
252    could_scalar &= (szd == 16) || (szd == 32);
253 
254    if (ains->src[0] != ~0)
255       could_scalar &= (sz0 == 16) || (sz0 == 32);
256 
257    if (ains->src[1] != ~0)
258       could_scalar &= (sz1 == 16) || (sz1 == 32);
259 
260    if (midgard_is_integer_out_op(ains->op) &&
261        ains->outmod != midgard_outmod_keeplo)
262       return false;
263 
264    return could_scalar;
265 }
266 
267 /* How many bytes does this ALU instruction add to the bundle? */
268 
269 static unsigned
bytes_for_instruction(midgard_instruction * ains)270 bytes_for_instruction(midgard_instruction *ains)
271 {
272    if (ains->unit & UNITS_ANY_VECTOR)
273       return sizeof(midgard_reg_info) + sizeof(midgard_vector_alu);
274    else if (ains->unit == ALU_ENAB_BRANCH)
275       return sizeof(midgard_branch_extended);
276    else if (ains->compact_branch)
277       return sizeof(uint16_t);
278    else
279       return sizeof(midgard_reg_info) + sizeof(midgard_scalar_alu);
280 }
281 
282 /* We would like to flatten the linked list of midgard_instructions in a bundle
283  * to an array of pointers on the heap for easy indexing */
284 
285 static midgard_instruction **
flatten_mir(midgard_block * block,unsigned * len)286 flatten_mir(midgard_block *block, unsigned *len)
287 {
288    *len = list_length(&block->base.instructions);
289 
290    if (!(*len))
291       return NULL;
292 
293    midgard_instruction **instructions =
294       calloc(sizeof(midgard_instruction *), *len);
295 
296    unsigned i = 0;
297 
298    mir_foreach_instr_in_block(block, ins)
299       instructions[i++] = ins;
300 
301    return instructions;
302 }
303 
304 /* The worklist is the set of instructions that can be scheduled now; that is,
305  * the set of instructions with no remaining dependencies */
306 
307 static void
mir_initialize_worklist(BITSET_WORD * worklist,midgard_instruction ** instructions,unsigned count)308 mir_initialize_worklist(BITSET_WORD *worklist,
309                         midgard_instruction **instructions, unsigned count)
310 {
311    for (unsigned i = 0; i < count; ++i) {
312       if (instructions[i]->nr_dependencies == 0)
313          BITSET_SET(worklist, i);
314    }
315 }
316 
317 /* Update the worklist after an instruction terminates. Remove its edges from
318  * the graph and if that causes any node to have no dependencies, add it to the
319  * worklist */
320 
321 static void
mir_update_worklist(BITSET_WORD * worklist,unsigned count,midgard_instruction ** instructions,midgard_instruction * done)322 mir_update_worklist(BITSET_WORD *worklist, unsigned count,
323                     midgard_instruction **instructions,
324                     midgard_instruction *done)
325 {
326    /* Sanity check: if no instruction terminated, there is nothing to do.
327     * If the instruction that terminated had dependencies, that makes no
328     * sense and means we messed up the worklist. Finally, as the purpose
329     * of this routine is to update dependents, we abort early if there are
330     * no dependents defined. */
331 
332    if (!done)
333       return;
334 
335    assert(done->nr_dependencies == 0);
336 
337    if (!done->dependents)
338       return;
339 
340    /* We have an instruction with dependents. Iterate each dependent to
341     * remove one dependency (`done`), adding dependents to the worklist
342     * where possible. */
343 
344    unsigned i;
345    BITSET_FOREACH_SET(i, done->dependents, count) {
346       assert(instructions[i]->nr_dependencies);
347 
348       if (!(--instructions[i]->nr_dependencies))
349          BITSET_SET(worklist, i);
350    }
351 
352    free(done->dependents);
353 }
354 
355 /* While scheduling, we need to choose instructions satisfying certain
356  * criteria. As we schedule backwards, we choose the *last* instruction in the
357  * worklist to simulate in-order scheduling. Chosen instructions must satisfy a
358  * given predicate. */
359 
360 struct midgard_predicate {
361    /* TAG or ~0 for dont-care */
362    unsigned tag;
363 
364    /* True if we want to pop off the chosen instruction */
365    bool destructive;
366 
367    /* For ALU, choose only this unit */
368    unsigned unit;
369 
370    /* State for bundle constants. constants is the actual constants
371     * for the bundle. constant_count is the number of bytes (up to
372     * 16) currently in use for constants. When picking in destructive
373     * mode, the constants array will be updated, and the instruction
374     * will be adjusted to index into the constants array */
375 
376    midgard_constants *constants;
377    unsigned constant_mask;
378 
379    /* Exclude this destination (if not ~0) */
380    unsigned exclude;
381 
382    /* Don't schedule instructions consuming conditionals (since we already
383     * scheduled one). Excludes conditional branches and csel */
384    bool no_cond;
385 
386    /* Require (or reject) a minimal mask and (if nonzero) given
387     * destination. Used for writeout optimizations */
388 
389    unsigned mask;
390    unsigned no_mask;
391    unsigned dest;
392 
393    /* Whether to not-care/only/never schedule imov/fmov instructions This
394     * allows non-move instructions to get priority on each unit */
395    unsigned move_mode;
396 
397    /* For load/store: how many pipeline registers are in use? The two
398     * scheduled instructions cannot use more than the 256-bits of pipeline
399     * space available or RA will fail (as it would run out of pipeline
400     * registers and fail to spill without breaking the schedule) */
401 
402    unsigned pipeline_count;
403 
404    /* For load/store: is a ST_VARY.a32 instruction scheduled into the
405     * bundle? is a non-ST_VARY.a32 instruction scheduled? Potential
406     * hardware issue, unknown cause.
407     */
408    bool any_st_vary_a32, any_non_st_vary_a32;
409 };
410 
411 static bool
mir_adjust_constant(midgard_instruction * ins,unsigned src,unsigned * bundle_constant_mask,unsigned * comp_mapping,uint8_t * bundle_constants,bool upper)412 mir_adjust_constant(midgard_instruction *ins, unsigned src,
413                     unsigned *bundle_constant_mask, unsigned *comp_mapping,
414                     uint8_t *bundle_constants, bool upper)
415 {
416    unsigned type_size = nir_alu_type_get_type_size(ins->src_types[src]) / 8;
417    unsigned type_shift = util_logbase2(type_size);
418    unsigned max_comp = mir_components_for_type(ins->src_types[src]);
419    unsigned comp_mask = mir_from_bytemask(
420       mir_round_bytemask_up(mir_bytemask_of_read_components_index(ins, src),
421                             type_size * 8),
422       type_size * 8);
423    unsigned type_mask = (1 << type_size) - 1;
424 
425    /* Upper only makes sense for 16-bit */
426    if (type_size != 16 && upper)
427       return false;
428 
429    /* For 16-bit, we need to stay on either upper or lower halves to avoid
430     * disrupting the swizzle */
431    unsigned start = upper ? 8 : 0;
432    unsigned length = (type_size == 2) ? 8 : 16;
433 
434    for (unsigned comp = 0; comp < max_comp; comp++) {
435       if (!(comp_mask & (1 << comp)))
436          continue;
437 
438       uint8_t *constantp = ins->constants.u8 + (type_size * comp);
439       unsigned best_reuse_bytes = 0;
440       signed best_place = -1;
441       unsigned i, j;
442 
443       for (i = start; i < (start + length); i += type_size) {
444          unsigned reuse_bytes = 0;
445 
446          for (j = 0; j < type_size; j++) {
447             if (!(*bundle_constant_mask & (1 << (i + j))))
448                continue;
449             if (constantp[j] != bundle_constants[i + j])
450                break;
451             if ((i + j) > (start + length))
452                break;
453 
454             reuse_bytes++;
455          }
456 
457          /* Select the place where existing bytes can be
458           * reused so we leave empty slots to others
459           */
460          if (j == type_size &&
461              (reuse_bytes > best_reuse_bytes || best_place < 0)) {
462             best_reuse_bytes = reuse_bytes;
463             best_place = i;
464             break;
465          }
466       }
467 
468       /* This component couldn't fit in the remaining constant slot,
469        * no need check the remaining components, bail out now
470        */
471       if (best_place < 0)
472          return false;
473 
474       memcpy(&bundle_constants[i], constantp, type_size);
475       *bundle_constant_mask |= type_mask << best_place;
476       comp_mapping[comp] = best_place >> type_shift;
477    }
478 
479    return true;
480 }
481 
482 /* For an instruction that can fit, adjust it to fit and update the constants
483  * array, in destructive mode. Returns whether the fitting was successful. */
484 
485 static bool
mir_adjust_constants(midgard_instruction * ins,struct midgard_predicate * pred,bool destructive)486 mir_adjust_constants(midgard_instruction *ins, struct midgard_predicate *pred,
487                      bool destructive)
488 {
489    /* No constant, nothing to adjust */
490    if (!ins->has_constants)
491       return true;
492 
493    unsigned r_constant = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
494    unsigned bundle_constant_mask = pred->constant_mask;
495    unsigned comp_mapping[2][16] = {};
496    uint8_t bundle_constants[16];
497 
498    memcpy(bundle_constants, pred->constants, 16);
499 
500    /* Let's try to find a place for each active component of the constant
501     * register.
502     */
503    for (unsigned src = 0; src < 2; ++src) {
504       if (ins->src[src] != SSA_FIXED_REGISTER(REGISTER_CONSTANT))
505          continue;
506 
507       /* First, try lower half (or whole for !16) */
508       if (mir_adjust_constant(ins, src, &bundle_constant_mask,
509                               comp_mapping[src], bundle_constants, false))
510          continue;
511 
512       /* Next, try upper half */
513       if (mir_adjust_constant(ins, src, &bundle_constant_mask,
514                               comp_mapping[src], bundle_constants, true))
515          continue;
516 
517       /* Otherwise bail */
518       return false;
519    }
520 
521    /* If non-destructive, we're done */
522    if (!destructive)
523       return true;
524 
525    /* Otherwise update the constant_mask and constant values */
526    pred->constant_mask = bundle_constant_mask;
527    memcpy(pred->constants, bundle_constants, 16);
528 
529    /* Use comp_mapping as a swizzle */
530    mir_foreach_src(ins, s) {
531       if (ins->src[s] == r_constant)
532          mir_compose_swizzle(ins->swizzle[s], comp_mapping[s], ins->swizzle[s]);
533    }
534 
535    return true;
536 }
537 
538 /* Conservative estimate of the pipeline registers required for load/store */
539 
540 static unsigned
mir_pipeline_count(midgard_instruction * ins)541 mir_pipeline_count(midgard_instruction *ins)
542 {
543    unsigned bytecount = 0;
544 
545    mir_foreach_src(ins, i) {
546       /* Skip empty source  */
547       if (ins->src[i] == ~0)
548          continue;
549 
550       if (i == 0) {
551          /* First source is a vector, worst-case the mask */
552          unsigned bytemask = mir_bytemask_of_read_components_index(ins, i);
553          unsigned max = util_logbase2(bytemask) + 1;
554          bytecount += max;
555       } else {
556          /* Sources 1 on are scalars */
557          bytecount += 4;
558       }
559    }
560 
561    unsigned dwords = DIV_ROUND_UP(bytecount, 16);
562    assert(dwords <= 2);
563 
564    return dwords;
565 }
566 
567 /* Matches FADD x, x with modifiers compatible. Since x + x = x * 2, for
568  * any x including of the form f(y) for some swizzle/abs/neg function f */
569 
570 static bool
mir_is_add_2(midgard_instruction * ins)571 mir_is_add_2(midgard_instruction *ins)
572 {
573    if (ins->op != midgard_alu_op_fadd)
574       return false;
575 
576    if (ins->src[0] != ins->src[1])
577       return false;
578 
579    if (ins->src_types[0] != ins->src_types[1])
580       return false;
581 
582    for (unsigned i = 0; i < MIR_VEC_COMPONENTS; ++i) {
583       if (ins->swizzle[0][i] != ins->swizzle[1][i])
584          return false;
585    }
586 
587    if (ins->src_abs[0] != ins->src_abs[1])
588       return false;
589 
590    if (ins->src_neg[0] != ins->src_neg[1])
591       return false;
592 
593    return true;
594 }
595 
596 static void
mir_adjust_unit(midgard_instruction * ins,unsigned unit)597 mir_adjust_unit(midgard_instruction *ins, unsigned unit)
598 {
599    /* FADD x, x = FMUL x, #2 */
600    if (mir_is_add_2(ins) && (unit & (UNITS_MUL | UNIT_VLUT))) {
601       ins->op = midgard_alu_op_fmul;
602 
603       ins->src[1] = ~0;
604       ins->src_abs[1] = false;
605       ins->src_neg[1] = false;
606 
607       ins->has_inline_constant = true;
608       ins->inline_constant = _mesa_float_to_half(2.0);
609    }
610 }
611 
612 static unsigned
mir_has_unit(midgard_instruction * ins,unsigned unit)613 mir_has_unit(midgard_instruction *ins, unsigned unit)
614 {
615    if (alu_opcode_props[ins->op].props & unit)
616       return true;
617 
618    /* FADD x, x can run on any adder or any multiplier */
619    if (mir_is_add_2(ins))
620       return true;
621 
622    return false;
623 }
624 
625 /* Net change in liveness if an instruction were scheduled. Loosely based on
626  * ir3's scheduler. */
627 
628 static int
mir_live_effect(uint16_t * liveness,midgard_instruction * ins,bool destructive)629 mir_live_effect(uint16_t *liveness, midgard_instruction *ins, bool destructive)
630 {
631    /* TODO: what if dest is used multiple times? */
632    int free_live = 0;
633 
634    if (ins->dest < SSA_FIXED_MINIMUM) {
635       unsigned bytemask = mir_bytemask(ins);
636       bytemask = util_next_power_of_two(bytemask + 1) - 1;
637       free_live += util_bitcount(liveness[ins->dest] & bytemask);
638 
639       if (destructive)
640          liveness[ins->dest] &= ~bytemask;
641    }
642 
643    int new_live = 0;
644 
645    mir_foreach_src(ins, s) {
646       unsigned S = ins->src[s];
647 
648       bool dupe = false;
649 
650       for (unsigned q = 0; q < s; ++q)
651          dupe |= (ins->src[q] == S);
652 
653       if (dupe)
654          continue;
655 
656       if (S < SSA_FIXED_MINIMUM) {
657          unsigned bytemask = mir_bytemask_of_read_components(ins, S);
658          bytemask = util_next_power_of_two(bytemask + 1) - 1;
659 
660          /* Count only the new components */
661          new_live += util_bitcount(bytemask & ~(liveness[S]));
662 
663          if (destructive)
664             liveness[S] |= bytemask;
665       }
666    }
667 
668    return new_live - free_live;
669 }
670 
671 static midgard_instruction *
mir_choose_instruction(midgard_instruction ** instructions,uint16_t * liveness,BITSET_WORD * worklist,unsigned count,struct midgard_predicate * predicate)672 mir_choose_instruction(midgard_instruction **instructions, uint16_t *liveness,
673                        BITSET_WORD *worklist, unsigned count,
674                        struct midgard_predicate *predicate)
675 {
676    /* Parse the predicate */
677    unsigned tag = predicate->tag;
678    unsigned unit = predicate->unit;
679    bool scalar = (unit != ~0) && (unit & UNITS_SCALAR);
680    bool no_cond = predicate->no_cond;
681 
682    unsigned mask = predicate->mask;
683    unsigned dest = predicate->dest;
684    bool needs_dest = mask & 0xF;
685 
686    /* Iterate to find the best instruction satisfying the predicate */
687    unsigned i;
688 
689    signed best_index = -1;
690    signed best_effect = INT_MAX;
691    bool best_conditional = false;
692 
693    /* Enforce a simple metric limiting distance to keep down register
694     * pressure. TOOD: replace with liveness tracking for much better
695     * results */
696 
697    unsigned max_active = 0;
698    unsigned max_distance = 36;
699 
700 #ifndef NDEBUG
701    /* Force in-order scheduling */
702    if (midgard_debug & MIDGARD_DBG_INORDER)
703       max_distance = 1;
704 #endif
705 
706    BITSET_FOREACH_SET(i, worklist, count) {
707       max_active = MAX2(max_active, i);
708    }
709 
710    BITSET_FOREACH_SET(i, worklist, count) {
711       if ((max_active - i) >= max_distance)
712          continue;
713 
714       if (tag != ~0 && instructions[i]->type != tag)
715          continue;
716 
717       bool alu = (instructions[i]->type == TAG_ALU_4);
718       bool ldst = (instructions[i]->type == TAG_LOAD_STORE_4);
719 
720       bool branch = alu && (unit == ALU_ENAB_BR_COMPACT);
721       bool is_move = alu && (instructions[i]->op == midgard_alu_op_imov ||
722                              instructions[i]->op == midgard_alu_op_fmov);
723 
724       if (predicate->exclude != ~0 &&
725           instructions[i]->dest == predicate->exclude)
726          continue;
727 
728       if (alu && !branch && unit != ~0 &&
729           !(mir_has_unit(instructions[i], unit)))
730          continue;
731 
732       /* 0: don't care, 1: no moves, 2: only moves */
733       if (predicate->move_mode && ((predicate->move_mode - 1) != is_move))
734          continue;
735 
736       if (branch && !instructions[i]->compact_branch)
737          continue;
738 
739       if (alu && scalar && !mir_is_scalar(instructions[i]))
740          continue;
741 
742       if (alu && predicate->constants &&
743           !mir_adjust_constants(instructions[i], predicate, false))
744          continue;
745 
746       if (needs_dest && instructions[i]->dest != dest)
747          continue;
748 
749       if (mask && ((~instructions[i]->mask) & mask))
750          continue;
751 
752       if (instructions[i]->mask & predicate->no_mask)
753          continue;
754 
755       if (ldst &&
756           mir_pipeline_count(instructions[i]) + predicate->pipeline_count > 2)
757          continue;
758 
759       bool st_vary_a32 = (instructions[i]->op == midgard_op_st_vary_32);
760 
761       if (ldst && predicate->any_non_st_vary_a32 && st_vary_a32)
762          continue;
763 
764       if (ldst && predicate->any_st_vary_a32 && !st_vary_a32)
765          continue;
766 
767       bool conditional = alu && !branch && OP_IS_CSEL(instructions[i]->op);
768       conditional |= (branch && instructions[i]->branch.conditional);
769 
770       if (conditional && no_cond)
771          continue;
772 
773       int effect = mir_live_effect(liveness, instructions[i], false);
774 
775       if (effect > best_effect)
776          continue;
777 
778       if (effect == best_effect && (signed)i < best_index)
779          continue;
780 
781       best_effect = effect;
782       best_index = i;
783       best_conditional = conditional;
784    }
785 
786    /* Did we find anything?  */
787 
788    if (best_index < 0)
789       return NULL;
790 
791    /* If we found something, remove it from the worklist */
792    assert(best_index < count);
793    midgard_instruction *I = instructions[best_index];
794 
795    if (predicate->destructive) {
796       BITSET_CLEAR(worklist, best_index);
797 
798       if (I->type == TAG_ALU_4)
799          mir_adjust_constants(instructions[best_index], predicate, true);
800 
801       if (I->type == TAG_LOAD_STORE_4) {
802          predicate->pipeline_count +=
803             mir_pipeline_count(instructions[best_index]);
804 
805          if (instructions[best_index]->op == midgard_op_st_vary_32)
806             predicate->any_st_vary_a32 = true;
807          else
808             predicate->any_non_st_vary_a32 = true;
809       }
810 
811       if (I->type == TAG_ALU_4)
812          mir_adjust_unit(instructions[best_index], unit);
813 
814       /* Once we schedule a conditional, we can't again */
815       predicate->no_cond |= best_conditional;
816       mir_live_effect(liveness, instructions[best_index], true);
817    }
818 
819    return I;
820 }
821 
822 /* Still, we don't choose instructions in a vacuum. We need a way to choose the
823  * best bundle type (ALU, load/store, texture). Nondestructive. */
824 
825 static unsigned
mir_choose_bundle(midgard_instruction ** instructions,uint16_t * liveness,BITSET_WORD * worklist,unsigned count,unsigned num_ldst)826 mir_choose_bundle(midgard_instruction **instructions, uint16_t *liveness,
827                   BITSET_WORD *worklist, unsigned count, unsigned num_ldst)
828 {
829    /* At the moment, our algorithm is very simple - use the bundle of the
830     * best instruction, regardless of what else could be scheduled
831     * alongside it. This is not optimal but it works okay for in-order */
832 
833    struct midgard_predicate predicate = {
834       .tag = ~0,
835       .unit = ~0,
836       .destructive = false,
837       .exclude = ~0,
838    };
839 
840    midgard_instruction *chosen = mir_choose_instruction(
841       instructions, liveness, worklist, count, &predicate);
842 
843    if (chosen && chosen->type == TAG_LOAD_STORE_4 && !(num_ldst % 2)) {
844       /* Try to schedule load/store ops in pairs */
845 
846       predicate.exclude = chosen->dest;
847       predicate.tag = TAG_LOAD_STORE_4;
848 
849       chosen = mir_choose_instruction(instructions, liveness, worklist, count,
850                                       &predicate);
851       if (chosen)
852          return TAG_LOAD_STORE_4;
853 
854       predicate.tag = ~0;
855 
856       chosen = mir_choose_instruction(instructions, liveness, worklist, count,
857                                       &predicate);
858       assert(chosen == NULL || chosen->type != TAG_LOAD_STORE_4);
859 
860       if (chosen)
861          return chosen->type;
862       else
863          return TAG_LOAD_STORE_4;
864    }
865 
866    if (chosen)
867       return chosen->type;
868    else
869       return ~0;
870 }
871 
872 /* We want to choose an ALU instruction filling a given unit */
873 static void
mir_choose_alu(midgard_instruction ** slot,midgard_instruction ** instructions,uint16_t * liveness,BITSET_WORD * worklist,unsigned len,struct midgard_predicate * predicate,unsigned unit)874 mir_choose_alu(midgard_instruction **slot, midgard_instruction **instructions,
875                uint16_t *liveness, BITSET_WORD *worklist, unsigned len,
876                struct midgard_predicate *predicate, unsigned unit)
877 {
878    /* Did we already schedule to this slot? */
879    if ((*slot) != NULL)
880       return;
881 
882    /* Try to schedule something, if not */
883    predicate->unit = unit;
884    *slot =
885       mir_choose_instruction(instructions, liveness, worklist, len, predicate);
886 
887    /* Store unit upon scheduling */
888    if (*slot && !((*slot)->compact_branch))
889       (*slot)->unit = unit;
890 }
891 
892 /* When we are scheduling a branch/csel, we need the consumed condition in the
893  * same block as a pipeline register. There are two options to enable this:
894  *
895  *  - Move the conditional into the bundle. Preferred, but only works if the
896  *    conditional is used only once and is from this block.
897  *  - Copy the conditional.
898  *
899  * We search for the conditional. If it's in this block, single-use, and
900  * without embedded constants, we schedule it immediately. Otherwise, we
901  * schedule a move for it.
902  *
903  * mir_comparison_mobile is a helper to find the moveable condition.
904  */
905 
906 static unsigned
mir_comparison_mobile(compiler_context * ctx,midgard_instruction ** instructions,struct midgard_predicate * predicate,unsigned count,unsigned cond)907 mir_comparison_mobile(compiler_context *ctx, midgard_instruction **instructions,
908                       struct midgard_predicate *predicate, unsigned count,
909                       unsigned cond)
910 {
911    if (!mir_single_use(ctx, cond))
912       return ~0;
913 
914    unsigned ret = ~0;
915 
916    for (unsigned i = 0; i < count; ++i) {
917       if (instructions[i]->dest != cond)
918          continue;
919 
920       /* Must fit in an ALU bundle */
921       if (instructions[i]->type != TAG_ALU_4)
922          return ~0;
923 
924       /* If it would itself require a condition, that's recursive */
925       if (OP_IS_CSEL(instructions[i]->op))
926          return ~0;
927 
928       /* We'll need to rewrite to .w but that doesn't work for vector
929        * ops that don't replicate (ball/bany), so bail there */
930 
931       if (GET_CHANNEL_COUNT(alu_opcode_props[instructions[i]->op].props))
932          return ~0;
933 
934       /* Ensure it will fit with constants */
935 
936       if (!mir_adjust_constants(instructions[i], predicate, false))
937          return ~0;
938 
939       /* Ensure it is written only once */
940 
941       if (ret != ~0)
942          return ~0;
943       else
944          ret = i;
945    }
946 
947    /* Inject constants now that we are sure we want to */
948    if (ret != ~0)
949       mir_adjust_constants(instructions[ret], predicate, true);
950 
951    return ret;
952 }
953 
954 /* Using the information about the moveable conditional itself, we either pop
955  * that condition off the worklist for use now, or create a move to
956  * artificially schedule instead as a fallback */
957 
958 static midgard_instruction *
mir_schedule_comparison(compiler_context * ctx,midgard_instruction ** instructions,struct midgard_predicate * predicate,BITSET_WORD * worklist,unsigned count,unsigned cond,bool vector,unsigned * swizzle,midgard_instruction * user)959 mir_schedule_comparison(compiler_context *ctx,
960                         midgard_instruction **instructions,
961                         struct midgard_predicate *predicate,
962                         BITSET_WORD *worklist, unsigned count, unsigned cond,
963                         bool vector, unsigned *swizzle,
964                         midgard_instruction *user)
965 {
966    /* TODO: swizzle when scheduling */
967    unsigned comp_i =
968       (!vector && (swizzle[0] == 0))
969          ? mir_comparison_mobile(ctx, instructions, predicate, count, cond)
970          : ~0;
971 
972    /* If we can, schedule the condition immediately */
973    if ((comp_i != ~0) && BITSET_TEST(worklist, comp_i)) {
974       assert(comp_i < count);
975       BITSET_CLEAR(worklist, comp_i);
976       return instructions[comp_i];
977    }
978 
979    /* Otherwise, we insert a move */
980 
981    midgard_instruction mov = v_mov(cond, cond);
982    mov.mask = vector ? 0xF : 0x1;
983    memcpy(mov.swizzle[1], swizzle, sizeof(mov.swizzle[1]));
984 
985    return mir_insert_instruction_before(ctx, user, mov);
986 }
987 
988 /* Most generally, we need instructions writing to r31 in the appropriate
989  * components */
990 
991 static midgard_instruction *
mir_schedule_condition(compiler_context * ctx,struct midgard_predicate * predicate,BITSET_WORD * worklist,unsigned count,midgard_instruction ** instructions,midgard_instruction * last)992 mir_schedule_condition(compiler_context *ctx,
993                        struct midgard_predicate *predicate,
994                        BITSET_WORD *worklist, unsigned count,
995                        midgard_instruction **instructions,
996                        midgard_instruction *last)
997 {
998    /* For a branch, the condition is the only argument; for csel, third */
999    bool branch = last->compact_branch;
1000    unsigned condition_index = branch ? 0 : 2;
1001 
1002    /* csel_v is vector; otherwise, conditions are scalar */
1003    bool vector = !branch && OP_IS_CSEL_V(last->op);
1004 
1005    /* Grab the conditional instruction */
1006 
1007    midgard_instruction *cond = mir_schedule_comparison(
1008       ctx, instructions, predicate, worklist, count, last->src[condition_index],
1009       vector, last->swizzle[condition_index], last);
1010 
1011    /* We have exclusive reign over this (possibly move) conditional
1012     * instruction. We can rewrite into a pipeline conditional register */
1013 
1014    predicate->exclude = cond->dest;
1015    cond->dest = SSA_FIXED_REGISTER(31);
1016    last->src[condition_index] = cond->dest;
1017 
1018    if (!vector) {
1019       cond->mask = (1 << COMPONENT_W);
1020 
1021       mir_foreach_src(cond, s) {
1022          if (cond->src[s] == ~0)
1023             continue;
1024 
1025          for (unsigned q = 0; q < 4; ++q)
1026             cond->swizzle[s][q + COMPONENT_W] = cond->swizzle[s][q];
1027       }
1028 
1029       last->swizzle[condition_index][0] = COMPONENT_W;
1030    }
1031 
1032    /* Schedule the unit: csel is always in the latter pipeline, so a csel
1033     * condition must be in the former pipeline stage (vmul/sadd),
1034     * depending on scalar/vector of the instruction itself. A branch must
1035     * be written from the latter pipeline stage and a branch condition is
1036     * always scalar, so it is always in smul (exception: ball/bany, which
1037     * will be vadd) */
1038 
1039    if (branch)
1040       cond->unit = UNIT_SMUL;
1041    else
1042       cond->unit = vector ? UNIT_VMUL : UNIT_SADD;
1043 
1044    return cond;
1045 }
1046 
1047 /* Schedules a single bundle of the given type */
1048 
1049 static midgard_bundle
mir_schedule_texture(midgard_instruction ** instructions,uint16_t * liveness,BITSET_WORD * worklist,unsigned len,bool is_vertex)1050 mir_schedule_texture(midgard_instruction **instructions, uint16_t *liveness,
1051                      BITSET_WORD *worklist, unsigned len, bool is_vertex)
1052 {
1053    struct midgard_predicate predicate = {
1054       .tag = TAG_TEXTURE_4,
1055       .destructive = true,
1056       .exclude = ~0,
1057    };
1058 
1059    midgard_instruction *ins =
1060       mir_choose_instruction(instructions, liveness, worklist, len, &predicate);
1061 
1062    mir_update_worklist(worklist, len, instructions, ins);
1063 
1064    struct midgard_bundle out = {
1065       .tag = ins->op == midgard_tex_op_barrier ? TAG_TEXTURE_4_BARRIER
1066              : (ins->op == midgard_tex_op_fetch) || is_vertex
1067                 ? TAG_TEXTURE_4_VTX
1068                 : TAG_TEXTURE_4,
1069       .instruction_count = 1,
1070       .instructions = {ins},
1071    };
1072 
1073    return out;
1074 }
1075 
1076 static midgard_bundle
mir_schedule_ldst(midgard_instruction ** instructions,uint16_t * liveness,BITSET_WORD * worklist,unsigned len,unsigned * num_ldst)1077 mir_schedule_ldst(midgard_instruction **instructions, uint16_t *liveness,
1078                   BITSET_WORD *worklist, unsigned len, unsigned *num_ldst)
1079 {
1080    struct midgard_predicate predicate = {
1081       .tag = TAG_LOAD_STORE_4,
1082       .destructive = true,
1083       .exclude = ~0,
1084    };
1085 
1086    /* Try to pick two load/store ops. Second not gauranteed to exist */
1087 
1088    midgard_instruction *ins =
1089       mir_choose_instruction(instructions, liveness, worklist, len, &predicate);
1090 
1091    midgard_instruction *pair =
1092       mir_choose_instruction(instructions, liveness, worklist, len, &predicate);
1093 
1094    assert(ins != NULL);
1095 
1096    struct midgard_bundle out = {
1097       .tag = TAG_LOAD_STORE_4,
1098       .instruction_count = pair ? 2 : 1,
1099       .instructions = {ins, pair},
1100    };
1101 
1102    *num_ldst -= out.instruction_count;
1103 
1104    /* We have to update the worklist atomically, since the two
1105     * instructions run concurrently (TODO: verify it's not pipelined) */
1106 
1107    mir_update_worklist(worklist, len, instructions, ins);
1108    mir_update_worklist(worklist, len, instructions, pair);
1109 
1110    return out;
1111 }
1112 
1113 static void
mir_schedule_zs_write(compiler_context * ctx,struct midgard_predicate * predicate,midgard_instruction ** instructions,uint16_t * liveness,BITSET_WORD * worklist,unsigned len,midgard_instruction * branch,midgard_instruction ** smul,midgard_instruction ** vadd,midgard_instruction ** vlut,bool stencil)1114 mir_schedule_zs_write(compiler_context *ctx,
1115                       struct midgard_predicate *predicate,
1116                       midgard_instruction **instructions, uint16_t *liveness,
1117                       BITSET_WORD *worklist, unsigned len,
1118                       midgard_instruction *branch, midgard_instruction **smul,
1119                       midgard_instruction **vadd, midgard_instruction **vlut,
1120                       bool stencil)
1121 {
1122    bool success = false;
1123    unsigned idx = stencil ? 3 : 2;
1124    unsigned src =
1125       (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(1) : branch->src[idx];
1126 
1127    predicate->dest = src;
1128    predicate->mask = 0x1;
1129 
1130    midgard_instruction **units[] = {smul, vadd, vlut};
1131    unsigned unit_names[] = {UNIT_SMUL, UNIT_VADD, UNIT_VLUT};
1132 
1133    for (unsigned i = 0; i < 3; ++i) {
1134       if (*(units[i]))
1135          continue;
1136 
1137       predicate->unit = unit_names[i];
1138       midgard_instruction *ins = mir_choose_instruction(
1139          instructions, liveness, worklist, len, predicate);
1140 
1141       if (ins) {
1142          ins->unit = unit_names[i];
1143          *(units[i]) = ins;
1144          success |= true;
1145          break;
1146       }
1147    }
1148 
1149    predicate->dest = predicate->mask = 0;
1150 
1151    if (success)
1152       return;
1153 
1154    midgard_instruction *mov = ralloc(ctx, midgard_instruction);
1155    *mov = v_mov(src, make_compiler_temp(ctx));
1156    mov->mask = 0x1;
1157 
1158    branch->src[idx] = mov->dest;
1159 
1160    if (stencil) {
1161       unsigned swizzle = (branch->src[0] == ~0) ? COMPONENT_Y : COMPONENT_X;
1162 
1163       for (unsigned c = 0; c < 16; ++c)
1164          mov->swizzle[1][c] = swizzle;
1165    }
1166 
1167    for (unsigned i = 0; i < 3; ++i) {
1168       if (!(*(units[i]))) {
1169          *(units[i]) = mov;
1170          mov->unit = unit_names[i];
1171          return;
1172       }
1173    }
1174 
1175    unreachable("Could not schedule Z/S move to any unit");
1176 }
1177 
1178 static midgard_bundle
mir_schedule_alu(compiler_context * ctx,midgard_instruction ** instructions,uint16_t * liveness,BITSET_WORD * worklist,unsigned len)1179 mir_schedule_alu(compiler_context *ctx, midgard_instruction **instructions,
1180                  uint16_t *liveness, BITSET_WORD *worklist, unsigned len)
1181 {
1182    struct midgard_bundle bundle = {};
1183 
1184    unsigned bytes_emitted = sizeof(bundle.control);
1185 
1186    struct midgard_predicate predicate = {
1187       .tag = TAG_ALU_4,
1188       .destructive = true,
1189       .exclude = ~0,
1190       .constants = &bundle.constants,
1191    };
1192 
1193    midgard_instruction *vmul = NULL;
1194    midgard_instruction *vadd = NULL;
1195    midgard_instruction *vlut = NULL;
1196    midgard_instruction *smul = NULL;
1197    midgard_instruction *sadd = NULL;
1198    midgard_instruction *branch = NULL;
1199 
1200    mir_choose_alu(&branch, instructions, liveness, worklist, len, &predicate,
1201                   ALU_ENAB_BR_COMPACT);
1202    mir_update_worklist(worklist, len, instructions, branch);
1203    unsigned writeout = branch ? branch->writeout : 0;
1204 
1205    if (branch && branch->branch.conditional) {
1206       midgard_instruction *cond = mir_schedule_condition(
1207          ctx, &predicate, worklist, len, instructions, branch);
1208 
1209       if (cond->unit == UNIT_VADD)
1210          vadd = cond;
1211       else if (cond->unit == UNIT_SMUL)
1212          smul = cond;
1213       else
1214          unreachable("Bad condition");
1215    }
1216 
1217    /* If we have a render target reference, schedule a move for it. Since
1218     * this will be in sadd, we boost this to prevent scheduling csel into
1219     * smul */
1220 
1221    if (writeout && (branch->constants.u32[0] || ctx->inputs->is_blend)) {
1222       sadd = ralloc(ctx, midgard_instruction);
1223       *sadd = v_mov(~0, make_compiler_temp(ctx));
1224       sadd->unit = UNIT_SADD;
1225       sadd->mask = 0x1;
1226       sadd->has_inline_constant = true;
1227       sadd->inline_constant = branch->constants.u32[0];
1228       branch->src[1] = sadd->dest;
1229       branch->src_types[1] = sadd->dest_type;
1230    }
1231 
1232    if (writeout) {
1233       /* Propagate up */
1234       bundle.last_writeout = branch->last_writeout;
1235 
1236       /* Mask off any conditionals.
1237        * This prevents csel and csel_v being scheduled into smul
1238        * since we might not have room for a conditional in vmul/sadd.
1239        * This is important because both writeout and csel have same-bundle
1240        * requirements on their dependencies. */
1241       predicate.no_cond = true;
1242    }
1243 
1244    /* Set r1.w to the return address so we can return from blend shaders */
1245    if (writeout) {
1246       vadd = ralloc(ctx, midgard_instruction);
1247       *vadd = v_mov(~0, make_compiler_temp(ctx));
1248 
1249       if (!ctx->inputs->is_blend) {
1250          vadd->op = midgard_alu_op_iadd;
1251          vadd->src[0] = SSA_FIXED_REGISTER(31);
1252          vadd->src_types[0] = nir_type_uint32;
1253 
1254          for (unsigned c = 0; c < 16; ++c)
1255             vadd->swizzle[0][c] = COMPONENT_X;
1256 
1257          vadd->has_inline_constant = true;
1258          vadd->inline_constant = 0;
1259       } else {
1260          vadd->src[1] = SSA_FIXED_REGISTER(1);
1261          vadd->src_types[0] = nir_type_uint32;
1262 
1263          for (unsigned c = 0; c < 16; ++c)
1264             vadd->swizzle[1][c] = COMPONENT_W;
1265       }
1266 
1267       vadd->unit = UNIT_VADD;
1268       vadd->mask = 0x1;
1269       branch->dest = vadd->dest;
1270       branch->dest_type = vadd->dest_type;
1271    }
1272 
1273    if (writeout & PAN_WRITEOUT_Z)
1274       mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist,
1275                             len, branch, &smul, &vadd, &vlut, false);
1276 
1277    if (writeout & PAN_WRITEOUT_S)
1278       mir_schedule_zs_write(ctx, &predicate, instructions, liveness, worklist,
1279                             len, branch, &smul, &vadd, &vlut, true);
1280 
1281    mir_choose_alu(&smul, instructions, liveness, worklist, len, &predicate,
1282                   UNIT_SMUL);
1283 
1284    for (unsigned mode = 1; mode < 3; ++mode) {
1285       predicate.move_mode = mode;
1286       predicate.no_mask = writeout ? (1 << 3) : 0;
1287       mir_choose_alu(&vlut, instructions, liveness, worklist, len, &predicate,
1288                      UNIT_VLUT);
1289       predicate.no_mask = 0;
1290       mir_choose_alu(&vadd, instructions, liveness, worklist, len, &predicate,
1291                      UNIT_VADD);
1292    }
1293 
1294    /* Reset */
1295    predicate.move_mode = 0;
1296    predicate.exclude = ~0;
1297 
1298    mir_update_worklist(worklist, len, instructions, vlut);
1299    mir_update_worklist(worklist, len, instructions, vadd);
1300    mir_update_worklist(worklist, len, instructions, smul);
1301 
1302    bool vadd_csel = vadd && OP_IS_CSEL(vadd->op);
1303    bool smul_csel = smul && OP_IS_CSEL(smul->op);
1304 
1305    if (vadd_csel || smul_csel) {
1306       midgard_instruction *ins = vadd_csel ? vadd : smul;
1307       midgard_instruction *cond = mir_schedule_condition(
1308          ctx, &predicate, worklist, len, instructions, ins);
1309 
1310       if (cond->unit == UNIT_VMUL)
1311          vmul = cond;
1312       else if (cond->unit == UNIT_SADD)
1313          sadd = cond;
1314       else
1315          unreachable("Bad condition");
1316    }
1317 
1318    /* Stage 2, let's schedule sadd before vmul for writeout */
1319    mir_choose_alu(&sadd, instructions, liveness, worklist, len, &predicate,
1320                   UNIT_SADD);
1321 
1322    /* Check if writeout reads its own register */
1323 
1324    if (writeout) {
1325       midgard_instruction *stages[] = {sadd, vadd, smul, vlut};
1326       unsigned src =
1327          (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0) : branch->src[0];
1328       unsigned writeout_mask = 0x0;
1329       bool bad_writeout = false;
1330 
1331       for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {
1332          if (!stages[i])
1333             continue;
1334 
1335          if (stages[i]->dest != src)
1336             continue;
1337 
1338          writeout_mask |= stages[i]->mask;
1339          bad_writeout |= mir_has_arg(stages[i], branch->src[0]);
1340       }
1341 
1342       /* It's possible we'll be able to schedule something into vmul
1343        * to fill r0. Let's peak into the future, trying to schedule
1344        * vmul specially that way. */
1345 
1346       unsigned full_mask = 0xF;
1347 
1348       if (!bad_writeout && writeout_mask != full_mask) {
1349          predicate.unit = UNIT_VMUL;
1350          predicate.dest = src;
1351          predicate.mask = writeout_mask ^ full_mask;
1352 
1353          struct midgard_instruction *peaked = mir_choose_instruction(
1354             instructions, liveness, worklist, len, &predicate);
1355 
1356          if (peaked) {
1357             vmul = peaked;
1358             vmul->unit = UNIT_VMUL;
1359             writeout_mask |= predicate.mask;
1360             assert(writeout_mask == full_mask);
1361          }
1362 
1363          /* Cleanup */
1364          predicate.dest = predicate.mask = 0;
1365       }
1366 
1367       /* Finally, add a move if necessary */
1368       if (bad_writeout || writeout_mask != full_mask) {
1369          unsigned temp = (branch->src[0] == ~0) ? SSA_FIXED_REGISTER(0)
1370                                                 : make_compiler_temp(ctx);
1371 
1372          vmul = ralloc(ctx, midgard_instruction);
1373          *vmul = v_mov(src, temp);
1374          vmul->unit = UNIT_VMUL;
1375          vmul->mask = full_mask ^ writeout_mask;
1376 
1377          /* Rewrite to use our temp */
1378 
1379          for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {
1380             if (stages[i]) {
1381                mir_rewrite_index_dst_single(stages[i], src, temp);
1382                mir_rewrite_index_src_single(stages[i], src, temp);
1383             }
1384          }
1385 
1386          mir_rewrite_index_src_single(branch, src, temp);
1387       }
1388    }
1389 
1390    mir_choose_alu(&vmul, instructions, liveness, worklist, len, &predicate,
1391                   UNIT_VMUL);
1392 
1393    mir_update_worklist(worklist, len, instructions, vmul);
1394    mir_update_worklist(worklist, len, instructions, sadd);
1395 
1396    bundle.has_embedded_constants = predicate.constant_mask != 0;
1397 
1398    unsigned padding = 0;
1399 
1400    /* Now that we have finished scheduling, build up the bundle */
1401    midgard_instruction *stages[] = {vmul, sadd, vadd, smul, vlut, branch};
1402 
1403    for (unsigned i = 0; i < ARRAY_SIZE(stages); ++i) {
1404       if (stages[i]) {
1405          bundle.control |= stages[i]->unit;
1406          bytes_emitted += bytes_for_instruction(stages[i]);
1407          bundle.instructions[bundle.instruction_count++] = stages[i];
1408 
1409          /* If we branch, we can't spill to TLS since the store
1410           * instruction will never get executed. We could try to
1411           * break the bundle but this is probably easier for
1412           * now. */
1413 
1414          if (branch)
1415             stages[i]->no_spill |= (1 << REG_CLASS_WORK);
1416       }
1417    }
1418 
1419    /* Pad ALU op to nearest word */
1420 
1421    if (bytes_emitted & 15) {
1422       padding = 16 - (bytes_emitted & 15);
1423       bytes_emitted += padding;
1424    }
1425 
1426    /* Constants must always be quadwords */
1427    if (bundle.has_embedded_constants)
1428       bytes_emitted += 16;
1429 
1430    /* Size ALU instruction for tag */
1431    bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
1432 
1433    bool tilebuf_wait = branch && branch->compact_branch &&
1434                        branch->branch.target_type == TARGET_TILEBUF_WAIT;
1435 
1436    /* MRT capable GPUs use a special writeout procedure */
1437    if ((writeout || tilebuf_wait) && !(ctx->quirks & MIDGARD_NO_UPPER_ALU))
1438       bundle.tag += 4;
1439 
1440    bundle.padding = padding;
1441    bundle.control |= bundle.tag;
1442 
1443    return bundle;
1444 }
1445 
1446 /* Schedule a single block by iterating its instruction to create bundles.
1447  * While we go, tally about the bundle sizes to compute the block size. */
1448 
1449 static void
schedule_block(compiler_context * ctx,midgard_block * block)1450 schedule_block(compiler_context *ctx, midgard_block *block)
1451 {
1452    /* Copy list to dynamic array */
1453    unsigned len = 0;
1454    midgard_instruction **instructions = flatten_mir(block, &len);
1455 
1456    if (!len)
1457       return;
1458 
1459    /* Calculate dependencies and initial worklist */
1460    unsigned node_count = ctx->temp_count + 1;
1461    mir_create_dependency_graph(instructions, len, node_count);
1462 
1463    /* Allocate the worklist */
1464    size_t sz = BITSET_WORDS(len) * sizeof(BITSET_WORD);
1465    BITSET_WORD *worklist = calloc(sz, 1);
1466    uint16_t *liveness = calloc(node_count, 2);
1467    mir_initialize_worklist(worklist, instructions, len);
1468 
1469    /* Count the number of load/store instructions so we know when it's
1470     * worth trying to schedule them in pairs. */
1471    unsigned num_ldst = 0;
1472    for (unsigned i = 0; i < len; ++i) {
1473       if (instructions[i]->type == TAG_LOAD_STORE_4)
1474          ++num_ldst;
1475    }
1476 
1477    struct util_dynarray bundles;
1478    util_dynarray_init(&bundles, NULL);
1479 
1480    block->quadword_count = 0;
1481 
1482    for (;;) {
1483       unsigned tag =
1484          mir_choose_bundle(instructions, liveness, worklist, len, num_ldst);
1485       midgard_bundle bundle;
1486 
1487       if (tag == TAG_TEXTURE_4)
1488          bundle = mir_schedule_texture(instructions, liveness, worklist, len,
1489                                        ctx->stage != MESA_SHADER_FRAGMENT);
1490       else if (tag == TAG_LOAD_STORE_4)
1491          bundle =
1492             mir_schedule_ldst(instructions, liveness, worklist, len, &num_ldst);
1493       else if (tag == TAG_ALU_4)
1494          bundle = mir_schedule_alu(ctx, instructions, liveness, worklist, len);
1495       else
1496          break;
1497 
1498       for (unsigned i = 0; i < bundle.instruction_count; ++i)
1499          bundle.instructions[i]->bundle_id =
1500             ctx->quadword_count + block->quadword_count;
1501 
1502       util_dynarray_append(&bundles, midgard_bundle, bundle);
1503       block->quadword_count += midgard_tag_props[bundle.tag].size;
1504    }
1505 
1506    assert(num_ldst == 0);
1507 
1508    /* We emitted bundles backwards; copy into the block in reverse-order */
1509 
1510    util_dynarray_init(&block->bundles, block);
1511    util_dynarray_foreach_reverse(&bundles, midgard_bundle, bundle) {
1512       util_dynarray_append(&block->bundles, midgard_bundle, *bundle);
1513    }
1514    util_dynarray_fini(&bundles);
1515 
1516    block->scheduled = true;
1517    ctx->quadword_count += block->quadword_count;
1518 
1519    /* Reorder instructions to match bundled. First remove existing
1520     * instructions and then recreate the list */
1521 
1522    mir_foreach_instr_in_block_safe(block, ins) {
1523       list_del(&ins->link);
1524    }
1525 
1526    mir_foreach_instr_in_block_scheduled_rev(block, ins) {
1527       list_add(&ins->link, &block->base.instructions);
1528    }
1529 
1530    free(instructions); /* Allocated by flatten_mir() */
1531    free(worklist);
1532    free(liveness);
1533 }
1534 
1535 /* Insert moves to ensure we can register allocate load/store registers */
1536 static void
mir_lower_ldst(compiler_context * ctx)1537 mir_lower_ldst(compiler_context *ctx)
1538 {
1539    mir_foreach_instr_global_safe(ctx, I) {
1540       if (I->type != TAG_LOAD_STORE_4)
1541          continue;
1542 
1543       mir_foreach_src(I, s) {
1544          if (s == 0)
1545             continue;
1546          if (I->src[s] == ~0)
1547             continue;
1548          if (I->swizzle[s][0] == 0)
1549             continue;
1550 
1551          unsigned temp = make_compiler_temp(ctx);
1552          midgard_instruction mov = v_mov(I->src[s], temp);
1553          mov.mask = 0x1;
1554          mov.dest_type = I->src_types[s];
1555          for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c)
1556             mov.swizzle[1][c] = I->swizzle[s][0];
1557 
1558          mir_insert_instruction_before(ctx, I, mov);
1559          I->src[s] = mov.dest;
1560          I->swizzle[s][0] = 0;
1561       }
1562    }
1563 }
1564 
1565 /* Insert moves to ensure we can register allocate blend writeout */
1566 static void
mir_lower_blend_input(compiler_context * ctx)1567 mir_lower_blend_input(compiler_context *ctx)
1568 {
1569    mir_foreach_block(ctx, _blk) {
1570       midgard_block *blk = (midgard_block *)_blk;
1571 
1572       if (list_is_empty(&blk->base.instructions))
1573          continue;
1574 
1575       midgard_instruction *I = mir_last_in_block(blk);
1576 
1577       if (!I || I->type != TAG_ALU_4 || !I->writeout)
1578          continue;
1579 
1580       mir_foreach_src(I, s) {
1581          unsigned src = I->src[s];
1582 
1583          if (src >= ctx->temp_count)
1584             continue;
1585 
1586          if (!_blk->live_out[src])
1587             continue;
1588 
1589          unsigned temp = make_compiler_temp(ctx);
1590          midgard_instruction mov = v_mov(src, temp);
1591          mov.mask = 0xF;
1592          mov.dest_type = nir_type_uint32;
1593          mir_insert_instruction_before(ctx, I, mov);
1594          I->src[s] = mov.dest;
1595       }
1596    }
1597 }
1598 
1599 void
midgard_schedule_program(compiler_context * ctx)1600 midgard_schedule_program(compiler_context *ctx)
1601 {
1602    mir_lower_ldst(ctx);
1603    midgard_promote_uniforms(ctx);
1604 
1605    /* Must be lowered right before scheduling */
1606    mir_lower_special_reads(ctx);
1607    mir_squeeze_index(ctx);
1608 
1609    if (ctx->stage == MESA_SHADER_FRAGMENT) {
1610       mir_invalidate_liveness(ctx);
1611       mir_compute_liveness(ctx);
1612       mir_lower_blend_input(ctx);
1613    }
1614 
1615    mir_squeeze_index(ctx);
1616 
1617    /* Lowering can introduce some dead moves */
1618 
1619    mir_foreach_block(ctx, _block) {
1620       midgard_block *block = (midgard_block *)_block;
1621       midgard_opt_dead_move_eliminate(ctx, block);
1622       schedule_block(ctx, block);
1623    }
1624 }
1625