xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/valhall/va_insert_flow.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2022 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "bi_builder.h"
25 #include "va_compiler.h"
26 #include "valhall_enums.h"
27 
28 /*
29  * Insert flow control into a scheduled and register allocated shader.  This
30  * pass runs after scheduling and register allocation. This pass only
31  * inserts NOPs with the appropriate flow control modifiers. It should be
32  * followed by a cleanup pass to merge flow control modifiers on adjacent
33  * instructions, eliminating the NOPs. This decouples optimization from
34  * correctness, simplifying both passes.
35  *
36  * This pass is responsible for calculating dependencies, according to the
37  * rules:
38  *
39  * 1. An instruction that depends on the results of a previous asyncronous
40  *    must first wait for that instruction's slot, unless all
41  *    reaching code paths already depended on it.
42  * 2. More generally, any dependencies must be encoded. This includes
43  *    Write-After-Write and Write-After-Read hazards with LOAD/STORE to memory.
44  * 3. The shader must wait on slot #6 before running BLEND, ATEST
45  * 4. The shader must wait on slot #7 before running BLEND, ST_TILE
46  * 6. BARRIER must wait on every active slot.
47  *
48  * Unlike Bifrost, it is not necessary to worry about outbound staging
49  * registers, as the hardware stalls reading staging registers when issuing
50  * asynchronous instructions. So we don't track reads in our model of the
51  * hardware scoreboard. This makes things a bit simpler.
52  *
53  * We may reuse slots for multiple asynchronous instructions, though there may
54  * be a performance penalty.
55  */
56 
57 #define BI_NUM_REGISTERS 64
58 
59 /*
60  * Insert a NOP instruction with given flow control.
61  */
62 static void
bi_flow(bi_context * ctx,bi_cursor cursor,enum va_flow flow)63 bi_flow(bi_context *ctx, bi_cursor cursor, enum va_flow flow)
64 {
65    bi_builder b = bi_init_builder(ctx, cursor);
66 
67    bi_nop(&b)->flow = flow;
68 }
69 
70 static uint64_t
bi_read_mask(bi_instr * I)71 bi_read_mask(bi_instr *I)
72 {
73    uint64_t mask = 0;
74 
75    bi_foreach_src(I, s) {
76       if (I->src[s].type == BI_INDEX_REGISTER) {
77          unsigned reg = I->src[s].value;
78          unsigned count = bi_count_read_registers(I, s);
79 
80          mask |= (BITFIELD64_MASK(count) << reg);
81       }
82    }
83 
84    return mask;
85 }
86 
87 static uint64_t
bi_write_mask(bi_instr * I)88 bi_write_mask(bi_instr *I)
89 {
90    uint64_t mask = 0;
91 
92    bi_foreach_dest(I, d) {
93       assert(I->dest[d].type == BI_INDEX_REGISTER);
94 
95       unsigned reg = I->dest[d].value;
96       unsigned count = bi_count_write_registers(I, d);
97 
98       mask |= (BITFIELD64_MASK(count) << reg);
99    }
100 
101    return mask;
102 }
103 
104 static bool
bi_ld_vary_writes_hidden_register(const bi_instr * I)105 bi_ld_vary_writes_hidden_register(const bi_instr *I)
106 {
107    /* Only varying loads can write the hidden register */
108    if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_VARYING)
109       return false;
110 
111    /* They only write in some update modes */
112    return (I->update == BI_UPDATE_STORE) || (I->update == BI_UPDATE_CLOBBER);
113 }
114 
115 static bool
bi_is_memory_access(const bi_instr * I)116 bi_is_memory_access(const bi_instr *I)
117 {
118    /* Some instructions on the attribute unit are functionally
119       a general memory load */
120    switch (I->op) {
121    case BI_OPCODE_LD_ATTR_TEX:
122    case BI_OPCODE_LD_TEX:
123    case BI_OPCODE_LD_TEX_IMM:
124       return true;
125    default:
126       break;
127    }
128 
129    /* UBOs are read-only so there are no ordering constriants */
130    if (I->seg == BI_SEG_UBO)
131       return false;
132 
133    switch (bi_opcode_props[I->op].message) {
134    case BIFROST_MESSAGE_LOAD:
135    case BIFROST_MESSAGE_STORE:
136    case BIFROST_MESSAGE_ATOMIC:
137       return true;
138    default:
139       return false;
140    }
141 }
142 
143 /* Update the scoreboard model to assign an instruction to a given slot */
144 
145 static void
bi_push_instr(struct bi_scoreboard_state * st,bi_instr * I)146 bi_push_instr(struct bi_scoreboard_state *st, bi_instr *I)
147 {
148    if (bi_opcode_props[I->op].sr_write)
149       st->write[I->slot] |= bi_write_mask(I);
150 
151    if (bi_is_memory_access(I))
152       st->memory |= BITFIELD_BIT(I->slot);
153 
154    if (bi_opcode_props[I->op].message == BIFROST_MESSAGE_VARYING)
155       st->varying |= BITFIELD_BIT(I->slot);
156 }
157 
158 static uint8_t MUST_CHECK
bi_pop_slot(struct bi_scoreboard_state * st,unsigned slot)159 bi_pop_slot(struct bi_scoreboard_state *st, unsigned slot)
160 {
161    st->write[slot] = 0;
162    st->varying &= ~BITFIELD_BIT(slot);
163    st->memory &= ~BITFIELD_BIT(slot);
164 
165    return BITFIELD_BIT(slot);
166 }
167 
168 /* Adds a dependency on each slot writing any specified register */
169 
170 static uint8_t MUST_CHECK
bi_depend_on_writers(struct bi_scoreboard_state * st,uint64_t regmask)171 bi_depend_on_writers(struct bi_scoreboard_state *st, uint64_t regmask)
172 {
173    uint8_t slots = 0;
174 
175    for (unsigned slot = 0; slot < ARRAY_SIZE(st->write); ++slot) {
176       if (st->write[slot] & regmask)
177          slots |= bi_pop_slot(st, slot);
178    }
179 
180    return slots;
181 }
182 
183 /* Sets the dependencies for a given clause, updating the model */
184 
185 static void
bi_set_dependencies(bi_block * block,bi_instr * I,struct bi_scoreboard_state * st)186 bi_set_dependencies(bi_block *block, bi_instr *I,
187                     struct bi_scoreboard_state *st)
188 {
189    /* Depend on writers to handle read-after-write and write-after-write
190     * dependencies. Write-after-read dependencies are handled in the hardware
191     * where necessary, so we don't worry about them.
192     */
193    I->flow |= bi_depend_on_writers(st, bi_read_mask(I) | bi_write_mask(I));
194 
195    /* Handle write-after-write and write-after-read dependencies for the varying
196     * hidden registers. Read-after-write dependencies handled in hardware.
197     */
198    if (bi_ld_vary_writes_hidden_register(I)) {
199       u_foreach_bit(slot, st->varying)
200          I->flow |= bi_pop_slot(st, slot);
201    }
202 
203    /* For now, serialize all memory access */
204    if (bi_is_memory_access(I)) {
205       u_foreach_bit(slot, st->memory)
206          I->flow |= bi_pop_slot(st, slot);
207    }
208 
209    /* We need to wait for all general slots before a barrier. The reason is
210     * unknown. In theory, this is redundant, since the BARRIER instruction will
211     * be followed immediately by .wait which waits for all slots. However, that
212     * doesn't seem to work properly in practice.
213     *
214     * The DDK is observed to use the same workaround, going so far as
215     * introducing a NOP before a BARRIER at the beginning of a basic block when
216     * there are outstanding stores.
217     *
218     *     NOP.wait12
219     *     BARRIER.slot7.wait
220     *
221     * Luckily, this situation is pretty rare. The wait introduced here can
222     * usually be merged into the preceding instruction.
223     *
224     * We also use the same workaround to serialize all async instructions when
225     * debugging this pass with the BIFROST_MESA_DEBUG=nosb option.
226     */
227    if (I->op == BI_OPCODE_BARRIER || (bifrost_debug & BIFROST_DBG_NOSB)) {
228       for (unsigned i = 0; i < VA_NUM_GENERAL_SLOTS; ++i) {
229          if (st->write[i] || ((st->varying | st->memory) & BITFIELD_BIT(i)))
230             I->flow |= bi_pop_slot(st, i);
231       }
232    }
233 }
234 
235 static bool
scoreboard_block_update(bi_context * ctx,bi_block * blk)236 scoreboard_block_update(bi_context *ctx, bi_block *blk)
237 {
238    bool progress = false;
239 
240    /* pending_in[s] = sum { p in pred[s] } ( pending_out[p] ) */
241    bi_foreach_predecessor(blk, pred) {
242       for (unsigned i = 0; i < BI_NUM_SLOTS; ++i) {
243          blk->scoreboard_in.read[i] |= (*pred)->scoreboard_out.read[i];
244          blk->scoreboard_in.write[i] |= (*pred)->scoreboard_out.write[i];
245          blk->scoreboard_in.varying |= (*pred)->scoreboard_out.varying;
246          blk->scoreboard_in.memory |= (*pred)->scoreboard_out.memory;
247       }
248    }
249 
250    struct bi_scoreboard_state state = blk->scoreboard_in;
251 
252    /* Assign locally */
253 
254    bi_foreach_instr_in_block(blk, I) {
255       bi_set_dependencies(blk, I, &state);
256       bi_push_instr(&state, I);
257    }
258 
259    /* Insert a wait for varyings at the end of the block.
260     *
261     * A varying load with .store has to wait for all other varying loads
262     * in the quad to complete. The bad case looks like:
263     *
264     *    if (dynamic) {
265     *        x = ld_var()
266     *    } else {
267     *       x = ld_var()
268     *    }
269     *
270     * Logically, a given thread executes only a single ld_var instruction. But
271     * if the quad diverges, the second ld_var has to wait for the first ld_var.
272     * For correct handling, we need to maintain a physical control flow graph
273     * and do the dataflow analysis on that instead of the logical control flow
274     * graph. However, this probably doesn't matter much in practice. This seems
275     * like a decent compromise for now.
276     *
277     * TODO: Consider optimizing this case.
278     */
279    if (state.varying) {
280       uint8_t flow = 0;
281 
282       u_foreach_bit(slot, state.varying)
283          flow |= bi_pop_slot(&state, slot);
284 
285       bi_flow(ctx, bi_after_block(blk), flow);
286    }
287 
288    /* To figure out progress, diff scoreboard_out */
289    progress = !!memcmp(&state, &blk->scoreboard_out, sizeof(state));
290 
291    blk->scoreboard_out = state;
292 
293    return progress;
294 }
295 
296 static void
va_assign_scoreboard(bi_context * ctx)297 va_assign_scoreboard(bi_context *ctx)
298 {
299    u_worklist worklist;
300    bi_worklist_init(ctx, &worklist);
301 
302    bi_foreach_block(ctx, block) {
303       bi_worklist_push_tail(&worklist, block);
304    }
305 
306    /* Perform forward data flow analysis to calculate dependencies */
307    while (!u_worklist_is_empty(&worklist)) {
308       /* Pop from the front for forward analysis */
309       bi_block *blk = bi_worklist_pop_head(&worklist);
310 
311       if (scoreboard_block_update(ctx, blk)) {
312          bi_foreach_successor(blk, succ)
313             bi_worklist_push_tail(&worklist, succ);
314       }
315    }
316 
317    u_worklist_fini(&worklist);
318 }
319 
320 /*
321  * Determine if execution should terminate after a given block. Execution cannot
322  * terminate within a basic block.
323  */
324 static bool
va_should_end(bi_block * block)325 va_should_end(bi_block *block)
326 {
327    /* Don't return if we're succeeded by instructions */
328    for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) {
329       bi_block *succ = block->successors[i];
330 
331       if (succ)
332          return false;
333    }
334 
335    return true;
336 }
337 
338 /*
339  * We should discard helper invocations as soon as helper invocations die after
340  * their last use. Either they die after an instruction using helper
341  * invocations, or they die along a control flow edge. The former is handled by
342  * discarding appropriately after instructions. The latter is handled by
343  * inserting a discard at the _start_ of some blocks:
344  *
345  * Lemma: If a non-critical edge discards helpers, it is the only edge that
346  * enters its destination.
347  *
348  * Proof: An edge discards helpers if helpers are live at the end of the source
349  * block and dead at the start of the destination block. By definition, helpers
350  * are live at the end of a block iff they are live at the start of some
351  * successor of a block. The source block therefore has a successor with helpers
352  * live at the start and a successor with helpers dead at the start. As the
353  * source block has at least two successors, the edge is NOT the only edge
354  * exiting its source. Hence it is the only edge entering the destination,
355  * otherwise the edge would be critical.
356  *
357  * By corrollary, we may handle discards on control flow edges by discarding at
358  * the start of blocks with a single predecessor.
359  *
360  * This routine tests if a block should discard helper invocations at its start.
361  */
362 static bool
va_discard_before_block(bi_block * block)363 va_discard_before_block(bi_block *block)
364 {
365    /* Do not discard if the block requires helpers at the start */
366    if (block->pass_flags)
367       return false;
368 
369    /* By the lemma, if we need to discard, there is a unique predecessor */
370    if (bi_num_predecessors(block) != 1)
371       return false;
372 
373    bi_block *pred = *util_dynarray_element(&block->predecessors, bi_block *, 0);
374 
375    /* Discard if helpers are live at the end of the predecessor, due to helpers
376     * live at the start of some (other) successor.
377     */
378    bi_foreach_successor(pred, succ) {
379       if (succ->pass_flags)
380          return true;
381    }
382 
383    return false;
384 }
385 
386 /*
387  * Test if a program is empty, in the sense of having zero instructions. Empty
388  * shaders get special handling.
389  */
390 static bool
bi_is_empty(bi_context * ctx)391 bi_is_empty(bi_context *ctx)
392 {
393    bi_foreach_instr_global(ctx, _)
394       return false;
395 
396    return true;
397 }
398 
399 /*
400  * Given a program with no flow control modifiers, insert NOPs signaling the
401  * required flow control. Not much optimization happens here.
402  */
403 void
va_insert_flow_control_nops(bi_context * ctx)404 va_insert_flow_control_nops(bi_context *ctx)
405 {
406    /* Special case: if a program is empty, leave it empty. In particular, do not
407     * insert NOP.end. There is special handling in the driver for skipping empty
408     * shaders for shader stage. The .end is not necessary and disrupts
409     * optimizations.
410     */
411    if (bi_is_empty(ctx))
412       return;
413 
414    /* First do dataflow analysis for the scoreboard. This populates I->flow with
415     * a bitmap of slots to wait on.
416     *
417     * Also do dataflow analysis for helper invocations in fragment shaders. This
418     * populates block->pass_flags with helper invocation information.
419     */
420    va_assign_scoreboard(ctx);
421    bi_analyze_helper_terminate(ctx);
422 
423    bi_foreach_block(ctx, block) {
424       /* Handle discards along control flow edges */
425       if (va_discard_before_block(block))
426          bi_flow(ctx, bi_before_block(block), VA_FLOW_DISCARD);
427 
428       bi_foreach_instr_in_block_safe(block, I) {
429          switch (I->op) {
430          /* Signal barriers immediately */
431          case BI_OPCODE_BARRIER:
432             bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT);
433             break;
434 
435          /* Insert waits for tilebuffer and depth/stencil instructions. These
436           * only happen in regular fragment shaders, as the required waits are
437           * assumed to already have happened in blend shaders.
438           *
439           * For discarded thread handling, ATEST must be serialized against all
440           * other asynchronous instructions and should be serialized against all
441           * instructions. Wait for slot 0 immediately after the ATEST.
442           */
443          case BI_OPCODE_BLEND:
444          case BI_OPCODE_LD_TILE:
445          case BI_OPCODE_ST_TILE:
446             if (!ctx->inputs->is_blend)
447                bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT);
448             break;
449          case BI_OPCODE_ATEST:
450             bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
451             bi_flow(ctx, bi_after_instr(I), VA_FLOW_WAIT0);
452             break;
453          case BI_OPCODE_ZS_EMIT:
454             if (!ctx->inputs->is_blend)
455                bi_flow(ctx, bi_before_instr(I), VA_FLOW_WAIT0126);
456             break;
457 
458          default:
459             break;
460          }
461 
462          if (I->flow && I->op != BI_OPCODE_NOP) {
463             /* Wait on the results of asynchronous instructions
464              *
465              * Bitmap of general slots lines up with the encoding of va_flow for
466              * waits on general slots. The dataflow analysis should be ignoring
467              * the special slots #6 and #7, which are handled separately.
468              */
469             assert((I->flow & ~BITFIELD_MASK(VA_NUM_GENERAL_SLOTS)) == 0);
470 
471             bi_flow(ctx, bi_before_instr(I), I->flow);
472             I->flow = 0;
473          }
474       }
475 
476       /* Terminate helpers after the last use */
477       if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend &&
478           block->pass_flags && bi_block_terminates_helpers(block)) {
479 
480          bi_foreach_instr_in_block_safe_rev(block, I) {
481             if (bi_instr_uses_helpers(I)) {
482                bi_flow(ctx, bi_after_instr(I), VA_FLOW_DISCARD);
483                break;
484             }
485          }
486       }
487 
488       /* End exeuction at the end of the block if needed, or reconverge if we
489        * continue but we don't need to end execution.
490        */
491       if (va_should_end(block) || block->needs_nop) {
492          /* Don't bother adding a NOP into an unreachable block */
493          if (block == bi_start_block(&ctx->blocks) ||
494              bi_num_predecessors(block))
495             bi_flow(ctx, bi_after_block(block), VA_FLOW_END);
496       } else if (bi_reconverge_branches(block)) {
497          /* TODO: Do we have ever need to reconverge from an empty block? */
498          if (!list_is_empty(&block->instructions))
499             bi_flow(ctx, bi_after_block(block), VA_FLOW_RECONVERGE);
500       }
501    }
502 
503    /* If helpers are not used anywhere, they are not used at the start, so we
504     * terminate at the start. Else, helpers are used somewhere in the shader and
505     * are terminated after last use.
506     */
507    bi_block *start = bi_start_block(&ctx->blocks);
508    bool frag = (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend);
509 
510    if (frag && !start->pass_flags)
511       bi_flow(ctx, bi_before_block(start), VA_FLOW_DISCARD);
512 }
513 
514 /*
515  * Assign slots to all asynchronous instructions. A few special instructions
516  * require specific slots. For the rest, we assign slots in a round-robin
517  * fashion to reduce false dependencies when encoding waits.
518  *
519  * This should be called before va_insert_flow_control_nops.
520  */
521 void
va_assign_slots(bi_context * ctx)522 va_assign_slots(bi_context *ctx)
523 {
524    unsigned counter = 0;
525 
526    bi_foreach_instr_global(ctx, I) {
527       if (I->op == BI_OPCODE_BARRIER) {
528          I->slot = 7;
529       } else if (I->op == BI_OPCODE_ZS_EMIT || I->op == BI_OPCODE_ATEST) {
530          I->slot = 0;
531       } else if (bi_opcode_props[I->op].message) {
532          I->slot = counter++;
533 
534          if (counter == 3)
535             counter = 0;
536       }
537    }
538 }
539