xref: /aosp_15_r20/external/mesa3d/src/asahi/compiler/agx_optimizer.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2021 Alyssa Rosenzweig
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "agx_builder.h"
7 #include "agx_compiler.h"
8 #include "agx_minifloat.h"
9 #include "agx_opcodes.h"
10 
11 /* AGX peephole optimizer responsible for instruction combining. It operates in
12  * a forward direction and a backward direction, in each case traversing in
13  * source order. SSA means the forward pass satisfies the invariant:
14  *
15  *    Every def is visited before any of its uses.
16  *
17  * Dually, the backend pass satisfies the invariant:
18  *
19  *    Every use of a def is visited before the def.
20  *
21  * This means the forward pass can propagate modifiers forward, whereas the
22  * backwards pass propagates modifiers backward. Consider an example:
23  *
24  *    1 = fabs 0
25  *    2 = fround 1
26  *    3 = fsat 1
27  *
28  * The forwards pass would propagate the fabs to the fround (since we can
29  * lookup the fabs from the fround source and do the replacement). By contrast
30  * the backwards pass would propagate the fsat back to the fround (since when
31  * we see the fround we know it has only a single user, fsat).  Propagatable
32  * instruction have natural directions (like pushforwards and pullbacks).
33  *
34  * We are careful to update the tracked state whenever we modify an instruction
35  * to ensure the passes are linear-time and converge in a single iteration.
36  *
37  * Size conversions are worth special discussion. Consider the snippet:
38  *
39  *    2 = fadd 0, 1
40  *    3 = f2f16 2
41  *    4 = fround 3
42  *
43  * A priori, we can move the f2f16 in either direction. But it's not equal --
44  * if we move it up to the fadd, we get FP16 for two instructions, whereas if
45  * we push it into the fround, we effectively get FP32 for two instructions. So
46  * f2f16 is backwards. Likewise, consider
47  *
48  *    2 = fadd 0, 1
49  *    3 = f2f32 1
50  *    4 = fround 3
51  *
52  * This time if we move f2f32 up to the fadd, we get FP32 for two, but if we
53  * move it down to the fround, we get FP16 to too. So f2f32 is backwards.
54  */
55 
56 static bool
agx_is_fmov(agx_instr * def)57 agx_is_fmov(agx_instr *def)
58 {
59    return (def->op == AGX_OPCODE_FADD) &&
60           agx_is_equiv(def->src[1], agx_negzero());
61 }
62 
63 /* Compose floating-point modifiers with floating-point sources */
64 
65 static agx_index
agx_compose_float_src(agx_index to,agx_index from)66 agx_compose_float_src(agx_index to, agx_index from)
67 {
68    if (to.abs) {
69       from.neg = false;
70       from.abs = true;
71    }
72 
73    from.neg ^= to.neg;
74 
75    return from;
76 }
77 
78 static void
agx_optimizer_fmov(agx_instr ** defs,agx_instr * ins)79 agx_optimizer_fmov(agx_instr **defs, agx_instr *ins)
80 {
81    agx_foreach_ssa_src(ins, s) {
82       agx_index src = ins->src[s];
83       agx_instr *def = defs[src.value];
84 
85       if (def == NULL)
86          continue; /* happens for phis in loops */
87       if (!agx_is_fmov(def))
88          continue;
89       if (def->saturate)
90          continue;
91       if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2)
92          continue;
93 
94       /* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16
95        * into 16-bit instructions, since the latter would implicitly promote to
96        * a 32-bit instruction which is not exact.
97        */
98       assert(def->src[0].size == AGX_SIZE_32 ||
99              def->src[0].size == AGX_SIZE_16);
100       assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16);
101 
102       if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32)
103          continue;
104 
105       ins->src[s] = agx_compose_float_src(src, def->src[0]);
106    }
107 }
108 
109 static bool
image_write_source_can_be_immediate(agx_instr * I,unsigned s)110 image_write_source_can_be_immediate(agx_instr *I, unsigned s)
111 {
112    assert(I->op == AGX_OPCODE_IMAGE_WRITE);
113 
114    /* LOD can always be immediate. Actually, it's just zero so far, we don't
115     * support nonzero LOD for images yet.
116     */
117    if (s == 2)
118       return true;
119 
120    /* If the "bindless" source (source 3) is an immediate, it means we don't
121     * have a bindless image, instead we have a texture state index. We're
122     * allowed to have immediate texture state registers (source 4). However,
123     * we're not allowed to have immediate bindless offsets (also source 4).
124     */
125    bool is_texture_state = (I->src[3].type == AGX_INDEX_IMMEDIATE);
126    if (s == 4 && is_texture_state)
127       return true;
128 
129    /* Otherwise, must be from a register */
130    return false;
131 }
132 
133 static void
agx_optimizer_inline_imm(agx_instr ** defs,agx_instr * I)134 agx_optimizer_inline_imm(agx_instr **defs, agx_instr *I)
135 {
136    agx_foreach_ssa_src(I, s) {
137       agx_index src = I->src[s];
138       if (src.neg)
139          continue;
140 
141       agx_instr *def = defs[src.value];
142       if (!def || def->op != AGX_OPCODE_MOV_IMM)
143          continue;
144 
145       uint8_t value = def->imm;
146       uint16_t value_u16 = def->imm;
147 
148       bool float_src = agx_is_float_src(I, s);
149 
150       if (I->op == AGX_OPCODE_ST_TILE && s == 0)
151          continue;
152       if (I->op == AGX_OPCODE_ZS_EMIT && s != 0)
153          continue;
154       if (I->op == AGX_OPCODE_TEXTURE_SAMPLE && s != 4)
155          continue;
156       if ((I->op == AGX_OPCODE_DEVICE_STORE ||
157            I->op == AGX_OPCODE_LOCAL_STORE || I->op == AGX_OPCODE_ATOMIC ||
158            I->op == AGX_OPCODE_LOCAL_ATOMIC) &&
159           s != 2)
160          continue;
161       if (I->op == AGX_OPCODE_ST_VARY && s != 0)
162          continue;
163       if ((I->op == AGX_OPCODE_LOCAL_LOAD || I->op == AGX_OPCODE_DEVICE_LOAD ||
164            I->op == AGX_OPCODE_STACK_STORE) &&
165           s != 1)
166          continue;
167       if (I->op == AGX_OPCODE_SPLIT)
168          continue;
169 
170       if (I->op == AGX_OPCODE_IMAGE_WRITE &&
171           !image_write_source_can_be_immediate(I, s))
172          continue;
173 
174       if (float_src) {
175          bool fp16 = (def->dest[0].size == AGX_SIZE_16);
176          assert(fp16 || (def->dest[0].size == AGX_SIZE_32));
177 
178          float f = fp16 ? _mesa_half_to_float(def->imm) : uif(def->imm);
179          if (!agx_minifloat_exact(f))
180             continue;
181 
182          I->src[s] = agx_immediate_f(f);
183       } else if (value == def->imm) {
184          I->src[s] = agx_immediate(value);
185       } else if (value_u16 == def->imm && agx_allows_16bit_immediate(I)) {
186          I->src[s] = agx_abs(agx_immediate(value_u16));
187       }
188    }
189 }
190 
191 /*
192  * Fuse not into and/or/xor. Specifically, acts on not and fuses:
193  *
194  *    not(and(x, y) -> nand(x, y)
195  *    not(or(x, y) -> nor(x, y)
196  *    not(xor(x, y) -> xnor(x, y)
197  */
198 static bool
agx_optimizer_not(agx_instr * I,agx_instr * use)199 agx_optimizer_not(agx_instr *I, agx_instr *use)
200 {
201    /* Check for bit op and use of not op */
202    if (I->op != AGX_OPCODE_BITOP || use->op != AGX_OPCODE_NOT)
203       return false;
204 
205    /* Remap operation to the appropriate one */
206    I->truth_table ^= 0xF;
207    I->dest[0] = use->dest[0];
208 
209    return true;
210 }
211 
212 static bool
agx_optimizer_fmov_rev(agx_instr * I,agx_instr * use)213 agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
214 {
215    if (!agx_is_fmov(use))
216       return false;
217    if (use->src[0].neg || use->src[0].abs)
218       return false;
219 
220    /* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into
221     * 16-bit instructions, since the latter would implicitly promote to a 32-bit
222     * instruction which is not exact.
223     */
224    assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16);
225    assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16);
226 
227    if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32)
228       return false;
229 
230    /* saturate(saturate(x)) = saturate(x) */
231    I->saturate |= use->saturate;
232    I->dest[0] = use->dest[0];
233    return true;
234 }
235 
236 static void
agx_optimizer_copyprop(agx_context * ctx,agx_instr ** defs,agx_instr * I)237 agx_optimizer_copyprop(agx_context *ctx, agx_instr **defs, agx_instr *I)
238 {
239    agx_foreach_ssa_src(I, s) {
240       agx_index src = I->src[s];
241       agx_instr *def = defs[src.value];
242 
243       if (def == NULL)
244          continue; /* happens for phis in loops */
245       if (def->op != AGX_OPCODE_MOV)
246          continue;
247 
248       /* At the moment, not all instructions support size conversions. Notably
249        * RA pseudo instructions don't handle size conversions. This should be
250        * refined in the future.
251        */
252       if (def->src[0].size != src.size)
253          continue;
254 
255       /* Optimize split(64-bit uniform) so we can get better copyprop of the
256        * 32-bit uniform parts. This helps reduce moves with 64-bit uniforms.
257        */
258       if (I->op == AGX_OPCODE_SPLIT && def->src[0].type == AGX_INDEX_UNIFORM &&
259           src.size == AGX_SIZE_64 && I->dest[0].size == AGX_SIZE_32) {
260 
261          assert(I->nr_dests == 2 && "decomposing a 64-bit scalar");
262          agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
263 
264          agx_index lo = def->src[0];
265          lo.size = AGX_SIZE_32;
266 
267          agx_index hi = lo;
268          hi.value += 2 /* half of 64-bits = 32-bits = 2 x 16-bits */;
269 
270          defs[I->dest[0].value] = agx_mov_to(&b, I->dest[0], lo);
271          defs[I->dest[1].value] = agx_mov_to(&b, I->dest[1], hi);
272 
273          agx_remove_instruction(I);
274          continue;
275       }
276 
277       /* Immediate inlining happens elsewhere */
278       if (def->src[0].type == AGX_INDEX_IMMEDIATE)
279          continue;
280 
281       /* ALU instructions cannot take 64-bit */
282       if (def->src[0].size == AGX_SIZE_64 &&
283           !(I->op == AGX_OPCODE_DEVICE_LOAD && s == 0) &&
284           !(I->op == AGX_OPCODE_DEVICE_STORE && s == 1) &&
285           !(I->op == AGX_OPCODE_ATOMIC && s == 1))
286          continue;
287 
288       agx_replace_src(I, s, def->src[0]);
289    }
290 }
291 
292 /*
293  * Fuse conditions into if. Specifically, acts on if_icmp and fuses:
294  *
295  *    if_icmp(cmp(x, y, *), 0, ne/eq) -> if_cmp(x, y, *)
296  */
297 static void
agx_optimizer_if_cmp(agx_instr ** defs,agx_instr * I)298 agx_optimizer_if_cmp(agx_instr **defs, agx_instr *I)
299 {
300    /* Check for unfused if */
301    if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
302        I->src[0].type != AGX_INDEX_NORMAL)
303       return;
304 
305    /* Check for condition */
306    agx_instr *def = defs[I->src[0].value];
307    if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
308       return;
309 
310    /* Fuse */
311    I->src[0] = def->src[0];
312    I->src[1] = def->src[1];
313    I->invert_cond = def->invert_cond ^ !I->invert_cond;
314 
315    if (def->op == AGX_OPCODE_ICMP) {
316       I->op = AGX_OPCODE_IF_ICMP;
317       I->icond = def->icond;
318    } else {
319       I->op = AGX_OPCODE_IF_FCMP;
320       I->fcond = def->fcond;
321    }
322 }
323 
324 /*
325  * Fuse invert into if. Acts on if_icmp and fuses:
326  *
327  *    if_icmp(xor(x, 1), 0, ne) -> if_cmp(x, 0, eq)
328  */
329 static void
agx_optimizer_if_not(agx_instr ** defs,agx_instr * I)330 agx_optimizer_if_not(agx_instr **defs, agx_instr *I)
331 {
332    /* Check for unfused if */
333    if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
334        I->src[0].type != AGX_INDEX_NORMAL)
335       return;
336 
337    /* Check for invert */
338    agx_instr *def = defs[I->src[0].value];
339    if (def->op != AGX_OPCODE_BITOP ||
340        !agx_is_equiv(def->src[1], agx_immediate(1)) ||
341        def->truth_table != AGX_BITOP_XOR)
342       return;
343 
344    /* Fuse */
345    I->src[0] = def->src[0];
346    I->invert_cond = !I->invert_cond;
347 }
348 
349 /*
350  * Fuse conditions into select. Specifically, acts on icmpsel and fuses:
351  *
352  *    icmpsel(cmp(x, y, *), 0, z, w, eq) -> cmpsel(x, y, w, z, *)
353  *
354  * Care must be taken to invert the condition by swapping cmpsel arguments.
355  */
356 static void
agx_optimizer_cmpsel(agx_instr ** defs,agx_instr * I)357 agx_optimizer_cmpsel(agx_instr **defs, agx_instr *I)
358 {
359    /* Check for unfused select */
360    if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
361        I->src[0].type != AGX_INDEX_NORMAL)
362       return;
363 
364    /* Check for condition */
365    agx_instr *def = defs[I->src[0].value];
366    if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
367       return;
368 
369    /* Fuse */
370    I->src[0] = def->src[0];
371    I->src[1] = def->src[1];
372 
373    /* In the unfused select, the condition is inverted due to the form:
374     *
375     *    (cond == 0) ? x : y
376     *
377     * So we need to swap the arguments when fusing to become cond ? y : x. If
378     * the condition was supposed to be inverted, we don't swap since it's
379     * already inverted. cmpsel does not have an invert_cond bit to use.
380     */
381    if (!def->invert_cond) {
382       agx_index temp = I->src[2];
383       I->src[2] = I->src[3];
384       I->src[3] = temp;
385    }
386 
387    if (def->op == AGX_OPCODE_ICMP) {
388       I->op = AGX_OPCODE_ICMPSEL;
389       I->icond = def->icond;
390    } else {
391       I->op = AGX_OPCODE_FCMPSEL;
392       I->fcond = def->fcond;
393    }
394 }
395 
396 /*
397  * Fuse conditions into ballots:
398  *
399  *    ballot(cmp(x, y)) -> ballot_cmp(x, y)
400  */
401 static void
agx_optimizer_ballot(agx_context * ctx,agx_instr ** defs,agx_instr * I)402 agx_optimizer_ballot(agx_context *ctx, agx_instr **defs, agx_instr *I)
403 {
404    if (I->src[0].type != AGX_INDEX_NORMAL)
405       return;
406 
407    agx_instr *def = defs[I->src[0].value];
408    if (!def || (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP))
409       return;
410 
411    bool quad = I->op == AGX_OPCODE_QUAD_BALLOT;
412    assert(quad || I->op == AGX_OPCODE_BALLOT);
413 
414    /* Replace with a fused instruction since the # of sources changes */
415    agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
416 
417    agx_instr *fused = agx_icmp_ballot_to(
418       &b, I->dest[0], def->src[0], def->src[1], def->icond, def->invert_cond);
419 
420    if (def->op == AGX_OPCODE_ICMP) {
421       fused->op = quad ? AGX_OPCODE_ICMP_QUAD_BALLOT : AGX_OPCODE_ICMP_BALLOT;
422    } else {
423       fused->op = quad ? AGX_OPCODE_FCMP_QUAD_BALLOT : AGX_OPCODE_FCMP_BALLOT;
424    }
425 
426    agx_remove_instruction(I);
427 }
428 
429 /*
430  * Fuse not srcs into bitop.
431  */
432 static void
agx_optimizer_bitop(agx_instr ** defs,agx_instr * I)433 agx_optimizer_bitop(agx_instr **defs, agx_instr *I)
434 {
435    agx_foreach_ssa_src(I, s) {
436       agx_index src = I->src[s];
437       agx_instr *def = defs[src.value];
438 
439       /* Check for not src */
440       if (def->op != AGX_OPCODE_NOT)
441          continue;
442 
443       /* Select new operation */
444       if (s == 0) {
445          I->truth_table =
446             ((I->truth_table & 0x5) << 1) | ((I->truth_table & 0xa) >> 1);
447       } else if (s == 1) {
448          I->truth_table = ((I->truth_table & 0x3) << 2) | (I->truth_table >> 2);
449       }
450 
451       /* Fuse */
452       I->src[s] = def->src[0];
453    }
454 }
455 
456 static void
agx_optimizer_forward(agx_context * ctx)457 agx_optimizer_forward(agx_context *ctx)
458 {
459    agx_instr **defs = calloc(ctx->alloc, sizeof(*defs));
460 
461    agx_foreach_instr_global_safe(ctx, I) {
462       struct agx_opcode_info info = agx_opcodes_info[I->op];
463 
464       agx_foreach_ssa_dest(I, d) {
465          defs[I->dest[d].value] = I;
466       }
467 
468       /* Optimize moves */
469       agx_optimizer_copyprop(ctx, defs, I);
470 
471       /* Propagate fmov down */
472       if (info.is_float || I->op == AGX_OPCODE_FCMPSEL ||
473           I->op == AGX_OPCODE_FCMP)
474          agx_optimizer_fmov(defs, I);
475 
476       /* Inline immediates if we can. TODO: systematic */
477       if (I->op != AGX_OPCODE_COLLECT && I->op != AGX_OPCODE_IMAGE_LOAD &&
478           I->op != AGX_OPCODE_TEXTURE_LOAD &&
479           I->op != AGX_OPCODE_UNIFORM_STORE &&
480           I->op != AGX_OPCODE_BLOCK_IMAGE_STORE && I->op != AGX_OPCODE_EXPORT)
481          agx_optimizer_inline_imm(defs, I);
482 
483       if (I->op == AGX_OPCODE_IF_ICMP) {
484          agx_optimizer_if_not(defs, I);
485          agx_optimizer_if_cmp(defs, I);
486       } else if (I->op == AGX_OPCODE_ICMPSEL) {
487          agx_optimizer_cmpsel(defs, I);
488       } else if (I->op == AGX_OPCODE_BALLOT ||
489                  I->op == AGX_OPCODE_QUAD_BALLOT) {
490          agx_optimizer_ballot(ctx, defs, I);
491       } else if (I->op == AGX_OPCODE_BITOP) {
492          agx_optimizer_bitop(defs, I);
493       }
494    }
495 
496    free(defs);
497 }
498 
499 static void
agx_optimizer_backward(agx_context * ctx)500 agx_optimizer_backward(agx_context *ctx)
501 {
502    agx_instr **uses = calloc(ctx->alloc, sizeof(*uses));
503    BITSET_WORD *multiple = calloc(BITSET_WORDS(ctx->alloc), sizeof(*multiple));
504 
505    agx_foreach_instr_global_rev(ctx, I) {
506       struct agx_opcode_info info = agx_opcodes_info[I->op];
507 
508       agx_foreach_ssa_src(I, s) {
509          if (I->src[s].type == AGX_INDEX_NORMAL) {
510             unsigned v = I->src[s].value;
511 
512             if (uses[v])
513                BITSET_SET(multiple, v);
514             else
515                uses[v] = I;
516          }
517       }
518 
519       if (info.nr_dests != 1)
520          continue;
521 
522       if (I->dest[0].type != AGX_INDEX_NORMAL)
523          continue;
524 
525       agx_instr *use = uses[I->dest[0].value];
526 
527       if (!use || BITSET_TEST(multiple, I->dest[0].value))
528          continue;
529 
530       if (agx_optimizer_not(I, use)) {
531          agx_remove_instruction(use);
532          continue;
533       }
534 
535       /* Destination has a single use, try to propagate */
536       if (info.is_float && agx_optimizer_fmov_rev(I, use)) {
537          agx_remove_instruction(use);
538          continue;
539       }
540    }
541 
542    free(uses);
543    free(multiple);
544 }
545 
546 void
agx_optimizer(agx_context * ctx)547 agx_optimizer(agx_context *ctx)
548 {
549    agx_optimizer_backward(ctx);
550    agx_optimizer_forward(ctx);
551 }
552