xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/bi_opt_mod_props.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2021 Collabora, Ltd.
3  * Copyright (C) 2021 Alyssa Rosenzweig <[email protected]>
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining a
6  * copy of this software and associated documentation files (the "Software"),
7  * to deal in the Software without restriction, including without limitation
8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9  * and/or sell copies of the Software, and to permit persons to whom the
10  * Software is furnished to do so, subject to the following conditions:
11  *
12  * The above copyright notice and this permission notice (including the next
13  * paragraph) shall be included in all copies or substantial portions of the
14  * Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "bi_builder.h"
26 #include "compiler.h"
27 
28 /*
29  * Due to a Bifrost encoding restriction, some instructions cannot have an abs
30  * modifier on both sources. Check if adding a fabs modifier to a given source
31  * of a binary instruction would cause this restriction to be hit.
32  */
33 static bool
bi_would_impact_abs(unsigned arch,bi_instr * I,bi_index repl,unsigned s)34 bi_would_impact_abs(unsigned arch, bi_instr *I, bi_index repl, unsigned s)
35 {
36    return (arch <= 8) && I->src[1 - s].abs &&
37           bi_is_word_equiv(I->src[1 - s], repl);
38 }
39 
40 static bool
bi_takes_fabs(unsigned arch,bi_instr * I,bi_index repl,unsigned s)41 bi_takes_fabs(unsigned arch, bi_instr *I, bi_index repl, unsigned s)
42 {
43    switch (I->op) {
44    case BI_OPCODE_FCMP_V2F16:
45    case BI_OPCODE_FMAX_V2F16:
46    case BI_OPCODE_FMIN_V2F16:
47       return !bi_would_impact_abs(arch, I, repl, s);
48    case BI_OPCODE_FADD_V2F16:
49       /*
50        * For FADD.v2f16, the FMA pipe has the abs encoding hazard,
51        * while the FADD pipe cannot encode a clamp. Either case in
52        * isolation can be worked around in the scheduler, but both
53        * together is impossible to encode. Avoid the hazard.
54        */
55       return !(I->clamp && bi_would_impact_abs(arch, I, repl, s));
56    case BI_OPCODE_V2F32_TO_V2F16:
57       /* TODO: Needs both match or lower */
58       return false;
59    case BI_OPCODE_FLOG_TABLE_F32:
60       /* TODO: Need to check mode */
61       return false;
62    default:
63       return bi_opcode_props[I->op].abs & BITFIELD_BIT(s);
64    }
65 }
66 
67 static bool
bi_takes_fneg(unsigned arch,bi_instr * I,unsigned s)68 bi_takes_fneg(unsigned arch, bi_instr *I, unsigned s)
69 {
70    switch (I->op) {
71    case BI_OPCODE_CUBE_SSEL:
72    case BI_OPCODE_CUBE_TSEL:
73    case BI_OPCODE_CUBEFACE:
74       /* TODO: Bifrost encoding restriction: need to match or lower */
75       return arch >= 9;
76    case BI_OPCODE_FREXPE_F32:
77    case BI_OPCODE_FREXPE_V2F16:
78    case BI_OPCODE_FLOG_TABLE_F32:
79       /* TODO: Need to check mode */
80       return false;
81    default:
82       return bi_opcode_props[I->op].neg & BITFIELD_BIT(s);
83    }
84 }
85 
86 static bool
bi_is_fabsneg(enum bi_opcode op,enum bi_size size)87 bi_is_fabsneg(enum bi_opcode op, enum bi_size size)
88 {
89    return (size == BI_SIZE_32 && op == BI_OPCODE_FABSNEG_F32) ||
90           (size == BI_SIZE_16 && op == BI_OPCODE_FABSNEG_V2F16);
91 }
92 
93 static enum bi_swizzle
bi_compose_swizzle_16(enum bi_swizzle a,enum bi_swizzle b)94 bi_compose_swizzle_16(enum bi_swizzle a, enum bi_swizzle b)
95 {
96    assert(a <= BI_SWIZZLE_H11);
97    assert(b <= BI_SWIZZLE_H11);
98 
99    bool al = (a & BI_SWIZZLE_H10);
100    bool ar = (a & BI_SWIZZLE_H01);
101    bool bl = (b & BI_SWIZZLE_H10);
102    bool br = (b & BI_SWIZZLE_H01);
103 
104    return ((al ? br : bl) ? BI_SWIZZLE_H10 : 0) |
105           ((ar ? br : bl) ? BI_SWIZZLE_H01 : 0);
106 }
107 
108 /* Like bi_replace_index, but composes instead of overwrites */
109 
110 static inline bi_index
bi_compose_float_index(bi_index old,bi_index repl)111 bi_compose_float_index(bi_index old, bi_index repl)
112 {
113    /* abs(-x) = abs(+x) so ignore repl.neg if old.abs is set, otherwise
114     * -(-x) = x but -(+x) = +(-x) so need to exclusive-or the negates */
115    repl.neg = old.neg ^ (repl.neg && !old.abs);
116 
117    /* +/- abs(+/- abs(x)) = +/- abs(x), etc so just or the two */
118    repl.abs |= old.abs;
119 
120    /* Use the old swizzle to select from the replacement swizzle */
121    repl.swizzle = bi_compose_swizzle_16(old.swizzle, repl.swizzle);
122 
123    return repl;
124 }
125 
126 /* DISCARD.b32(FCMP.f(x, y)) --> DISCARD.f(x, y) */
127 
128 static inline bool
bi_fuse_discard_fcmp(bi_context * ctx,bi_instr * I,bi_instr * mod)129 bi_fuse_discard_fcmp(bi_context *ctx, bi_instr *I, bi_instr *mod)
130 {
131    if (!mod)
132       return false;
133    if (I->op != BI_OPCODE_DISCARD_B32)
134       return false;
135    if (mod->op != BI_OPCODE_FCMP_F32 && mod->op != BI_OPCODE_FCMP_V2F16)
136       return false;
137    if (mod->cmpf >= BI_CMPF_GTLT)
138       return false;
139 
140    /* result_type doesn't matter */
141 
142    /* .abs and .neg modifiers allowed on Valhall DISCARD but not Bifrost */
143    bool absneg = mod->src[0].neg || mod->src[0].abs;
144    absneg |= mod->src[1].neg || mod->src[1].abs;
145 
146    if (ctx->arch <= 8 && absneg)
147       return false;
148 
149    enum bi_swizzle r = I->src[0].swizzle;
150 
151    bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
152    I = bi_discard_f32(&b, mod->src[0], mod->src[1], mod->cmpf);
153 
154    if (mod->op == BI_OPCODE_FCMP_V2F16) {
155       I->src[0].swizzle = bi_compose_swizzle_16(r, I->src[0].swizzle);
156       I->src[1].swizzle = bi_compose_swizzle_16(r, I->src[1].swizzle);
157    }
158 
159    return true;
160 }
161 
162 /*
163  * S32_TO_F32(S8_TO_S32(x)) -> S8_TO_F32 and friends. Round modes don't matter
164  * because all 8-bit and 16-bit integers may be represented exactly as fp32.
165  */
166 struct {
167    enum bi_opcode inner;
168    enum bi_opcode outer;
169    enum bi_opcode replacement;
170 } bi_small_int_patterns[] = {
171    {BI_OPCODE_S8_TO_S32, BI_OPCODE_S32_TO_F32, BI_OPCODE_S8_TO_F32},
172    {BI_OPCODE_U8_TO_U32, BI_OPCODE_U32_TO_F32, BI_OPCODE_U8_TO_F32},
173    {BI_OPCODE_U8_TO_U32, BI_OPCODE_S32_TO_F32, BI_OPCODE_U8_TO_F32},
174    {BI_OPCODE_S16_TO_S32, BI_OPCODE_S32_TO_F32, BI_OPCODE_S16_TO_F32},
175    {BI_OPCODE_U16_TO_U32, BI_OPCODE_U32_TO_F32, BI_OPCODE_U16_TO_F32},
176    {BI_OPCODE_U16_TO_U32, BI_OPCODE_S32_TO_F32, BI_OPCODE_U16_TO_F32},
177 };
178 
179 static inline void
bi_fuse_small_int_to_f32(bi_instr * I,bi_instr * mod)180 bi_fuse_small_int_to_f32(bi_instr *I, bi_instr *mod)
181 {
182    for (unsigned i = 0; i < ARRAY_SIZE(bi_small_int_patterns); ++i) {
183       if (I->op != bi_small_int_patterns[i].outer)
184          continue;
185       if (mod->op != bi_small_int_patterns[i].inner)
186          continue;
187 
188       assert(I->src[0].swizzle == BI_SWIZZLE_H01);
189       I->src[0] = mod->src[0];
190       I->round = BI_ROUND_NONE;
191       I->op = bi_small_int_patterns[i].replacement;
192    }
193 }
194 
195 void
bi_opt_mod_prop_forward(bi_context * ctx)196 bi_opt_mod_prop_forward(bi_context *ctx)
197 {
198    bi_instr **lut = calloc(sizeof(bi_instr *), ctx->ssa_alloc);
199 
200    bi_foreach_instr_global_safe(ctx, I) {
201       /* Try fusing FCMP into DISCARD.b32, building a new DISCARD.f32
202        * instruction. As this is the only optimization DISCARD is
203        * involved in, this shortcircuits other processing.
204        */
205       if (I->op == BI_OPCODE_DISCARD_B32) {
206          if (bi_is_ssa(I->src[0]) &&
207              bi_fuse_discard_fcmp(ctx, I, lut[I->src[0].value])) {
208             bi_remove_instruction(I);
209          }
210 
211          continue;
212       }
213 
214       bi_foreach_dest(I, d) {
215          lut[I->dest[d].value] = I;
216       }
217 
218       bi_foreach_ssa_src(I, s) {
219          bi_instr *mod = lut[I->src[s].value];
220 
221          if (!mod)
222             continue;
223 
224          unsigned size = bi_opcode_props[I->op].size;
225 
226          bi_fuse_small_int_to_f32(I, mod);
227 
228          if (bi_is_fabsneg(mod->op, size)) {
229             if (mod->src[0].abs && !bi_takes_fabs(ctx->arch, I, mod->src[0], s))
230                continue;
231 
232             if (mod->src[0].neg && !bi_takes_fneg(ctx->arch, I, s))
233                continue;
234 
235             I->src[s] = bi_compose_float_index(I->src[s], mod->src[0]);
236          }
237       }
238    }
239 
240    free(lut);
241 }
242 
243 /* RSCALE has restrictions on how the clamp may be used, only used for
244  * specialized transcendental sequences that set the clamp explicitly anyway */
245 
246 static bool
bi_takes_clamp(bi_instr * I)247 bi_takes_clamp(bi_instr *I)
248 {
249    switch (I->op) {
250    case BI_OPCODE_FMA_RSCALE_F32:
251    case BI_OPCODE_FMA_RSCALE_V2F16:
252    case BI_OPCODE_FADD_RSCALE_F32:
253       return false;
254    case BI_OPCODE_FADD_V2F16:
255       /* Encoding restriction */
256       return !(I->src[0].abs && I->src[1].abs &&
257                bi_is_word_equiv(I->src[0], I->src[1]));
258    default:
259       return bi_opcode_props[I->op].clamp;
260    }
261 }
262 
263 static bool
bi_is_fclamp(enum bi_opcode op,enum bi_size size)264 bi_is_fclamp(enum bi_opcode op, enum bi_size size)
265 {
266    return (size == BI_SIZE_32 && op == BI_OPCODE_FCLAMP_F32) ||
267           (size == BI_SIZE_16 && op == BI_OPCODE_FCLAMP_V2F16);
268 }
269 
270 static bool
bi_optimizer_clamp(bi_instr * I,bi_instr * use)271 bi_optimizer_clamp(bi_instr *I, bi_instr *use)
272 {
273    if (!bi_is_fclamp(use->op, bi_opcode_props[I->op].size))
274       return false;
275    if (!bi_takes_clamp(I))
276       return false;
277 
278    /* Clamps are bitfields (clamp_m1_1/clamp_0_inf) so composition is OR */
279    I->clamp |= use->clamp;
280    I->dest[0] = use->dest[0];
281    return true;
282 }
283 
284 static enum bi_opcode
bi_sized_mux_op(unsigned size)285 bi_sized_mux_op(unsigned size)
286 {
287    switch (size) {
288    case 8:
289       return BI_OPCODE_MUX_V4I8;
290    case 16:
291       return BI_OPCODE_MUX_V2I16;
292    case 32:
293       return BI_OPCODE_MUX_I32;
294    default:
295       unreachable("invalid size");
296    }
297 }
298 
299 static bool
bi_is_fixed_mux(bi_instr * I,unsigned size,bi_index v1)300 bi_is_fixed_mux(bi_instr *I, unsigned size, bi_index v1)
301 {
302    return I->op == bi_sized_mux_op(size) &&
303           bi_is_value_equiv(I->src[0], bi_zero()) &&
304           bi_is_value_equiv(I->src[1], v1);
305 }
306 
307 static bool
bi_takes_int_result_type(enum bi_opcode op)308 bi_takes_int_result_type(enum bi_opcode op)
309 {
310    switch (op) {
311    case BI_OPCODE_ICMP_I32:
312    case BI_OPCODE_ICMP_S32:
313    case BI_OPCODE_ICMP_U32:
314    case BI_OPCODE_ICMP_V2I16:
315    case BI_OPCODE_ICMP_V2S16:
316    case BI_OPCODE_ICMP_V2U16:
317    case BI_OPCODE_ICMP_V4I8:
318    case BI_OPCODE_ICMP_V4S8:
319    case BI_OPCODE_ICMP_V4U8:
320    case BI_OPCODE_FCMP_F32:
321    case BI_OPCODE_FCMP_V2F16:
322       return true;
323    default:
324       return false;
325    }
326 }
327 
328 static bool
bi_takes_float_result_type(enum bi_opcode op)329 bi_takes_float_result_type(enum bi_opcode op)
330 {
331    return (op == BI_OPCODE_FCMP_F32) || (op == BI_OPCODE_FCMP_V2F16);
332 }
333 
334 /* CMP+MUX -> CMP with result type */
335 static bool
bi_optimizer_result_type(bi_instr * I,bi_instr * mux)336 bi_optimizer_result_type(bi_instr *I, bi_instr *mux)
337 {
338    if (bi_opcode_props[I->op].size != bi_opcode_props[mux->op].size)
339       return false;
340 
341    if (bi_is_fixed_mux(mux, 32, bi_imm_f32(1.0)) ||
342        bi_is_fixed_mux(mux, 16, bi_imm_f16(1.0))) {
343 
344       if (!bi_takes_float_result_type(I->op))
345          return false;
346 
347       I->result_type = BI_RESULT_TYPE_F1;
348    } else if (bi_is_fixed_mux(mux, 32, bi_imm_u32(1)) ||
349               bi_is_fixed_mux(mux, 16, bi_imm_u16(1)) ||
350               bi_is_fixed_mux(mux, 8, bi_imm_u8(1))) {
351 
352       if (!bi_takes_int_result_type(I->op))
353          return false;
354 
355       I->result_type = BI_RESULT_TYPE_I1;
356    } else {
357       return false;
358    }
359 
360    I->dest[0] = mux->dest[0];
361    return true;
362 }
363 
364 static bool
bi_is_var_tex(bi_instr * var,bi_instr * tex)365 bi_is_var_tex(bi_instr *var, bi_instr *tex)
366 {
367    return (var->op == BI_OPCODE_LD_VAR_IMM) &&
368           (tex->op == BI_OPCODE_TEXS_2D_F16 ||
369            tex->op == BI_OPCODE_TEXS_2D_F32) &&
370           (var->register_format == BI_REGISTER_FORMAT_F32) &&
371           ((var->sample == BI_SAMPLE_CENTER &&
372             var->update == BI_UPDATE_STORE) ||
373            (var->sample == BI_SAMPLE_NONE &&
374             var->update == BI_UPDATE_RETRIEVE)) &&
375           (tex->texture_index == tex->sampler_index) &&
376           (tex->texture_index < 4) && (var->index < 8);
377 }
378 
379 static bool
bi_optimizer_var_tex(bi_context * ctx,bi_instr * var,bi_instr * tex)380 bi_optimizer_var_tex(bi_context *ctx, bi_instr *var, bi_instr *tex)
381 {
382    if (!bi_is_var_tex(var, tex))
383       return false;
384 
385    /* Construct the corresponding VAR_TEX intruction */
386    bi_builder b = bi_init_builder(ctx, bi_after_instr(var));
387 
388    bi_instr *I = bi_var_tex_f32_to(&b, tex->dest[0], tex->lod_mode, var->sample,
389                                    var->update, tex->texture_index, var->index);
390    I->skip = tex->skip;
391 
392    if (tex->op == BI_OPCODE_TEXS_2D_F16)
393       I->op = BI_OPCODE_VAR_TEX_F16;
394 
395    /* Dead code elimination will clean up for us */
396    return true;
397 }
398 
399 void
bi_opt_mod_prop_backward(bi_context * ctx)400 bi_opt_mod_prop_backward(bi_context *ctx)
401 {
402    unsigned count = ctx->ssa_alloc;
403    bi_instr **uses = calloc(count, sizeof(*uses));
404    BITSET_WORD *multiple = calloc(BITSET_WORDS(count), sizeof(*multiple));
405 
406    bi_foreach_instr_global_rev(ctx, I) {
407       bi_foreach_ssa_src(I, s) {
408          unsigned v = I->src[s].value;
409 
410          if (uses[v] && uses[v] != I)
411             BITSET_SET(multiple, v);
412          else
413             uses[v] = I;
414       }
415 
416       if (!I->nr_dests)
417          continue;
418 
419       bi_instr *use = uses[I->dest[0].value];
420 
421       if (!use || BITSET_TEST(multiple, I->dest[0].value))
422          continue;
423 
424       /* Destination has a single use, try to propagate */
425       bool propagated =
426          bi_optimizer_clamp(I, use) || bi_optimizer_result_type(I, use);
427 
428       if (!propagated && I->op == BI_OPCODE_LD_VAR_IMM &&
429           use->op == BI_OPCODE_SPLIT_I32) {
430          /* Need to see through the split in a
431           * ld_var_imm/split/var_tex  sequence
432           */
433          bi_instr *tex = uses[use->dest[0].value];
434 
435          if (!tex || BITSET_TEST(multiple, use->dest[0].value))
436             continue;
437 
438          use = tex;
439          propagated = bi_optimizer_var_tex(ctx, I, use);
440       }
441 
442       if (propagated) {
443          bi_remove_instruction(use);
444          continue;
445       }
446    }
447 
448    free(uses);
449    free(multiple);
450 }
451 
452 /*
453  * Lower pseudo instructions that exist to simplify the optimizer. Returns the
454  * replacement instruction, or NULL if no replacement is needed.
455  */
456 static bool
bi_lower_opt_instruction_helper(bi_builder * b,bi_instr * I)457 bi_lower_opt_instruction_helper(bi_builder *b, bi_instr *I)
458 {
459    bi_instr *repl;
460 
461    switch (I->op) {
462    case BI_OPCODE_FABSNEG_F32:
463    case BI_OPCODE_FCLAMP_F32:
464       repl = bi_fadd_f32_to(b, I->dest[0], I->src[0], bi_negzero());
465       repl->clamp = I->clamp;
466       return true;
467 
468    case BI_OPCODE_FABSNEG_V2F16:
469    case BI_OPCODE_FCLAMP_V2F16:
470       repl = bi_fadd_v2f16_to(b, I->dest[0], I->src[0], bi_negzero());
471       repl->clamp = I->clamp;
472       return true;
473 
474    case BI_OPCODE_DISCARD_B32:
475       bi_discard_f32(b, I->src[0], bi_zero(), BI_CMPF_NE);
476       return true;
477 
478    default:
479       return false;
480    }
481 }
482 
483 void
bi_lower_opt_instructions(bi_context * ctx)484 bi_lower_opt_instructions(bi_context *ctx)
485 {
486    bi_foreach_instr_global_safe(ctx, I) {
487       bi_builder b = bi_init_builder(ctx, bi_before_instr(I));
488 
489       if (bi_lower_opt_instruction_helper(&b, I))
490          bi_remove_instruction(I);
491    }
492 }
493