xref: /aosp_15_r20/external/mesa3d/src/panfrost/compiler/bi_lower_swizzle.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright (C) 2020 Collabora Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "bi_builder.h"
25 #include "compiler.h"
26 
27 /* Not all 8-bit and 16-bit instructions support all swizzles on all sources.
28  * These passes, intended to run after NIR->BIR but before scheduling/RA, lower
29  * away swizzles that cannot be represented. In the future, we should try to
30  * recombine swizzles where we can as an optimization.
31  */
32 
33 static bool
bi_swizzle_replicates_8(enum bi_swizzle swz)34 bi_swizzle_replicates_8(enum bi_swizzle swz)
35 {
36    switch (swz) {
37    case BI_SWIZZLE_B0000:
38    case BI_SWIZZLE_B1111:
39    case BI_SWIZZLE_B2222:
40    case BI_SWIZZLE_B3333:
41       return true;
42    default:
43       return false;
44    }
45 }
46 
47 static void
lower_swizzle(bi_context * ctx,bi_instr * ins,unsigned src)48 lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
49 {
50    /* TODO: Use the opcode table and be a lot more methodical about this... */
51    switch (ins->op) {
52    /* Some instructions used with 16-bit data never have swizzles */
53    case BI_OPCODE_CSEL_V2F16:
54    case BI_OPCODE_CSEL_V2I16:
55    case BI_OPCODE_CSEL_V2S16:
56    case BI_OPCODE_CSEL_V2U16:
57 
58    /* Despite ostensibly being 32-bit instructions, CLPER does not
59     * inherently interpret the data, so it can be used for v2f16
60     * derivatives, which might require swizzle lowering */
61    case BI_OPCODE_CLPER_I32:
62    case BI_OPCODE_CLPER_OLD_I32:
63 
64    /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the
65     * boolean is implemented as a 16-bit integer, the swizzle is needed
66     * for correct operation if the instruction producing the 16-bit
67     * boolean does not replicate to both halves of the containing 32-bit
68     * register. As such, we may need to lower a swizzle.
69     *
70     * This is a silly hack. Ideally, code gen would be smart enough to
71     * avoid this case (by replicating). In practice, silly hardware design
72     * decisions force our hand here.
73     */
74    case BI_OPCODE_MUX_I32:
75    case BI_OPCODE_CSEL_I32:
76       break;
77 
78    case BI_OPCODE_IADD_V2S16:
79    case BI_OPCODE_IADD_V2U16:
80    case BI_OPCODE_ISUB_V2S16:
81    case BI_OPCODE_ISUB_V2U16:
82       if (src == 0 && ins->src[src].swizzle != BI_SWIZZLE_H10)
83          break;
84       else
85          return;
86    case BI_OPCODE_LSHIFT_AND_V2I16:
87    case BI_OPCODE_LSHIFT_OR_V2I16:
88    case BI_OPCODE_LSHIFT_XOR_V2I16:
89    case BI_OPCODE_RSHIFT_AND_V2I16:
90    case BI_OPCODE_RSHIFT_OR_V2I16:
91    case BI_OPCODE_RSHIFT_XOR_V2I16:
92       if (src == 2)
93          return;
94       else
95          break;
96 
97    /* For some reason MUX.v2i16 allows swaps but not replication */
98    case BI_OPCODE_MUX_V2I16:
99       if (ins->src[src].swizzle == BI_SWIZZLE_H10)
100          return;
101       else
102          break;
103 
104    /* No swizzles supported */
105    case BI_OPCODE_HADD_V4U8:
106    case BI_OPCODE_HADD_V4S8:
107    case BI_OPCODE_CLZ_V4U8:
108    case BI_OPCODE_IDP_V4I8:
109    case BI_OPCODE_IABS_V4S8:
110    case BI_OPCODE_ICMP_V4I8:
111    case BI_OPCODE_ICMP_V4U8:
112    case BI_OPCODE_MUX_V4I8:
113    case BI_OPCODE_IADD_IMM_V4I8:
114       break;
115 
116    case BI_OPCODE_LSHIFT_AND_V4I8:
117    case BI_OPCODE_LSHIFT_OR_V4I8:
118    case BI_OPCODE_LSHIFT_XOR_V4I8:
119    case BI_OPCODE_RSHIFT_AND_V4I8:
120    case BI_OPCODE_RSHIFT_OR_V4I8:
121    case BI_OPCODE_RSHIFT_XOR_V4I8:
122       /* Last source allows identity or replication */
123       if (src == 2 && bi_swizzle_replicates_8(ins->src[src].swizzle))
124          return;
125 
126       /* Others do not allow swizzles */
127       break;
128 
129    /* We don't want to deal with reswizzling logic in modifier prop. Move
130     * the swizzle outside, it's easier for clamp propagation. */
131    case BI_OPCODE_FCLAMP_V2F16: {
132       bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
133       bi_index dest = ins->dest[0];
134       bi_index tmp = bi_temp(ctx);
135 
136       bi_index swizzled_src = bi_replace_index(ins->src[0], tmp);
137       ins->src[0].swizzle = BI_SWIZZLE_H01;
138       ins->dest[0] = tmp;
139       bi_swz_v2i16_to(&b, dest, swizzled_src);
140       return;
141    }
142 
143    default:
144       return;
145    }
146 
147    /* First, try to apply a given swizzle to a constant to clear the
148     * runtime swizzle. This is less heavy-handed than ignoring the
149     * swizzle for scalar destinations, since it maintains
150     * replication of the destination.
151     */
152    if (ins->src[src].type == BI_INDEX_CONSTANT) {
153       ins->src[src].value =
154          bi_apply_swizzle(ins->src[src].value, ins->src[src].swizzle);
155       ins->src[src].swizzle = BI_SWIZZLE_H01;
156       return;
157    }
158 
159    /* Even if the source does not replicate, if the consuming instruction
160     * produces a 16-bit scalar, we can ignore the other component.
161     */
162    if (ins->dest[0].swizzle == BI_SWIZZLE_H00 &&
163        ins->src[src].swizzle == BI_SWIZZLE_H00) {
164       ins->src[src].swizzle = BI_SWIZZLE_H01;
165       return;
166    }
167 
168    /* Lower it away */
169    bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
170 
171    bool is_8 = (bi_opcode_props[ins->op].size == BI_SIZE_8) ||
172                (bi_opcode_props[ins->op].size == BI_SIZE_32 &&
173                 ins->src[src].swizzle >= BI_SWIZZLE_B0000);
174 
175    bi_index orig = ins->src[src];
176    bi_index stripped = bi_replace_index(bi_null(), orig);
177    stripped.swizzle = ins->src[src].swizzle;
178 
179    bi_index swz = is_8 ? bi_swz_v4i8(&b, stripped) : bi_swz_v2i16(&b, stripped);
180 
181    bi_replace_src(ins, src, swz);
182    ins->src[src].swizzle = BI_SWIZZLE_H01;
183 }
184 
185 static bool
bi_swizzle_replicates_16(enum bi_swizzle swz)186 bi_swizzle_replicates_16(enum bi_swizzle swz)
187 {
188    switch (swz) {
189    case BI_SWIZZLE_H00:
190    case BI_SWIZZLE_H11:
191       return true;
192    default:
193       /* If a swizzle replicates every 8-bits, it also replicates
194        * every 16-bits, so allow 8-bit replicating swizzles.
195        */
196       return bi_swizzle_replicates_8(swz);
197    }
198 }
199 
200 static bool
bi_instr_replicates(bi_instr * I,BITSET_WORD * replicates_16)201 bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16)
202 {
203    switch (I->op) {
204 
205    /* Instructions that construct vectors have replicated output if their
206     * sources are identical. Check this case first.
207     */
208    case BI_OPCODE_MKVEC_V2I16:
209    case BI_OPCODE_V2F16_TO_V2S16:
210    case BI_OPCODE_V2F16_TO_V2U16:
211    case BI_OPCODE_V2F32_TO_V2F16:
212    case BI_OPCODE_V2S16_TO_V2F16:
213    case BI_OPCODE_V2S8_TO_V2F16:
214    case BI_OPCODE_V2S8_TO_V2S16:
215    case BI_OPCODE_V2U16_TO_V2F16:
216    case BI_OPCODE_V2U8_TO_V2F16:
217    case BI_OPCODE_V2U8_TO_V2U16:
218       return bi_is_value_equiv(I->src[0], I->src[1]);
219 
220    /* 16-bit transcendentals are defined to output zero in their
221     * upper half, so they do not replicate
222     */
223    case BI_OPCODE_FRCP_F16:
224    case BI_OPCODE_FRSQ_F16:
225       return false;
226 
227    /* Not sure, be conservative, we don't use these.. */
228    case BI_OPCODE_VN_ASST1_F16:
229    case BI_OPCODE_FPCLASS_F16:
230    case BI_OPCODE_FPOW_SC_DET_F16:
231       return false;
232 
233    default:
234       break;
235    }
236 
237    /* Replication analysis only makes sense for ALU instructions */
238    if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE)
239       return false;
240 
241    /* We only analyze 16-bit instructions for 16-bit replication. We could
242     * maybe do better.
243     */
244    if (bi_opcode_props[I->op].size != BI_SIZE_16)
245       return false;
246 
247    bi_foreach_src(I, s) {
248       if (bi_is_null(I->src[s]))
249          continue;
250 
251       /* Replicated swizzles */
252       if (bi_swizzle_replicates_16(I->src[s].swizzle))
253          continue;
254 
255       /* Replicated values */
256       if (bi_is_ssa(I->src[s]) && BITSET_TEST(replicates_16, I->src[s].value))
257          continue;
258 
259       /* Replicated constants */
260       if (I->src[s].type == BI_INDEX_CONSTANT &&
261           (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16))
262          continue;
263 
264       return false;
265    }
266 
267    return true;
268 }
269 
270 void
bi_lower_swizzle(bi_context * ctx)271 bi_lower_swizzle(bi_context *ctx)
272 {
273    bi_foreach_instr_global_safe(ctx, ins) {
274       bi_foreach_src(ins, s) {
275          if (bi_is_null(ins->src[s]))
276             continue;
277          if (ins->src[s].swizzle == BI_SWIZZLE_H01)
278             continue;
279 
280          lower_swizzle(ctx, ins, s);
281       }
282    }
283 
284    /* Now that we've lowered swizzles, clean up the mess */
285    BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc);
286 
287    bi_foreach_instr_global(ctx, ins) {
288       if (ins->nr_dests && bi_instr_replicates(ins, replicates_16))
289          BITSET_SET(replicates_16, ins->dest[0].value);
290 
291       if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) &&
292           BITSET_TEST(replicates_16, ins->src[0].value)) {
293          ins->op = BI_OPCODE_MOV_I32;
294          ins->src[0].swizzle = BI_SWIZZLE_H01;
295       }
296 
297       /* The above passes rely on replicating destinations.  For
298        * Valhall, we will want to optimize this. For now, default
299        * to Bifrost compatible behaviour.
300        */
301       if (ins->nr_dests)
302          ins->dest[0].swizzle = BI_SWIZZLE_H01;
303    }
304 
305    free(replicates_16);
306 }
307