1 /*
2 * Copyright (C) 2020 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "bi_builder.h"
25 #include "compiler.h"
26
27 /* Not all 8-bit and 16-bit instructions support all swizzles on all sources.
28 * These passes, intended to run after NIR->BIR but before scheduling/RA, lower
29 * away swizzles that cannot be represented. In the future, we should try to
30 * recombine swizzles where we can as an optimization.
31 */
32
33 static bool
bi_swizzle_replicates_8(enum bi_swizzle swz)34 bi_swizzle_replicates_8(enum bi_swizzle swz)
35 {
36 switch (swz) {
37 case BI_SWIZZLE_B0000:
38 case BI_SWIZZLE_B1111:
39 case BI_SWIZZLE_B2222:
40 case BI_SWIZZLE_B3333:
41 return true;
42 default:
43 return false;
44 }
45 }
46
47 static void
lower_swizzle(bi_context * ctx,bi_instr * ins,unsigned src)48 lower_swizzle(bi_context *ctx, bi_instr *ins, unsigned src)
49 {
50 /* TODO: Use the opcode table and be a lot more methodical about this... */
51 switch (ins->op) {
52 /* Some instructions used with 16-bit data never have swizzles */
53 case BI_OPCODE_CSEL_V2F16:
54 case BI_OPCODE_CSEL_V2I16:
55 case BI_OPCODE_CSEL_V2S16:
56 case BI_OPCODE_CSEL_V2U16:
57
58 /* Despite ostensibly being 32-bit instructions, CLPER does not
59 * inherently interpret the data, so it can be used for v2f16
60 * derivatives, which might require swizzle lowering */
61 case BI_OPCODE_CLPER_I32:
62 case BI_OPCODE_CLPER_OLD_I32:
63
64 /* Similarly, CSEL.i32 consumes a boolean as a 32-bit argument. If the
65 * boolean is implemented as a 16-bit integer, the swizzle is needed
66 * for correct operation if the instruction producing the 16-bit
67 * boolean does not replicate to both halves of the containing 32-bit
68 * register. As such, we may need to lower a swizzle.
69 *
70 * This is a silly hack. Ideally, code gen would be smart enough to
71 * avoid this case (by replicating). In practice, silly hardware design
72 * decisions force our hand here.
73 */
74 case BI_OPCODE_MUX_I32:
75 case BI_OPCODE_CSEL_I32:
76 break;
77
78 case BI_OPCODE_IADD_V2S16:
79 case BI_OPCODE_IADD_V2U16:
80 case BI_OPCODE_ISUB_V2S16:
81 case BI_OPCODE_ISUB_V2U16:
82 if (src == 0 && ins->src[src].swizzle != BI_SWIZZLE_H10)
83 break;
84 else
85 return;
86 case BI_OPCODE_LSHIFT_AND_V2I16:
87 case BI_OPCODE_LSHIFT_OR_V2I16:
88 case BI_OPCODE_LSHIFT_XOR_V2I16:
89 case BI_OPCODE_RSHIFT_AND_V2I16:
90 case BI_OPCODE_RSHIFT_OR_V2I16:
91 case BI_OPCODE_RSHIFT_XOR_V2I16:
92 if (src == 2)
93 return;
94 else
95 break;
96
97 /* For some reason MUX.v2i16 allows swaps but not replication */
98 case BI_OPCODE_MUX_V2I16:
99 if (ins->src[src].swizzle == BI_SWIZZLE_H10)
100 return;
101 else
102 break;
103
104 /* No swizzles supported */
105 case BI_OPCODE_HADD_V4U8:
106 case BI_OPCODE_HADD_V4S8:
107 case BI_OPCODE_CLZ_V4U8:
108 case BI_OPCODE_IDP_V4I8:
109 case BI_OPCODE_IABS_V4S8:
110 case BI_OPCODE_ICMP_V4I8:
111 case BI_OPCODE_ICMP_V4U8:
112 case BI_OPCODE_MUX_V4I8:
113 case BI_OPCODE_IADD_IMM_V4I8:
114 break;
115
116 case BI_OPCODE_LSHIFT_AND_V4I8:
117 case BI_OPCODE_LSHIFT_OR_V4I8:
118 case BI_OPCODE_LSHIFT_XOR_V4I8:
119 case BI_OPCODE_RSHIFT_AND_V4I8:
120 case BI_OPCODE_RSHIFT_OR_V4I8:
121 case BI_OPCODE_RSHIFT_XOR_V4I8:
122 /* Last source allows identity or replication */
123 if (src == 2 && bi_swizzle_replicates_8(ins->src[src].swizzle))
124 return;
125
126 /* Others do not allow swizzles */
127 break;
128
129 /* We don't want to deal with reswizzling logic in modifier prop. Move
130 * the swizzle outside, it's easier for clamp propagation. */
131 case BI_OPCODE_FCLAMP_V2F16: {
132 bi_builder b = bi_init_builder(ctx, bi_after_instr(ins));
133 bi_index dest = ins->dest[0];
134 bi_index tmp = bi_temp(ctx);
135
136 bi_index swizzled_src = bi_replace_index(ins->src[0], tmp);
137 ins->src[0].swizzle = BI_SWIZZLE_H01;
138 ins->dest[0] = tmp;
139 bi_swz_v2i16_to(&b, dest, swizzled_src);
140 return;
141 }
142
143 default:
144 return;
145 }
146
147 /* First, try to apply a given swizzle to a constant to clear the
148 * runtime swizzle. This is less heavy-handed than ignoring the
149 * swizzle for scalar destinations, since it maintains
150 * replication of the destination.
151 */
152 if (ins->src[src].type == BI_INDEX_CONSTANT) {
153 ins->src[src].value =
154 bi_apply_swizzle(ins->src[src].value, ins->src[src].swizzle);
155 ins->src[src].swizzle = BI_SWIZZLE_H01;
156 return;
157 }
158
159 /* Even if the source does not replicate, if the consuming instruction
160 * produces a 16-bit scalar, we can ignore the other component.
161 */
162 if (ins->dest[0].swizzle == BI_SWIZZLE_H00 &&
163 ins->src[src].swizzle == BI_SWIZZLE_H00) {
164 ins->src[src].swizzle = BI_SWIZZLE_H01;
165 return;
166 }
167
168 /* Lower it away */
169 bi_builder b = bi_init_builder(ctx, bi_before_instr(ins));
170
171 bool is_8 = (bi_opcode_props[ins->op].size == BI_SIZE_8) ||
172 (bi_opcode_props[ins->op].size == BI_SIZE_32 &&
173 ins->src[src].swizzle >= BI_SWIZZLE_B0000);
174
175 bi_index orig = ins->src[src];
176 bi_index stripped = bi_replace_index(bi_null(), orig);
177 stripped.swizzle = ins->src[src].swizzle;
178
179 bi_index swz = is_8 ? bi_swz_v4i8(&b, stripped) : bi_swz_v2i16(&b, stripped);
180
181 bi_replace_src(ins, src, swz);
182 ins->src[src].swizzle = BI_SWIZZLE_H01;
183 }
184
185 static bool
bi_swizzle_replicates_16(enum bi_swizzle swz)186 bi_swizzle_replicates_16(enum bi_swizzle swz)
187 {
188 switch (swz) {
189 case BI_SWIZZLE_H00:
190 case BI_SWIZZLE_H11:
191 return true;
192 default:
193 /* If a swizzle replicates every 8-bits, it also replicates
194 * every 16-bits, so allow 8-bit replicating swizzles.
195 */
196 return bi_swizzle_replicates_8(swz);
197 }
198 }
199
200 static bool
bi_instr_replicates(bi_instr * I,BITSET_WORD * replicates_16)201 bi_instr_replicates(bi_instr *I, BITSET_WORD *replicates_16)
202 {
203 switch (I->op) {
204
205 /* Instructions that construct vectors have replicated output if their
206 * sources are identical. Check this case first.
207 */
208 case BI_OPCODE_MKVEC_V2I16:
209 case BI_OPCODE_V2F16_TO_V2S16:
210 case BI_OPCODE_V2F16_TO_V2U16:
211 case BI_OPCODE_V2F32_TO_V2F16:
212 case BI_OPCODE_V2S16_TO_V2F16:
213 case BI_OPCODE_V2S8_TO_V2F16:
214 case BI_OPCODE_V2S8_TO_V2S16:
215 case BI_OPCODE_V2U16_TO_V2F16:
216 case BI_OPCODE_V2U8_TO_V2F16:
217 case BI_OPCODE_V2U8_TO_V2U16:
218 return bi_is_value_equiv(I->src[0], I->src[1]);
219
220 /* 16-bit transcendentals are defined to output zero in their
221 * upper half, so they do not replicate
222 */
223 case BI_OPCODE_FRCP_F16:
224 case BI_OPCODE_FRSQ_F16:
225 return false;
226
227 /* Not sure, be conservative, we don't use these.. */
228 case BI_OPCODE_VN_ASST1_F16:
229 case BI_OPCODE_FPCLASS_F16:
230 case BI_OPCODE_FPOW_SC_DET_F16:
231 return false;
232
233 default:
234 break;
235 }
236
237 /* Replication analysis only makes sense for ALU instructions */
238 if (bi_opcode_props[I->op].message != BIFROST_MESSAGE_NONE)
239 return false;
240
241 /* We only analyze 16-bit instructions for 16-bit replication. We could
242 * maybe do better.
243 */
244 if (bi_opcode_props[I->op].size != BI_SIZE_16)
245 return false;
246
247 bi_foreach_src(I, s) {
248 if (bi_is_null(I->src[s]))
249 continue;
250
251 /* Replicated swizzles */
252 if (bi_swizzle_replicates_16(I->src[s].swizzle))
253 continue;
254
255 /* Replicated values */
256 if (bi_is_ssa(I->src[s]) && BITSET_TEST(replicates_16, I->src[s].value))
257 continue;
258
259 /* Replicated constants */
260 if (I->src[s].type == BI_INDEX_CONSTANT &&
261 (I->src[s].value & 0xFFFF) == (I->src[s].value >> 16))
262 continue;
263
264 return false;
265 }
266
267 return true;
268 }
269
270 void
bi_lower_swizzle(bi_context * ctx)271 bi_lower_swizzle(bi_context *ctx)
272 {
273 bi_foreach_instr_global_safe(ctx, ins) {
274 bi_foreach_src(ins, s) {
275 if (bi_is_null(ins->src[s]))
276 continue;
277 if (ins->src[s].swizzle == BI_SWIZZLE_H01)
278 continue;
279
280 lower_swizzle(ctx, ins, s);
281 }
282 }
283
284 /* Now that we've lowered swizzles, clean up the mess */
285 BITSET_WORD *replicates_16 = calloc(sizeof(bi_index), ctx->ssa_alloc);
286
287 bi_foreach_instr_global(ctx, ins) {
288 if (ins->nr_dests && bi_instr_replicates(ins, replicates_16))
289 BITSET_SET(replicates_16, ins->dest[0].value);
290
291 if (ins->op == BI_OPCODE_SWZ_V2I16 && bi_is_ssa(ins->src[0]) &&
292 BITSET_TEST(replicates_16, ins->src[0].value)) {
293 ins->op = BI_OPCODE_MOV_I32;
294 ins->src[0].swizzle = BI_SWIZZLE_H01;
295 }
296
297 /* The above passes rely on replicating destinations. For
298 * Valhall, we will want to optimize this. For now, default
299 * to Bifrost compatible behaviour.
300 */
301 if (ins->nr_dests)
302 ins->dest[0].swizzle = BI_SWIZZLE_H01;
303 }
304
305 free(replicates_16);
306 }
307