1 /*
2 * Copyright 2021 Alyssa Rosenzweig
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "agx_builder.h"
7 #include "agx_compiler.h"
8 #include "agx_minifloat.h"
9 #include "agx_opcodes.h"
10
11 /* AGX peephole optimizer responsible for instruction combining. It operates in
12 * a forward direction and a backward direction, in each case traversing in
13 * source order. SSA means the forward pass satisfies the invariant:
14 *
15 * Every def is visited before any of its uses.
16 *
17 * Dually, the backend pass satisfies the invariant:
18 *
19 * Every use of a def is visited before the def.
20 *
21 * This means the forward pass can propagate modifiers forward, whereas the
22 * backwards pass propagates modifiers backward. Consider an example:
23 *
24 * 1 = fabs 0
25 * 2 = fround 1
26 * 3 = fsat 1
27 *
28 * The forwards pass would propagate the fabs to the fround (since we can
29 * lookup the fabs from the fround source and do the replacement). By contrast
30 * the backwards pass would propagate the fsat back to the fround (since when
31 * we see the fround we know it has only a single user, fsat). Propagatable
32 * instruction have natural directions (like pushforwards and pullbacks).
33 *
34 * We are careful to update the tracked state whenever we modify an instruction
35 * to ensure the passes are linear-time and converge in a single iteration.
36 *
37 * Size conversions are worth special discussion. Consider the snippet:
38 *
39 * 2 = fadd 0, 1
40 * 3 = f2f16 2
41 * 4 = fround 3
42 *
43 * A priori, we can move the f2f16 in either direction. But it's not equal --
44 * if we move it up to the fadd, we get FP16 for two instructions, whereas if
45 * we push it into the fround, we effectively get FP32 for two instructions. So
46 * f2f16 is backwards. Likewise, consider
47 *
48 * 2 = fadd 0, 1
49 * 3 = f2f32 1
50 * 4 = fround 3
51 *
52 * This time if we move f2f32 up to the fadd, we get FP32 for two, but if we
53 * move it down to the fround, we get FP16 to too. So f2f32 is backwards.
54 */
55
56 static bool
agx_is_fmov(agx_instr * def)57 agx_is_fmov(agx_instr *def)
58 {
59 return (def->op == AGX_OPCODE_FADD) &&
60 agx_is_equiv(def->src[1], agx_negzero());
61 }
62
63 /* Compose floating-point modifiers with floating-point sources */
64
65 static agx_index
agx_compose_float_src(agx_index to,agx_index from)66 agx_compose_float_src(agx_index to, agx_index from)
67 {
68 if (to.abs) {
69 from.neg = false;
70 from.abs = true;
71 }
72
73 from.neg ^= to.neg;
74
75 return from;
76 }
77
78 static void
agx_optimizer_fmov(agx_instr ** defs,agx_instr * ins)79 agx_optimizer_fmov(agx_instr **defs, agx_instr *ins)
80 {
81 agx_foreach_ssa_src(ins, s) {
82 agx_index src = ins->src[s];
83 agx_instr *def = defs[src.value];
84
85 if (def == NULL)
86 continue; /* happens for phis in loops */
87 if (!agx_is_fmov(def))
88 continue;
89 if (def->saturate)
90 continue;
91 if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2)
92 continue;
93
94 /* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16
95 * into 16-bit instructions, since the latter would implicitly promote to
96 * a 32-bit instruction which is not exact.
97 */
98 assert(def->src[0].size == AGX_SIZE_32 ||
99 def->src[0].size == AGX_SIZE_16);
100 assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16);
101
102 if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32)
103 continue;
104
105 ins->src[s] = agx_compose_float_src(src, def->src[0]);
106 }
107 }
108
109 static bool
image_write_source_can_be_immediate(agx_instr * I,unsigned s)110 image_write_source_can_be_immediate(agx_instr *I, unsigned s)
111 {
112 assert(I->op == AGX_OPCODE_IMAGE_WRITE);
113
114 /* LOD can always be immediate. Actually, it's just zero so far, we don't
115 * support nonzero LOD for images yet.
116 */
117 if (s == 2)
118 return true;
119
120 /* If the "bindless" source (source 3) is an immediate, it means we don't
121 * have a bindless image, instead we have a texture state index. We're
122 * allowed to have immediate texture state registers (source 4). However,
123 * we're not allowed to have immediate bindless offsets (also source 4).
124 */
125 bool is_texture_state = (I->src[3].type == AGX_INDEX_IMMEDIATE);
126 if (s == 4 && is_texture_state)
127 return true;
128
129 /* Otherwise, must be from a register */
130 return false;
131 }
132
133 static void
agx_optimizer_inline_imm(agx_instr ** defs,agx_instr * I)134 agx_optimizer_inline_imm(agx_instr **defs, agx_instr *I)
135 {
136 agx_foreach_ssa_src(I, s) {
137 agx_index src = I->src[s];
138 if (src.neg)
139 continue;
140
141 agx_instr *def = defs[src.value];
142 if (!def || def->op != AGX_OPCODE_MOV_IMM)
143 continue;
144
145 uint8_t value = def->imm;
146 uint16_t value_u16 = def->imm;
147
148 bool float_src = agx_is_float_src(I, s);
149
150 if (I->op == AGX_OPCODE_ST_TILE && s == 0)
151 continue;
152 if (I->op == AGX_OPCODE_ZS_EMIT && s != 0)
153 continue;
154 if (I->op == AGX_OPCODE_TEXTURE_SAMPLE && s != 4)
155 continue;
156 if ((I->op == AGX_OPCODE_DEVICE_STORE ||
157 I->op == AGX_OPCODE_LOCAL_STORE || I->op == AGX_OPCODE_ATOMIC ||
158 I->op == AGX_OPCODE_LOCAL_ATOMIC) &&
159 s != 2)
160 continue;
161 if (I->op == AGX_OPCODE_ST_VARY && s != 0)
162 continue;
163 if ((I->op == AGX_OPCODE_LOCAL_LOAD || I->op == AGX_OPCODE_DEVICE_LOAD ||
164 I->op == AGX_OPCODE_STACK_STORE) &&
165 s != 1)
166 continue;
167 if (I->op == AGX_OPCODE_SPLIT)
168 continue;
169
170 if (I->op == AGX_OPCODE_IMAGE_WRITE &&
171 !image_write_source_can_be_immediate(I, s))
172 continue;
173
174 if (float_src) {
175 bool fp16 = (def->dest[0].size == AGX_SIZE_16);
176 assert(fp16 || (def->dest[0].size == AGX_SIZE_32));
177
178 float f = fp16 ? _mesa_half_to_float(def->imm) : uif(def->imm);
179 if (!agx_minifloat_exact(f))
180 continue;
181
182 I->src[s] = agx_immediate_f(f);
183 } else if (value == def->imm) {
184 I->src[s] = agx_immediate(value);
185 } else if (value_u16 == def->imm && agx_allows_16bit_immediate(I)) {
186 I->src[s] = agx_abs(agx_immediate(value_u16));
187 }
188 }
189 }
190
191 /*
192 * Fuse not into and/or/xor. Specifically, acts on not and fuses:
193 *
194 * not(and(x, y) -> nand(x, y)
195 * not(or(x, y) -> nor(x, y)
196 * not(xor(x, y) -> xnor(x, y)
197 */
198 static bool
agx_optimizer_not(agx_instr * I,agx_instr * use)199 agx_optimizer_not(agx_instr *I, agx_instr *use)
200 {
201 /* Check for bit op and use of not op */
202 if (I->op != AGX_OPCODE_BITOP || use->op != AGX_OPCODE_NOT)
203 return false;
204
205 /* Remap operation to the appropriate one */
206 I->truth_table ^= 0xF;
207 I->dest[0] = use->dest[0];
208
209 return true;
210 }
211
212 static bool
agx_optimizer_fmov_rev(agx_instr * I,agx_instr * use)213 agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
214 {
215 if (!agx_is_fmov(use))
216 return false;
217 if (use->src[0].neg || use->src[0].abs)
218 return false;
219
220 /* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into
221 * 16-bit instructions, since the latter would implicitly promote to a 32-bit
222 * instruction which is not exact.
223 */
224 assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16);
225 assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16);
226
227 if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32)
228 return false;
229
230 /* saturate(saturate(x)) = saturate(x) */
231 I->saturate |= use->saturate;
232 I->dest[0] = use->dest[0];
233 return true;
234 }
235
236 static void
agx_optimizer_copyprop(agx_context * ctx,agx_instr ** defs,agx_instr * I)237 agx_optimizer_copyprop(agx_context *ctx, agx_instr **defs, agx_instr *I)
238 {
239 agx_foreach_ssa_src(I, s) {
240 agx_index src = I->src[s];
241 agx_instr *def = defs[src.value];
242
243 if (def == NULL)
244 continue; /* happens for phis in loops */
245 if (def->op != AGX_OPCODE_MOV)
246 continue;
247
248 /* At the moment, not all instructions support size conversions. Notably
249 * RA pseudo instructions don't handle size conversions. This should be
250 * refined in the future.
251 */
252 if (def->src[0].size != src.size)
253 continue;
254
255 /* Optimize split(64-bit uniform) so we can get better copyprop of the
256 * 32-bit uniform parts. This helps reduce moves with 64-bit uniforms.
257 */
258 if (I->op == AGX_OPCODE_SPLIT && def->src[0].type == AGX_INDEX_UNIFORM &&
259 src.size == AGX_SIZE_64 && I->dest[0].size == AGX_SIZE_32) {
260
261 assert(I->nr_dests == 2 && "decomposing a 64-bit scalar");
262 agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
263
264 agx_index lo = def->src[0];
265 lo.size = AGX_SIZE_32;
266
267 agx_index hi = lo;
268 hi.value += 2 /* half of 64-bits = 32-bits = 2 x 16-bits */;
269
270 defs[I->dest[0].value] = agx_mov_to(&b, I->dest[0], lo);
271 defs[I->dest[1].value] = agx_mov_to(&b, I->dest[1], hi);
272
273 agx_remove_instruction(I);
274 continue;
275 }
276
277 /* Immediate inlining happens elsewhere */
278 if (def->src[0].type == AGX_INDEX_IMMEDIATE)
279 continue;
280
281 /* ALU instructions cannot take 64-bit */
282 if (def->src[0].size == AGX_SIZE_64 &&
283 !(I->op == AGX_OPCODE_DEVICE_LOAD && s == 0) &&
284 !(I->op == AGX_OPCODE_DEVICE_STORE && s == 1) &&
285 !(I->op == AGX_OPCODE_ATOMIC && s == 1))
286 continue;
287
288 agx_replace_src(I, s, def->src[0]);
289 }
290 }
291
292 /*
293 * Fuse conditions into if. Specifically, acts on if_icmp and fuses:
294 *
295 * if_icmp(cmp(x, y, *), 0, ne/eq) -> if_cmp(x, y, *)
296 */
297 static void
agx_optimizer_if_cmp(agx_instr ** defs,agx_instr * I)298 agx_optimizer_if_cmp(agx_instr **defs, agx_instr *I)
299 {
300 /* Check for unfused if */
301 if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
302 I->src[0].type != AGX_INDEX_NORMAL)
303 return;
304
305 /* Check for condition */
306 agx_instr *def = defs[I->src[0].value];
307 if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
308 return;
309
310 /* Fuse */
311 I->src[0] = def->src[0];
312 I->src[1] = def->src[1];
313 I->invert_cond = def->invert_cond ^ !I->invert_cond;
314
315 if (def->op == AGX_OPCODE_ICMP) {
316 I->op = AGX_OPCODE_IF_ICMP;
317 I->icond = def->icond;
318 } else {
319 I->op = AGX_OPCODE_IF_FCMP;
320 I->fcond = def->fcond;
321 }
322 }
323
324 /*
325 * Fuse invert into if. Acts on if_icmp and fuses:
326 *
327 * if_icmp(xor(x, 1), 0, ne) -> if_cmp(x, 0, eq)
328 */
329 static void
agx_optimizer_if_not(agx_instr ** defs,agx_instr * I)330 agx_optimizer_if_not(agx_instr **defs, agx_instr *I)
331 {
332 /* Check for unfused if */
333 if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
334 I->src[0].type != AGX_INDEX_NORMAL)
335 return;
336
337 /* Check for invert */
338 agx_instr *def = defs[I->src[0].value];
339 if (def->op != AGX_OPCODE_BITOP ||
340 !agx_is_equiv(def->src[1], agx_immediate(1)) ||
341 def->truth_table != AGX_BITOP_XOR)
342 return;
343
344 /* Fuse */
345 I->src[0] = def->src[0];
346 I->invert_cond = !I->invert_cond;
347 }
348
349 /*
350 * Fuse conditions into select. Specifically, acts on icmpsel and fuses:
351 *
352 * icmpsel(cmp(x, y, *), 0, z, w, eq) -> cmpsel(x, y, w, z, *)
353 *
354 * Care must be taken to invert the condition by swapping cmpsel arguments.
355 */
356 static void
agx_optimizer_cmpsel(agx_instr ** defs,agx_instr * I)357 agx_optimizer_cmpsel(agx_instr **defs, agx_instr *I)
358 {
359 /* Check for unfused select */
360 if (!agx_is_equiv(I->src[1], agx_zero()) || I->icond != AGX_ICOND_UEQ ||
361 I->src[0].type != AGX_INDEX_NORMAL)
362 return;
363
364 /* Check for condition */
365 agx_instr *def = defs[I->src[0].value];
366 if (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP)
367 return;
368
369 /* Fuse */
370 I->src[0] = def->src[0];
371 I->src[1] = def->src[1];
372
373 /* In the unfused select, the condition is inverted due to the form:
374 *
375 * (cond == 0) ? x : y
376 *
377 * So we need to swap the arguments when fusing to become cond ? y : x. If
378 * the condition was supposed to be inverted, we don't swap since it's
379 * already inverted. cmpsel does not have an invert_cond bit to use.
380 */
381 if (!def->invert_cond) {
382 agx_index temp = I->src[2];
383 I->src[2] = I->src[3];
384 I->src[3] = temp;
385 }
386
387 if (def->op == AGX_OPCODE_ICMP) {
388 I->op = AGX_OPCODE_ICMPSEL;
389 I->icond = def->icond;
390 } else {
391 I->op = AGX_OPCODE_FCMPSEL;
392 I->fcond = def->fcond;
393 }
394 }
395
396 /*
397 * Fuse conditions into ballots:
398 *
399 * ballot(cmp(x, y)) -> ballot_cmp(x, y)
400 */
401 static void
agx_optimizer_ballot(agx_context * ctx,agx_instr ** defs,agx_instr * I)402 agx_optimizer_ballot(agx_context *ctx, agx_instr **defs, agx_instr *I)
403 {
404 if (I->src[0].type != AGX_INDEX_NORMAL)
405 return;
406
407 agx_instr *def = defs[I->src[0].value];
408 if (!def || (def->op != AGX_OPCODE_ICMP && def->op != AGX_OPCODE_FCMP))
409 return;
410
411 bool quad = I->op == AGX_OPCODE_QUAD_BALLOT;
412 assert(quad || I->op == AGX_OPCODE_BALLOT);
413
414 /* Replace with a fused instruction since the # of sources changes */
415 agx_builder b = agx_init_builder(ctx, agx_before_instr(I));
416
417 agx_instr *fused = agx_icmp_ballot_to(
418 &b, I->dest[0], def->src[0], def->src[1], def->icond, def->invert_cond);
419
420 if (def->op == AGX_OPCODE_ICMP) {
421 fused->op = quad ? AGX_OPCODE_ICMP_QUAD_BALLOT : AGX_OPCODE_ICMP_BALLOT;
422 } else {
423 fused->op = quad ? AGX_OPCODE_FCMP_QUAD_BALLOT : AGX_OPCODE_FCMP_BALLOT;
424 }
425
426 agx_remove_instruction(I);
427 }
428
429 /*
430 * Fuse not srcs into bitop.
431 */
432 static void
agx_optimizer_bitop(agx_instr ** defs,agx_instr * I)433 agx_optimizer_bitop(agx_instr **defs, agx_instr *I)
434 {
435 agx_foreach_ssa_src(I, s) {
436 agx_index src = I->src[s];
437 agx_instr *def = defs[src.value];
438
439 /* Check for not src */
440 if (def->op != AGX_OPCODE_NOT)
441 continue;
442
443 /* Select new operation */
444 if (s == 0) {
445 I->truth_table =
446 ((I->truth_table & 0x5) << 1) | ((I->truth_table & 0xa) >> 1);
447 } else if (s == 1) {
448 I->truth_table = ((I->truth_table & 0x3) << 2) | (I->truth_table >> 2);
449 }
450
451 /* Fuse */
452 I->src[s] = def->src[0];
453 }
454 }
455
456 static void
agx_optimizer_forward(agx_context * ctx)457 agx_optimizer_forward(agx_context *ctx)
458 {
459 agx_instr **defs = calloc(ctx->alloc, sizeof(*defs));
460
461 agx_foreach_instr_global_safe(ctx, I) {
462 struct agx_opcode_info info = agx_opcodes_info[I->op];
463
464 agx_foreach_ssa_dest(I, d) {
465 defs[I->dest[d].value] = I;
466 }
467
468 /* Optimize moves */
469 agx_optimizer_copyprop(ctx, defs, I);
470
471 /* Propagate fmov down */
472 if (info.is_float || I->op == AGX_OPCODE_FCMPSEL ||
473 I->op == AGX_OPCODE_FCMP)
474 agx_optimizer_fmov(defs, I);
475
476 /* Inline immediates if we can. TODO: systematic */
477 if (I->op != AGX_OPCODE_COLLECT && I->op != AGX_OPCODE_IMAGE_LOAD &&
478 I->op != AGX_OPCODE_TEXTURE_LOAD &&
479 I->op != AGX_OPCODE_UNIFORM_STORE &&
480 I->op != AGX_OPCODE_BLOCK_IMAGE_STORE && I->op != AGX_OPCODE_EXPORT)
481 agx_optimizer_inline_imm(defs, I);
482
483 if (I->op == AGX_OPCODE_IF_ICMP) {
484 agx_optimizer_if_not(defs, I);
485 agx_optimizer_if_cmp(defs, I);
486 } else if (I->op == AGX_OPCODE_ICMPSEL) {
487 agx_optimizer_cmpsel(defs, I);
488 } else if (I->op == AGX_OPCODE_BALLOT ||
489 I->op == AGX_OPCODE_QUAD_BALLOT) {
490 agx_optimizer_ballot(ctx, defs, I);
491 } else if (I->op == AGX_OPCODE_BITOP) {
492 agx_optimizer_bitop(defs, I);
493 }
494 }
495
496 free(defs);
497 }
498
499 static void
agx_optimizer_backward(agx_context * ctx)500 agx_optimizer_backward(agx_context *ctx)
501 {
502 agx_instr **uses = calloc(ctx->alloc, sizeof(*uses));
503 BITSET_WORD *multiple = calloc(BITSET_WORDS(ctx->alloc), sizeof(*multiple));
504
505 agx_foreach_instr_global_rev(ctx, I) {
506 struct agx_opcode_info info = agx_opcodes_info[I->op];
507
508 agx_foreach_ssa_src(I, s) {
509 if (I->src[s].type == AGX_INDEX_NORMAL) {
510 unsigned v = I->src[s].value;
511
512 if (uses[v])
513 BITSET_SET(multiple, v);
514 else
515 uses[v] = I;
516 }
517 }
518
519 if (info.nr_dests != 1)
520 continue;
521
522 if (I->dest[0].type != AGX_INDEX_NORMAL)
523 continue;
524
525 agx_instr *use = uses[I->dest[0].value];
526
527 if (!use || BITSET_TEST(multiple, I->dest[0].value))
528 continue;
529
530 if (agx_optimizer_not(I, use)) {
531 agx_remove_instruction(use);
532 continue;
533 }
534
535 /* Destination has a single use, try to propagate */
536 if (info.is_float && agx_optimizer_fmov_rev(I, use)) {
537 agx_remove_instruction(use);
538 continue;
539 }
540 }
541
542 free(uses);
543 free(multiple);
544 }
545
546 void
agx_optimizer(agx_context * ctx)547 agx_optimizer(agx_context *ctx)
548 {
549 agx_optimizer_backward(ctx);
550 agx_optimizer_forward(ctx);
551 }
552