xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_cmod_propagation.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "brw_fs.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 
28 /** @file
29  *
30  * Implements a pass that propagates the conditional modifier from a CMP x 0.0
31  * instruction into the instruction that generated x. For instance, in this
32  * sequence
33  *
34  *    add(8)          g70<1>F    g69<8,8,1>F    4096F
35  *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
36  *
37  * we can do the comparison as part of the ADD instruction directly:
38  *
39  *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
40  *
41  * If there had been a use of the flag register and another CMP using g70
42  *
43  *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
44  *    (+f0) sel(8)    g71<F>     g72<8,8,1>F    g73<8,8,1>F
45  *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
46  *
47  * we can recognize that the CMP is generating the flag value that already
48  * exists and therefore remove the instruction.
49  */
50 
51 using namespace brw;
52 
53 static bool
cmod_propagate_cmp_to_add(const intel_device_info * devinfo,bblock_t * block,fs_inst * inst)54 cmod_propagate_cmp_to_add(const intel_device_info *devinfo, bblock_t *block,
55                           fs_inst *inst)
56 {
57    bool read_flag = false;
58    const unsigned flags_written = inst->flags_written(devinfo);
59 
60    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
61       if (scan_inst->opcode == BRW_OPCODE_ADD &&
62           !scan_inst->predicate &&
63           scan_inst->dst.is_contiguous() &&
64           scan_inst->exec_size == inst->exec_size) {
65          bool negate;
66 
67          /* A CMP is basically a subtraction.  The result of the
68           * subtraction must be the same as the result of the addition.
69           * This means that one of the operands must be negated.  So (a +
70           * b) vs (a == -b) or (a + -b) vs (a == b).
71           */
72          if ((inst->src[0].equals(scan_inst->src[0]) &&
73               inst->src[1].negative_equals(scan_inst->src[1])) ||
74              (inst->src[0].equals(scan_inst->src[1]) &&
75               inst->src[1].negative_equals(scan_inst->src[0]))) {
76             negate = false;
77          } else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
78                      inst->src[1].equals(scan_inst->src[1])) ||
79                     (inst->src[0].negative_equals(scan_inst->src[1]) &&
80                      inst->src[1].equals(scan_inst->src[0]))) {
81             negate = true;
82          } else {
83             goto not_match;
84          }
85 
86          /* If the scan instruction writes a different flag register than the
87           * instruction we're trying to propagate from, bail.
88           *
89           * FINISHME: The second part of the condition may be too strong.
90           * Perhaps (scan_inst->flags_written() & flags_written) !=
91           * flags_written?
92           */
93          if (scan_inst->flags_written(devinfo) != 0 &&
94              scan_inst->flags_written(devinfo) != flags_written)
95             goto not_match;
96 
97          /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
98           *
99           *    * Note that the [post condition signal] bits generated at
100           *      the output of a compute are before the .sat.
101           *
102           * Paragraph about post_zero does not mention saturation, but
103           * testing it on actual GPUs shows that conditional modifiers
104           * are applied after saturation.
105           *
106           *    * post_zero bit: This bit reflects whether the final
107           *      result is zero after all the clamping, normalizing,
108           *      or format conversion logic.
109           *
110           * For signed types we don't care about saturation: it won't
111           * change the result of conditional modifier.
112           *
113           * For floating and unsigned types there two special cases,
114           * when we can remove inst even if scan_inst is saturated: G
115           * and LE. Since conditional modifiers are just comparisons
116           * against zero, saturating positive values to the upper
117           * limit never changes the result of comparison.
118           *
119           * For negative values:
120           * (sat(x) >  0) == (x >  0) --- false
121           * (sat(x) <= 0) == (x <= 0) --- true
122           */
123          const enum brw_conditional_mod cond =
124             negate ? brw_swap_cmod(inst->conditional_mod)
125             : inst->conditional_mod;
126 
127          if (scan_inst->saturate &&
128              (brw_type_is_float(scan_inst->dst.type) ||
129               brw_type_is_uint(scan_inst->dst.type)) &&
130              (cond != BRW_CONDITIONAL_G &&
131               cond != BRW_CONDITIONAL_LE))
132             goto not_match;
133 
134          /* Otherwise, try propagating the conditional. */
135          if (scan_inst->can_do_cmod() &&
136              ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
137               scan_inst->conditional_mod == cond)) {
138             scan_inst->conditional_mod = cond;
139             scan_inst->flag_subreg = inst->flag_subreg;
140             inst->remove(block, true);
141             return true;
142          }
143          break;
144       }
145 
146    not_match:
147       if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
148          break;
149 
150       read_flag = read_flag ||
151                   (scan_inst->flags_read(devinfo) & flags_written) != 0;
152    }
153 
154    return false;
155 }
156 
157 /**
158  * Propagate conditional modifiers from NOT instructions
159  *
160  * Attempt to convert sequences like
161  *
162  *    or(8)           g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
163  *    ...
164  *    not.nz.f0(8)    null            g78<8,8,1>UD
165  *
166  * into
167  *
168  *    or.z.f0(8)      g78<8,8,1>      g76<8,8,1>UD    g77<8,8,1>UD
169  */
170 static bool
cmod_propagate_not(const intel_device_info * devinfo,bblock_t * block,fs_inst * inst)171 cmod_propagate_not(const intel_device_info *devinfo, bblock_t *block,
172                    fs_inst *inst)
173 {
174    const enum brw_conditional_mod cond = brw_negate_cmod(inst->conditional_mod);
175    bool read_flag = false;
176    const unsigned flags_written = inst->flags_written(devinfo);
177 
178    if (cond != BRW_CONDITIONAL_Z && cond != BRW_CONDITIONAL_NZ)
179       return false;
180 
181    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
182       if (regions_overlap(scan_inst->dst, scan_inst->size_written,
183                           inst->src[0], inst->size_read(0))) {
184          if (scan_inst->opcode != BRW_OPCODE_OR &&
185              scan_inst->opcode != BRW_OPCODE_AND)
186             break;
187 
188          if (scan_inst->predicate ||
189              !scan_inst->dst.is_contiguous() ||
190              scan_inst->dst.offset != inst->src[0].offset ||
191              scan_inst->exec_size != inst->exec_size)
192             break;
193 
194          /* If the scan instruction writes a different flag register than the
195           * instruction we're trying to propagate from, bail.
196           *
197           * FINISHME: The second part of the condition may be too strong.
198           * Perhaps (scan_inst->flags_written() & flags_written) !=
199           * flags_written?
200           */
201          if (scan_inst->flags_written(devinfo) != 0 &&
202              scan_inst->flags_written(devinfo) != flags_written)
203             break;
204 
205          if (scan_inst->can_do_cmod() &&
206              ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
207               scan_inst->conditional_mod == cond)) {
208             scan_inst->conditional_mod = cond;
209             scan_inst->flag_subreg = inst->flag_subreg;
210             inst->remove(block, true);
211             return true;
212          }
213          break;
214       }
215 
216       if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
217          break;
218 
219       read_flag = read_flag ||
220                   (scan_inst->flags_read(devinfo) & flags_written) != 0;
221    }
222 
223    return false;
224 }
225 
226 static bool
opt_cmod_propagation_local(const intel_device_info * devinfo,bblock_t * block)227 opt_cmod_propagation_local(const intel_device_info *devinfo, bblock_t *block)
228 {
229    bool progress = false;
230    UNUSED int ip = block->end_ip + 1;
231 
232    foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
233       ip--;
234 
235       if ((inst->opcode != BRW_OPCODE_AND &&
236            inst->opcode != BRW_OPCODE_CMP &&
237            inst->opcode != BRW_OPCODE_MOV &&
238            inst->opcode != BRW_OPCODE_NOT) ||
239           inst->predicate != BRW_PREDICATE_NONE ||
240           !inst->dst.is_null() ||
241           (inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
242            inst->src[0].file != UNIFORM))
243          continue;
244 
245       /* An ABS source modifier can only be handled when processing a compare
246        * with a value other than zero.
247        */
248       if (inst->src[0].abs &&
249           (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
250          continue;
251 
252       /* Only an AND.NZ can be propagated.  Many AND.Z instructions are
253        * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
254        * Propagating those would require inverting the condition on the CMP.
255        * This changes both the flag value and the register destination of the
256        * CMP.  That result may be used elsewhere, so we can't change its value
257        * on a whim.
258        */
259       if (inst->opcode == BRW_OPCODE_AND &&
260           !(inst->src[1].is_one() &&
261             inst->conditional_mod == BRW_CONDITIONAL_NZ &&
262             !inst->src[0].negate))
263          continue;
264 
265       /* A CMP with a second source of zero can match with anything.  A CMP
266        * with a second source that is not zero can only match with an ADD
267        * instruction.
268        *
269        * Only apply this optimization to float-point sources.  It can fail for
270        * integers.  For inputs a = 0x80000000, b = 4, int(0x80000000) < 4, but
271        * int(0x80000000) - 4 overflows and results in 0x7ffffffc.  that's not
272        * less than zero, so the flags get set differently than for (a < b).
273        */
274       if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
275          if (brw_type_is_float(inst->src[0].type) &&
276              cmod_propagate_cmp_to_add(devinfo, block, inst))
277             progress = true;
278 
279          continue;
280       }
281 
282       if (inst->opcode == BRW_OPCODE_NOT) {
283          progress = cmod_propagate_not(devinfo, block, inst) || progress;
284          continue;
285       }
286 
287       bool read_flag = false;
288       const unsigned flags_written = inst->flags_written(devinfo);
289       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
290          if (regions_overlap(scan_inst->dst, scan_inst->size_written,
291                              inst->src[0], inst->size_read(0))) {
292             /* If the scan instruction writes a different flag register than
293              * the instruction we're trying to propagate from, bail.
294              *
295              * FINISHME: The second part of the condition may be too strong.
296              * Perhaps (scan_inst->flags_written() & flags_written) !=
297              * flags_written?
298              */
299             if (scan_inst->flags_written(devinfo) != 0 &&
300                 scan_inst->flags_written(devinfo) != flags_written)
301                break;
302 
303             if (scan_inst->predicate ||
304                 !scan_inst->dst.is_contiguous() ||
305                 scan_inst->dst.offset != inst->src[0].offset ||
306                 scan_inst->exec_size != inst->exec_size)
307                break;
308 
309             /* If the write mask is different we can't propagate. */
310             if (scan_inst->force_writemask_all != inst->force_writemask_all)
311                break;
312 
313             /* CMP's result is the same regardless of dest type. */
314             if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
315                 scan_inst->opcode == BRW_OPCODE_CMP &&
316                 brw_type_is_int(inst->dst.type)) {
317                inst->remove(block, true);
318                progress = true;
319                break;
320             }
321 
322             /* If the AND wasn't handled by the previous case, it isn't safe
323              * to remove it.
324              */
325             if (inst->opcode == BRW_OPCODE_AND)
326                break;
327 
328             if (inst->opcode == BRW_OPCODE_MOV) {
329                if (brw_type_is_float(scan_inst->dst.type)) {
330                   /* If the destination type of scan_inst is floating-point,
331                    * then:
332                    *
333                    * - The source of the MOV instruction must be the same
334                    *   type.
335                    *
336                    * - The destination of the MOV instruction must be float
337                    *   point with a size at least as large as the destination
338                    *   of inst.  Size-reducing f2f conversions could cause
339                    *   non-zero values to become zero, etc.
340                    */
341                   if (scan_inst->dst.type != inst->src[0].type)
342                      break;
343 
344                   if (!brw_type_is_float(inst->dst.type))
345                      break;
346 
347                   if (brw_type_size_bits(scan_inst->dst.type) >
348                       brw_type_size_bits(inst->dst.type))
349                      break;
350                } else {
351                   /* If the destination type of scan_inst is integer, then:
352                    *
353                    * - The source of the MOV instruction must be integer with
354                    *   the same size.
355                    *
356                    * - If the conditional modifier is neither Z nor NZ, then
357                    *   the source of the MOV instruction has to have same
358                    *   signedness.
359                    *
360                    * - If the conditional modifier is Z or NZ, then the
361                    *   destination type of inst must either be floating point
362                    *   (of any size) or integer with a size at least as large
363                    *   as the destination of inst.
364                    *
365                    * - If the conditional modifier is neither Z nor NZ, then the
366                    *   destination type of inst must either be floating point
367                    *   (of any size) or integer with a size at least as large
368                    *   as the destination of inst and the same signedness.
369                    */
370                   if (!brw_type_is_int(inst->src[0].type) ||
371                       brw_type_size_bits(scan_inst->dst.type) != brw_type_size_bits(inst->src[0].type))
372                      break;
373 
374                   if (inst->conditional_mod != BRW_CONDITIONAL_Z &&
375                       inst->conditional_mod != BRW_CONDITIONAL_NZ &&
376                       brw_type_is_uint(inst->src[0].type) !=
377                       brw_type_is_uint(scan_inst->dst.type))
378                      break;
379 
380                   if (brw_type_is_int(inst->dst.type)) {
381                      if (brw_type_size_bits(inst->dst.type) <
382                          brw_type_size_bits(scan_inst->dst.type))
383                         break;
384 
385                      if (inst->conditional_mod != BRW_CONDITIONAL_Z &&
386                          inst->conditional_mod != BRW_CONDITIONAL_NZ &&
387                          brw_type_is_uint(inst->dst.type) !=
388                          brw_type_is_uint(scan_inst->dst.type))
389                         break;
390                   }
391                }
392             } else {
393                /* Not safe to use inequality operators if the types are
394                 * different.
395                 */
396                if (scan_inst->dst.type != inst->src[0].type &&
397                    inst->conditional_mod != BRW_CONDITIONAL_Z &&
398                    inst->conditional_mod != BRW_CONDITIONAL_NZ)
399                   break;
400 
401                /* Comparisons operate differently for ints and floats */
402                if (scan_inst->dst.type != inst->dst.type) {
403                   /* Comparison result may be altered if the bit-size changes
404                    * since that affects range, denorms, etc
405                    */
406                   if (brw_type_size_bits(scan_inst->dst.type) !=
407                       brw_type_size_bits(inst->dst.type))
408                      break;
409 
410                   if (brw_type_is_float(scan_inst->dst.type) !=
411                       brw_type_is_float(inst->dst.type))
412                      break;
413                }
414             }
415 
416             /* Knowing following:
417              * - CMP writes to flag register the result of
418              *   applying cmod to the `src0 - src1`.
419              *   After that it stores the same value to dst.
420              *   Other instructions first store their result to
421              *   dst, and then store cmod(dst) to the flag
422              *   register.
423              * - inst is either CMP or MOV
424              * - inst->dst is null
425              * - inst->src[0] overlaps with scan_inst->dst
426              * - inst->src[1] is zero
427              * - scan_inst wrote to a flag register
428              *
429              * There can be three possible paths:
430              *
431              * - scan_inst is CMP:
432              *
433              *   Considering that src0 is either 0x0 (false),
434              *   or 0xffffffff (true), and src1 is 0x0:
435              *
436              *   - If inst's cmod is NZ, we can always remove
437              *     scan_inst: NZ is invariant for false and true. This
438              *     holds even if src0 is NaN: .nz is the only cmod,
439              *     that returns true for NaN.
440              *
441              *   - .g is invariant if src0 has a UD type
442              *
443              *   - .l is invariant if src0 has a D type
444              *
445              * - scan_inst and inst have the same cmod:
446              *
447              *   If scan_inst is anything than CMP, it already
448              *   wrote the appropriate value to the flag register.
449              *
450              * - else:
451              *
452              *   We can change cmod of scan_inst to that of inst,
453              *   and remove inst. It is valid as long as we make
454              *   sure that no instruction uses the flag register
455              *   between scan_inst and inst.
456              */
457             if (!inst->src[0].negate &&
458                 scan_inst->flags_written(devinfo)) {
459                if (scan_inst->opcode == BRW_OPCODE_CMP) {
460                   if ((inst->conditional_mod == BRW_CONDITIONAL_NZ) ||
461                       (inst->conditional_mod == BRW_CONDITIONAL_G &&
462                        inst->src[0].type == BRW_TYPE_UD) ||
463                       (inst->conditional_mod == BRW_CONDITIONAL_L &&
464                        inst->src[0].type == BRW_TYPE_D)) {
465                      inst->remove(block, true);
466                      progress = true;
467                      break;
468                   }
469                } else if (scan_inst->conditional_mod == inst->conditional_mod) {
470                   /* sel.cond will not write the flags. */
471                   assert(scan_inst->opcode != BRW_OPCODE_SEL);
472                   inst->remove(block, true);
473                   progress = true;
474                   break;
475                } else if (!read_flag && scan_inst->can_do_cmod()) {
476                   scan_inst->conditional_mod = inst->conditional_mod;
477                   scan_inst->flag_subreg = inst->flag_subreg;
478                   inst->remove(block, true);
479                   progress = true;
480                   break;
481                }
482             }
483 
484             /* The conditional mod of the CMP/CMPN instructions behaves
485              * specially because the flag output is not calculated from the
486              * result of the instruction, but the other way around, which
487              * means that even if the condmod to propagate and the condmod
488              * from the CMP instruction are the same they will in general give
489              * different results because they are evaluated based on different
490              * inputs.
491              */
492             if (scan_inst->opcode == BRW_OPCODE_CMP ||
493                 scan_inst->opcode == BRW_OPCODE_CMPN)
494                break;
495 
496             /* From the Sky Lake PRM, Vol 2a, "Multiply":
497              *
498              *    "When multiplying integer data types, if one of the sources
499              *     is a DW, the resulting full precision data is stored in
500              *     the accumulator. However, if the destination data type is
501              *     either W or DW, the low bits of the result are written to
502              *     the destination register and the remaining high bits are
503              *     discarded. This results in undefined Overflow and Sign
504              *     flags. Therefore, conditional modifiers and saturation
505              *     (.sat) cannot be used in this case."
506              *
507              * We just disallow cmod propagation on all integer multiplies.
508              */
509             if (!brw_type_is_float(scan_inst->dst.type) &&
510                 scan_inst->opcode == BRW_OPCODE_MUL)
511                break;
512 
513             enum brw_conditional_mod cond =
514                inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
515                                    : inst->conditional_mod;
516 
517             /* From the Kaby Lake PRM Vol. 7 "Assigning Conditional Flags":
518              *
519              *    * Note that the [post condition signal] bits generated at
520              *      the output of a compute are before the .sat.
521              *
522              * Paragraph about post_zero does not mention saturation, but
523              * testing it on actual GPUs shows that conditional modifiers are
524              * applied after saturation.
525              *
526              *    * post_zero bit: This bit reflects whether the final
527              *      result is zero after all the clamping, normalizing,
528              *      or format conversion logic.
529              *
530              * For this reason, no additional restrictions are necessary on
531              * instructions with saturate.
532              */
533 
534             /* Otherwise, try propagating the conditional. */
535             if (scan_inst->can_do_cmod() &&
536                 ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
537                  scan_inst->conditional_mod == cond)) {
538                scan_inst->conditional_mod = cond;
539                scan_inst->flag_subreg = inst->flag_subreg;
540                inst->remove(block, true);
541                progress = true;
542             }
543             break;
544          }
545 
546          if ((scan_inst->flags_written(devinfo) & flags_written) != 0)
547             break;
548 
549          read_flag = read_flag ||
550                      (scan_inst->flags_read(devinfo) & flags_written) != 0;
551       }
552    }
553 
554    /* There is progress if and only if instructions were removed. */
555    assert(progress == (block->end_ip_delta != 0));
556 
557    return progress;
558 }
559 
560 bool
brw_fs_opt_cmod_propagation(fs_visitor & s)561 brw_fs_opt_cmod_propagation(fs_visitor &s)
562 {
563    bool progress = false;
564 
565    foreach_block_reverse(block, s.cfg) {
566       progress = opt_cmod_propagation_local(s.devinfo, block) || progress;
567    }
568 
569    if (progress) {
570       s.cfg->adjust_block_ips();
571 
572       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
573    }
574 
575    return progress;
576 }
577