xref: /aosp_15_r20/external/mesa3d/src/intel/compiler/brw_fs_workaround.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2010 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8 
9 using namespace brw;
10 
11 /* Wa_14015360517
12  *
13  * The first instruction of any kernel should have non-zero emask.
14  * Make sure this happens by introducing a dummy mov instruction.
15  */
16 bool
brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor & s)17 brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor &s)
18 {
19    if (!intel_needs_workaround(s.devinfo, 14015360517))
20       return false;
21 
22    fs_inst *first_inst =
23       s.cfg->first_block()->start();
24 
25    /* We can skip the WA if first instruction is marked with
26     * force_writemask_all or exec_size equals dispatch_width.
27     */
28    if (first_inst->force_writemask_all ||
29        first_inst->exec_size == s.dispatch_width)
30       return false;
31 
32    /* Insert dummy mov as first instruction. */
33    const fs_builder ubld =
34       fs_builder(&s, s.cfg->first_block(), (fs_inst *)first_inst).exec_all().group(8, 0);
35    ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));
36 
37    s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
38    return true;
39 }
40 
41 static bool
needs_dummy_fence(const intel_device_info * devinfo,fs_inst * inst)42 needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
43 {
44    /* This workaround is about making sure that any instruction writing
45     * through UGM has completed before we hit EOT.
46     */
47    if (inst->sfid != GFX12_SFID_UGM)
48       return false;
49 
50    /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
51     * where the L1-cache override is NOT among {WB, WS, WT}
52     */
53    enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
54    if (lsc_opcode_is_store(opcode)) {
55       switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
56       case LSC_CACHE_STORE_L1STATE_L3MOCS:
57       case LSC_CACHE_STORE_L1WB_L3WB:
58       case LSC_CACHE_STORE_L1S_L3UC:
59       case LSC_CACHE_STORE_L1S_L3WB:
60       case LSC_CACHE_STORE_L1WT_L3UC:
61       case LSC_CACHE_STORE_L1WT_L3WB:
62          return false;
63 
64       default:
65          return true;
66       }
67    }
68 
69    /* Any UGM Atomic message WITHOUT return value */
70    if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
71       return true;
72 
73    return false;
74 }
75 
76 /* Wa_22013689345
77  *
78  * We need to emit UGM fence message before EOT, if shader has any UGM write
79  * or atomic message.
80  *
81  * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
82  *                We probably need a better criteria in needs_dummy_fence().
83  */
84 bool
brw_fs_workaround_memory_fence_before_eot(fs_visitor & s)85 brw_fs_workaround_memory_fence_before_eot(fs_visitor &s)
86 {
87    bool progress = false;
88    bool has_ugm_write_or_atomic = false;
89 
90    if (!intel_needs_workaround(s.devinfo, 22013689345))
91       return false;
92 
93    foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
94       if (!inst->eot) {
95          if (needs_dummy_fence(s.devinfo, inst))
96             has_ugm_write_or_atomic = true;
97          continue;
98       }
99 
100       if (!has_ugm_write_or_atomic)
101          break;
102 
103       const fs_builder ibld(&s, block, inst);
104       const fs_builder ubld = ibld.exec_all().group(1, 0);
105 
106       brw_reg dst = ubld.vgrf(BRW_TYPE_UD);
107       fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
108                                        dst, brw_vec8_grf(0, 0),
109                                        /* commit enable */ brw_imm_ud(1),
110                                        /* bti */ brw_imm_ud(0));
111       dummy_fence->sfid = GFX12_SFID_UGM;
112       dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE,
113                                              LSC_FLUSH_TYPE_NONE_6, false);
114       ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
115       progress = true;
116       /* TODO: remove this break if we ever have shader with multiple EOT. */
117       break;
118    }
119 
120    if (progress) {
121       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
122                             DEPENDENCY_VARIABLES);
123    }
124 
125    return progress;
126 }
127 
128 /**
129  * Find the first instruction in the program that might start a region of
130  * divergent control flow due to a HALT jump.  There is no
131  * find_halt_control_flow_region_end(), the region of divergence extends until
132  * the only SHADER_OPCODE_HALT_TARGET in the program.
133  */
134 static const fs_inst *
find_halt_control_flow_region_start(const fs_visitor * v)135 find_halt_control_flow_region_start(const fs_visitor *v)
136 {
137    foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
138       if (inst->opcode == BRW_OPCODE_HALT ||
139           inst->opcode == SHADER_OPCODE_HALT_TARGET)
140          return inst;
141    }
142 
143    return NULL;
144 }
145 
146 /**
147  * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
148  * can cause a BB to be executed with all channels disabled, which will lead
149  * to the execution of any NoMask instructions in it, even though any
150  * execution-masked instructions will be correctly shot down.  This may break
151  * assumptions of some NoMask SEND messages whose descriptor depends on data
152  * generated by live invocations of the shader.
153  *
154  * This avoids the problem by predicating certain instructions on an ANY
155  * horizontal predicate that makes sure that their execution is omitted when
156  * all channels of the program are disabled.
157  */
158 bool
brw_fs_workaround_nomask_control_flow(fs_visitor & s)159 brw_fs_workaround_nomask_control_flow(fs_visitor &s)
160 {
161    if (s.devinfo->ver != 12)
162       return false;
163 
164    const brw_predicate pred = s.dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
165                               s.dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
166                               BRW_PREDICATE_ALIGN1_ANY8H;
167    const fs_inst *halt_start = find_halt_control_flow_region_start(&s);
168    unsigned depth = 0;
169    bool progress = false;
170 
171    const fs_live_variables &live_vars = s.live_analysis.require();
172 
173    /* Scan the program backwards in order to be able to easily determine
174     * whether the flag register is live at any point.
175     */
176    foreach_block_reverse_safe(block, s.cfg) {
177       BITSET_WORD flag_liveout = live_vars.block_data[block->num]
178                                                .flag_liveout[0];
179       STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
180 
181       foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
182          if (!inst->predicate && inst->exec_size >= 8)
183             flag_liveout &= ~inst->flags_written(s.devinfo);
184 
185          switch (inst->opcode) {
186          case BRW_OPCODE_DO:
187          case BRW_OPCODE_IF:
188             /* Note that this doesn't handle BRW_OPCODE_HALT since only
189              * the first one in the program closes the region of divergent
190              * control flow due to any HALT instructions -- Instead this is
191              * handled with the halt_start check below.
192              */
193             depth--;
194             break;
195 
196          case BRW_OPCODE_WHILE:
197          case BRW_OPCODE_ENDIF:
198          case SHADER_OPCODE_HALT_TARGET:
199             depth++;
200             break;
201 
202          default:
203             /* Note that the vast majority of NoMask SEND instructions in the
204              * program are harmless while executed in a block with all
205              * channels disabled, since any instructions with side effects we
206              * could hit here should be execution-masked.
207              *
208              * The main concern is NoMask SEND instructions where the message
209              * descriptor or header depends on data generated by live
210              * invocations of the shader (RESINFO and
211              * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
212              * computed surface index seem to be the only examples right now
213              * where this could easily lead to GPU hangs).  Unfortunately we
214              * have no straightforward way to detect that currently, so just
215              * predicate any NoMask SEND instructions we find under control
216              * flow.
217              *
218              * If this proves to have a measurable performance impact it can
219              * be easily extended with a whitelist of messages we know we can
220              * safely omit the predication for.
221              */
222             if (depth && inst->force_writemask_all &&
223                 is_send(inst) && !inst->predicate &&
224                 !inst->has_no_mask_send_params) {
225                /* We need to load the execution mask into the flag register by
226                 * using a builder with channel group matching the whole shader
227                 * (rather than the default which is derived from the original
228                 * instruction), in order to avoid getting a right-shifted
229                 * value.
230                 */
231                const fs_builder ubld = fs_builder(&s, block, inst)
232                                        .exec_all().group(s.dispatch_width, 0);
233                const brw_reg flag = retype(brw_flag_reg(0, 0),
234                                           BRW_TYPE_UD);
235 
236                /* Due to the lack of flag register allocation we need to save
237                 * and restore the flag register if it's live.
238                 */
239                const bool save_flag = flag_liveout &
240                                       brw_fs_flag_mask(flag, s.dispatch_width / 8);
241                const brw_reg tmp = ubld.group(8, 0).vgrf(flag.type);
242 
243                if (save_flag) {
244                   ubld.group(8, 0).UNDEF(tmp);
245                   ubld.group(1, 0).MOV(tmp, flag);
246                }
247 
248                ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
249 
250                set_predicate(pred, inst);
251                inst->flag_subreg = 0;
252                inst->predicate_trivial = true;
253 
254                if (save_flag)
255                   ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
256 
257                progress = true;
258             }
259             break;
260          }
261 
262          if (inst == halt_start)
263             depth--;
264 
265          flag_liveout |= inst->flags_read(s.devinfo);
266       }
267    }
268 
269    if (progress)
270       s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
271 
272    return progress;
273 }
274