1 /*
2 * Copyright © 2010 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "brw_fs.h"
7 #include "brw_fs_builder.h"
8
9 using namespace brw;
10
11 /* Wa_14015360517
12 *
13 * The first instruction of any kernel should have non-zero emask.
14 * Make sure this happens by introducing a dummy mov instruction.
15 */
16 bool
brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor & s)17 brw_fs_workaround_emit_dummy_mov_instruction(fs_visitor &s)
18 {
19 if (!intel_needs_workaround(s.devinfo, 14015360517))
20 return false;
21
22 fs_inst *first_inst =
23 s.cfg->first_block()->start();
24
25 /* We can skip the WA if first instruction is marked with
26 * force_writemask_all or exec_size equals dispatch_width.
27 */
28 if (first_inst->force_writemask_all ||
29 first_inst->exec_size == s.dispatch_width)
30 return false;
31
32 /* Insert dummy mov as first instruction. */
33 const fs_builder ubld =
34 fs_builder(&s, s.cfg->first_block(), (fs_inst *)first_inst).exec_all().group(8, 0);
35 ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));
36
37 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
38 return true;
39 }
40
41 static bool
needs_dummy_fence(const intel_device_info * devinfo,fs_inst * inst)42 needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
43 {
44 /* This workaround is about making sure that any instruction writing
45 * through UGM has completed before we hit EOT.
46 */
47 if (inst->sfid != GFX12_SFID_UGM)
48 return false;
49
50 /* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
51 * where the L1-cache override is NOT among {WB, WS, WT}
52 */
53 enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
54 if (lsc_opcode_is_store(opcode)) {
55 switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
56 case LSC_CACHE_STORE_L1STATE_L3MOCS:
57 case LSC_CACHE_STORE_L1WB_L3WB:
58 case LSC_CACHE_STORE_L1S_L3UC:
59 case LSC_CACHE_STORE_L1S_L3WB:
60 case LSC_CACHE_STORE_L1WT_L3UC:
61 case LSC_CACHE_STORE_L1WT_L3WB:
62 return false;
63
64 default:
65 return true;
66 }
67 }
68
69 /* Any UGM Atomic message WITHOUT return value */
70 if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
71 return true;
72
73 return false;
74 }
75
76 /* Wa_22013689345
77 *
78 * We need to emit UGM fence message before EOT, if shader has any UGM write
79 * or atomic message.
80 *
81 * TODO/FINISHME: According to Curro we could avoid the fence in some cases.
82 * We probably need a better criteria in needs_dummy_fence().
83 */
84 bool
brw_fs_workaround_memory_fence_before_eot(fs_visitor & s)85 brw_fs_workaround_memory_fence_before_eot(fs_visitor &s)
86 {
87 bool progress = false;
88 bool has_ugm_write_or_atomic = false;
89
90 if (!intel_needs_workaround(s.devinfo, 22013689345))
91 return false;
92
93 foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) {
94 if (!inst->eot) {
95 if (needs_dummy_fence(s.devinfo, inst))
96 has_ugm_write_or_atomic = true;
97 continue;
98 }
99
100 if (!has_ugm_write_or_atomic)
101 break;
102
103 const fs_builder ibld(&s, block, inst);
104 const fs_builder ubld = ibld.exec_all().group(1, 0);
105
106 brw_reg dst = ubld.vgrf(BRW_TYPE_UD);
107 fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
108 dst, brw_vec8_grf(0, 0),
109 /* commit enable */ brw_imm_ud(1),
110 /* bti */ brw_imm_ud(0));
111 dummy_fence->sfid = GFX12_SFID_UGM;
112 dummy_fence->desc = lsc_fence_msg_desc(s.devinfo, LSC_FENCE_TILE,
113 LSC_FLUSH_TYPE_NONE_6, false);
114 ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
115 progress = true;
116 /* TODO: remove this break if we ever have shader with multiple EOT. */
117 break;
118 }
119
120 if (progress) {
121 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
122 DEPENDENCY_VARIABLES);
123 }
124
125 return progress;
126 }
127
128 /**
129 * Find the first instruction in the program that might start a region of
130 * divergent control flow due to a HALT jump. There is no
131 * find_halt_control_flow_region_end(), the region of divergence extends until
132 * the only SHADER_OPCODE_HALT_TARGET in the program.
133 */
134 static const fs_inst *
find_halt_control_flow_region_start(const fs_visitor * v)135 find_halt_control_flow_region_start(const fs_visitor *v)
136 {
137 foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
138 if (inst->opcode == BRW_OPCODE_HALT ||
139 inst->opcode == SHADER_OPCODE_HALT_TARGET)
140 return inst;
141 }
142
143 return NULL;
144 }
145
146 /**
147 * Work around the Gfx12 hardware bug filed as Wa_1407528679. EU fusion
148 * can cause a BB to be executed with all channels disabled, which will lead
149 * to the execution of any NoMask instructions in it, even though any
150 * execution-masked instructions will be correctly shot down. This may break
151 * assumptions of some NoMask SEND messages whose descriptor depends on data
152 * generated by live invocations of the shader.
153 *
154 * This avoids the problem by predicating certain instructions on an ANY
155 * horizontal predicate that makes sure that their execution is omitted when
156 * all channels of the program are disabled.
157 */
158 bool
brw_fs_workaround_nomask_control_flow(fs_visitor & s)159 brw_fs_workaround_nomask_control_flow(fs_visitor &s)
160 {
161 if (s.devinfo->ver != 12)
162 return false;
163
164 const brw_predicate pred = s.dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
165 s.dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
166 BRW_PREDICATE_ALIGN1_ANY8H;
167 const fs_inst *halt_start = find_halt_control_flow_region_start(&s);
168 unsigned depth = 0;
169 bool progress = false;
170
171 const fs_live_variables &live_vars = s.live_analysis.require();
172
173 /* Scan the program backwards in order to be able to easily determine
174 * whether the flag register is live at any point.
175 */
176 foreach_block_reverse_safe(block, s.cfg) {
177 BITSET_WORD flag_liveout = live_vars.block_data[block->num]
178 .flag_liveout[0];
179 STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
180
181 foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
182 if (!inst->predicate && inst->exec_size >= 8)
183 flag_liveout &= ~inst->flags_written(s.devinfo);
184
185 switch (inst->opcode) {
186 case BRW_OPCODE_DO:
187 case BRW_OPCODE_IF:
188 /* Note that this doesn't handle BRW_OPCODE_HALT since only
189 * the first one in the program closes the region of divergent
190 * control flow due to any HALT instructions -- Instead this is
191 * handled with the halt_start check below.
192 */
193 depth--;
194 break;
195
196 case BRW_OPCODE_WHILE:
197 case BRW_OPCODE_ENDIF:
198 case SHADER_OPCODE_HALT_TARGET:
199 depth++;
200 break;
201
202 default:
203 /* Note that the vast majority of NoMask SEND instructions in the
204 * program are harmless while executed in a block with all
205 * channels disabled, since any instructions with side effects we
206 * could hit here should be execution-masked.
207 *
208 * The main concern is NoMask SEND instructions where the message
209 * descriptor or header depends on data generated by live
210 * invocations of the shader (RESINFO and
211 * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
212 * computed surface index seem to be the only examples right now
213 * where this could easily lead to GPU hangs). Unfortunately we
214 * have no straightforward way to detect that currently, so just
215 * predicate any NoMask SEND instructions we find under control
216 * flow.
217 *
218 * If this proves to have a measurable performance impact it can
219 * be easily extended with a whitelist of messages we know we can
220 * safely omit the predication for.
221 */
222 if (depth && inst->force_writemask_all &&
223 is_send(inst) && !inst->predicate &&
224 !inst->has_no_mask_send_params) {
225 /* We need to load the execution mask into the flag register by
226 * using a builder with channel group matching the whole shader
227 * (rather than the default which is derived from the original
228 * instruction), in order to avoid getting a right-shifted
229 * value.
230 */
231 const fs_builder ubld = fs_builder(&s, block, inst)
232 .exec_all().group(s.dispatch_width, 0);
233 const brw_reg flag = retype(brw_flag_reg(0, 0),
234 BRW_TYPE_UD);
235
236 /* Due to the lack of flag register allocation we need to save
237 * and restore the flag register if it's live.
238 */
239 const bool save_flag = flag_liveout &
240 brw_fs_flag_mask(flag, s.dispatch_width / 8);
241 const brw_reg tmp = ubld.group(8, 0).vgrf(flag.type);
242
243 if (save_flag) {
244 ubld.group(8, 0).UNDEF(tmp);
245 ubld.group(1, 0).MOV(tmp, flag);
246 }
247
248 ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
249
250 set_predicate(pred, inst);
251 inst->flag_subreg = 0;
252 inst->predicate_trivial = true;
253
254 if (save_flag)
255 ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
256
257 progress = true;
258 }
259 break;
260 }
261
262 if (inst == halt_start)
263 depth--;
264
265 flag_liveout |= inst->flags_read(s.devinfo);
266 }
267 }
268
269 if (progress)
270 s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
271
272 return progress;
273 }
274