xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/tests/test_insert_nops.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2020 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 #include "common/amdgfxregs.h"
7 
8 #include "helpers.h"
9 
10 using namespace aco;
11 
12 void
create_mubuf(unsigned offset,PhysReg dst=PhysReg (256),PhysReg vaddr=PhysReg (256))13 create_mubuf(unsigned offset, PhysReg dst = PhysReg(256), PhysReg vaddr = PhysReg(256))
14 {
15    bld.mubuf(aco_opcode::buffer_load_dword, Definition(dst, v1), Operand(PhysReg(0), s4),
16              Operand(vaddr, v1), Operand::zero(), offset, true);
17 }
18 
19 void
create_mubuf_store(PhysReg src=PhysReg (256))20 create_mubuf_store(PhysReg src = PhysReg(256))
21 {
22    bld.mubuf(aco_opcode::buffer_store_dword, Operand(PhysReg(0), s4), Operand(src, v1),
23              Operand::zero(), Operand(src, v1), 0, true);
24 }
25 
26 void
create_mimg(bool nsa,unsigned addrs,unsigned instr_dwords)27 create_mimg(bool nsa, unsigned addrs, unsigned instr_dwords)
28 {
29    aco_ptr<Instruction> mimg{
30       create_instruction(aco_opcode::image_sample, Format::MIMG, 3 + addrs, 1)};
31    mimg->definitions[0] = Definition(PhysReg(256), v1);
32    mimg->operands[0] = Operand(PhysReg(0), s8);
33    mimg->operands[1] = Operand(PhysReg(0), s4);
34    mimg->operands[2] = Operand(v1);
35    for (unsigned i = 0; i < addrs; i++)
36       mimg->operands[3 + i] = Operand(PhysReg(256 + (nsa ? i * 2 : i)), v1);
37    mimg->mimg().dmask = 0x1;
38    mimg->mimg().dim = ac_image_2d;
39 
40    assert(get_mimg_nsa_dwords(mimg.get()) + 2 == instr_dwords);
41 
42    bld.insert(std::move(mimg));
43 }
44 
45 void
create_bvh()46 create_bvh()
47 {
48    aco_ptr<Instruction> instr{
49       create_instruction(aco_opcode::image_bvh64_intersect_ray, Format::MIMG, 8, 1)};
50    instr->definitions[0] = Definition(PhysReg(256), v4);
51    instr->operands[0] = Operand(PhysReg(0), s4);
52    instr->operands[1] = Operand(s4);
53    instr->operands[2] = Operand(v1);
54    instr->operands[3] = Operand(PhysReg(256 + 0), v2); /* node */
55    instr->operands[4] = Operand(PhysReg(256 + 2), v1); /* tmax */
56    instr->operands[5] = Operand(PhysReg(256 + 3), v3); /* origin */
57    instr->operands[6] = Operand(PhysReg(256 + 6), v3); /* dir */
58    instr->operands[7] = Operand(PhysReg(256 + 9), v3); /* inv dir */
59    instr->mimg().dmask = 0xf;
60    instr->mimg().unrm = true;
61    instr->mimg().r128 = true;
62    bld.insert(std::move(instr));
63 }
64 
65 BEGIN_TEST(insert_nops.nsa_to_vmem_bug)
66    if (!setup_cs(NULL, GFX10))
67       return;
68 
69    /* no nop needed because offset&6==0 */
70    //>> p_unit_test 0
71    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
72    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:8 offen
73    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
74    create_mimg(true, 6, 4);
75    create_mubuf(8);
76 
77    /* nop needed */
78    //! p_unit_test 1
79    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
80    //! s_nop
81    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
82    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
83    create_mimg(true, 6, 4);
84    create_mubuf(4);
85 
86    /* no nop needed because the MIMG is not NSA */
87    //! p_unit_test 2
88    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[1], %0:v[2], %0:v[3], %0:v[4], %0:v[5] 2d
89    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
90    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
91    create_mimg(false, 6, 2);
92    create_mubuf(4);
93 
94    /* no nop needed because there's already an instruction in-between */
95    //! p_unit_test 3
96    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
97    //! v_nop
98    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
99    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
100    create_mimg(true, 6, 4);
101    bld.vop1(aco_opcode::v_nop);
102    create_mubuf(4);
103 
104    /* no nop needed because the NSA instruction is under 4 dwords */
105    //! p_unit_test 4
106    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
107    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
108    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u));
109    create_mimg(true, 2, 3);
110    create_mubuf(4);
111 
112    /* NSA instruction and MUBUF/MTBUF in a different block */
113    //! p_unit_test 5
114    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
115    //! BB1
116    //! /* logical preds: / linear preds: BB0, / kind: uniform, */
117    //! s_nop
118    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offset:4 offen
119    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u));
120    create_mimg(true, 6, 4);
121    bld.reset(program->create_and_insert_block());
122    create_mubuf(4);
123    program->blocks[0].linear_succs.push_back(1);
124    program->blocks[1].linear_preds.push_back(0);
125 
126    finish_insert_nops_test();
127 END_TEST
128 
129 BEGIN_TEST(insert_nops.writelane_to_nsa_bug)
130    if (!setup_cs(NULL, GFX10))
131       return;
132 
133    /* nop needed */
134    //>> p_unit_test 0
135    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
136    //! s_nop
137    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
138    bld.pseudo(aco_opcode::p_unit_test, Operand::zero());
139    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
140                  Operand(PhysReg(511), v1));
141    create_mimg(true, 2, 3);
142 
143    /* no nop needed because the MIMG is not NSA */
144    //! p_unit_test 1
145    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
146    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[1] 2d
147    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
148    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
149                  Operand(PhysReg(511), v1));
150    create_mimg(false, 2, 2);
151 
152    /* no nop needed because there's already an instruction in-between */
153    //! p_unit_test 2
154    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
155    //! v_nop
156    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
157    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
158    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
159                  Operand(PhysReg(511), v1));
160    bld.vop1(aco_opcode::v_nop);
161    create_mimg(true, 2, 3);
162 
163    /* writelane and NSA instruction in different blocks */
164    //! p_unit_test 3
165    //! v1: %0:v[255] = v_writelane_b32_e64 0, 0, %0:v[255]
166    //! BB1
167    //! /* logical preds: / linear preds: BB0, / kind: uniform, */
168    //! s_nop
169    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2] 2d
170    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u));
171    bld.writelane(Definition(PhysReg(511), v1), Operand::zero(), Operand::zero(),
172                  Operand(PhysReg(511), v1));
173    bld.reset(program->create_and_insert_block());
174    create_mimg(true, 2, 3);
175    program->blocks[0].linear_succs.push_back(1);
176    program->blocks[1].linear_preds.push_back(0);
177 
178    finish_insert_nops_test();
179 END_TEST
180 
181 BEGIN_TEST(insert_nops.vmem_to_scalar_write)
182    if (!setup_cs(NULL, GFX10))
183       return;
184 
185    /* WaR: VMEM load */
186    //>> p_unit_test 0
187    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
188    //! s_waitcnt_depctr vm_vsrc(0)
189    //! s1: %0:s[0] = s_mov_b32 0
190    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
191    create_mubuf(0);
192    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
193 
194    //! p_unit_test 1
195    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
196    //! s_waitcnt_depctr vm_vsrc(0)
197    //! s2: %0:exec = s_mov_b64 -1
198    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
199    create_mubuf(0);
200    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
201 
202    /* no hazard: VMEM load */
203    //! p_unit_test 2
204    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
205    //! s1: %0:s[4] = s_mov_b32 0
206    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
207    create_mubuf(0);
208    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand::zero());
209 
210    /* no hazard: VMEM load with VALU in-between */
211    //! p_unit_test 3
212    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
213    //! v_nop
214    //! s1: %0:s[0] = s_mov_b32 0
215    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
216    create_mubuf(0);
217    bld.vop1(aco_opcode::v_nop);
218    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
219 
220    /* WaR: LDS */
221    //! p_unit_test 4
222    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
223    //! s_waitcnt_depctr vm_vsrc(0)
224    //! s1: %0:m0 = s_mov_b32 0
225    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
226    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
227           Operand(m0, s1));
228    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
229 
230    //! p_unit_test 5
231    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
232    //! s_waitcnt_depctr vm_vsrc(0)
233    //! s2: %0:exec = s_mov_b64 -1
234    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
235    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
236           Operand(m0, s1));
237    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
238 
239    /* no hazard: LDS */
240    //! p_unit_test 6
241    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
242    //! s1: %0:s[0] = s_mov_b32 0
243    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
244    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
245           Operand(m0, s1));
246    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
247 
248    /* no hazard: LDS with VALU in-between */
249    //! p_unit_test 7
250    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
251    //! v_nop
252    //! s1: %0:m0 = s_mov_b32 0
253    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
254    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
255           Operand(m0, s1));
256    bld.vop1(aco_opcode::v_nop);
257    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
258 
259    /* no hazard: VMEM/LDS with the correct waitcnt in-between */
260    //! p_unit_test 8
261    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
262    //! s_waitcnt vmcnt(0)
263    //! s1: %0:s[0] = s_mov_b32 0
264    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
265    create_mubuf(0);
266    bld.sopp(aco_opcode::s_waitcnt, 0x3f70);
267    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
268 
269    //! p_unit_test 9
270    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
271    //! s_waitcnt_vscnt %0:null imm:0
272    //! s1: %0:s[0] = s_mov_b32 0
273    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
274    create_mubuf_store();
275    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
276    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
277 
278    //! p_unit_test 10
279    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
280    //! s_waitcnt lgkmcnt(0)
281    //! s1: %0:m0 = s_mov_b32 0
282    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
283    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
284           Operand(m0, s1));
285    bld.sopp(aco_opcode::s_waitcnt, 0xc07f);
286    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
287 
288    /* VMEM/LDS with the wrong waitcnt in-between */
289    //! p_unit_test 11
290    //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
291    //! s_waitcnt_vscnt %0:null imm:0
292    //! s_waitcnt_depctr vm_vsrc(0)
293    //! s1: %0:s[0] = s_mov_b32 0
294    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
295    create_mubuf(0);
296    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
297    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
298 
299    //! p_unit_test 12
300    //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
301    //! s_waitcnt lgkmcnt(0)
302    //! s_waitcnt_depctr vm_vsrc(0)
303    //! s1: %0:s[0] = s_mov_b32 0
304    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
305    create_mubuf_store();
306    bld.sopp(aco_opcode::s_waitcnt, 0xc07f);
307    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
308 
309    //! p_unit_test 13
310    //! v1: %0:v[0] = ds_read_b32 %0:v[0], %0:m0
311    //! s_waitcnt vmcnt(0)
312    //! s_waitcnt_depctr vm_vsrc(0)
313    //! s1: %0:m0 = s_mov_b32 0
314    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
315    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1),
316           Operand(m0, s1));
317    bld.sopp(aco_opcode::s_waitcnt, 0x3f70);
318    bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
319 
320    finish_insert_nops_test();
321 END_TEST
322 
323 BEGIN_TEST(insert_nops.lds_direct_valu)
324    for (amd_gfx_level gfx : {GFX11, GFX12}) {
325       if (!setup_cs(NULL, gfx))
326          continue;
327 
328       /* WaW */
329       //>> p_unit_test 0
330       //! v1: %0:v[0] = v_mov_b32 0
331       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
332       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
333       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
334       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
335 
336       /* WaR */
337       //! p_unit_test 1
338       //! v1: %0:v[1] = v_mov_b32 %0:v[0]
339       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
340       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
341       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
342       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
343 
344       /* No hazard. */
345       //! p_unit_test 2
346       //! v1: %0:v[1] = v_mov_b32 0
347       //! v1: %0:v[0] = lds_direct_load %0:m0
348       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
349       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::zero());
350       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
351 
352       /* multiples hazards, nearest should be considered */
353       //! p_unit_test 3
354       //! v1: %0:v[1] = v_mov_b32 %0:v[0]
355       //! v1: %0:v[0] = v_mov_b32 0
356       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
357       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
358       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
359       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
360       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
361 
362       /* independent VALU increase wait_vdst */
363       //! p_unit_test 4
364       //! v1: %0:v[0] = v_mov_b32 0
365       //! v_nop
366       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
367       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
368       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
369       bld.vop1(aco_opcode::v_nop);
370       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
371 
372       //! p_unit_test 5
373       //! v1: %0:v[0] = v_mov_b32 0
374       //; for i in range(10): insert_pattern('v_nop')
375       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:10
376       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
377       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
378       for (unsigned i = 0; i < 10; i++)
379          bld.vop1(aco_opcode::v_nop);
380       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
381 
382       //! p_unit_test 6
383       //! v1: %0:v[0] = v_mov_b32 0
384       //; for i in range(20): insert_pattern('v_nop')
385       //! v1: %0:v[0] = lds_direct_load %0:m0
386       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
387       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
388       for (unsigned i = 0; i < 20; i++)
389          bld.vop1(aco_opcode::v_nop);
390       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
391 
392       /* transcendental requires wait_vdst=0 */
393       //! p_unit_test 7
394       //! v1: %0:v[0] = v_mov_b32 0
395       //! v_nop
396       //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
397       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
398       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
399       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
400       bld.vop1(aco_opcode::v_nop);
401       bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
402       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
403 
404       //! p_unit_test 8
405       //! v1: %0:v[0] = v_sqrt_f32 %0:v[0]
406       //! v_nop
407       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
408       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
409       bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
410       bld.vop1(aco_opcode::v_nop);
411       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
412 
413       /* transcendental is fine if it's before the instruction */
414       //! p_unit_test 9
415       //! v1: %0:v[1] = v_sqrt_f32 %0:v[1]
416       //! v1: %0:v[0] = v_mov_b32 0
417       //! v_nop
418       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:1
419       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
420       bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
421       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
422       bld.vop1(aco_opcode::v_nop);
423       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
424 
425       /* non-VALU does not increase wait_vdst */
426       //! p_unit_test 10
427       //! v1: %0:v[0] = v_mov_b32 0
428       //! s1: %0:m0 = s_mov_b32 0
429       //! v1: %0:v[0] = lds_direct_load %0:m0 wait_vdst:0
430       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
431       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
432       bld.sop1(aco_opcode::s_mov_b32, Definition(m0, s1), Operand::zero());
433       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
434 
435       /* consider instructions which wait on vdst */
436       //! p_unit_test 11
437       //! v1: %0:v[0] = v_mov_b32 0
438       //! v_nop
439       //! s_waitcnt_depctr va_vdst(0)
440       //! v1: %0:v[0] = lds_direct_load %0:m0
441       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
442       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
443       bld.vop1(aco_opcode::v_nop);
444       bld.sopp(aco_opcode::s_waitcnt_depctr, 0x0fff);
445       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
446 
447       finish_insert_nops_test();
448    }
449 END_TEST
450 
451 BEGIN_TEST(insert_nops.lds_direct_vmem)
452    for (amd_gfx_level gfx : {GFX11, GFX12}) {
453       if (!setup_cs(NULL, gfx))
454          continue;
455 
456       /* WaR: VMEM */
457       //>> p_unit_test 0
458       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
459       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
460       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
461       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
462       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
463       create_mubuf(0, PhysReg(257));
464       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
465 
466       /* WaW: VMEM */
467       //! p_unit_test 1
468       //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
469       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
470       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
471       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
472       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
473       create_mubuf(0, PhysReg(256), PhysReg(257));
474       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
475 
476       /* no hazard: VMEM */
477       //! p_unit_test 2
478       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
479       //! v1: %0:v[0] = lds_direct_load %0:m0
480       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
481       create_mubuf(0, PhysReg(257), PhysReg(257));
482       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
483 
484       /* no hazard: VMEM with VALU in-between */
485       //! p_unit_test 3
486       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
487       //! v_nop
488       //! v1: %0:v[0] = lds_direct_load %0:m0
489       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
490       create_mubuf(0, PhysReg(257));
491       bld.vop1(aco_opcode::v_nop);
492       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
493 
494       /* WaR: LDS */
495       //! p_unit_test 4
496       //! v1: %0:v[1] = ds_read_b32 %0:v[0]
497       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
498       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
499       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
500       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
501       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
502       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
503 
504       /* WaW: LDS */
505       //! p_unit_test 5
506       //! v1: %0:v[0] = ds_read_b32 %0:v[1]
507       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
508       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
509       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
510       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
511       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
512       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
513 
514       /* no hazard: LDS */
515       //! p_unit_test 6
516       //! v1: %0:v[1] = ds_read_b32 %0:v[1]
517       //! v1: %0:v[0] = lds_direct_load %0:m0
518       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
519       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(257), v1));
520       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
521 
522       /* no hazard: LDS with VALU in-between */
523       //! p_unit_test 7
524       //! v1: %0:v[1] = ds_read_b32 %0:v[0]
525       //! v_nop
526       //! v1: %0:v[0] = lds_direct_load %0:m0
527       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
528       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
529       bld.vop1(aco_opcode::v_nop);
530       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
531 
532       /* no hazard: VMEM/LDS with the correct waitcnt in-between */
533       //! p_unit_test 8
534       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
535       //~gfx11! s_waitcnt vmcnt(0)
536       //~gfx12! s_wait_loadcnt imm:0
537       //! v1: %0:v[0] = lds_direct_load %0:m0
538       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
539       create_mubuf(0, PhysReg(257));
540       if (gfx >= GFX12)
541          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
542       else
543          bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
544       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
545 
546       //! p_unit_test 9
547       //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
548       //~gfx11! s_waitcnt_vscnt %0:null imm:0
549       //~gfx12! s_wait_storecnt imm:0
550       //! v1: %0:v[0] = lds_direct_load %0:m0
551       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
552       create_mubuf_store();
553       if (gfx >= GFX12)
554          bld.sopp(aco_opcode::s_wait_storecnt, 0);
555       else
556          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
557       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
558 
559       //! p_unit_test 10
560       //! v1: %0:v[1] = ds_read_b32 %0:v[0]
561       //~gfx11! s_waitcnt lgkmcnt(0)
562       //~gfx12! s_wait_dscnt imm:0
563       //! v1: %0:v[0] = lds_direct_load %0:m0
564       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
565       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
566       if (gfx >= GFX12)
567          bld.sopp(aco_opcode::s_wait_dscnt, 0);
568       else
569          bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
570       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
571 
572       if (gfx >= GFX12) {
573          //~gfx12! p_unit_test 11
574          //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
575          //~gfx12! s_wait_loadcnt imm:0
576          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
577          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
578          Instruction* instr =
579             bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
580                      Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
581                .instr;
582          instr->mimg().dmask = 0x1;
583          instr->mimg().dim = ac_image_2d;
584          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
585          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
586 
587          //~gfx12! p_unit_test 12
588          //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
589          //~gfx12! s_wait_samplecnt imm:0
590          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
591          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
592          instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
593                           Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
594                           Operand(PhysReg(256), v2))
595                     .instr;
596          instr->mimg().dmask = 0x1;
597          instr->mimg().dim = ac_image_2d;
598          bld.sopp(aco_opcode::s_wait_samplecnt, 0);
599          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
600 
601          //~gfx12! p_unit_test 13
602          //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
603          //~gfx12! s_wait_bvhcnt imm:0
604          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0
605          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13));
606          create_bvh();
607          bld.sopp(aco_opcode::s_wait_bvhcnt, 0);
608          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
609       }
610 
611       /* VMEM/LDS with the wrong waitcnt in-between */
612       //! p_unit_test 14
613       //! v1: %0:v[1] = buffer_load_dword %0:s[0-3], %0:v[0], 0 offen
614       //~gfx11! s_waitcnt_vscnt %0:null imm:0
615       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
616       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
617       //~gfx12! s_wait_storecnt imm:0
618       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
619       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14));
620       create_mubuf(0, PhysReg(257));
621       if (gfx >= GFX12)
622          bld.sopp(aco_opcode::s_wait_storecnt, 0);
623       else
624          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
625       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
626 
627       //! p_unit_test 15
628       //! buffer_store_dword %0:s[0-3], %0:v[0], 0, %0:v[0] offen
629       //~gfx11! s_waitcnt lgkmcnt(0)
630       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
631       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
632       //~gfx12! s_wait_dscnt imm:0
633       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
634       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15));
635       create_mubuf_store();
636       if (gfx >= GFX12)
637          bld.sopp(aco_opcode::s_wait_dscnt, 0);
638       else
639          bld.sopp(aco_opcode::s_waitcnt, 0xfc0f);
640       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
641 
642       //! p_unit_test 16
643       //! v1: %0:v[1] = ds_read_b32 %0:v[0]
644       //~gfx11! s_waitcnt vmcnt(0)
645       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
646       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
647       //~gfx12! s_wait_loadcnt imm:0
648       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
649       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16));
650       bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
651       if (gfx >= GFX12)
652          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
653       else
654          bld.sopp(aco_opcode::s_waitcnt, 0x3ff);
655       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
656 
657       //! p_unit_test 17
658       //! v1: %0:v[0] = buffer_load_dword %0:s[0-3], %0:v[1], 0 offen
659       //~gfx11! s_waitcnt_vscnt %0:null imm:0
660       //~gfx11! s_waitcnt_depctr vm_vsrc(0)
661       //~gfx11! v1: %0:v[0] = lds_direct_load %0:m0
662       //~gfx12! s_wait_storecnt imm:0
663       //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
664       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17));
665       create_mubuf(0, PhysReg(256), PhysReg(257));
666       if (gfx >= GFX12)
667          bld.sopp(aco_opcode::s_wait_storecnt, 0);
668       else
669          bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1), 0);
670       bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
671 
672       if (gfx >= GFX12) {
673          //~gfx12! p_unit_test 18
674          //~gfx12! v1: %0:v[1] = image_load %0:s[0-7], s4: undef, v1: undef, %0:v[0-1] 2d
675          //~gfx12! s_wait_samplecnt imm:0
676          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
677          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(18));
678          Instruction* instr =
679             bld.mimg(aco_opcode::image_load, Definition(PhysReg(257), v1), Operand(PhysReg(0), s8),
680                      Operand(s4), Operand(v1), Operand(PhysReg(256), v2))
681                .instr;
682          instr->mimg().dmask = 0x1;
683          instr->mimg().dim = ac_image_2d;
684          bld.sopp(aco_opcode::s_wait_samplecnt, 0);
685          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
686 
687          //~gfx12! p_unit_test 19
688          //~gfx12! v1: %0:v[1] = image_sample %0:s[0-7], %0:s[0-3], v1: undef, %0:v[0-1] 2d
689          //~gfx12! s_wait_loadcnt imm:0
690          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
691          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(19));
692          instr = bld.mimg(aco_opcode::image_sample, Definition(PhysReg(257), v1),
693                           Operand(PhysReg(0), s8), Operand(PhysReg(0), s4), Operand(v1),
694                           Operand(PhysReg(256), v2))
695                     .instr;
696          instr->mimg().dmask = 0x1;
697          instr->mimg().dim = ac_image_2d;
698          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
699          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
700 
701          //~gfx12! p_unit_test 20
702          //~gfx12! v4: %0:v[0-3] = image_bvh64_intersect_ray %0:s[0-3], s4: undef, v1: undef, %0:v[0-1], %0:v[2], %0:v[3-5], %0:v[6-8], %0:v[9-11] 1d unrm r128
703          //~gfx12! s_wait_loadcnt imm:0
704          //~gfx12! v1: %0:v[0] = lds_direct_load %0:m0 wait_vsrc:0
705          bld.pseudo(aco_opcode::p_unit_test, Operand::c32(20));
706          create_bvh();
707          bld.sopp(aco_opcode::s_wait_loadcnt, 0);
708          bld.ldsdir(aco_opcode::lds_direct_load, Definition(PhysReg(256), v1), Operand(m0, s1));
709       }
710 
711       finish_insert_nops_test();
712    }
713 END_TEST
714 
715 BEGIN_TEST(insert_nops.valu_trans_use)
716    if (!setup_cs(NULL, GFX11))
717       return;
718 
719    //>> p_unit_test 0
720    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
721    //! s_waitcnt_depctr va_vdst(0)
722    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
723    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
724    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
725    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
726 
727    /* Sufficient VALU mitigates the hazard. */
728    //! p_unit_test 1
729    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
730    //; for i in range(4): insert_pattern('v_nop')
731    //! s_waitcnt_depctr va_vdst(0)
732    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
733    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
734    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
735    for (unsigned i = 0; i < 4; i++)
736       bld.vop1(aco_opcode::v_nop);
737    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
738 
739    //! p_unit_test 2
740    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
741    //; for i in range(8): insert_pattern('v_nop')
742    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
743    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
744    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
745    for (unsigned i = 0; i < 8; i++)
746       bld.vop1(aco_opcode::v_nop);
747    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
748 
749    /* Sufficient transcendental VALU mitigates the hazard. */
750    //! p_unit_test 3
751    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
752    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
753    //! s_waitcnt_depctr va_vdst(0)
754    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
755    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
756    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
757    bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
758    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
759 
760    //! p_unit_test 4
761    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
762    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
763    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
764    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
765    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
766    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
767    for (unsigned i = 0; i < 2; i++)
768       bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
769    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
770 
771    /* Transcendental VALU should be counted towards VALU */
772    //! p_unit_test 5
773    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
774    //; for i in range(5): insert_pattern('v_nop')
775    //! v1: %0:v[2] = v_sqrt_f32 %0:v[3]
776    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
777    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
778    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
779    for (unsigned i = 0; i < 5; i++)
780       bld.vop1(aco_opcode::v_nop);
781    bld.vop1(aco_opcode::v_sqrt_f32, Definition(PhysReg(258), v1), Operand(PhysReg(259), v1));
782    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
783 
784    /* non-VALU does not mitigate the hazard. */
785    //! p_unit_test 6
786    //! v1: %0:v[0] = v_rcp_f32 %0:v[1]
787    //; for i in range(8): insert_pattern('s_nop')
788    //! s_waitcnt_depctr va_vdst(0)
789    //! v1: %0:v[1] = v_mov_b32 %0:v[0]
790    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
791    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand(PhysReg(257), v1));
792    for (unsigned i = 0; i < 8; i++)
793       bld.sopp(aco_opcode::s_nop, 0);
794    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(256), v1));
795 
796    finish_insert_nops_test();
797 END_TEST
798 
799 BEGIN_TEST(insert_nops.valu_partial_forwarding.basic)
800    if (!setup_cs(NULL, GFX11))
801       return;
802 
803    /* Basic case. */
804    //>> p_unit_test 0
805    //! v1: %0:v[0] = v_mov_b32 0
806    //! s2: %0:exec = s_mov_b64 -1
807    //! v1: %0:v[1] = v_mov_b32 1
808    //! s_waitcnt_depctr va_vdst(0)
809    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
810    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
811    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
812    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
813    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
814    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
815             Operand(PhysReg(257), v1));
816 
817    /* We should consider both the closest and further VALU after the exec write. */
818    //! p_unit_test 1
819    //! v1: %0:v[0] = v_mov_b32 0
820    //! s2: %0:exec = s_mov_b64 -1
821    //! v1: %0:v[1] = v_mov_b32 1
822    //; for i in range(2): insert_pattern('v_nop')
823    //! v1: %0:v[2] = v_mov_b32 2
824    //! s_waitcnt_depctr va_vdst(0)
825    //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
826    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
827    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
828    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
829    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
830    bld.vop1(aco_opcode::v_nop);
831    bld.vop1(aco_opcode::v_nop);
832    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
833    bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
834             Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
835 
836    //! p_unit_test 2
837    //! v1: %0:v[0] = v_mov_b32 0
838    //! s2: %0:exec = s_mov_b64 -1
839    //! v1: %0:v[1] = v_mov_b32 1
840    //! v1: %0:v[2] = v_mov_b32 2
841    //; for i in range(4): insert_pattern('v_nop')
842    //! s_waitcnt_depctr va_vdst(0)
843    //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
844    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
845    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
846    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
847    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
848    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
849    for (unsigned i = 0; i < 4; i++)
850       bld.vop1(aco_opcode::v_nop);
851    bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
852             Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
853 
854    /* If a VALU writes a read VGPR in-between the first and second writes, it should still be
855     * counted towards the distance between the first and second writes.
856     */
857    //! p_unit_test 3
858    //! v1: %0:v[0] = v_mov_b32 0
859    //! s2: %0:exec = s_mov_b64 -1
860    //! v1: %0:v[1] = v_mov_b32 1
861    //; for i in range(2): insert_pattern('v_nop')
862    //! v1: %0:v[2] = v_mov_b32 2
863    //; for i in range(3): insert_pattern('v_nop')
864    //! v1: %0:v[2] = v_max3_f32 %0:v[0], %0:v[1], %0:v[2]
865    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
866    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
867    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
868    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
869    bld.vop1(aco_opcode::v_nop);
870    bld.vop1(aco_opcode::v_nop);
871    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(258), v1), Operand::c32(2));
872    for (unsigned i = 0; i < 3; i++)
873       bld.vop1(aco_opcode::v_nop);
874    bld.vop3(aco_opcode::v_max3_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
875             Operand(PhysReg(257), v1), Operand(PhysReg(258), v1));
876 
877    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
878 
879    finish_insert_nops_test();
880 END_TEST
881 
882 BEGIN_TEST(insert_nops.valu_partial_forwarding.multiple_exec_writes)
883    if (!setup_cs(NULL, GFX11))
884       return;
885 
886    //>> p_unit_test 0
887    //! v1: %0:v[0] = v_mov_b32 0
888    //! s2: %0:exec = s_mov_b64 0
889    //! s2: %0:exec = s_mov_b64 -1
890    //! v1: %0:v[1] = v_mov_b32 1
891    //! s_waitcnt_depctr va_vdst(0)
892    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
893    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
894    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
895    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
896    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
897    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
898    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
899             Operand(PhysReg(257), v1));
900 
901    //! p_unit_test 1
902    //! v1: %0:v[0] = v_mov_b32 0
903    //! s2: %0:exec = s_mov_b64 0
904    //! v1: %0:v[1] = v_mov_b32 1
905    //! s2: %0:exec = s_mov_b64 -1
906    //! s_waitcnt_depctr va_vdst(0)
907    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
908    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
909    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
910    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(0));
911    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
912    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
913    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
914             Operand(PhysReg(257), v1));
915 
916    finish_insert_nops_test();
917 END_TEST
918 
919 BEGIN_TEST(insert_nops.valu_partial_forwarding.control_flow)
920    if (!setup_cs(NULL, GFX11))
921       return;
922 
923    /* Control flow merges: one branch shouldn't interfere with the other (clobbering VALU closer
924     * than interesting one).
925     */
926    //>> p_unit_test 0
927    //! s_cbranch_scc1 block:BB2
928    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0u));
929    bld.sopp(aco_opcode::s_cbranch_scc1, 2);
930 
931    //! BB1
932    //! /* logical preds: / linear preds: BB0, / kind: */
933    //! v1: %0:v[0] = v_mov_b32 0
934    //! s2: %0:exec = s_mov_b64 -1
935    //! v_nop
936    //! s_branch block:BB3
937    bld.reset(program->create_and_insert_block());
938    program->blocks[0].linear_succs.push_back(1);
939    program->blocks[1].linear_preds.push_back(0);
940    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
941    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
942    bld.vop1(aco_opcode::v_nop);
943    bld.sopp(aco_opcode::s_branch, 3);
944 
945    //! BB2
946    //! /* logical preds: / linear preds: BB0, / kind: */
947    //! v1: %0:v[0] = v_mov_b32 0
948    bld.reset(program->create_and_insert_block());
949    program->blocks[0].linear_succs.push_back(2);
950    program->blocks[2].linear_preds.push_back(0);
951    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
952 
953    //! BB3
954    //! /* logical preds: / linear preds: BB1, BB2, / kind: */
955    //! v1: %0:v[1] = v_mov_b32 1
956    //! s_waitcnt_depctr va_vdst(0)
957    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
958    bld.reset(program->create_and_insert_block());
959    program->blocks[1].linear_succs.push_back(3);
960    program->blocks[2].linear_succs.push_back(3);
961    program->blocks[3].linear_preds.push_back(1);
962    program->blocks[3].linear_preds.push_back(2);
963    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
964    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
965             Operand(PhysReg(257), v1));
966 
967    /* Control flow merges: one branch shouldn't interfere with the other (should consider furthest
968     * VALU writes after exec).
969     */
970    //! p_unit_test 1
971    //! s_cbranch_scc1 block:BB5
972    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1u));
973    bld.sopp(aco_opcode::s_cbranch_scc1, 5);
974 
975    //! BB4
976    //! /* logical preds: / linear preds: BB3, / kind: */
977    //! v1: %0:v[0] = v_mov_b32 0
978    //! s2: %0:exec = s_mov_b64 -1
979    //; for i in range(2): insert_pattern('v_nop')
980    //! v1: %0:v[1] = v_mov_b32 1
981    //! v_nop
982    //! s_branch block:BB6
983    bld.reset(program->create_and_insert_block());
984    program->blocks[3].linear_succs.push_back(4);
985    program->blocks[4].linear_preds.push_back(3);
986    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
987    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
988    bld.vop1(aco_opcode::v_nop);
989    bld.vop1(aco_opcode::v_nop);
990    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
991    bld.vop1(aco_opcode::v_nop);
992    bld.sopp(aco_opcode::s_branch, 6);
993 
994    //! BB5
995    //! /* logical preds: / linear preds: BB3, / kind: */
996    //! v1: %0:v[1] = v_mov_b32 1
997    bld.reset(program->create_and_insert_block());
998    program->blocks[3].linear_succs.push_back(5);
999    program->blocks[5].linear_preds.push_back(3);
1000    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1001 
1002    //! BB6
1003    //! /* logical preds: / linear preds: BB4, BB5, / kind: */
1004    //! s_waitcnt_depctr va_vdst(0)
1005    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
1006    bld.reset(program->create_and_insert_block());
1007    program->blocks[4].linear_succs.push_back(6);
1008    program->blocks[5].linear_succs.push_back(6);
1009    program->blocks[6].linear_preds.push_back(4);
1010    program->blocks[6].linear_preds.push_back(5);
1011    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
1012             Operand(PhysReg(257), v1));
1013 
1014    /* Control flow merges: one branch shouldn't interfere with the other (should consider closest
1015     * VALU writes after exec).
1016     */
1017    //! p_unit_test 2
1018    //! s_cbranch_scc1 block:BB8
1019    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u));
1020    bld.sopp(aco_opcode::s_cbranch_scc1, 8);
1021 
1022    //! BB7
1023    //! /* logical preds: / linear preds: BB6, / kind: */
1024    //! v1: %0:v[0] = v_mov_b32 0
1025    //! s2: %0:exec = s_mov_b64 -1
1026    //! v1: %0:v[1] = v_mov_b32 1
1027    //; for i in range(4): insert_pattern('v_nop')
1028    //! s_branch block:BB9
1029    bld.reset(program->create_and_insert_block());
1030    program->blocks[6].linear_succs.push_back(7);
1031    program->blocks[7].linear_preds.push_back(6);
1032    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1033    bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(-1));
1034    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1035    for (unsigned i = 0; i < 4; i++)
1036       bld.vop1(aco_opcode::v_nop);
1037    bld.sopp(aco_opcode::s_branch, 9);
1038 
1039    //! BB8
1040    //! /* logical preds: / linear preds: BB6, / kind: */
1041    //! v1: %0:v[1] = v_mov_b32 1
1042    //; for i in range(5): insert_pattern('v_nop')
1043    bld.reset(program->create_and_insert_block());
1044    program->blocks[6].linear_succs.push_back(8);
1045    program->blocks[8].linear_preds.push_back(6);
1046    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::c32(1));
1047    for (unsigned i = 0; i < 5; i++)
1048       bld.vop1(aco_opcode::v_nop);
1049 
1050    //! BB9
1051    //! /* logical preds: / linear preds: BB7, BB8, / kind: uniform, */
1052    //! s_waitcnt_depctr va_vdst(0)
1053    //! v1: %0:v[2] = v_max_f32 %0:v[0], %0:v[1]
1054    bld.reset(program->create_and_insert_block());
1055    program->blocks[7].linear_succs.push_back(9);
1056    program->blocks[8].linear_succs.push_back(9);
1057    program->blocks[9].linear_preds.push_back(7);
1058    program->blocks[9].linear_preds.push_back(8);
1059    bld.vop2(aco_opcode::v_max_f32, Definition(PhysReg(258), v1), Operand(PhysReg(256), v1),
1060             Operand(PhysReg(257), v1));
1061 
1062    finish_insert_nops_test();
1063 END_TEST
1064 
1065 BEGIN_TEST(insert_nops.valu_mask_write)
1066    if (!setup_cs(NULL, GFX11))
1067       return;
1068 
1069    /* Basic case. */
1070    //>> p_unit_test 0
1071    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1072    //! s1: %0:s[1] = s_mov_b32 0
1073    //! s_waitcnt_depctr sa_sdst(0)
1074    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1075    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1076    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1077                 Operand::zero(), Operand(PhysReg(0), s2));
1078    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1079    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1080 
1081    /* Mitigation. */
1082    //! p_unit_test 1
1083    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1084    //! v1: %0:v[1] = v_mov_b32 %0:s[1]
1085    //! s1: %0:s[1] = s_mov_b32 0
1086    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1087    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1088    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1089                 Operand::zero(), Operand(PhysReg(0), s2));
1090    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
1091    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1092    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1093 
1094    //! p_unit_test 2
1095    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1096    //! s1: %0:s[1] = s_mov_b32 0
1097    //! s_waitcnt_depctr sa_sdst(0)
1098    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1099    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1100    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1101    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1102                 Operand::zero(), Operand(PhysReg(0), s2));
1103    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1104    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1105    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1106 
1107    //! p_unit_test 3
1108    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1109    //! s1: %0:s[1] = s_mov_b32 0
1110    //! s_waitcnt_depctr sa_sdst(0)
1111    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1112    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1113    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1114                 Operand::zero(), Operand(PhysReg(0), s2));
1115    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1116    bld.sopp(aco_opcode::s_waitcnt_depctr, 0xfffe);
1117    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1118 
1119    /* v_cndmask_b32 is both involved in the hazard and is a mitigation. */
1120    //! p_unit_test 4
1121    //! v1: %0:v[0] = v_cndmask_b32 %0:s[2], 0, %0:s[0-1]
1122    //! s1: %0:s[1] = s_mov_b32 0
1123    //! s_waitcnt_depctr sa_sdst(0)
1124    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1125    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1126    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand(PhysReg(2), s1),
1127                 Operand::zero(), Operand(PhysReg(0), s2));
1128    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1129    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1130 
1131    /* VALU reading exec does not mitigate the hazard. We also don't consider literals. */
1132    //! p_unit_test 5
1133    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1134    //! v1: %0:v[1] = v_mov_b32 %0:exec_lo
1135    //! s1: %0:s[1] = s_mov_b32 0
1136    //! s_waitcnt_depctr sa_sdst(0)
1137    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1138    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1139    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1140                 Operand::zero(), Operand(PhysReg(0), s2));
1141    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(exec_lo, s1));
1142    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1143    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1144 
1145    //! p_unit_test 6
1146    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1147    //! v1: %0:v[1] = v_mov_b32 0x200
1148    //! s1: %0:s[1] = s_mov_b32 0
1149    //! s_waitcnt_depctr sa_sdst(0)
1150    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1151    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1152    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1153                 Operand::zero(), Operand(PhysReg(0), s2));
1154    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand::literal32(0x200));
1155    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1156    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1157 
1158    /* Basic case: VALU. */
1159    //! p_unit_test 7
1160    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1161    //! s1: %0:s[1] = s_mov_b32 0
1162    //! s_waitcnt_depctr sa_sdst(0)
1163    //! v1: %0:v[1] = v_mov_b32 %0:s[1]
1164    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1165    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1166                 Operand::zero(), Operand(PhysReg(0), s2));
1167    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1168    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(257), v1), Operand(PhysReg(1), s1));
1169 
1170    /* SALU which both reads and writes a lane mask SGPR. */
1171    //! p_unit_test 8
1172    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1173    //! s1: %0:s[1] = s_mov_b32 0
1174    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1175    //! s_waitcnt_depctr sa_sdst(0)
1176    //! s1: %0:s[2] = s_mov_b32 %0:s[1]
1177    //! s_waitcnt_depctr sa_sdst(0)
1178    //! s1: %0:s[4] = s_mov_b32 %0:s[2]
1179    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1180    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1181                 Operand::zero(), Operand(PhysReg(0), s2));
1182    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1183    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1184                 Operand::zero(), Operand(PhysReg(2), s2));
1185    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand(PhysReg(1), s1));
1186    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(2), s1));
1187 
1188    /* When a SALU writes a lane mask, we shouldn't forget the current SGPRs used as lane masks then
1189     * written. */
1190    //! p_unit_test 9
1191    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1192    //! s1: %0:s[0] = s_mov_b32 0
1193    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1194    //! s1: %0:s[2] = s_mov_b32 0
1195    //! s_waitcnt_depctr sa_sdst(0)
1196    //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1197    //! s1: %0:s[5] = s_mov_b32 %0:s[2]
1198    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1199    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1200                 Operand::zero(), Operand(PhysReg(0), s2));
1201    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1202    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1203                 Operand::zero(), Operand(PhysReg(2), s2));
1204    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero());
1205    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1206    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1));
1207 
1208    /* When a SALU writes a lane mask, we shouldn't forget all SGPRs used as lane masks, there might
1209     * be later problematic writes. */
1210    //! p_unit_test 10
1211    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1212    //! s1: %0:s[0] = s_mov_b32 0
1213    //! s_waitcnt_depctr sa_sdst(0)
1214    //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1215    //! s1: %0:s[1] = s_mov_b32 0
1216    //! s_waitcnt_depctr sa_sdst(0)
1217    //! s1: %0:s[5] = s_mov_b32 %0:s[1]
1218    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(10));
1219    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1220                 Operand::zero(), Operand(PhysReg(0), s2));
1221    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1222    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1223    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(1), s1), Operand::zero());
1224    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(1), s1));
1225 
1226    //! p_unit_test 11
1227    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1228    //! s1: %0:s[0] = s_mov_b32 0
1229    //! s_waitcnt_depctr sa_sdst(0)
1230    //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1231    //! s1: %0:s[0] = s_mov_b32 0
1232    //! s_waitcnt_depctr sa_sdst(0)
1233    //! s1: %0:s[5] = s_mov_b32 %0:s[0]
1234    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(11));
1235    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1236                 Operand::zero(), Operand(PhysReg(0), s2));
1237    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1238    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1239    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1240    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(0), s1));
1241 
1242    //! p_unit_test 12
1243    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12));
1244 
1245    //! BB1
1246    //! /* logical preds: / linear preds: BB0, / kind: */
1247    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[0-1]
1248    bld.reset(program->create_and_insert_block());
1249    program->blocks[0].linear_succs.push_back(1);
1250    program->blocks[1].linear_preds.push_back(0);
1251    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1252                 Operand::zero(), Operand(PhysReg(0), s2));
1253 
1254    //! BB2
1255    //! /* logical preds: / linear preds: BB0, / kind: */
1256    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:s[2-3]
1257    bld.reset(program->create_and_insert_block());
1258    program->blocks[0].linear_succs.push_back(2);
1259    program->blocks[2].linear_preds.push_back(0);
1260    bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1261                 Operand::zero(), Operand(PhysReg(2), s2));
1262 
1263    //! BB3
1264    //! /* logical preds: / linear preds: BB1, BB2, / kind: uniform, */
1265    //! s1: %0:s[0] = s_mov_b32 0
1266    //! s_waitcnt_depctr sa_sdst(0)
1267    //! s1: %0:s[4] = s_mov_b32 %0:s[0]
1268    //! s1: %0:s[2] = s_mov_b32 0
1269    //! s_waitcnt_depctr sa_sdst(0)
1270    //! s1: %0:s[5] = s_mov_b32 %0:s[2]
1271    bld.reset(program->create_and_insert_block());
1272    program->blocks[1].linear_succs.push_back(3);
1273    program->blocks[2].linear_succs.push_back(3);
1274    program->blocks[3].linear_preds.push_back(1);
1275    program->blocks[3].linear_preds.push_back(2);
1276    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1277    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(4), s1), Operand(PhysReg(0), s1));
1278    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(2), s1), Operand::zero());
1279    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(5), s1), Operand(PhysReg(2), s1));
1280 
1281    finish_insert_nops_test();
1282 END_TEST
1283 
1284 BEGIN_TEST(insert_nops.wmma_raw)
1285    if (!setup_cs(NULL, GFX11))
1286       return;
1287 
1288    /* Basic case. */
1289    //>> p_unit_test 0
1290    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1291    //! v_nop
1292    //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx
1293    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1294    Operand A(PhysReg(256 + 0), v8);
1295    Operand B(PhysReg(256 + 8), v8);
1296    Operand C(PhysReg(256 + 20), v4);
1297    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1298              0);
1299    A.setFixed(PhysReg(256 + 24));
1300    B.setFixed(PhysReg(256 + 16));
1301    C.setFixed(PhysReg(256 + 48));
1302    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1303              0);
1304 
1305    /* Mitigation. */
1306    //! p_unit_test 1
1307    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1308    //! v1: %_:v[56] = v_rcp_f32 0
1309    //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[16-23].xx, %_:v[48-51].xx
1310    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1311    A.setFixed(PhysReg(256 + 0));
1312    B.setFixed(PhysReg(256 + 8));
1313    C.setFixed(PhysReg(256 + 20));
1314    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1315              0);
1316    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256 + 56), v1), Operand::zero());
1317    A.setFixed(PhysReg(256 + 24));
1318    B.setFixed(PhysReg(256 + 16));
1319    C.setFixed(PhysReg(256 + 48));
1320    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1321              0);
1322 
1323    /* No hazard. */
1324    //>> p_unit_test 2
1325    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1326    //! v4: %_:v[48-51] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[48-51].xx
1327    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1328    A.setFixed(PhysReg(256 + 0));
1329    B.setFixed(PhysReg(256 + 8));
1330    C.setFixed(PhysReg(256 + 20));
1331    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1332              0);
1333    A.setFixed(PhysReg(256 + 24));
1334    B.setFixed(PhysReg(256 + 32));
1335    C.setFixed(PhysReg(256 + 48));
1336    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1337              0);
1338 
1339    //>> p_unit_test 3
1340    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[0-7].xx, %_:v[8-15].xx, %_:v[20-23].xx
1341    //! v4: %_:v[20-23] = v_wmma_f16_16x16x16_f16 %_:v[24-31].xx, %_:v[32-39].xx, %_:v[20-23].xx
1342    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1343    A.setFixed(PhysReg(256 + 0));
1344    B.setFixed(PhysReg(256 + 8));
1345    C.setFixed(PhysReg(256 + 20));
1346    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1347              0);
1348    A.setFixed(PhysReg(256 + 24));
1349    B.setFixed(PhysReg(256 + 32));
1350    C.setFixed(PhysReg(256 + 20));
1351    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1352              0);
1353 
1354    finish_insert_nops_test();
1355 END_TEST
1356 
1357 enum StageInfoFlags {
1358    stage_separate = 1 << 0,
1359    stage_has_prolog = 1 << 1,
1360    stage_has_export = 1 << 2,
1361    stage_is_prolog = 1 << 3,
1362    stage_is_epilog = 1 << 4,
1363 };
1364 
1365 struct StageInfo {
1366    const char* name;
1367    Stage stage;
1368    unsigned flags;
1369 };
1370 
1371 BEGIN_TEST(insert_nops.export_priority.stages)
1372    Stage geometry_ngg(AC_HW_NEXT_GEN_GEOMETRY_SHADER, SWStage::GS);
1373    for (StageInfo stage : (StageInfo[]){
1374            {"_fs_first_last", fragment_fs, stage_has_export},
1375            {"_fs_with_epilog_first", fragment_fs, 0},
1376            {"_fs_prolog_first", fragment_fs, stage_is_prolog},
1377            {"_fs_epilog_last", fragment_fs, stage_is_epilog | stage_has_export},
1378            {"_vs_first_last", vertex_ngg, stage_has_export},
1379            {"_vs_with_prolog_last", vertex_ngg, stage_has_export | stage_has_prolog},
1380            {"_tes_first_last", tess_eval_ngg, stage_has_export},
1381            {"_ms_first_last", mesh_ngg, stage_has_export},
1382            {"_tesgs_first_last", tess_eval_geometry_ngg, stage_has_export},
1383            {"_vsgs_first_last", vertex_geometry_ngg, stage_has_export},
1384            {"_vsgs_with_prolog_last", vertex_geometry_ngg, stage_has_export | stage_has_prolog},
1385            {"_separate_vs_first", vertex_ngg, stage_separate},
1386            {"_separate_vs_with_prolog", vertex_ngg, stage_separate | stage_has_prolog},
1387            {"_separate_tes_first", tess_eval_ngg, stage_separate},
1388            {"_separate_gs_last", geometry_ngg, stage_separate | stage_has_export}}) {
1389       if (!setup_cs(NULL, GFX11_5, CHIP_UNKNOWN, stage.name))
1390          continue;
1391 
1392       program->stage = stage.stage;
1393       program->info.merged_shader_compiled_separately = stage.flags & stage_separate;
1394       program->info.vs.has_prolog = stage.flags & stage_has_prolog;
1395       program->is_prolog = stage.flags & stage_is_prolog;
1396       program->is_epilog = stage.flags & stage_is_epilog;
1397       //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1398       //~.*first.*! s_setprio imm:2
1399       if (stage.flags & stage_has_export) {
1400          //~.*last.*! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1401          //~.*last.*! s_setprio imm:0
1402          //~.*last.*! s_nop
1403          //~.*last.*! s_nop
1404          //~.*last.*! s_endpgm
1405          bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1406                  V_008DFC_SQ_EXP_POS, false);
1407       } else {
1408          //(?!.*last.*)! v_nop
1409          bld.vop1(aco_opcode::v_nop);
1410       }
1411 
1412       finish_insert_nops_test(stage.flags & stage_has_export);
1413    }
1414 END_TEST
1415 
1416 BEGIN_TEST(insert_nops.export_priority.instrs_after_export)
1417    if (!setup_cs(NULL, GFX11_5))
1418       return;
1419 
1420    program->stage = vertex_ngg;
1421    //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1422    //! s_setprio imm:2
1423    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1424    //! s_setprio imm:0
1425    //! s_waitcnt_expcnt %0:null imm:0
1426    //! s_nop
1427    //! s_nop
1428    //! s_setprio imm:2
1429    //! v_nop
1430    //! s_endpgm
1431    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1432            V_008DFC_SQ_EXP_POS, false);
1433    bld.vop1(aco_opcode::v_nop);
1434 
1435    finish_insert_nops_test();
1436 END_TEST
1437 
1438 BEGIN_TEST(insert_nops.export_priority.fallthrough_to_endpgm)
1439    if (!setup_cs(NULL, GFX11_5))
1440       return;
1441 
1442    program->stage = vertex_ngg;
1443    //>> /* logical preds: / linear preds: / kind: top-level, */
1444    //! s_setprio imm:2
1445    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1446    //! s_setprio imm:0
1447    //! s_nop
1448    //! s_nop
1449    //>> BB1
1450    //>> /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
1451    //! s_endpgm
1452    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1453            V_008DFC_SQ_EXP_POS, false);
1454 
1455    bld.reset(program->create_and_insert_block());
1456    program->blocks[0].linear_succs.push_back(1);
1457    program->blocks[0].logical_succs.push_back(1);
1458    program->blocks[1].linear_preds.push_back(0);
1459    program->blocks[1].logical_preds.push_back(0);
1460 
1461    finish_insert_nops_test();
1462 END_TEST
1463 
1464 BEGIN_TEST(insert_nops.export_priority.multiple_exports)
1465    if (!setup_cs(NULL, GFX11_5))
1466       return;
1467 
1468    program->stage = vertex_ngg;
1469    //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1470    //! s_setprio imm:2
1471    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1472    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos1
1473    //! s_setprio imm:0
1474    //! s_nop
1475    //! s_nop
1476    //! s_endpgm
1477    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1478            V_008DFC_SQ_EXP_POS, false);
1479    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1480            V_008DFC_SQ_EXP_POS + 1, false);
1481 
1482    finish_insert_nops_test();
1483 END_TEST
1484 
1485 BEGIN_TEST(insert_nops.export_priority.set_prio)
1486    if (!setup_cs(NULL, GFX11_5))
1487       return;
1488 
1489    program->stage = vertex_ngg;
1490    //>> /* logical preds: / linear preds: / kind: uniform, top-level, */
1491    //! s_setprio imm:3
1492    //! v_nop
1493    //! s_setprio imm:2
1494    //! exp v1: undef, v1: undef, v1: undef, v1: undef en:**** pos0
1495    //! s_setprio imm:0
1496    //! s_nop
1497    //! s_nop
1498    //! s_endpgm
1499    bld.sopp(aco_opcode::s_setprio, 3);
1500    bld.vop1(aco_opcode::v_nop);
1501    bld.sopp(aco_opcode::s_setprio, 1);
1502    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0x0,
1503            V_008DFC_SQ_EXP_POS, false);
1504 
1505    finish_insert_nops_test();
1506 END_TEST
1507 
1508 BEGIN_TEST(insert_nops.setpc_gfx6)
1509    if (!setup_cs(NULL, GFX6))
1510       return;
1511 
1512    /* SGPR->SMEM hazards */
1513    //>> p_unit_test 0
1514    //! s1: %0:s[0] = s_mov_b32 0
1515    //! s_nop imm:2
1516    //! s_setpc_b64 0
1517    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1518    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1519    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1520 
1521    //! p_unit_test 1
1522    //! s1: %0:s[0] = s_mov_b32 0
1523    //! s_nop imm:2
1524    //! s_setpc_b64 0
1525    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1526    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand::zero());
1527    bld.sopp(aco_opcode::s_nop, 2);
1528    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1529 
1530    finish_insert_nops_test();
1531 
1532    /* This hazard can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves it. */
1533 
1534    /* VINTRP->v_readlane_b32/etc */
1535    //>> p_unit_test 2
1536    //! v1: %0:v[0] = v_interp_mov_f32 2, %0:m0 attr0.x
1537    //! s_nop
1538    create_program(GFX6, compute_cs, 64, CHIP_UNKNOWN);
1539    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1540    bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(PhysReg(256), v1), Operand::c32(2u),
1541               Operand(m0, s1), 0, 0);
1542    finish_insert_nops_test(false);
1543 END_TEST
1544 
1545 BEGIN_TEST(insert_nops.setpc_gfx7)
1546    for (amd_gfx_level gfx : {GFX7, GFX9}) {
1547       if (!setup_cs(NULL, gfx))
1548          continue;
1549 
1550       //>> p_unit_test 0
1551       //! s_setpc_b64 0
1552       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1553       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1554 
1555       /* Break up SMEM clauses: resolved by the s_setpc_b64 itself */
1556       //! p_unit_test 1
1557       //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1558       //! s_setpc_b64 0
1559       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1560       bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1561       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1562 
1563       /* SALU and GDS hazards */
1564       //! p_unit_test 2
1565       //! s_setreg_imm32_b32 0x0 imm:14337
1566       //! s_nop
1567       //! s_setpc_b64 0
1568       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1569       bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::literal32(0), (7 << 11) | 1);
1570       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1571 
1572       /* VALU writes vcc -> vccz/v_div_fmas */
1573       //! p_unit_test 3
1574       //! s2: %0:vcc = v_cmp_eq_u32 0, 0
1575       //! s_nop imm:3
1576       //! s_setpc_b64 0
1577       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1578       bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand::zero(), Operand::zero());
1579       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1580 
1581       /* VALU writes exec -> execz/DPP */
1582       //! p_unit_test 4
1583       //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1584       //! s_nop imm:3
1585       //! s_setpc_b64 0
1586       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1587       bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(),
1588                    Operand::zero());
1589       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1590 
1591       /* VALU->DPP */
1592       //! p_unit_test 5
1593       //! v1: %0:v[0] = v_mov_b32 0
1594       //~gfx9! s_nop
1595       //! s_setpc_b64 0
1596       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1597       bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1598       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1599 
1600       /* VALU->v_readlane_b32/VMEM/etc */
1601       //! p_unit_test 6
1602       //! s1: %0:s[0] = v_readfirstlane_b32 %0:v[0]
1603       //! s_nop imm:3
1604       //! s_setpc_b64 0
1605       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1606       bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(PhysReg(0), s1),
1607                Operand(PhysReg(256), v1));
1608       bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1609 
1610       finish_insert_nops_test();
1611 
1612       /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves
1613        * them. */
1614 
1615       //>> p_unit_test 7
1616       //! buffer_store_dwordx3 %0:s[0-3], %0:v[0], 0, %0:v[0-2] offen
1617       //! s_nop
1618       create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1619       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1620       bld.mubuf(aco_opcode::buffer_store_dwordx3, Operand(PhysReg(0), s4),
1621                 Operand(PhysReg(256), v1), Operand::zero(), Operand(PhysReg(256), v3), 0, true);
1622       finish_insert_nops_test(false);
1623 
1624       //>> p_unit_test 8
1625       //! s1: %0:m0 = s_mov_b32 0
1626       //! s_nop
1627       create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1628       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1629       bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(m0), s1), Operand::zero());
1630       finish_insert_nops_test(false);
1631 
1632       /* Break up SMEM clauses */
1633       //>> p_unit_test 9
1634       //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1635       //! s_nop
1636       create_program(gfx, compute_cs, 64, CHIP_UNKNOWN);
1637       bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1638       bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1639       finish_insert_nops_test(false);
1640    }
1641 END_TEST
1642 
1643 BEGIN_TEST(insert_nops.setpc_gfx10)
1644    if (!setup_cs(NULL, GFX10))
1645       return;
1646 
1647    //>> p_unit_test 0
1648    //! s_setpc_b64 0
1649    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1650    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1651 
1652    /* VcmpxPermlaneHazard */
1653    //! p_unit_test 1
1654    //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1655    //! v1: %0:v[0] = v_mov_b32 %0:v[0]
1656    //! s_setpc_b64 0
1657    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1658    bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1659    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1660 
1661    /* VMEMtoScalarWriteHazard */
1662    //! p_unit_test 2
1663    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1664    //! s_waitcnt_vscnt %0:null imm:0
1665    //! s_waitcnt_depctr vm_vsrc(0)
1666    //! s_setpc_b64 0
1667    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1668    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1669    bld.sopk(aco_opcode::s_waitcnt_vscnt, Operand(sgpr_null, s1),
1670             0); /* reset LdsBranchVmemWARHazard */
1671    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1672 
1673    /* VcmpxExecWARHazard */
1674    //! p_unit_test 3
1675    //! s1: %0:s[0] = s_mov_b32 %0:exec_hi
1676    //! s_waitcnt_depctr sa_sdst(0)
1677    //! s_setpc_b64 0
1678    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1679    bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg(0), s1), Operand(exec_hi, s1));
1680    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1681 
1682    /* LdsBranchVmemWARHazard */
1683    //! p_unit_test 4
1684    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1685    //! v_nop
1686    //! s_branch block:BB0
1687    //! s_waitcnt_vscnt %0:null imm:0
1688    //! s_setpc_b64 0
1689    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1690    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1691    bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1692    bld.sopp(aco_opcode::s_branch, 0);
1693    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1694 
1695    //! p_unit_test 5
1696    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1697    //! v_nop
1698    //! s_waitcnt_vscnt %0:null imm:0
1699    //! s_setpc_b64 0
1700    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1701    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1702    bld.vop1(aco_opcode::v_nop); /* reset VMEMtoScalarWriteHazard */
1703    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1704 
1705    /* waNsaCannotFollowWritelane: resolved by the s_setpc_b64 */
1706    //! p_unit_test 6
1707    //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1708    //! s_setpc_b64 0
1709    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1710    bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1711             Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1712    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1713 
1714    finish_insert_nops_test();
1715 
1716    /* These hazards can't be tested using s_setpc_b64, because the s_setpc_b64 itself resolves them.
1717     */
1718 
1719    /* SMEMtoVectorWriteHazard */
1720    //>> p_unit_test 7
1721    //! s1: %0:s[0] = s_load_dword %0:s[0-1]
1722    //! s1: %0:null = s_mov_b32 0
1723    create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1724    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1725    bld.smem(aco_opcode::s_load_dword, Definition(PhysReg(0), s1), Operand(PhysReg(0), s2));
1726    finish_insert_nops_test(false);
1727 
1728    /* NSAToVMEMBug is already resolved indirectly through VMEMtoScalarWriteHazard and
1729     * LdsBranchVmemWARHazard. */
1730    //>> p_unit_test 8
1731    //! v1: %0:v[0] = image_sample %0:s[0-7], %0:s[0-3],  v1: undef, %0:v[0], %0:v[2], %0:v[4], %0:v[6], %0:v[8], %0:v[10] 2d
1732    //! s_waitcnt_depctr vm_vsrc(0)
1733    //! s_waitcnt_vscnt %0:null imm:0
1734    create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1735    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1736    create_mimg(true, 6, 4);
1737    finish_insert_nops_test(false);
1738 
1739    /* waNsaCannotFollowWritelane */
1740    //>> p_unit_test 9
1741    //! v1: %0:v[0] = v_writelane_b32_e64 %0:v[1], 0, %0:v[0]
1742    //! s_nop
1743    create_program(GFX10, compute_cs, 64, CHIP_UNKNOWN);
1744    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9));
1745    bld.vop3(aco_opcode::v_writelane_b32_e64, Definition(PhysReg(256), v1),
1746             Operand(PhysReg(257), v1), Operand::zero(4), Operand(PhysReg(256), v1));
1747    finish_insert_nops_test(false);
1748 END_TEST
1749 
1750 BEGIN_TEST(insert_nops.setpc_gfx11)
1751    if (!setup_cs(NULL, GFX11))
1752       return;
1753 
1754    //>> p_unit_test 0
1755    //! s_setpc_b64 0
1756    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1757    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1758 
1759    /* LdsDirectVALUHazard */
1760    //! p_unit_test 1
1761    //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
1762    //! s_waitcnt_depctr va_vdst(0)
1763    //! s_setpc_b64 0
1764    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1765    bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
1766                 Operand::zero());
1767    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1768 
1769    /* VALUPartialForwardingHazard */
1770    //! p_unit_test 2
1771    //! v1: %0:v[0] = v_mov_b32 0
1772    //! s_waitcnt_depctr va_vdst(0)
1773    //! s_setpc_b64 0
1774    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1775    bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg(256), v1), Operand::zero());
1776    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1777 
1778    /* VcmpxPermlaneHazard */
1779    //! p_unit_test 2
1780    //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1781    //! v_nop
1782    //! s_setpc_b64 0
1783    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1784    bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1785    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1786 
1787    /* VALUTransUseHazard */
1788    //! p_unit_test 3
1789    //! v1: %0:v[0] = v_rcp_f32 0
1790    //! s_waitcnt_depctr va_vdst(0)
1791    //! s_setpc_b64 0
1792    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1793    bld.vop1(aco_opcode::v_rcp_f32, Definition(PhysReg(256), v1), Operand::zero());
1794    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1795 
1796    /* VALUMaskWriteHazard */
1797    //! p_unit_test 4
1798    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1799    //! s1: %0:vcc_hi = s_mov_b32 0
1800    //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
1801    //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
1802    //! s_waitcnt_depctr va_vdst(0)
1803    //! s_setpc_b64 0
1804    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4));
1805    bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1806             Operand::zero(), Operand(vcc, s2));
1807    bld.sop1(aco_opcode::s_mov_b32, Definition(vcc_hi, s1), Operand::c32(0));
1808    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1809 
1810    //! p_unit_test 8
1811    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1812    //! s_waitcnt_depctr va_vdst(0)
1813    //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
1814    //! s_waitcnt_depctr va_vdst(0)
1815    //! s_setpc_b64 0
1816    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(8));
1817    bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1818             Operand::zero(), Operand(vcc, s2));
1819    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1820 
1821    //! p_unit_test 5
1822    //! v1: %0:v[0] = v_cndmask_b32 0, 0, %0:vcc
1823    //! s2: %0:vcc = s_mov_b64 0
1824    //! s_waitcnt_depctr va_vdst(0) sa_sdst(0)
1825    //! v1: %0:v[0] = v_xor3_b32 %0:v[0], %0:s[0], %0:s[0]
1826    //! s_waitcnt_depctr va_vdst(0)
1827    //! s_setpc_b64 0
1828    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5));
1829    bld.vop2(aco_opcode::v_cndmask_b32, Definition(PhysReg(256), v1), Operand::zero(),
1830             Operand::zero(), Operand(vcc, s2));
1831    bld.sop1(aco_opcode::s_mov_b64, Definition(vcc, s2), Operand::zero(8));
1832    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1833 
1834    /* LdsDirectVMEMHazard */
1835    //! p_unit_test 6
1836    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1837    //! s_waitcnt_depctr vm_vsrc(0)
1838    //! s_setpc_b64 0
1839    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(6));
1840    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1841    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1842 
1843    /* WMMA Hazards */
1844    //! p_unit_test 7
1845    //! v4: %0:v[20-23] = v_wmma_f16_16x16x16_f16 %0:v[0-7].xx, %0:v[8-15].xx, %0:v[20-23].xx
1846    //! v_nop
1847    //! s_waitcnt_depctr va_vdst(0)
1848    //! s_setpc_b64 0
1849    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(7));
1850    Operand A(PhysReg(256 + 0), v8);
1851    Operand B(PhysReg(256 + 8), v8);
1852    Operand C(PhysReg(256 + 20), v4);
1853    bld.vop3p(aco_opcode::v_wmma_f16_16x16x16_f16, Definition(C.physReg(), C.regClass()), A, B, C, 0,
1854              0);
1855    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1856 
1857    finish_insert_nops_test(true);
1858 END_TEST
1859 
1860 BEGIN_TEST(insert_nops.setpc_gfx12)
1861    if (!setup_cs(NULL, GFX12))
1862       return;
1863 
1864    //>> p_unit_test 0
1865    //! s_setpc_b64 0
1866    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(0));
1867    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1868 
1869    /* LdsDirectVALUHazard */
1870    //! p_unit_test 1
1871    //! s2: %0:vcc = v_cmp_eq_u32 %0:v[0], 0
1872    //! s_waitcnt_depctr va_vdst(0)
1873    //! s_setpc_b64 0
1874    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1));
1875    bld.vopc_e64(aco_opcode::v_cmp_eq_u32, Definition(vcc, s2), Operand(PhysReg(256), v1),
1876                 Operand::zero());
1877    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1878 
1879    /* VcmpxPermlaneHazard */
1880    //! p_unit_test 2
1881    //! s2: %0:exec = v_cmpx_eq_u32 0, 0
1882    //! v_nop
1883    //! s_setpc_b64 0
1884    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2));
1885    bld.vopc_e64(aco_opcode::v_cmpx_eq_u32, Definition(exec, s2), Operand::zero(), Operand::zero());
1886    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1887 
1888    /* LdsDirectVMEMHazard */
1889    //! p_unit_test 3
1890    //! v1: %0:v[0] = ds_read_b32 %0:v[0]
1891    //! s_waitcnt_depctr vm_vsrc(0)
1892    //! s_setpc_b64 0
1893    bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3));
1894    bld.ds(aco_opcode::ds_read_b32, Definition(PhysReg(256), v1), Operand(PhysReg(256), v1));
1895    bld.sop1(aco_opcode::s_setpc_b64, Operand::zero(8));
1896 
1897    finish_insert_nops_test(true);
1898 END_TEST
1899