xref: /aosp_15_r20/external/mesa3d/src/amd/compiler/tests/test_optimizer_postRA.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Valve Corporation
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "helpers.h"
8 
9 using namespace aco;
10 
11 BEGIN_TEST(optimizer_postRA.vcmp)
12    PhysReg reg_v0(256);
13    PhysReg reg_s0(0);
14    PhysReg reg_s2(2);
15    PhysReg reg_s4(4);
16 
17    //>> v1: %a:v[0] = p_startpgm
18    ASSERTED bool setup_ok = setup_cs("v1", GFX8);
19    assert(setup_ok);
20 
21    auto& startpgm = bld.instructions->at(0);
22    assert(startpgm->opcode == aco_opcode::p_startpgm);
23    startpgm->definitions[0].setFixed(reg_v0);
24 
25    Temp v_in = inputs[0];
26 
27    {
28       /* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */
29 
30       //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
31       //! s2: %e:s[2-3] = p_cbranch_z %b:vcc
32       //! p_unit_test 0, %e:s[2-3]
33       auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(),
34                            Operand(v_in, reg_v0));
35       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp),
36                            Operand(exec, bld.lm));
37       auto br =
38          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
39       writeout(0, Operand(br, reg_s2));
40    }
41 
42    //; del b, e
43 
44    {
45       /* When VCC is overwritten inbetween, don't optimize. */
46 
47       //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
48       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
49       //! s2: %f:vcc = s_mov_b64 0
50       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
51       //! p_unit_test 1, %e:s[2-3], %f:vcc
52       auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(),
53                            Operand(v_in, reg_v0));
54       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp),
55                            Operand(exec, bld.lm));
56       auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand::zero());
57       auto br =
58          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
59       writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc));
60    }
61 
62    //; del b, c, d, e, f
63 
64    {
65       /* When part of VCC is overwritten inbetween, don't optimize. */
66 
67       //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
68       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
69       //! s1: %f:vcc_hi = s_mov_b32 0
70       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
71       //! p_unit_test 1, %e:s[2-3], %f:vcc_hi
72       auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(),
73                            Operand(v_in, reg_v0));
74       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp),
75                            Operand(exec, bld.lm));
76       auto ovrwr = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1, vcc_hi), Operand::zero());
77       auto br =
78          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
79       writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc_hi));
80    }
81 
82    //; del b, c, d, e, f
83 
84    {
85       /* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */
86 
87       //! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0]
88       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec
89       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
90       //! p_unit_test 2, %e:s[2-3]
91       auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand::zero(),
92                                Operand(v_in, reg_v0));
93       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc),
94                            Operand(vcmp, reg_s4), Operand(exec, bld.lm));
95       auto br =
96          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
97       writeout(2, Operand(br, reg_s2));
98    }
99 
100    //; del b, c, d, e
101 
102    {
103       /* When the VCC isn't written by VOPC, don't optimize */
104 
105       //! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5]
106       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
107       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
108       //! p_unit_test 2, %e:s[2-3]
109       auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand::c32(1u),
110                            Operand(reg_s4, bld.lm));
111       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc),
112                            Operand(salu, vcc), Operand(exec, bld.lm));
113       auto br =
114          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
115       writeout(2, Operand(br, reg_s2));
116    }
117 
118    //; del b, c, d, e, f, x
119 
120    {
121       /* When EXEC is overwritten inbetween, don't optimize. */
122 
123       //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0]
124       //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec
125       //! s2: %f:exec = s_mov_b64 42
126       //! s2: %e:s[2-3] = p_cbranch_z %d:scc
127       //! p_unit_test 4, %e:s[2-3], %f:exec
128       auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(),
129                            Operand(v_in, reg_v0));
130       auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp),
131                            Operand(exec, bld.lm));
132       auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand::c32(42u));
133       auto br =
134          bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp()));
135       writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec));
136    }
137 
138    //; del b, c, d, e, f, x
139 
140    finish_optimizer_postRA_test();
141 END_TEST
142 
143 BEGIN_TEST(optimizer_postRA.scc_nocmp_opt)
144    //>> s1: %a, s2: %y, s1: %z = p_startpgm
145    ASSERTED bool setup_ok = setup_cs("s1 s2 s1", GFX6);
146    assert(setup_ok);
147 
148    PhysReg reg_s0{0};
149    PhysReg reg_s2{2};
150    PhysReg reg_s3{3};
151    PhysReg reg_s4{4};
152    PhysReg reg_s6{6};
153    PhysReg reg_s8{8};
154 
155    Temp in_0 = inputs[0];
156    Temp in_1 = inputs[1];
157    Temp in_2 = inputs[2];
158    Operand op_in_0(in_0);
159    op_in_0.setFixed(reg_s0);
160    Operand op_in_1(in_1);
161    op_in_1.setFixed(reg_s4);
162    Operand op_in_2(in_2);
163    op_in_2.setFixed(reg_s6);
164 
165    {
166       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
167       //! s2: %f:vcc = p_cbranch_nz %e:scc
168       //! p_unit_test 0, %f:vcc
169       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
170                            Operand::c32(0x40018u));
171       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
172                            Operand::zero());
173       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
174       writeout(0, Operand(br, vcc));
175    }
176 
177    //; del d, e, f
178 
179    {
180       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
181       //! s2: %f:vcc = p_cbranch_z %e:scc
182       //! p_unit_test 1, %f:vcc
183       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
184                            Operand::c32(0x40018u));
185       auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2),
186                            Operand::zero());
187       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
188       writeout(1, Operand(br, vcc));
189    }
190 
191    //; del d, e, f
192 
193    {
194       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
195       //! s2: %f:vcc = p_cbranch_z %e:scc
196       //! p_unit_test 2, %f:vcc
197       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
198                            Operand::c32(0x40018u));
199       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
200                            Operand::zero());
201       auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp));
202       writeout(2, Operand(br, vcc));
203    }
204 
205    //; del d, e, f
206 
207    {
208       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
209       //! s2: %f:vcc = p_cbranch_nz %e:scc
210       //! p_unit_test 3, %f:vcc
211       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
212                            Operand::c32(0x40018u));
213       auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2),
214                            Operand::zero());
215       auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp));
216       writeout(3, Operand(br, vcc));
217    }
218 
219    //; del d, e, f
220 
221    {
222       //! s2: %d:s[2-3], s1: %e:scc = s_and_b64 %y:s[4-5], 0x12345
223       //! s2: %f:vcc = p_cbranch_z %e:scc
224       //! p_unit_test 4, %f:vcc
225       auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s2), bld.def(s1, scc), op_in_1,
226                            Operand::c32(0x12345u));
227       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u64, bld.def(s1, scc), Operand(salu, reg_s2),
228                            Operand::zero(8));
229       auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp));
230       writeout(4, Operand(br, vcc));
231    }
232 
233    //; del d, e, f
234 
235    {
236       /* SCC is overwritten in between, don't optimize */
237 
238       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
239       //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1
240       //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0
241       //! s2: %f:vcc = p_cbranch_z %g:scc
242       //! p_unit_test 5, %f:vcc, %h:s[3]
243       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
244                            Operand::c32(0x40018u));
245       auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0,
246                            Operand::c32(1u));
247       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
248                            Operand::zero());
249       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
250       writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3));
251    }
252 
253    //; del d, e, f, g, h, x
254 
255    {
256       /* SCC is overwritten in between, optimize by pulling down */
257 
258       //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1
259       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
260       //! s2: %f:vcc = p_cbranch_z %g:scc
261       //! p_unit_test 5, %f:vcc, %h:s[3]
262       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
263                            Operand::c32(0x40018u));
264       auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0,
265                            Operand::c32(1u));
266       auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2),
267                            Operand::zero());
268       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
269       writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3));
270    }
271 
272    //; del d, e, f, g, h, x
273 
274    {
275       /* SCC is overwritten in between, optimize by pulling down */
276 
277       //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1
278       //! s2: %d:s[8-9], s1: %e:scc = s_and_b64 %b:s[4-5], 0x40018
279       //! s2: %f:vcc = p_cbranch_z %g:scc
280       //! p_unit_test 5, %f:vcc, %h:s[3]
281       auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), op_in_1,
282                            Operand::c32(0x40018u));
283       auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0,
284                            Operand::c32(1u));
285       auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(salu, reg_s8),
286                            Operand::zero());
287       auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
288       writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3));
289    }
290 
291    //; del d, e, f, g, h, x
292 
293    {
294       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
295       //! s1: %f:s[4] = s_cselect_b32 %z:s[6], %a:s[0], %e:scc
296       //! p_unit_test 6, %f:s[4]
297       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
298                            Operand::c32(0x40018u));
299       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
300                            Operand::zero());
301       auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0),
302                          Operand(op_in_2), bld.scc(scmp));
303       writeout(6, Operand(br, reg_s4));
304    }
305 
306    //; del d, e, f
307 
308    {
309       /* SCC is overwritten in between, don't optimize */
310 
311       //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018
312       //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1
313       //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0
314       //! s1: %f:s[4] = s_cselect_b32 %a:s[0], %z:s[6], %g:scc
315       //! p_unit_test 7, %f:s[4], %h:s[3]
316       auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0,
317                            Operand::c32(0x40018u));
318       auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0,
319                            Operand::c32(1u));
320       auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2),
321                            Operand::zero());
322       auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0),
323                          Operand(op_in_2), bld.scc(scmp));
324       writeout(7, Operand(br, reg_s4), Operand(ovrw, reg_s3));
325    }
326 
327    //; del d, e, f, g, h, x
328 
329    finish_optimizer_postRA_test();
330 END_TEST
331 
332 BEGIN_TEST(optimizer_postRA.dpp)
333    //>> v1: %a:v[0], v1: %b:v[1], s2: %c:vcc, s2: %d:s[0-1] = p_startpgm
334    if (!setup_cs("v1 v1 s2 s2", GFX10_3))
335       return;
336 
337    bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256));
338    bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257));
339    bld.instructions->at(0)->definitions[2].setFixed(vcc);
340    bld.instructions->at(0)->definitions[3].setFixed(PhysReg(0));
341 
342    PhysReg reg_v0(256);
343    PhysReg reg_v2(258);
344    Operand a(inputs[0], PhysReg(256));
345    Operand b(inputs[1], PhysReg(257));
346    Operand c(inputs[2], vcc);
347    Operand d(inputs[3], PhysReg(0));
348 
349    /* basic optimization */
350    //! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
351    //! p_unit_test 0, %res0:v[2]
352    Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
353    Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b);
354    writeout(0, Operand(res0, reg_v2));
355 
356    /* operand swapping */
357    //! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
358    //! p_unit_test 1, %res1:v[2]
359    Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
360    Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp1, reg_v2));
361    writeout(1, Operand(res1, reg_v2));
362 
363    //! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
364    //! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1 fi
365    //! p_unit_test 2, %res2:v[2]
366    Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
367    Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2),
368                             dpp_row_half_mirror);
369    writeout(2, Operand(res2, reg_v2));
370 
371    /* modifiers */
372    //! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
373    //! p_unit_test 3, %res3:v[2]
374    auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
375    tmp3->dpp16().neg[0] = true;
376    Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp3, reg_v2), b);
377    writeout(3, Operand(res3, reg_v2));
378 
379    //! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
380    //! p_unit_test 4, %res4:v[2]
381    Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
382    auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp4, reg_v2), b);
383    res4->valu().neg[0] = true;
384    writeout(4, Operand(res4, reg_v2));
385 
386    //! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
387    //! v1: %res5:v[2] = v_add_f32 %tmp5:v[2], %b:v[1] clamp
388    //! p_unit_test 5, %res5:v[2]
389    Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
390    auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp5, reg_v2), b);
391    res5->valu().clamp = true;
392    writeout(5, Operand(res5, reg_v2));
393 
394    //! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1 fi
395    //! p_unit_test 6, %res6:v[2]
396    auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
397    tmp6->dpp16().neg[0] = true;
398    auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp6, reg_v2), b);
399    res6->valu().abs[0] = true;
400    writeout(6, Operand(res6, reg_v2));
401 
402    //! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1 fi
403    //! p_unit_test 7, %res7:v[2]
404    Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
405    auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp7, reg_v2));
406    res7->valu().abs[0] = true;
407    writeout(7, Operand(res7, reg_v2));
408 
409    //! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi
410    //! v1: %res12:v[2] = v_add_u32 %tmp12:v[2], %b:v[1]
411    //! p_unit_test 12, %res12:v[2]
412    auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
413    tmp12->dpp16().neg[0] = true;
414    Temp res12 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1, reg_v2), Operand(tmp12, reg_v2), b);
415    writeout(12, Operand(res12, reg_v2));
416 
417    //! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi
418    //! v1: %res13:v[2] = v_add_f16 %tmp13:v[2], %b:v[1]
419    //! p_unit_test 13, %res13:v[2]
420    auto tmp13 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
421    tmp13->dpp16().neg[0] = true;
422    Temp res13 = bld.vop2(aco_opcode::v_add_f16, bld.def(v1, reg_v2), Operand(tmp13, reg_v2), b);
423    writeout(13, Operand(res13, reg_v2));
424 
425    /* vcc */
426    //! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 fi
427    //! p_unit_test 8, %res8:v[2]
428    Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
429    Temp res8 =
430       bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c);
431    writeout(8, Operand(res8, reg_v2));
432 
433    //! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
434    //! v1: %res9:v[2] = v_cndmask_b32 %tmp9:v[2], %b:v[1], %d:s[0-1]
435    //! p_unit_test 9, %res9:v[2]
436    Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
437    Temp res9 =
438       bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp9, reg_v2), b, d);
439    writeout(9, Operand(res9, reg_v2));
440 
441    /* control flow */
442    //! BB1
443    //! /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */
444    //! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
445    //! p_unit_test 10, %res10:v[2]
446    Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
447 
448    bld.reset(program->create_and_insert_block());
449    program->blocks[0].linear_succs.push_back(1);
450    program->blocks[0].logical_succs.push_back(1);
451    program->blocks[1].linear_preds.push_back(0);
452    program->blocks[1].logical_preds.push_back(0);
453 
454    Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp10, reg_v2), b);
455    writeout(10, Operand(res10, reg_v2));
456 
457    /* can't combine if the v_mov_b32's operand is modified */
458    //! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
459    //! v1: %tmp11_2:v[0] = v_mov_b32 0
460    //! v1: %res11:v[2] = v_add_f32 %tmp11_1:v[2], %b:v[1]
461    //! p_unit_test 11, %res11_1:v[2], %tmp11_2:v[0]
462    Temp tmp11_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
463    Temp tmp11_2 = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1, reg_v0), Operand::c32(0));
464    Temp res11 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp11_1, reg_v2), b);
465    writeout(11, Operand(res11, reg_v2), Operand(tmp11_2, reg_v0));
466 
467    finish_optimizer_postRA_test();
468 END_TEST
469 
470 BEGIN_TEST(optimizer_postRA.dpp_across_exec)
471    for (amd_gfx_level gfx : {GFX9, GFX10}) {
472       //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm
473       if (!setup_cs("v1 v1", gfx))
474          continue;
475 
476       bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256));
477       bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257));
478 
479       PhysReg reg_v2(258);
480       Operand a(inputs[0], PhysReg(256));
481       Operand b(inputs[1], PhysReg(257));
482 
483       //~gfx9! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
484       //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
485       //~gfx9! v1: %res0:v[2] = v_add_f32 %tmp0:v[2], %b:v[1]
486       //~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
487       //! p_unit_test 0, %res0:v[2]
488       Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
489       bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
490                Operand(exec, bld.lm));
491       Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b);
492       writeout(0, Operand(res0, reg_v2));
493 
494       finish_optimizer_postRA_test();
495    }
496 END_TEST
497 
498 BEGIN_TEST(optimizer_postRA.dpp_vcmpx)
499    //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm
500    if (!setup_cs("v1 v1", GFX11))
501       return;
502 
503    bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256));
504    bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257));
505 
506    PhysReg reg_v2(258);
507    Operand a(inputs[0], PhysReg(256));
508    Operand b(inputs[1], PhysReg(257));
509 
510    //! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
511    //! s2: %res0:exec = v_cmpx_lt_f32 %tmp0:v[2], %b:v[1]
512    //! p_unit_test 0, %res0:exec
513    Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror);
514    Temp res0 = bld.vopc(aco_opcode::v_cmpx_lt_f32, bld.def(bld.lm, exec), Operand(tmp0, reg_v2), b);
515    writeout(0, Operand(res0, exec));
516 
517    finish_optimizer_postRA_test();
518 END_TEST
519 
520 BEGIN_TEST(optimizer_postRA.dpp_across_cf)
521    //>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s4: %f:s[4-7] = p_startpgm
522    if (!setup_cs("v1 v1 v1 v1 s2 s4", GFX10_3))
523       return;
524 
525    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
526    startpgm->definitions[0].setFixed(PhysReg(256));
527    startpgm->definitions[1].setFixed(PhysReg(257));
528    startpgm->definitions[2].setFixed(PhysReg(258));
529    startpgm->definitions[3].setFixed(PhysReg(259));
530    startpgm->definitions[4].setFixed(PhysReg(0));
531    startpgm->definitions[5].setFixed(PhysReg(4));
532 
533    Operand a(inputs[0], PhysReg(256)); /* source for DPP */
534    Operand b(inputs[1], PhysReg(257)); /* source for fadd */
535    Operand c(inputs[2], PhysReg(258)); /* buffer store address */
536    Operand d(inputs[3], PhysReg(259)); /* buffer store value */
537    Operand e(inputs[4], PhysReg(0));   /* condition */
538    Operand f(inputs[5], PhysReg(4));   /* buffer descriptor */
539    PhysReg reg_v12(268);               /* temporary register */
540 
541    Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror);
542 
543    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec
544    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
545 
546    emit_divergent_if_else(
547       program.get(), bld, e,
548       [&]() -> void
__anon4f1ff3420102() 549       {
550          /* --- logical then --- */
551          //! BB1
552          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
553          //! p_logical_start
554 
555          //! buffer_store_dword %f:s[4-7], %c:v[2], 0, %d:v[3] offen
556          bld.mubuf(aco_opcode::buffer_store_dword, f, c, Operand::zero(), d, 0, true);
557 
558          //! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi
559          //! p_unit_test 10, %res10:v[12]
560          Temp result =
561             bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b);
562          writeout(10, Operand(result, reg_v12));
563 
564          //! p_logical_end
565          //! s2: %0:vcc = p_branch BB3
566 
567          /* --- linear then --- */
568          //! BB2
569          //! /* logical preds: / linear preds: BB0, / kind: */
570          //! s2: %0:vcc = p_branch BB3
571 
572          /* --- invert --- */
573          //! BB3
574          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
575          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
576          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
577       },
578       [&]() -> void
__anon4f1ff3420202() 579       {
580          /* --- logical else --- */
581          //! BB4
582          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
583          //! p_logical_start
584          //! p_logical_end
585          //! s2: %0:vcc = p_branch BB6
586 
587          /* --- linear else --- */
588          //! BB5
589          //! /* logical preds: / linear preds: BB3, / kind: */
590          //! s2: %0:vcc = p_branch BB6
591       });
592 
593    /* --- merge block --- */
594    //! BB6
595    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
596    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
597 
598    finish_optimizer_postRA_test();
599 END_TEST
600 
601 BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten)
602    //>> v1: %a:v[0], v1: %b:v[1], s4: %c:s[4-7], v1: %d:v[3], s2: %e:s[0-1], s1: %f:s[2] = p_startpgm
603    if (!setup_cs("v1 v1 s4 v1 s2 s1", GFX10_3))
604       return;
605 
606    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
607    startpgm->definitions[0].setFixed(PhysReg(256));
608    startpgm->definitions[1].setFixed(PhysReg(257));
609    startpgm->definitions[2].setFixed(PhysReg(4));
610    startpgm->definitions[3].setFixed(PhysReg(259));
611    startpgm->definitions[4].setFixed(PhysReg(0));
612    startpgm->definitions[5].setFixed(PhysReg(2));
613 
614    Operand a(inputs[0], PhysReg(256)); /* source for DPP */
615    Operand b(inputs[1], PhysReg(257)); /* source for fadd */
616    Operand c(inputs[2], PhysReg(4));   /* buffer descriptor */
617    Operand d(inputs[3], PhysReg(259)); /* buffer store value */
618    Operand e(inputs[4], PhysReg(0));   /* condition */
619    Operand f(inputs[5], PhysReg(2));   /* buffer store address (scalar) */
620    PhysReg reg_v12(268);               /* temporary register */
621 
622    //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
623    Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror);
624 
625    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec
626    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
627 
628    emit_divergent_if_else(
629       program.get(), bld, e,
630       [&]() -> void
__anon4f1ff3420302() 631       {
632          /* --- logical then --- */
633          //! BB1
634          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
635          //! p_logical_start
636 
637          //! v1: %addr:v[0] = p_parallelcopy %f:s[2]
638          Temp addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), f);
639 
640          //! buffer_store_dword %c:s[4-7], %addr:v[0], 0, %d:v[3] offen
641          bld.mubuf(aco_opcode::buffer_store_dword, c, Operand(addr, a.physReg()), Operand::zero(),
642                    d, 0, true);
643 
644          //! p_logical_end
645          //! s2: %0:vcc = p_branch BB3
646 
647          /* --- linear then --- */
648          //! BB2
649          //! /* logical preds: / linear preds: BB0, / kind: */
650          //! s2: %0:vcc = p_branch BB3
651 
652          /* --- invert --- */
653          //! BB3
654          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
655          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
656          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
657       },
658       [&]() -> void
__anon4f1ff3420402() 659       {
660          /* --- logical else --- */
661          //! BB4
662          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
663          //! p_logical_start
664          //! p_logical_end
665          //! s2: %0:vcc = p_branch BB6
666 
667          /* --- linear else --- */
668          //! BB5
669          //! /* logical preds: / linear preds: BB3, / kind: */
670          //! s2: %0:vcc = p_branch BB6
671       });
672 
673    /* --- merge block --- */
674    //! BB6
675    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
676    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
677 
678    //! v1: %result:v[12] = v_add_f32 %dpp_mov_tmp:v[12], %b:v[1]
679    Temp result =
680       bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b);
681    //! p_unit_test 10, %result:v[12]
682    writeout(10, Operand(result, reg_v12));
683 
684    finish_optimizer_postRA_test();
685 END_TEST
686 
687 BEGIN_TEST(optimizer_postRA.dpp_across_cf_linear_clobber)
688    //>> v1: %a:v[0], v1: %b:v[1], s2: %c:s[0-1] = p_startpgm
689    if (!setup_cs("v1 v1 s2", GFX10_3))
690       return;
691 
692    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
693    startpgm->definitions[0].setFixed(PhysReg(256));
694    startpgm->definitions[1].setFixed(PhysReg(257));
695    startpgm->definitions[2].setFixed(PhysReg(0));
696 
697    Operand a(inputs[0], PhysReg(256)); /* source for DPP */
698    Operand b(inputs[1], PhysReg(257)); /* source for fadd */
699    Operand c(inputs[2], PhysReg(0));   /* condition */
700    PhysReg reg_v12(268);               /* temporary register */
701 
702    //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi
703    Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror);
704 
705    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %c:s[0-1], %0:exec
706    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
707 
708    emit_divergent_if_else(
709       program.get(), bld, c,
710       [&]() -> void
__anon4f1ff3420502() 711       {
712          /* --- logical then --- */
713          //! BB1
714          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
715          //! p_logical_start
716 
717          //! v1: %clobber:v[0] = p_parallelcopy 0
718          Temp clobber =
719             bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), Operand::c32(0));
720 
721          //! p_unit_test 0, %clobber:v[0]
722          writeout(0, Operand(clobber, a.physReg()));
723 
724          //! p_logical_end
725          //! s2: %0:vcc = p_branch BB3
726 
727          /* --- linear then --- */
728          //! BB2
729          //! /* logical preds: / linear preds: BB0, / kind: */
730          //! s2: %0:vcc = p_branch BB3
731 
732          /* --- invert --- */
733          //! BB3
734          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
735          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
736          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
737       },
738       [&]() -> void
__anon4f1ff3420602() 739       {
740          /* --- logical else --- */
741          //! BB4
742          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
743          //! p_logical_start
744 
745          //! v1: %result:v[12] = v_add_f32 %dpp_mov_tmp:v[12], %b:v[1]
746          Temp result =
747             bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b);
748          //! p_unit_test 1, %result:v[12]
749          writeout(1, Operand(result, reg_v12));
750 
751          //! p_logical_end
752          //! s2: %0:vcc = p_branch BB6
753 
754          /* --- linear else --- */
755          //! BB5
756          //! /* logical preds: / linear preds: BB3, / kind: */
757          //! s2: %0:vcc = p_branch BB6
758       });
759 
760    /* --- merge block --- */
761    //! BB6
762    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
763    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
764 
765    finish_optimizer_postRA_test();
766 END_TEST
767 
768 BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf)
769    //>> s2: %a:s[2-3], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s4: %f:s[4-7] = p_startpgm
770    if (!setup_cs("s2 v1 v1 s2 s4", GFX10_3))
771       return;
772 
773    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
774    startpgm->definitions[0].setFixed(PhysReg(2));
775    startpgm->definitions[1].setFixed(PhysReg(258));
776    startpgm->definitions[2].setFixed(PhysReg(259));
777    startpgm->definitions[3].setFixed(PhysReg(0));
778    startpgm->definitions[4].setFixed(PhysReg(4));
779 
780    Operand a(inputs[0], PhysReg(2));   /* source for s_and */
781    Operand c(inputs[1], PhysReg(258)); /* buffer store address */
782    Operand d(inputs[2], PhysReg(259)); /* buffer store value */
783    Operand e(inputs[3], PhysReg(0));   /* condition */
784    Operand f(inputs[4], PhysReg(4));   /* buffer descriptor */
785    PhysReg reg_s8(8);                  /* temporary register */
786 
787    auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a,
788                             Operand::c32(0x40018u));
789 
790    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec
791    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
792 
793    emit_divergent_if_else(
794       program.get(), bld, e,
795       [&]() -> void
__anon4f1ff3420702() 796       {
797          /* --- logical then --- */
798          //! BB1
799          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
800          //! p_logical_start
801 
802          //! buffer_store_dword %f:s[4-7], %c:v[2], 0, %d:v[3] offen
803          bld.mubuf(aco_opcode::buffer_store_dword, f, c, Operand::zero(), d, 0, true);
804 
805          //! p_logical_end
806          //! s2: %0:vcc = p_branch BB3
807 
808          /* --- linear then --- */
809          //! BB2
810          //! /* logical preds: / linear preds: BB0, / kind: */
811          //! s2: %0:vcc = p_branch BB3
812 
813          /* --- invert --- */
814          //! BB3
815          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
816          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
817          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
818       },
819       [&]() -> void
__anon4f1ff3420802() 820       {
821          /* --- logical else --- */
822          //! BB4
823          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
824          //! p_logical_start
825          //! p_logical_end
826          //! s2: %0:vcc = p_branch BB6
827 
828          /* --- linear else --- */
829          //! BB5
830          //! /* logical preds: / linear preds: BB3, / kind: */
831          //! s2: %0:vcc = p_branch BB6
832       });
833 
834    /* --- merge block --- */
835    //! BB6
836    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
837    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
838 
839    //! s2: %tmp_salu:s[8-9], s1: %br_scc:scc = s_and_b64 %a:s[2-3], 0x40018
840    //! s2: %br_vcc:vcc = p_cbranch_z %br_scc:scc
841    //! p_unit_test 5, %br_vcc:vcc
842    auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(tmp_salu, reg_s8),
843                         Operand::zero());
844    auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
845    writeout(5, Operand(br, vcc));
846 
847    finish_optimizer_postRA_test();
848 END_TEST
849 
850 BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf_partially_overwritten)
851    //>> s2: %a:s[2-3], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s1: %f:s[4], s4: %g:s[8-11] = p_startpgm
852    if (!setup_cs("s2 v1 v1 s2 s1 s4", GFX10_3))
853       return;
854 
855    aco_ptr<Instruction>& startpgm = bld.instructions->at(0);
856    startpgm->definitions[0].setFixed(PhysReg(2));
857    startpgm->definitions[1].setFixed(PhysReg(258));
858    startpgm->definitions[2].setFixed(PhysReg(259));
859    startpgm->definitions[3].setFixed(PhysReg(0));
860    startpgm->definitions[4].setFixed(PhysReg(4));
861    startpgm->definitions[5].setFixed(PhysReg(8));
862 
863    Operand a(inputs[0], PhysReg(2));   /* source for s_and */
864    Operand c(inputs[1], PhysReg(258)); /* buffer store address */
865    Operand d(inputs[2], PhysReg(259)); /* buffer store value */
866    Operand e(inputs[3], PhysReg(0));   /* condition */
867    Operand f(inputs[4], PhysReg(4));   /* overwrite value */
868    Operand g(inputs[5], PhysReg(8));   /* buffer descriptor */
869    PhysReg reg_s3(3);                  /* temporary register */
870    PhysReg reg_s8(8);                  /* temporary register */
871 
872    //! s2: %tmp_salu:s[8-9], s1: %tmp_salu_scc:scc = s_and_b64 %a:s[2-3], 0x40018
873    auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a,
874                             Operand::c32(0x40018u));
875 
876    //! s2: %saved_exec:s[84-85],  s1: %0:scc,  s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec
877    //! s2: %0:vcc = p_cbranch_nz BB1, BB2
878 
879    emit_divergent_if_else(
880       program.get(), bld, e,
881       [&]() -> void
__anon4f1ff3420902() 882       {
883          /* --- logical then --- */
884          //! BB1
885          //! /* logical preds: BB0, / linear preds: BB0, / kind: */
886          //! p_logical_start
887 
888          //! s1: %ovrwr:s[3] = p_parallelcopy %f:s[4]
889          Temp s_addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1, reg_s3), f);
890 
891          //! buffer_store_dword %g:s[8-11], %c:v[2], %ovrwr:s[3], %d:v[3] offen
892          bld.mubuf(aco_opcode::buffer_store_dword, g, c, Operand(s_addr, reg_s3), d, 0, true);
893 
894          //! p_logical_end
895          //! s2: %0:vcc = p_branch BB3
896 
897          /* --- linear then --- */
898          //! BB2
899          //! /* logical preds: / linear preds: BB0, / kind: */
900          //! s2: %0:vcc = p_branch BB3
901 
902          /* --- invert --- */
903          //! BB3
904          //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */
905          //! s2: %0:exec,  s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec
906          //! s2: %0:vcc = p_cbranch_nz BB4, BB5
907       },
908       [&]() -> void
__anon4f1ff3420a02() 909       {
910          /* --- logical else --- */
911          //! BB4
912          //! /* logical preds: BB0, / linear preds: BB3, / kind: */
913          //! p_logical_start
914          //! p_logical_end
915          //! s2: %0:vcc = p_branch BB6
916 
917          /* --- linear else --- */
918          //! BB5
919          //! /* logical preds: / linear preds: BB3, / kind: */
920          //! s2: %0:vcc = p_branch BB6
921       });
922 
923    /* --- merge block --- */
924    //! BB6
925    //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */
926    //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85]
927 
928    //! s1: %br_scc:scc = s_cmp_lg_u64 %tmp_salu:s[8-9], 0
929    //! s2: %br_vcc:vcc = p_cbranch_z %br_scc:scc
930    //! p_unit_test 5, %br_vcc:vcc
931    auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(tmp_salu, reg_s8),
932                         Operand::zero());
933    auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp));
934    writeout(5, Operand(br, vcc));
935 
936    finish_optimizer_postRA_test();
937 END_TEST
938