1 /* 2 * Copyright © 2021 Valve Corporation 3 * 4 * SPDX-License-Identifier: MIT 5 */ 6 7 #include "helpers.h" 8 9 using namespace aco; 10 11 BEGIN_TEST(optimizer_postRA.vcmp) 12 PhysReg reg_v0(256); 13 PhysReg reg_s0(0); 14 PhysReg reg_s2(2); 15 PhysReg reg_s4(4); 16 17 //>> v1: %a:v[0] = p_startpgm 18 ASSERTED bool setup_ok = setup_cs("v1", GFX8); 19 assert(setup_ok); 20 21 auto& startpgm = bld.instructions->at(0); 22 assert(startpgm->opcode == aco_opcode::p_startpgm); 23 startpgm->definitions[0].setFixed(reg_v0); 24 25 Temp v_in = inputs[0]; 26 27 { 28 /* Recognize when the result of VOPC goes to VCC, and use that for the branching then. */ 29 30 //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] 31 //! s2: %e:s[2-3] = p_cbranch_z %b:vcc 32 //! p_unit_test 0, %e:s[2-3] 33 auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), 34 Operand(v_in, reg_v0)); 35 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), 36 Operand(exec, bld.lm)); 37 auto br = 38 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 39 writeout(0, Operand(br, reg_s2)); 40 } 41 42 //; del b, e 43 44 { 45 /* When VCC is overwritten inbetween, don't optimize. */ 46 47 //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] 48 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec 49 //! s2: %f:vcc = s_mov_b64 0 50 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 51 //! p_unit_test 1, %e:s[2-3], %f:vcc 52 auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), 53 Operand(v_in, reg_v0)); 54 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), 55 Operand(exec, bld.lm)); 56 auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, vcc), Operand::zero()); 57 auto br = 58 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 59 writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc)); 60 } 61 62 //; del b, c, d, e, f 63 64 { 65 /* When part of VCC is overwritten inbetween, don't optimize. */ 66 67 //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] 68 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec 69 //! s1: %f:vcc_hi = s_mov_b32 0 70 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 71 //! p_unit_test 1, %e:s[2-3], %f:vcc_hi 72 auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), 73 Operand(v_in, reg_v0)); 74 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), 75 Operand(exec, bld.lm)); 76 auto ovrwr = bld.sop1(aco_opcode::s_mov_b32, bld.def(s1, vcc_hi), Operand::zero()); 77 auto br = 78 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 79 writeout(1, Operand(br, reg_s2), Operand(ovrwr, vcc_hi)); 80 } 81 82 //; del b, c, d, e, f 83 84 { 85 /* When the result of VOPC goes to an SGPR pair other than VCC, don't optimize */ 86 87 //! s2: %b:s[4-5] = v_cmp_eq_u32 0, %a:v[0] 88 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:s[4-5], %x:exec 89 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 90 //! p_unit_test 2, %e:s[2-3] 91 auto vcmp = bld.vopc_e64(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, reg_s4), Operand::zero(), 92 Operand(v_in, reg_v0)); 93 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), 94 Operand(vcmp, reg_s4), Operand(exec, bld.lm)); 95 auto br = 96 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 97 writeout(2, Operand(br, reg_s2)); 98 } 99 100 //; del b, c, d, e 101 102 { 103 /* When the VCC isn't written by VOPC, don't optimize */ 104 105 //! s2: %b:vcc, s1: %f:scc = s_or_b64 1, %0:s[4-5] 106 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec 107 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 108 //! p_unit_test 2, %e:s[2-3] 109 auto salu = bld.sop2(Builder::s_or, bld.def(bld.lm, vcc), bld.def(s1, scc), Operand::c32(1u), 110 Operand(reg_s4, bld.lm)); 111 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), 112 Operand(salu, vcc), Operand(exec, bld.lm)); 113 auto br = 114 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 115 writeout(2, Operand(br, reg_s2)); 116 } 117 118 //; del b, c, d, e, f, x 119 120 { 121 /* When EXEC is overwritten inbetween, don't optimize. */ 122 123 //! s2: %b:vcc = v_cmp_eq_u32 0, %a:v[0] 124 //! s2: %c:s[0-1], s1: %d:scc = s_and_b64 %b:vcc, %x:exec 125 //! s2: %f:exec = s_mov_b64 42 126 //! s2: %e:s[2-3] = p_cbranch_z %d:scc 127 //! p_unit_test 4, %e:s[2-3], %f:exec 128 auto vcmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), Operand::zero(), 129 Operand(v_in, reg_v0)); 130 auto sand = bld.sop2(Builder::s_and, bld.def(bld.lm, reg_s0), bld.def(s1, scc), bld.vcc(vcmp), 131 Operand(exec, bld.lm)); 132 auto ovrwr = bld.sop1(Builder::s_mov, bld.def(bld.lm, exec), Operand::c32(42u)); 133 auto br = 134 bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, reg_s2), bld.scc(sand.def(1).getTemp())); 135 writeout(4, Operand(br, reg_s2), Operand(ovrwr, exec)); 136 } 137 138 //; del b, c, d, e, f, x 139 140 finish_optimizer_postRA_test(); 141 END_TEST 142 143 BEGIN_TEST(optimizer_postRA.scc_nocmp_opt) 144 //>> s1: %a, s2: %y, s1: %z = p_startpgm 145 ASSERTED bool setup_ok = setup_cs("s1 s2 s1", GFX6); 146 assert(setup_ok); 147 148 PhysReg reg_s0{0}; 149 PhysReg reg_s2{2}; 150 PhysReg reg_s3{3}; 151 PhysReg reg_s4{4}; 152 PhysReg reg_s6{6}; 153 PhysReg reg_s8{8}; 154 155 Temp in_0 = inputs[0]; 156 Temp in_1 = inputs[1]; 157 Temp in_2 = inputs[2]; 158 Operand op_in_0(in_0); 159 op_in_0.setFixed(reg_s0); 160 Operand op_in_1(in_1); 161 op_in_1.setFixed(reg_s4); 162 Operand op_in_2(in_2); 163 op_in_2.setFixed(reg_s6); 164 165 { 166 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 167 //! s2: %f:vcc = p_cbranch_nz %e:scc 168 //! p_unit_test 0, %f:vcc 169 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 170 Operand::c32(0x40018u)); 171 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 172 Operand::zero()); 173 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 174 writeout(0, Operand(br, vcc)); 175 } 176 177 //; del d, e, f 178 179 { 180 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 181 //! s2: %f:vcc = p_cbranch_z %e:scc 182 //! p_unit_test 1, %f:vcc 183 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 184 Operand::c32(0x40018u)); 185 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), 186 Operand::zero()); 187 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 188 writeout(1, Operand(br, vcc)); 189 } 190 191 //; del d, e, f 192 193 { 194 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 195 //! s2: %f:vcc = p_cbranch_z %e:scc 196 //! p_unit_test 2, %f:vcc 197 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 198 Operand::c32(0x40018u)); 199 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 200 Operand::zero()); 201 auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); 202 writeout(2, Operand(br, vcc)); 203 } 204 205 //; del d, e, f 206 207 { 208 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 209 //! s2: %f:vcc = p_cbranch_nz %e:scc 210 //! p_unit_test 3, %f:vcc 211 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 212 Operand::c32(0x40018u)); 213 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), 214 Operand::zero()); 215 auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); 216 writeout(3, Operand(br, vcc)); 217 } 218 219 //; del d, e, f 220 221 { 222 //! s2: %d:s[2-3], s1: %e:scc = s_and_b64 %y:s[4-5], 0x12345 223 //! s2: %f:vcc = p_cbranch_z %e:scc 224 //! p_unit_test 4, %f:vcc 225 auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s2), bld.def(s1, scc), op_in_1, 226 Operand::c32(0x12345u)); 227 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u64, bld.def(s1, scc), Operand(salu, reg_s2), 228 Operand::zero(8)); 229 auto br = bld.branch(aco_opcode::p_cbranch_nz, bld.def(s2, vcc), bld.scc(scmp)); 230 writeout(4, Operand(br, vcc)); 231 } 232 233 //; del d, e, f 234 235 { 236 /* SCC is overwritten in between, don't optimize */ 237 238 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 239 //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 240 //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0 241 //! s2: %f:vcc = p_cbranch_z %g:scc 242 //! p_unit_test 5, %f:vcc, %h:s[3] 243 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 244 Operand::c32(0x40018u)); 245 auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, 246 Operand::c32(1u)); 247 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 248 Operand::zero()); 249 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 250 writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); 251 } 252 253 //; del d, e, f, g, h, x 254 255 { 256 /* SCC is overwritten in between, optimize by pulling down */ 257 258 //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 259 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 260 //! s2: %f:vcc = p_cbranch_z %g:scc 261 //! p_unit_test 5, %f:vcc, %h:s[3] 262 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 263 Operand::c32(0x40018u)); 264 auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, 265 Operand::c32(1u)); 266 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand(salu, reg_s2), 267 Operand::zero()); 268 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 269 writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); 270 } 271 272 //; del d, e, f, g, h, x 273 274 { 275 /* SCC is overwritten in between, optimize by pulling down */ 276 277 //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 278 //! s2: %d:s[8-9], s1: %e:scc = s_and_b64 %b:s[4-5], 0x40018 279 //! s2: %f:vcc = p_cbranch_z %g:scc 280 //! p_unit_test 5, %f:vcc, %h:s[3] 281 auto salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), op_in_1, 282 Operand::c32(0x40018u)); 283 auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, 284 Operand::c32(1u)); 285 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(salu, reg_s8), 286 Operand::zero()); 287 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 288 writeout(5, Operand(br, vcc), Operand(ovrw, reg_s3)); 289 } 290 291 //; del d, e, f, g, h, x 292 293 { 294 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 295 //! s1: %f:s[4] = s_cselect_b32 %z:s[6], %a:s[0], %e:scc 296 //! p_unit_test 6, %f:s[4] 297 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 298 Operand::c32(0x40018u)); 299 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 300 Operand::zero()); 301 auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0), 302 Operand(op_in_2), bld.scc(scmp)); 303 writeout(6, Operand(br, reg_s4)); 304 } 305 306 //; del d, e, f 307 308 { 309 /* SCC is overwritten in between, don't optimize */ 310 311 //! s1: %d:s[2], s1: %e:scc = s_bfe_u32 %a:s[0], 0x40018 312 //! s1: %h:s[3], s1: %x:scc = s_add_u32 %a:s[0], 1 313 //! s1: %g:scc = s_cmp_eq_u32 %d:s[2], 0 314 //! s1: %f:s[4] = s_cselect_b32 %a:s[0], %z:s[6], %g:scc 315 //! p_unit_test 7, %f:s[4], %h:s[3] 316 auto salu = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, reg_s2), bld.def(s1, scc), op_in_0, 317 Operand::c32(0x40018u)); 318 auto ovrw = bld.sop2(aco_opcode::s_add_u32, bld.def(s1, reg_s3), bld.def(s1, scc), op_in_0, 319 Operand::c32(1u)); 320 auto scmp = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), Operand(salu, reg_s2), 321 Operand::zero()); 322 auto br = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1, reg_s4), Operand(op_in_0), 323 Operand(op_in_2), bld.scc(scmp)); 324 writeout(7, Operand(br, reg_s4), Operand(ovrw, reg_s3)); 325 } 326 327 //; del d, e, f, g, h, x 328 329 finish_optimizer_postRA_test(); 330 END_TEST 331 332 BEGIN_TEST(optimizer_postRA.dpp) 333 //>> v1: %a:v[0], v1: %b:v[1], s2: %c:vcc, s2: %d:s[0-1] = p_startpgm 334 if (!setup_cs("v1 v1 s2 s2", GFX10_3)) 335 return; 336 337 bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256)); 338 bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257)); 339 bld.instructions->at(0)->definitions[2].setFixed(vcc); 340 bld.instructions->at(0)->definitions[3].setFixed(PhysReg(0)); 341 342 PhysReg reg_v0(256); 343 PhysReg reg_v2(258); 344 Operand a(inputs[0], PhysReg(256)); 345 Operand b(inputs[1], PhysReg(257)); 346 Operand c(inputs[2], vcc); 347 Operand d(inputs[3], PhysReg(0)); 348 349 /* basic optimization */ 350 //! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 351 //! p_unit_test 0, %res0:v[2] 352 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 353 Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b); 354 writeout(0, Operand(res0, reg_v2)); 355 356 /* operand swapping */ 357 //! v1: %res1:v[2] = v_subrev_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 358 //! p_unit_test 1, %res1:v[2] 359 Temp tmp1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 360 Temp res1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp1, reg_v2)); 361 writeout(1, Operand(res1, reg_v2)); 362 363 //! v1: %tmp2:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 364 //! v1: %res2:v[2] = v_sub_f32 %b:v[1], %tmp2:v[2] row_half_mirror bound_ctrl:1 fi 365 //! p_unit_test 2, %res2:v[2] 366 Temp tmp2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 367 Temp res2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp2, reg_v2), 368 dpp_row_half_mirror); 369 writeout(2, Operand(res2, reg_v2)); 370 371 /* modifiers */ 372 //! v1: %res3:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 373 //! p_unit_test 3, %res3:v[2] 374 auto tmp3 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 375 tmp3->dpp16().neg[0] = true; 376 Temp res3 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp3, reg_v2), b); 377 writeout(3, Operand(res3, reg_v2)); 378 379 //! v1: %res4:v[2] = v_add_f32 -%a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 380 //! p_unit_test 4, %res4:v[2] 381 Temp tmp4 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 382 auto res4 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp4, reg_v2), b); 383 res4->valu().neg[0] = true; 384 writeout(4, Operand(res4, reg_v2)); 385 386 //! v1: %tmp5:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 387 //! v1: %res5:v[2] = v_add_f32 %tmp5:v[2], %b:v[1] clamp 388 //! p_unit_test 5, %res5:v[2] 389 Temp tmp5 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 390 auto res5 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp5, reg_v2), b); 391 res5->valu().clamp = true; 392 writeout(5, Operand(res5, reg_v2)); 393 394 //! v1: %res6:v[2] = v_add_f32 |%a:v[0]|, %b:v[1] row_mirror bound_ctrl:1 fi 395 //! p_unit_test 6, %res6:v[2] 396 auto tmp6 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 397 tmp6->dpp16().neg[0] = true; 398 auto res6 = bld.vop2_e64(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp6, reg_v2), b); 399 res6->valu().abs[0] = true; 400 writeout(6, Operand(res6, reg_v2)); 401 402 //! v1: %res7:v[2] = v_subrev_f32 %a:v[0], |%b:v[1]| row_mirror bound_ctrl:1 fi 403 //! p_unit_test 7, %res7:v[2] 404 Temp tmp7 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 405 auto res7 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1, reg_v2), b, Operand(tmp7, reg_v2)); 406 res7->valu().abs[0] = true; 407 writeout(7, Operand(res7, reg_v2)); 408 409 //! v1: %tmp12:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi 410 //! v1: %res12:v[2] = v_add_u32 %tmp12:v[2], %b:v[1] 411 //! p_unit_test 12, %res12:v[2] 412 auto tmp12 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 413 tmp12->dpp16().neg[0] = true; 414 Temp res12 = bld.vop2(aco_opcode::v_add_u32, bld.def(v1, reg_v2), Operand(tmp12, reg_v2), b); 415 writeout(12, Operand(res12, reg_v2)); 416 417 //! v1: %tmp13:v[2] = v_mov_b32 -%a:v[0] row_mirror bound_ctrl:1 fi 418 //! v1: %res13:v[2] = v_add_f16 %tmp13:v[2], %b:v[1] 419 //! p_unit_test 13, %res13:v[2] 420 auto tmp13 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 421 tmp13->dpp16().neg[0] = true; 422 Temp res13 = bld.vop2(aco_opcode::v_add_f16, bld.def(v1, reg_v2), Operand(tmp13, reg_v2), b); 423 writeout(13, Operand(res13, reg_v2)); 424 425 /* vcc */ 426 //! v1: %res8:v[2] = v_cndmask_b32 %a:v[0], %b:v[1], %c:vcc row_mirror bound_ctrl:1 fi 427 //! p_unit_test 8, %res8:v[2] 428 Temp tmp8 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 429 Temp res8 = 430 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp8, reg_v2), b, c); 431 writeout(8, Operand(res8, reg_v2)); 432 433 //! v1: %tmp9:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 434 //! v1: %res9:v[2] = v_cndmask_b32 %tmp9:v[2], %b:v[1], %d:s[0-1] 435 //! p_unit_test 9, %res9:v[2] 436 Temp tmp9 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 437 Temp res9 = 438 bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1, reg_v2), Operand(tmp9, reg_v2), b, d); 439 writeout(9, Operand(res9, reg_v2)); 440 441 /* control flow */ 442 //! BB1 443 //! /* logical preds: BB0, / linear preds: BB0, / kind: uniform, */ 444 //! v1: %res10:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 445 //! p_unit_test 10, %res10:v[2] 446 Temp tmp10 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 447 448 bld.reset(program->create_and_insert_block()); 449 program->blocks[0].linear_succs.push_back(1); 450 program->blocks[0].logical_succs.push_back(1); 451 program->blocks[1].linear_preds.push_back(0); 452 program->blocks[1].logical_preds.push_back(0); 453 454 Temp res10 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp10, reg_v2), b); 455 writeout(10, Operand(res10, reg_v2)); 456 457 /* can't combine if the v_mov_b32's operand is modified */ 458 //! v1: %tmp11_1:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 459 //! v1: %tmp11_2:v[0] = v_mov_b32 0 460 //! v1: %res11:v[2] = v_add_f32 %tmp11_1:v[2], %b:v[1] 461 //! p_unit_test 11, %res11_1:v[2], %tmp11_2:v[0] 462 Temp tmp11_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 463 Temp tmp11_2 = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1, reg_v0), Operand::c32(0)); 464 Temp res11 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp11_1, reg_v2), b); 465 writeout(11, Operand(res11, reg_v2), Operand(tmp11_2, reg_v0)); 466 467 finish_optimizer_postRA_test(); 468 END_TEST 469 470 BEGIN_TEST(optimizer_postRA.dpp_across_exec) 471 for (amd_gfx_level gfx : {GFX9, GFX10}) { 472 //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm 473 if (!setup_cs("v1 v1", gfx)) 474 continue; 475 476 bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256)); 477 bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257)); 478 479 PhysReg reg_v2(258); 480 Operand a(inputs[0], PhysReg(256)); 481 Operand b(inputs[1], PhysReg(257)); 482 483 //~gfx9! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 484 //! s2: %0:exec, s1: %0:scc = s_not_b64 %0:exec 485 //~gfx9! v1: %res0:v[2] = v_add_f32 %tmp0:v[2], %b:v[1] 486 //~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 487 //! p_unit_test 0, %res0:v[2] 488 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 489 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), 490 Operand(exec, bld.lm)); 491 Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), Operand(tmp0, reg_v2), b); 492 writeout(0, Operand(res0, reg_v2)); 493 494 finish_optimizer_postRA_test(); 495 } 496 END_TEST 497 498 BEGIN_TEST(optimizer_postRA.dpp_vcmpx) 499 //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm 500 if (!setup_cs("v1 v1", GFX11)) 501 return; 502 503 bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256)); 504 bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257)); 505 506 PhysReg reg_v2(258); 507 Operand a(inputs[0], PhysReg(256)); 508 Operand b(inputs[1], PhysReg(257)); 509 510 //! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 511 //! s2: %res0:exec = v_cmpx_lt_f32 %tmp0:v[2], %b:v[1] 512 //! p_unit_test 0, %res0:exec 513 Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, dpp_row_mirror); 514 Temp res0 = bld.vopc(aco_opcode::v_cmpx_lt_f32, bld.def(bld.lm, exec), Operand(tmp0, reg_v2), b); 515 writeout(0, Operand(res0, exec)); 516 517 finish_optimizer_postRA_test(); 518 END_TEST 519 520 BEGIN_TEST(optimizer_postRA.dpp_across_cf) 521 //>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s4: %f:s[4-7] = p_startpgm 522 if (!setup_cs("v1 v1 v1 v1 s2 s4", GFX10_3)) 523 return; 524 525 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 526 startpgm->definitions[0].setFixed(PhysReg(256)); 527 startpgm->definitions[1].setFixed(PhysReg(257)); 528 startpgm->definitions[2].setFixed(PhysReg(258)); 529 startpgm->definitions[3].setFixed(PhysReg(259)); 530 startpgm->definitions[4].setFixed(PhysReg(0)); 531 startpgm->definitions[5].setFixed(PhysReg(4)); 532 533 Operand a(inputs[0], PhysReg(256)); /* source for DPP */ 534 Operand b(inputs[1], PhysReg(257)); /* source for fadd */ 535 Operand c(inputs[2], PhysReg(258)); /* buffer store address */ 536 Operand d(inputs[3], PhysReg(259)); /* buffer store value */ 537 Operand e(inputs[4], PhysReg(0)); /* condition */ 538 Operand f(inputs[5], PhysReg(4)); /* buffer descriptor */ 539 PhysReg reg_v12(268); /* temporary register */ 540 541 Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror); 542 543 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec 544 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 545 546 emit_divergent_if_else( 547 program.get(), bld, e, 548 [&]() -> void __anon4f1ff3420102() 549 { 550 /* --- logical then --- */ 551 //! BB1 552 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 553 //! p_logical_start 554 555 //! buffer_store_dword %f:s[4-7], %c:v[2], 0, %d:v[3] offen 556 bld.mubuf(aco_opcode::buffer_store_dword, f, c, Operand::zero(), d, 0, true); 557 558 //! v1: %res10:v[12] = v_add_f32 %a:v[0], %b:v[1] row_mirror bound_ctrl:1 fi 559 //! p_unit_test 10, %res10:v[12] 560 Temp result = 561 bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); 562 writeout(10, Operand(result, reg_v12)); 563 564 //! p_logical_end 565 //! s2: %0:vcc = p_branch BB3 566 567 /* --- linear then --- */ 568 //! BB2 569 //! /* logical preds: / linear preds: BB0, / kind: */ 570 //! s2: %0:vcc = p_branch BB3 571 572 /* --- invert --- */ 573 //! BB3 574 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 575 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 576 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 577 }, 578 [&]() -> void __anon4f1ff3420202() 579 { 580 /* --- logical else --- */ 581 //! BB4 582 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 583 //! p_logical_start 584 //! p_logical_end 585 //! s2: %0:vcc = p_branch BB6 586 587 /* --- linear else --- */ 588 //! BB5 589 //! /* logical preds: / linear preds: BB3, / kind: */ 590 //! s2: %0:vcc = p_branch BB6 591 }); 592 593 /* --- merge block --- */ 594 //! BB6 595 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 596 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 597 598 finish_optimizer_postRA_test(); 599 END_TEST 600 601 BEGIN_TEST(optimizer_postRA.dpp_across_cf_overwritten) 602 //>> v1: %a:v[0], v1: %b:v[1], s4: %c:s[4-7], v1: %d:v[3], s2: %e:s[0-1], s1: %f:s[2] = p_startpgm 603 if (!setup_cs("v1 v1 s4 v1 s2 s1", GFX10_3)) 604 return; 605 606 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 607 startpgm->definitions[0].setFixed(PhysReg(256)); 608 startpgm->definitions[1].setFixed(PhysReg(257)); 609 startpgm->definitions[2].setFixed(PhysReg(4)); 610 startpgm->definitions[3].setFixed(PhysReg(259)); 611 startpgm->definitions[4].setFixed(PhysReg(0)); 612 startpgm->definitions[5].setFixed(PhysReg(2)); 613 614 Operand a(inputs[0], PhysReg(256)); /* source for DPP */ 615 Operand b(inputs[1], PhysReg(257)); /* source for fadd */ 616 Operand c(inputs[2], PhysReg(4)); /* buffer descriptor */ 617 Operand d(inputs[3], PhysReg(259)); /* buffer store value */ 618 Operand e(inputs[4], PhysReg(0)); /* condition */ 619 Operand f(inputs[5], PhysReg(2)); /* buffer store address (scalar) */ 620 PhysReg reg_v12(268); /* temporary register */ 621 622 //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 623 Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror); 624 625 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec 626 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 627 628 emit_divergent_if_else( 629 program.get(), bld, e, 630 [&]() -> void __anon4f1ff3420302() 631 { 632 /* --- logical then --- */ 633 //! BB1 634 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 635 //! p_logical_start 636 637 //! v1: %addr:v[0] = p_parallelcopy %f:s[2] 638 Temp addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), f); 639 640 //! buffer_store_dword %c:s[4-7], %addr:v[0], 0, %d:v[3] offen 641 bld.mubuf(aco_opcode::buffer_store_dword, c, Operand(addr, a.physReg()), Operand::zero(), 642 d, 0, true); 643 644 //! p_logical_end 645 //! s2: %0:vcc = p_branch BB3 646 647 /* --- linear then --- */ 648 //! BB2 649 //! /* logical preds: / linear preds: BB0, / kind: */ 650 //! s2: %0:vcc = p_branch BB3 651 652 /* --- invert --- */ 653 //! BB3 654 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 655 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 656 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 657 }, 658 [&]() -> void __anon4f1ff3420402() 659 { 660 /* --- logical else --- */ 661 //! BB4 662 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 663 //! p_logical_start 664 //! p_logical_end 665 //! s2: %0:vcc = p_branch BB6 666 667 /* --- linear else --- */ 668 //! BB5 669 //! /* logical preds: / linear preds: BB3, / kind: */ 670 //! s2: %0:vcc = p_branch BB6 671 }); 672 673 /* --- merge block --- */ 674 //! BB6 675 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 676 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 677 678 //! v1: %result:v[12] = v_add_f32 %dpp_mov_tmp:v[12], %b:v[1] 679 Temp result = 680 bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); 681 //! p_unit_test 10, %result:v[12] 682 writeout(10, Operand(result, reg_v12)); 683 684 finish_optimizer_postRA_test(); 685 END_TEST 686 687 BEGIN_TEST(optimizer_postRA.dpp_across_cf_linear_clobber) 688 //>> v1: %a:v[0], v1: %b:v[1], s2: %c:s[0-1] = p_startpgm 689 if (!setup_cs("v1 v1 s2", GFX10_3)) 690 return; 691 692 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 693 startpgm->definitions[0].setFixed(PhysReg(256)); 694 startpgm->definitions[1].setFixed(PhysReg(257)); 695 startpgm->definitions[2].setFixed(PhysReg(0)); 696 697 Operand a(inputs[0], PhysReg(256)); /* source for DPP */ 698 Operand b(inputs[1], PhysReg(257)); /* source for fadd */ 699 Operand c(inputs[2], PhysReg(0)); /* condition */ 700 PhysReg reg_v12(268); /* temporary register */ 701 702 //! v1: %dpp_tmp:v[12] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1 fi 703 Temp dpp_tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v12), a, dpp_row_mirror); 704 705 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %c:s[0-1], %0:exec 706 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 707 708 emit_divergent_if_else( 709 program.get(), bld, c, 710 [&]() -> void __anon4f1ff3420502() 711 { 712 /* --- logical then --- */ 713 //! BB1 714 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 715 //! p_logical_start 716 717 //! v1: %clobber:v[0] = p_parallelcopy 0 718 Temp clobber = 719 bld.pseudo(aco_opcode::p_parallelcopy, bld.def(v1, a.physReg()), Operand::c32(0)); 720 721 //! p_unit_test 0, %clobber:v[0] 722 writeout(0, Operand(clobber, a.physReg())); 723 724 //! p_logical_end 725 //! s2: %0:vcc = p_branch BB3 726 727 /* --- linear then --- */ 728 //! BB2 729 //! /* logical preds: / linear preds: BB0, / kind: */ 730 //! s2: %0:vcc = p_branch BB3 731 732 /* --- invert --- */ 733 //! BB3 734 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 735 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 736 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 737 }, 738 [&]() -> void __anon4f1ff3420602() 739 { 740 /* --- logical else --- */ 741 //! BB4 742 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 743 //! p_logical_start 744 745 //! v1: %result:v[12] = v_add_f32 %dpp_mov_tmp:v[12], %b:v[1] 746 Temp result = 747 bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v12), Operand(dpp_tmp, reg_v12), b); 748 //! p_unit_test 1, %result:v[12] 749 writeout(1, Operand(result, reg_v12)); 750 751 //! p_logical_end 752 //! s2: %0:vcc = p_branch BB6 753 754 /* --- linear else --- */ 755 //! BB5 756 //! /* logical preds: / linear preds: BB3, / kind: */ 757 //! s2: %0:vcc = p_branch BB6 758 }); 759 760 /* --- merge block --- */ 761 //! BB6 762 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 763 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 764 765 finish_optimizer_postRA_test(); 766 END_TEST 767 768 BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf) 769 //>> s2: %a:s[2-3], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s4: %f:s[4-7] = p_startpgm 770 if (!setup_cs("s2 v1 v1 s2 s4", GFX10_3)) 771 return; 772 773 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 774 startpgm->definitions[0].setFixed(PhysReg(2)); 775 startpgm->definitions[1].setFixed(PhysReg(258)); 776 startpgm->definitions[2].setFixed(PhysReg(259)); 777 startpgm->definitions[3].setFixed(PhysReg(0)); 778 startpgm->definitions[4].setFixed(PhysReg(4)); 779 780 Operand a(inputs[0], PhysReg(2)); /* source for s_and */ 781 Operand c(inputs[1], PhysReg(258)); /* buffer store address */ 782 Operand d(inputs[2], PhysReg(259)); /* buffer store value */ 783 Operand e(inputs[3], PhysReg(0)); /* condition */ 784 Operand f(inputs[4], PhysReg(4)); /* buffer descriptor */ 785 PhysReg reg_s8(8); /* temporary register */ 786 787 auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a, 788 Operand::c32(0x40018u)); 789 790 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec 791 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 792 793 emit_divergent_if_else( 794 program.get(), bld, e, 795 [&]() -> void __anon4f1ff3420702() 796 { 797 /* --- logical then --- */ 798 //! BB1 799 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 800 //! p_logical_start 801 802 //! buffer_store_dword %f:s[4-7], %c:v[2], 0, %d:v[3] offen 803 bld.mubuf(aco_opcode::buffer_store_dword, f, c, Operand::zero(), d, 0, true); 804 805 //! p_logical_end 806 //! s2: %0:vcc = p_branch BB3 807 808 /* --- linear then --- */ 809 //! BB2 810 //! /* logical preds: / linear preds: BB0, / kind: */ 811 //! s2: %0:vcc = p_branch BB3 812 813 /* --- invert --- */ 814 //! BB3 815 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 816 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 817 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 818 }, 819 [&]() -> void __anon4f1ff3420802() 820 { 821 /* --- logical else --- */ 822 //! BB4 823 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 824 //! p_logical_start 825 //! p_logical_end 826 //! s2: %0:vcc = p_branch BB6 827 828 /* --- linear else --- */ 829 //! BB5 830 //! /* logical preds: / linear preds: BB3, / kind: */ 831 //! s2: %0:vcc = p_branch BB6 832 }); 833 834 /* --- merge block --- */ 835 //! BB6 836 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 837 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 838 839 //! s2: %tmp_salu:s[8-9], s1: %br_scc:scc = s_and_b64 %a:s[2-3], 0x40018 840 //! s2: %br_vcc:vcc = p_cbranch_z %br_scc:scc 841 //! p_unit_test 5, %br_vcc:vcc 842 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(tmp_salu, reg_s8), 843 Operand::zero()); 844 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 845 writeout(5, Operand(br, vcc)); 846 847 finish_optimizer_postRA_test(); 848 END_TEST 849 850 BEGIN_TEST(optimizer_postRA.scc_nocmp_across_cf_partially_overwritten) 851 //>> s2: %a:s[2-3], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1], s1: %f:s[4], s4: %g:s[8-11] = p_startpgm 852 if (!setup_cs("s2 v1 v1 s2 s1 s4", GFX10_3)) 853 return; 854 855 aco_ptr<Instruction>& startpgm = bld.instructions->at(0); 856 startpgm->definitions[0].setFixed(PhysReg(2)); 857 startpgm->definitions[1].setFixed(PhysReg(258)); 858 startpgm->definitions[2].setFixed(PhysReg(259)); 859 startpgm->definitions[3].setFixed(PhysReg(0)); 860 startpgm->definitions[4].setFixed(PhysReg(4)); 861 startpgm->definitions[5].setFixed(PhysReg(8)); 862 863 Operand a(inputs[0], PhysReg(2)); /* source for s_and */ 864 Operand c(inputs[1], PhysReg(258)); /* buffer store address */ 865 Operand d(inputs[2], PhysReg(259)); /* buffer store value */ 866 Operand e(inputs[3], PhysReg(0)); /* condition */ 867 Operand f(inputs[4], PhysReg(4)); /* overwrite value */ 868 Operand g(inputs[5], PhysReg(8)); /* buffer descriptor */ 869 PhysReg reg_s3(3); /* temporary register */ 870 PhysReg reg_s8(8); /* temporary register */ 871 872 //! s2: %tmp_salu:s[8-9], s1: %tmp_salu_scc:scc = s_and_b64 %a:s[2-3], 0x40018 873 auto tmp_salu = bld.sop2(aco_opcode::s_and_b64, bld.def(s2, reg_s8), bld.def(s1, scc), a, 874 Operand::c32(0x40018u)); 875 876 //! s2: %saved_exec:s[84-85], s1: %0:scc, s2: %0:exec = s_and_saveexec_b64 %e:s[0-1], %0:exec 877 //! s2: %0:vcc = p_cbranch_nz BB1, BB2 878 879 emit_divergent_if_else( 880 program.get(), bld, e, 881 [&]() -> void __anon4f1ff3420902() 882 { 883 /* --- logical then --- */ 884 //! BB1 885 //! /* logical preds: BB0, / linear preds: BB0, / kind: */ 886 //! p_logical_start 887 888 //! s1: %ovrwr:s[3] = p_parallelcopy %f:s[4] 889 Temp s_addr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1, reg_s3), f); 890 891 //! buffer_store_dword %g:s[8-11], %c:v[2], %ovrwr:s[3], %d:v[3] offen 892 bld.mubuf(aco_opcode::buffer_store_dword, g, c, Operand(s_addr, reg_s3), d, 0, true); 893 894 //! p_logical_end 895 //! s2: %0:vcc = p_branch BB3 896 897 /* --- linear then --- */ 898 //! BB2 899 //! /* logical preds: / linear preds: BB0, / kind: */ 900 //! s2: %0:vcc = p_branch BB3 901 902 /* --- invert --- */ 903 //! BB3 904 //! /* logical preds: / linear preds: BB1, BB2, / kind: invert, */ 905 //! s2: %0:exec, s1: %0:scc = s_andn2_b64 %saved_exec:s[84-85], %0:exec 906 //! s2: %0:vcc = p_cbranch_nz BB4, BB5 907 }, 908 [&]() -> void __anon4f1ff3420a02() 909 { 910 /* --- logical else --- */ 911 //! BB4 912 //! /* logical preds: BB0, / linear preds: BB3, / kind: */ 913 //! p_logical_start 914 //! p_logical_end 915 //! s2: %0:vcc = p_branch BB6 916 917 /* --- linear else --- */ 918 //! BB5 919 //! /* logical preds: / linear preds: BB3, / kind: */ 920 //! s2: %0:vcc = p_branch BB6 921 }); 922 923 /* --- merge block --- */ 924 //! BB6 925 //! /* logical preds: BB1, BB4, / linear preds: BB4, BB5, / kind: uniform, top-level, merge, */ 926 //! s2: %0:exec = p_parallelcopy %saved_exec:s[84-85] 927 928 //! s1: %br_scc:scc = s_cmp_lg_u64 %tmp_salu:s[8-9], 0 929 //! s2: %br_vcc:vcc = p_cbranch_z %br_scc:scc 930 //! p_unit_test 5, %br_vcc:vcc 931 auto scmp = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(tmp_salu, reg_s8), 932 Operand::zero()); 933 auto br = bld.branch(aco_opcode::p_cbranch_z, bld.def(s2, vcc), bld.scc(scmp)); 934 writeout(5, Operand(br, vcc)); 935 936 finish_optimizer_postRA_test(); 937 END_TEST 938