1 /* 2 * Copyright (C) 2023 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_ 18 #define BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_ 19 20 #include "berberis/backend/x86_64/machine_ir.h" 21 #include "berberis/backend/x86_64/machine_ir_builder.h" 22 #include "berberis/base/arena_map.h" 23 #include "berberis/base/checks.h" 24 #include "berberis/base/dependent_false.h" 25 #include "berberis/decoder/riscv64/decoder.h" 26 #include "berberis/decoder/riscv64/semantics_player.h" 27 #include "berberis/guest_state/guest_addr.h" 28 #include "berberis/guest_state/guest_state_arch.h" 29 #include "berberis/guest_state/guest_state_opaque.h" 30 #include "berberis/intrinsics/intrinsics.h" 31 #include "berberis/intrinsics/macro_assembler.h" 32 #include "berberis/runtime_primitives/memory_region_reservation.h" 33 #include "berberis/runtime_primitives/platform.h" 34 35 #include "call_intrinsic.h" 36 #include "inline_intrinsic.h" 37 #include "simd_register.h" 38 39 namespace berberis { 40 41 class HeavyOptimizerFrontend { 42 public: 43 using CsrName = berberis::CsrName; 44 using Decoder = Decoder<SemanticsPlayer<HeavyOptimizerFrontend>>; 45 using Register = MachineReg; 46 static constexpr Register no_register = MachineReg{}; 47 using FpRegister = SimdReg; 48 static constexpr SimdReg no_fp_register = SimdReg{}; 49 using Float32 = intrinsics::Float32; 50 using Float64 = intrinsics::Float64; 51 52 struct MemoryOperand { 53 Register base{0}; 54 // We call the following field "index" even though we do not scale it at the 55 // moment. We can add a scale as the need arises. 56 Register index{0}; 57 uint64_t disp = 0; 58 }; 59 HeavyOptimizerFrontend(x86_64::MachineIR * machine_ir,GuestAddr pc)60 explicit HeavyOptimizerFrontend(x86_64::MachineIR* machine_ir, GuestAddr pc) 61 : pc_(pc), 62 success_(true), 63 builder_(machine_ir), 64 flag_register_(machine_ir->AllocVReg()), 65 is_uncond_branch_(false), 66 branch_targets_(machine_ir->arena()) { 67 StartRegion(); 68 } 69 70 void CompareAndBranch(Decoder::BranchOpcode opcode, Register arg1, Register arg2, int16_t offset); 71 void Branch(int32_t offset); 72 void BranchRegister(Register base, int16_t offset); 73 74 [[nodiscard]] Register GetImm(uint64_t imm); Copy(Register value)75 [[nodiscard]] Register Copy(Register value) { 76 Register result = AllocTempReg(); 77 Gen<PseudoCopy>(result, value, 8); 78 return result; 79 } 80 81 [[nodiscard]] Register GetReg(uint8_t reg); 82 void SetReg(uint8_t reg, Register value); 83 84 void Undefined(); 85 // 86 // Instruction implementations. 87 // 88 void Nop(); 89 Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2); 90 Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2); 91 Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm); 92 Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm); 93 Register Slli(Register arg, int8_t imm); 94 Register Srli(Register arg, int8_t imm); 95 Register Srai(Register arg, int8_t imm); 96 Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm); 97 Register Rori(Register arg, int8_t shamt); 98 Register Roriw(Register arg, int8_t shamt); 99 Register Lui(int32_t imm); 100 Register Auipc(int32_t imm); 101 Ecall(Register,Register,Register,Register,Register,Register,Register)102 Register Ecall(Register /* syscall_nr */, 103 Register /* arg0 */, 104 Register /* arg1 */, 105 Register /* arg2 */, 106 Register /* arg3 */, 107 Register /* arg4 */, 108 Register /* arg5 */) { 109 Undefined(); 110 return {}; 111 } 112 113 void Store(Decoder::MemoryDataOperandType operand_type, 114 Register arg, 115 int16_t offset, 116 Register data); 117 Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset); 118 119 template <typename IntType> ToLoadOperandType()120 constexpr Decoder::LoadOperandType ToLoadOperandType() { 121 if constexpr (std::is_same_v<IntType, int8_t>) { 122 return Decoder::LoadOperandType::k8bitSigned; 123 } else if constexpr (std::is_same_v<IntType, int16_t>) { 124 return Decoder::LoadOperandType::k16bitSigned; 125 } else if constexpr (std::is_same_v<IntType, int32_t>) { 126 return Decoder::LoadOperandType::k32bitSigned; 127 } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) { 128 return Decoder::LoadOperandType::k64bit; 129 } else if constexpr (std::is_same_v<IntType, uint8_t>) { 130 return Decoder::LoadOperandType::k8bitUnsigned; 131 } else if constexpr (std::is_same_v<IntType, uint16_t>) { 132 return Decoder::LoadOperandType::k16bitUnsigned; 133 } else if constexpr (std::is_same_v<IntType, uint32_t>) { 134 return Decoder::LoadOperandType::k32bitUnsigned; 135 } else { 136 static_assert(kDependentTypeFalse<IntType>); 137 } 138 } 139 140 template <typename IntType> ToMemoryDataOperandType()141 constexpr Decoder::MemoryDataOperandType ToMemoryDataOperandType() { 142 if constexpr (std::is_same_v<IntType, int8_t> || std::is_same_v<IntType, uint8_t>) { 143 return Decoder::MemoryDataOperandType::k8bit; 144 } else if constexpr (std::is_same_v<IntType, int16_t> || std::is_same_v<IntType, uint16_t>) { 145 return Decoder::MemoryDataOperandType::k16bit; 146 } else if constexpr (std::is_same_v<IntType, int32_t> || std::is_same_v<IntType, uint32_t>) { 147 return Decoder::MemoryDataOperandType::k32bit; 148 } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) { 149 return Decoder::MemoryDataOperandType::k64bit; 150 } else { 151 static_assert(kDependentTypeFalse<IntType>); 152 } 153 } 154 155 // Versions without recovery can be used to access non-guest memory (e.g. CPUState). 156 Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type, Register base, int32_t disp); 157 Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type, 158 Register base, 159 Register index, 160 int32_t disp); 161 void StoreWithoutRecovery(Decoder::MemoryDataOperandType operand_type, 162 Register base, 163 int32_t disp, 164 Register val); 165 void StoreWithoutRecovery(Decoder::MemoryDataOperandType operand_type, 166 Register base, 167 Register index, 168 int32_t disp, 169 Register val); 170 171 // 172 // Atomic extensions. 173 // 174 175 template <typename IntType, bool aq, bool rl> Lr(Register addr)176 Register Lr(Register addr) { 177 Register aligned_addr = AllocTempReg(); 178 Gen<PseudoCopy>(aligned_addr, addr, 8); 179 // The immediate is sign extended to 64-bit. 180 Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{sizeof(Reservation) - 1}, GetFlagsRegister()); 181 182 MemoryRegionReservationLoad(aligned_addr); 183 184 Register addr_offset = AllocTempReg(); 185 Gen<PseudoCopy>(addr_offset, addr, 8); 186 Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister()); 187 188 // Load the requested part from CPUState. 189 return LoadWithoutRecovery(ToLoadOperandType<IntType>(), 190 x86_64::kMachineRegRBP, 191 addr_offset, 192 GetThreadStateReservationValueOffset()); 193 } 194 195 template <typename IntType, bool aq, bool rl> Sc(Register addr,Register data)196 Register Sc(Register addr, Register data) { 197 // Compute aligned_addr. 198 auto aligned_addr = AllocTempReg(); 199 Gen<PseudoCopy>(aligned_addr, addr, 8); 200 // The immediate is sign extended to 64-bit. 201 Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{sizeof(Reservation) - 1}, GetFlagsRegister()); 202 203 // Load current monitor value before we clobber it. 204 auto reservation_value = AllocTempReg(); 205 int32_t value_offset = GetThreadStateReservationValueOffset(); 206 Gen<x86_64::MovqRegMemBaseDisp>(reservation_value, x86_64::kMachineRegRBP, value_offset); 207 Register addr_offset = AllocTempReg(); 208 Gen<PseudoCopy>(addr_offset, addr, 8); 209 Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister()); 210 // It's okay to clobber reservation_value since we clear out reservation_address in 211 // MemoryRegionReservationExchange anyway. 212 StoreWithoutRecovery(ToMemoryDataOperandType<IntType>(), 213 x86_64::kMachineRegRBP, 214 addr_offset, 215 value_offset, 216 data); 217 218 return MemoryRegionReservationExchange(aligned_addr, reservation_value); 219 } 220 221 void Fence(Decoder::FenceOpcode opcode, 222 Register src, 223 bool sw, 224 bool sr, 225 bool so, 226 bool si, 227 bool pw, 228 bool pr, 229 bool po, 230 bool pi); 231 232 // 233 // F and D extensions. 234 // 235 [[nodiscard]] FpRegister GetFpReg(uint8_t reg); 236 237 template <typename FloatType> GetFRegAndUnboxNan(uint8_t reg)238 [[nodiscard]] FpRegister GetFRegAndUnboxNan(uint8_t reg) { 239 CHECK_LE(reg, kNumGuestFpRegs); 240 FpRegister result = AllocTempSimdReg(); 241 builder_.GenGetSimd<8>(result.machine_reg(), GetThreadStateFRegOffset(reg)); 242 FpRegister unboxed_result = AllocTempSimdReg(); 243 if (host_platform::kHasAVX) { 244 builder_.Gen<x86_64::MacroUnboxNanFloat32AVX>(unboxed_result.machine_reg(), 245 result.machine_reg()); 246 } else { 247 builder_.Gen<x86_64::MacroUnboxNanFloat32>(unboxed_result.machine_reg(), 248 result.machine_reg()); 249 } 250 return unboxed_result; 251 } 252 253 template <typename FloatType> NanBoxFpReg(FpRegister value)254 void NanBoxFpReg(FpRegister value) { 255 if (host_platform::kHasAVX) { 256 builder_.Gen<x86_64::MacroNanBoxFloat32AVX>(value.machine_reg(), value.machine_reg()); 257 } else { 258 builder_.Gen<x86_64::MacroNanBoxFloat32>(value.machine_reg()); 259 } 260 } 261 262 template <typename FloatType> NanBoxAndSetFpReg(uint8_t reg,FpRegister value)263 void NanBoxAndSetFpReg(uint8_t reg, FpRegister value) { 264 CHECK_LE(reg, kNumGuestFpRegs); 265 if (success()) { 266 NanBoxFpReg<FloatType>(value); 267 builder_.GenSetSimd<8>(GetThreadStateFRegOffset(reg), value.machine_reg()); 268 } 269 } 270 271 template <typename DataType> LoadFp(Register arg,int16_t offset)272 FpRegister LoadFp(Register arg, int16_t offset) { 273 auto res = AllocTempSimdReg(); 274 if constexpr (std::is_same_v<DataType, Float32>) { 275 Gen<x86_64::MovssXRegMemBaseDisp>(res.machine_reg(), arg, offset); 276 } else if constexpr (std::is_same_v<DataType, Float64>) { 277 Gen<x86_64::MovsdXRegMemBaseDisp>(res.machine_reg(), arg, offset); 278 } else { 279 static_assert(kDependentTypeFalse<DataType>); 280 } 281 return res; 282 } 283 284 template <typename DataType> StoreFp(Register arg,int16_t offset,FpRegister data)285 void StoreFp(Register arg, int16_t offset, FpRegister data) { 286 if constexpr (std::is_same_v<DataType, Float32>) { 287 Gen<x86_64::MovssMemBaseDispXReg>(arg, offset, data.machine_reg()); 288 } else if constexpr (std::is_same_v<DataType, Float64>) { 289 Gen<x86_64::MovsdMemBaseDispXReg>(arg, offset, data.machine_reg()); 290 } else { 291 static_assert(kDependentTypeFalse<DataType>); 292 } 293 } 294 Fmv(FpRegister arg)295 FpRegister Fmv(FpRegister arg) { 296 auto res = AllocTempSimdReg(); 297 Gen<PseudoCopy>(res.machine_reg(), arg.machine_reg(), 16); 298 return res; 299 } 300 301 // 302 // V extension. 303 // 304 305 template <typename VOpArgs, typename... ExtraAegs> OpVector(const VOpArgs &,ExtraAegs...)306 void OpVector(const VOpArgs& /*args*/, ExtraAegs... /*extra_args*/) { 307 // TODO(b/300690740): develop and implement strategy which would allow us to support vector 308 // intrinsics not just in the interpreter. 309 Undefined(); 310 } 311 312 // 313 // Csr 314 // 315 316 Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr); 317 Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr); 318 success()319 [[nodiscard]] bool success() const { return success_; } 320 321 // 322 // Intrinsic proxy methods. 323 // 324 325 #include "berberis/intrinsics/translator_intrinsics_hooks-inl.h" 326 327 // 328 // Guest state getters/setters. 329 // 330 GetInsnAddr()331 [[nodiscard]] GuestAddr GetInsnAddr() const { return pc_; } IncrementInsnAddr(uint8_t insn_size)332 void IncrementInsnAddr(uint8_t insn_size) { pc_ += insn_size; } 333 334 [[nodiscard]] bool IsRegionEndReached() const; 335 void StartInsn(); 336 void Finalize(GuestAddr stop_pc); 337 338 // These methods are exported only for testing. branch_targets()339 [[nodiscard]] const ArenaMap<GuestAddr, MachineInsnPosition>& branch_targets() const { 340 return branch_targets_; 341 } 342 343 template <CsrName kName> GetCsr()344 [[nodiscard]] Register GetCsr() { 345 auto csr_reg = AllocTempReg(); 346 if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) { 347 Gen<x86_64::MovzxblRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>); 348 } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) { 349 Gen<x86_64::MovqRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>); 350 } else { 351 static_assert(kDependentTypeFalse<CsrFieldType<kName>>); 352 } 353 return csr_reg; 354 } 355 356 template <CsrName kName> SetCsr(uint8_t imm)357 void SetCsr(uint8_t imm) { 358 // Note: csr immediate only have 5 bits in RISC-V encoding which guarantess us that 359 // “imm & kCsrMask<kName>”can be used as 8-bit immediate. 360 if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) { 361 Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, 362 kCsrFieldOffset<kName>, 363 static_cast<int8_t>(imm & kCsrMask<kName>)); 364 } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) { 365 Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, 366 kCsrFieldOffset<kName>, 367 static_cast<int8_t>(imm & kCsrMask<kName>)); 368 } else { 369 static_assert(kDependentTypeFalse<CsrFieldType<kName>>); 370 } 371 } 372 373 template <CsrName kName> SetCsr(Register arg)374 void SetCsr(Register arg) { 375 auto tmp = AllocTempReg(); 376 Gen<PseudoCopy>(tmp, arg, sizeof(CsrFieldType<kName>)); 377 if constexpr (sizeof(CsrFieldType<kName>) == 1) { 378 Gen<x86_64::AndbRegImm>(tmp, kCsrMask<kName>, GetFlagsRegister()); 379 Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp); 380 } else if constexpr (sizeof(CsrFieldType<kName>) == 8) { 381 Gen<x86_64::AndqRegMemAbsolute>( 382 tmp, constants_pool::kConst<uint64_t{kCsrMask<kName>}>, GetFlagsRegister()); 383 Gen<x86_64::MovqMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp); 384 } else { 385 static_assert(kDependentTypeFalse<CsrFieldType<kName>>); 386 } 387 } 388 389 private: 390 // Specialization for AssemblerResType=void 391 template <auto kFunction, 392 typename AssemblerResType, 393 typename... AssemblerArgType, 394 std::enable_if_t<std::is_same_v<std::decay_t<AssemblerResType>, void>, bool> = true> CallIntrinsic(AssemblerArgType...args)395 void CallIntrinsic(AssemblerArgType... args) { 396 if (TryInlineIntrinsicForHeavyOptimizerVoid<kFunction>( 397 &builder_, GetFlagsRegister(), args...)) { 398 return; 399 } 400 401 CallIntrinsicImpl(&builder_, kFunction, GetFlagsRegister(), args...); 402 } 403 404 template <auto kFunction, 405 typename AssemblerResType, 406 typename... AssemblerArgType, 407 std::enable_if_t<!std::is_same_v<std::decay_t<AssemblerResType>, void>, bool> = true> CallIntrinsic(AssemblerArgType...args)408 AssemblerResType CallIntrinsic(AssemblerArgType... args) { 409 AssemblerResType result; 410 411 if constexpr (std::is_same_v<AssemblerResType, Register>) { 412 result = AllocTempReg(); 413 } else if constexpr (std::is_same_v<AssemblerResType, SimdReg>) { 414 result = AllocTempSimdReg(); 415 } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<Register, Register>>) { 416 result = {AllocTempReg(), AllocTempReg()}; 417 } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, Register>>) { 418 result = {AllocTempSimdReg(), AllocTempReg()}; 419 } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, SimdReg>>) { 420 result = {AllocTempSimdReg(), AllocTempSimdReg()}; 421 } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, SimdReg, SimdReg>>) { 422 result = {AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg()}; 423 } else if constexpr (std::is_same_v<AssemblerResType, 424 std::tuple<SimdReg, SimdReg, SimdReg, SimdReg>>) { 425 result = {AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg()}; 426 } else { 427 // This should not be reached by the compiler. If it is - there is a new result type that 428 // needs to be supported. 429 static_assert(kDependentTypeFalse<AssemblerResType>, "Unsupported result type"); 430 } 431 432 if (TryInlineIntrinsicForHeavyOptimizer<kFunction>( 433 &builder_, result, GetFlagsRegister(), args...)) { 434 return result; 435 } 436 437 CallIntrinsicImpl(&builder_, kFunction, result, GetFlagsRegister(), args...); 438 return result; 439 } 440 441 void MemoryRegionReservationLoad(Register aligned_addr); 442 Register MemoryRegionReservationExchange(Register aligned_addr, Register curr_reservation_value); 443 void MemoryRegionReservationSwapWithLockedOwner(Register aligned_addr, 444 Register curr_reservation_value, 445 Register new_reservation_value, 446 MachineBasicBlock* failure_bb); 447 448 // Syntax sugar. 449 template <typename InsnType, typename... Args> Gen(Args...args)450 /*may_discard*/ InsnType* Gen(Args... args) { 451 return builder_.Gen<InsnType, Args...>(args...); 452 } 453 454 static x86_64::Assembler::Condition ToAssemblerCond(Decoder::BranchOpcode opcode); 455 456 [[nodiscard]] Register AllocTempReg(); 457 [[nodiscard]] SimdReg AllocTempSimdReg(); GetFlagsRegister()458 [[nodiscard]] Register GetFlagsRegister() const { return flag_register_; }; 459 460 void GenJump(GuestAddr target); 461 void ExitGeneratedCode(GuestAddr target); 462 void ExitRegionIndirect(Register target); 463 464 void GenRecoveryBlockForLastInsn(); 465 466 void ResolveJumps(); 467 void ReplaceJumpWithBranch(MachineBasicBlock* bb, MachineBasicBlock* target_bb); 468 void UpdateBranchTargetsAfterSplit(GuestAddr addr, 469 const MachineBasicBlock* old_bb, 470 MachineBasicBlock* new_bb); 471 StartRegion()472 void StartRegion() { 473 auto* region_entry_bb = builder_.ir()->NewBasicBlock(); 474 auto* cont_bb = builder_.ir()->NewBasicBlock(); 475 builder_.ir()->AddEdge(region_entry_bb, cont_bb); 476 builder_.StartBasicBlock(region_entry_bb); 477 Gen<PseudoBranch>(cont_bb); 478 builder_.StartBasicBlock(cont_bb); 479 } 480 481 GuestAddr pc_; 482 bool success_; 483 x86_64::MachineIRBuilder builder_; 484 MachineReg flag_register_; 485 bool is_uncond_branch_; 486 // Contains IR positions of all guest instructions of the current region. 487 // Also contains all branch targets which the current region jumps to. 488 // If the target is outside of the current region the position is uninitialized, 489 // i.e. it's basic block (position.first) is nullptr. 490 ArenaMap<GuestAddr, MachineInsnPosition> branch_targets_; 491 }; 492 493 template <> 494 [[nodiscard]] inline HeavyOptimizerFrontend::FpRegister 495 HeavyOptimizerFrontend::GetFRegAndUnboxNan<intrinsics::Float64>(uint8_t reg) { 496 return GetFpReg(reg); 497 } 498 499 template <> 500 inline void HeavyOptimizerFrontend::NanBoxFpReg<intrinsics::Float64>(FpRegister) {} 501 502 template <> 503 [[nodiscard]] inline HeavyOptimizerFrontend::Register 504 HeavyOptimizerFrontend::GetCsr<CsrName::kCycle>() { 505 return CPUClockCount(); 506 } 507 508 template <> 509 [[nodiscard]] inline HeavyOptimizerFrontend::Register 510 HeavyOptimizerFrontend::GetCsr<CsrName::kFCsr>() { 511 auto csr_reg = AllocTempReg(); 512 auto tmp = AllocTempReg(); 513 InlineIntrinsicForHeavyOptimizer<&intrinsics::FeGetExceptions>( 514 &builder_, tmp, GetFlagsRegister()); 515 Gen<x86_64::MovzxbqRegMemBaseDisp>( 516 csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>); 517 Gen<x86_64::ShlbRegImm>(csr_reg, 5, GetFlagsRegister()); 518 Gen<x86_64::OrbRegReg>(csr_reg, tmp, GetFlagsRegister()); 519 return csr_reg; 520 } 521 522 template <> 523 [[nodiscard]] inline HeavyOptimizerFrontend::Register 524 HeavyOptimizerFrontend::GetCsr<CsrName::kFFlags>() { 525 return FeGetExceptions(); 526 } 527 528 template <> 529 [[nodiscard]] inline HeavyOptimizerFrontend::Register 530 HeavyOptimizerFrontend::GetCsr<CsrName::kVlenb>() { 531 return GetImm(16); 532 } 533 534 template <> 535 [[nodiscard]] inline HeavyOptimizerFrontend::Register 536 HeavyOptimizerFrontend::GetCsr<CsrName::kVxrm>() { 537 auto reg = AllocTempReg(); 538 Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>); 539 Gen<x86_64::AndbRegImm>(reg, 0b11, GetFlagsRegister()); 540 return reg; 541 } 542 543 template <> 544 [[nodiscard]] inline HeavyOptimizerFrontend::Register 545 HeavyOptimizerFrontend::GetCsr<CsrName::kVxsat>() { 546 auto reg = AllocTempReg(); 547 Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>); 548 Gen<x86_64::ShrbRegImm>(reg, 2, GetFlagsRegister()); 549 return reg; 550 } 551 552 template <> 553 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(uint8_t imm) { 554 // Note: instructions Csrrci or Csrrsi couldn't affect Frm because immediate only has five bits. 555 // But these instruction don't pass their immediate-specified argument into `SetCsr`, they combine 556 // it with register first. Fixing that can only be done by changing code in the semantics player. 557 // 558 // But Csrrwi may clear it. And we actually may only arrive here from Csrrwi. 559 // Thus, technically, we know that imm >> 5 is always zero, but it doesn't look like a good idea 560 // to rely on that: it's very subtle and it only affects code generation speed. 561 Gen<x86_64::MovbMemBaseDispImm>( 562 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, static_cast<int8_t>(imm >> 5)); 563 InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRoundImm>( 564 &builder_, GetFlagsRegister(), imm); 565 } 566 567 template <> 568 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(Register arg) { 569 // Check size to be sure we can use Andb and Movb below. 570 static_assert(sizeof(kCsrMask<CsrName::kFrm>) == 1); 571 572 auto exceptions = AllocTempReg(); 573 auto rounding_mode = AllocTempReg(); 574 Gen<PseudoCopy>(exceptions, arg, 1); 575 Gen<x86_64::AndlRegImm>(exceptions, 0b1'1111, GetFlagsRegister()); 576 // We don't care about the data in rounding_mode because we will shift in the 577 // data we need. 578 Gen<PseudoDefReg>(rounding_mode); 579 Gen<x86_64::ShldlRegRegImm>(rounding_mode, arg, int8_t{32 - 5}, GetFlagsRegister()); 580 Gen<x86_64::AndbRegImm>(rounding_mode, kCsrMask<CsrName::kFrm>, GetFlagsRegister()); 581 Gen<x86_64::MovbMemBaseDispReg>( 582 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, rounding_mode); 583 InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRound>( 584 &builder_, GetFlagsRegister(), exceptions, rounding_mode); 585 } 586 587 template <> 588 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(uint8_t imm) { 589 FeSetExceptionsImm(static_cast<int8_t>(imm & 0b1'1111)); 590 } 591 592 template <> 593 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(Register arg) { 594 auto tmp = AllocTempReg(); 595 Gen<PseudoCopy>(tmp, arg, 1); 596 Gen<x86_64::AndlRegImm>(tmp, 0b1'1111, GetFlagsRegister()); 597 FeSetExceptions(tmp); 598 } 599 600 template <> 601 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(uint8_t imm) { 602 Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP, 603 kCsrFieldOffset<CsrName::kFrm>, 604 static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>)); 605 FeSetRoundImm(static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>)); 606 } 607 608 template <> 609 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(Register arg) { 610 // Use RCX as temporary register. We know it would be used by FeSetRound, too. 611 auto tmp = AllocTempReg(); 612 Gen<PseudoCopy>(tmp, arg, 1); 613 Gen<x86_64::AndbRegImm>(tmp, kCsrMask<CsrName::kFrm>, GetFlagsRegister()); 614 Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, tmp); 615 FeSetRound(tmp); 616 } 617 618 template <> 619 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(uint8_t imm) { 620 imm &= 0b11; 621 if (imm != 0b11) { 622 Gen<x86_64::AndbMemBaseDispImm>( 623 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister()); 624 } 625 if (imm != 0b00) { 626 Gen<x86_64::OrbMemBaseDispImm>( 627 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, imm, GetFlagsRegister()); 628 } 629 } 630 631 template <> 632 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(Register arg) { 633 Gen<x86_64::AndbMemBaseDispImm>( 634 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister()); 635 Gen<x86_64::AndbRegImm>(arg, 0b11, GetFlagsRegister()); 636 Gen<x86_64::OrbMemBaseDispReg>( 637 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, arg, GetFlagsRegister()); 638 } 639 640 template <> 641 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(uint8_t imm) { 642 if (imm & 0b1) { 643 Gen<x86_64::OrbMemBaseDispImm>( 644 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister()); 645 } else { 646 Gen<x86_64::AndbMemBaseDispImm>( 647 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister()); 648 } 649 } 650 651 template <> 652 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(Register arg) { 653 using Condition = x86_64::Assembler::Condition; 654 Gen<x86_64::AndbMemBaseDispImm>( 655 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister()); 656 Gen<x86_64::TestbRegImm>(arg, 1, GetFlagsRegister()); 657 auto tmp = AllocTempReg(); 658 Gen<x86_64::SetccReg>(Condition::kNotZero, tmp, GetFlagsRegister()); 659 Gen<x86_64::MovzxbqRegReg>(tmp, tmp); 660 Gen<x86_64::ShlbRegImm>(tmp, int8_t{2}, GetFlagsRegister()); 661 Gen<x86_64::OrbMemBaseDispReg>( 662 x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, tmp, GetFlagsRegister()); 663 } 664 665 } // namespace berberis 666 667 #endif /* BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_ */ 668