1 /*
2  * Copyright (C) 2023 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_
18 #define BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_
19 
20 #include "berberis/backend/x86_64/machine_ir.h"
21 #include "berberis/backend/x86_64/machine_ir_builder.h"
22 #include "berberis/base/arena_map.h"
23 #include "berberis/base/checks.h"
24 #include "berberis/base/dependent_false.h"
25 #include "berberis/decoder/riscv64/decoder.h"
26 #include "berberis/decoder/riscv64/semantics_player.h"
27 #include "berberis/guest_state/guest_addr.h"
28 #include "berberis/guest_state/guest_state_arch.h"
29 #include "berberis/guest_state/guest_state_opaque.h"
30 #include "berberis/intrinsics/intrinsics.h"
31 #include "berberis/intrinsics/macro_assembler.h"
32 #include "berberis/runtime_primitives/memory_region_reservation.h"
33 #include "berberis/runtime_primitives/platform.h"
34 
35 #include "call_intrinsic.h"
36 #include "inline_intrinsic.h"
37 #include "simd_register.h"
38 
39 namespace berberis {
40 
41 class HeavyOptimizerFrontend {
42  public:
43   using CsrName = berberis::CsrName;
44   using Decoder = Decoder<SemanticsPlayer<HeavyOptimizerFrontend>>;
45   using Register = MachineReg;
46   static constexpr Register no_register = MachineReg{};
47   using FpRegister = SimdReg;
48   static constexpr SimdReg no_fp_register = SimdReg{};
49   using Float32 = intrinsics::Float32;
50   using Float64 = intrinsics::Float64;
51 
52   struct MemoryOperand {
53     Register base{0};
54     // We call the following field "index" even though we do not scale it at the
55     // moment.  We can add a scale as the need arises.
56     Register index{0};
57     uint64_t disp = 0;
58   };
59 
HeavyOptimizerFrontend(x86_64::MachineIR * machine_ir,GuestAddr pc)60   explicit HeavyOptimizerFrontend(x86_64::MachineIR* machine_ir, GuestAddr pc)
61       : pc_(pc),
62         success_(true),
63         builder_(machine_ir),
64         flag_register_(machine_ir->AllocVReg()),
65         is_uncond_branch_(false),
66         branch_targets_(machine_ir->arena()) {
67     StartRegion();
68   }
69 
70   void CompareAndBranch(Decoder::BranchOpcode opcode, Register arg1, Register arg2, int16_t offset);
71   void Branch(int32_t offset);
72   void BranchRegister(Register base, int16_t offset);
73 
74   [[nodiscard]] Register GetImm(uint64_t imm);
Copy(Register value)75   [[nodiscard]] Register Copy(Register value) {
76     Register result = AllocTempReg();
77     Gen<PseudoCopy>(result, value, 8);
78     return result;
79   }
80 
81   [[nodiscard]] Register GetReg(uint8_t reg);
82   void SetReg(uint8_t reg, Register value);
83 
84   void Undefined();
85   //
86   // Instruction implementations.
87   //
88   void Nop();
89   Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2);
90   Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2);
91   Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm);
92   Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm);
93   Register Slli(Register arg, int8_t imm);
94   Register Srli(Register arg, int8_t imm);
95   Register Srai(Register arg, int8_t imm);
96   Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm);
97   Register Rori(Register arg, int8_t shamt);
98   Register Roriw(Register arg, int8_t shamt);
99   Register Lui(int32_t imm);
100   Register Auipc(int32_t imm);
101 
Ecall(Register,Register,Register,Register,Register,Register,Register)102   Register Ecall(Register /* syscall_nr */,
103                  Register /* arg0 */,
104                  Register /* arg1 */,
105                  Register /* arg2 */,
106                  Register /* arg3 */,
107                  Register /* arg4 */,
108                  Register /* arg5 */) {
109     Undefined();
110     return {};
111   }
112 
113   void Store(Decoder::MemoryDataOperandType operand_type,
114              Register arg,
115              int16_t offset,
116              Register data);
117   Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset);
118 
119   template <typename IntType>
ToLoadOperandType()120   constexpr Decoder::LoadOperandType ToLoadOperandType() {
121     if constexpr (std::is_same_v<IntType, int8_t>) {
122       return Decoder::LoadOperandType::k8bitSigned;
123     } else if constexpr (std::is_same_v<IntType, int16_t>) {
124       return Decoder::LoadOperandType::k16bitSigned;
125     } else if constexpr (std::is_same_v<IntType, int32_t>) {
126       return Decoder::LoadOperandType::k32bitSigned;
127     } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) {
128       return Decoder::LoadOperandType::k64bit;
129     } else if constexpr (std::is_same_v<IntType, uint8_t>) {
130       return Decoder::LoadOperandType::k8bitUnsigned;
131     } else if constexpr (std::is_same_v<IntType, uint16_t>) {
132       return Decoder::LoadOperandType::k16bitUnsigned;
133     } else if constexpr (std::is_same_v<IntType, uint32_t>) {
134       return Decoder::LoadOperandType::k32bitUnsigned;
135     } else {
136       static_assert(kDependentTypeFalse<IntType>);
137     }
138   }
139 
140   template <typename IntType>
ToMemoryDataOperandType()141   constexpr Decoder::MemoryDataOperandType ToMemoryDataOperandType() {
142     if constexpr (std::is_same_v<IntType, int8_t> || std::is_same_v<IntType, uint8_t>) {
143       return Decoder::MemoryDataOperandType::k8bit;
144     } else if constexpr (std::is_same_v<IntType, int16_t> || std::is_same_v<IntType, uint16_t>) {
145       return Decoder::MemoryDataOperandType::k16bit;
146     } else if constexpr (std::is_same_v<IntType, int32_t> || std::is_same_v<IntType, uint32_t>) {
147       return Decoder::MemoryDataOperandType::k32bit;
148     } else if constexpr (std::is_same_v<IntType, int64_t> || std::is_same_v<IntType, uint64_t>) {
149       return Decoder::MemoryDataOperandType::k64bit;
150     } else {
151       static_assert(kDependentTypeFalse<IntType>);
152     }
153   }
154 
155   // Versions without recovery can be used to access non-guest memory (e.g. CPUState).
156   Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type, Register base, int32_t disp);
157   Register LoadWithoutRecovery(Decoder::LoadOperandType operand_type,
158                                Register base,
159                                Register index,
160                                int32_t disp);
161   void StoreWithoutRecovery(Decoder::MemoryDataOperandType operand_type,
162                             Register base,
163                             int32_t disp,
164                             Register val);
165   void StoreWithoutRecovery(Decoder::MemoryDataOperandType operand_type,
166                             Register base,
167                             Register index,
168                             int32_t disp,
169                             Register val);
170 
171   //
172   // Atomic extensions.
173   //
174 
175   template <typename IntType, bool aq, bool rl>
Lr(Register addr)176   Register Lr(Register addr) {
177     Register aligned_addr = AllocTempReg();
178     Gen<PseudoCopy>(aligned_addr, addr, 8);
179     // The immediate is sign extended to 64-bit.
180     Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{sizeof(Reservation) - 1}, GetFlagsRegister());
181 
182     MemoryRegionReservationLoad(aligned_addr);
183 
184     Register addr_offset = AllocTempReg();
185     Gen<PseudoCopy>(addr_offset, addr, 8);
186     Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister());
187 
188     // Load the requested part from CPUState.
189     return LoadWithoutRecovery(ToLoadOperandType<IntType>(),
190                                x86_64::kMachineRegRBP,
191                                addr_offset,
192                                GetThreadStateReservationValueOffset());
193   }
194 
195   template <typename IntType, bool aq, bool rl>
Sc(Register addr,Register data)196   Register Sc(Register addr, Register data) {
197     // Compute aligned_addr.
198     auto aligned_addr = AllocTempReg();
199     Gen<PseudoCopy>(aligned_addr, addr, 8);
200     // The immediate is sign extended to 64-bit.
201     Gen<x86_64::AndqRegImm>(aligned_addr, ~int32_t{sizeof(Reservation) - 1}, GetFlagsRegister());
202 
203     // Load current monitor value before we clobber it.
204     auto reservation_value = AllocTempReg();
205     int32_t value_offset = GetThreadStateReservationValueOffset();
206     Gen<x86_64::MovqRegMemBaseDisp>(reservation_value, x86_64::kMachineRegRBP, value_offset);
207     Register addr_offset = AllocTempReg();
208     Gen<PseudoCopy>(addr_offset, addr, 8);
209     Gen<x86_64::SubqRegReg>(addr_offset, aligned_addr, GetFlagsRegister());
210     // It's okay to clobber reservation_value since we clear out reservation_address in
211     // MemoryRegionReservationExchange anyway.
212     StoreWithoutRecovery(ToMemoryDataOperandType<IntType>(),
213                          x86_64::kMachineRegRBP,
214                          addr_offset,
215                          value_offset,
216                          data);
217 
218     return MemoryRegionReservationExchange(aligned_addr, reservation_value);
219   }
220 
221   void Fence(Decoder::FenceOpcode opcode,
222              Register src,
223              bool sw,
224              bool sr,
225              bool so,
226              bool si,
227              bool pw,
228              bool pr,
229              bool po,
230              bool pi);
231 
232   //
233   // F and D extensions.
234   //
235   [[nodiscard]] FpRegister GetFpReg(uint8_t reg);
236 
237   template <typename FloatType>
GetFRegAndUnboxNan(uint8_t reg)238   [[nodiscard]] FpRegister GetFRegAndUnboxNan(uint8_t reg) {
239     CHECK_LE(reg, kNumGuestFpRegs);
240     FpRegister result = AllocTempSimdReg();
241     builder_.GenGetSimd<8>(result.machine_reg(), GetThreadStateFRegOffset(reg));
242     FpRegister unboxed_result = AllocTempSimdReg();
243     if (host_platform::kHasAVX) {
244       builder_.Gen<x86_64::MacroUnboxNanFloat32AVX>(unboxed_result.machine_reg(),
245                                                     result.machine_reg());
246     } else {
247       builder_.Gen<x86_64::MacroUnboxNanFloat32>(unboxed_result.machine_reg(),
248                                                  result.machine_reg());
249     }
250     return unboxed_result;
251   }
252 
253   template <typename FloatType>
NanBoxFpReg(FpRegister value)254   void NanBoxFpReg(FpRegister value) {
255     if (host_platform::kHasAVX) {
256       builder_.Gen<x86_64::MacroNanBoxFloat32AVX>(value.machine_reg(), value.machine_reg());
257     } else {
258       builder_.Gen<x86_64::MacroNanBoxFloat32>(value.machine_reg());
259     }
260   }
261 
262   template <typename FloatType>
NanBoxAndSetFpReg(uint8_t reg,FpRegister value)263   void NanBoxAndSetFpReg(uint8_t reg, FpRegister value) {
264     CHECK_LE(reg, kNumGuestFpRegs);
265     if (success()) {
266       NanBoxFpReg<FloatType>(value);
267       builder_.GenSetSimd<8>(GetThreadStateFRegOffset(reg), value.machine_reg());
268     }
269   }
270 
271   template <typename DataType>
LoadFp(Register arg,int16_t offset)272   FpRegister LoadFp(Register arg, int16_t offset) {
273     auto res = AllocTempSimdReg();
274     if constexpr (std::is_same_v<DataType, Float32>) {
275       Gen<x86_64::MovssXRegMemBaseDisp>(res.machine_reg(), arg, offset);
276     } else if constexpr (std::is_same_v<DataType, Float64>) {
277       Gen<x86_64::MovsdXRegMemBaseDisp>(res.machine_reg(), arg, offset);
278     } else {
279       static_assert(kDependentTypeFalse<DataType>);
280     }
281     return res;
282   }
283 
284   template <typename DataType>
StoreFp(Register arg,int16_t offset,FpRegister data)285   void StoreFp(Register arg, int16_t offset, FpRegister data) {
286     if constexpr (std::is_same_v<DataType, Float32>) {
287       Gen<x86_64::MovssMemBaseDispXReg>(arg, offset, data.machine_reg());
288     } else if constexpr (std::is_same_v<DataType, Float64>) {
289       Gen<x86_64::MovsdMemBaseDispXReg>(arg, offset, data.machine_reg());
290     } else {
291       static_assert(kDependentTypeFalse<DataType>);
292     }
293   }
294 
Fmv(FpRegister arg)295   FpRegister Fmv(FpRegister arg) {
296     auto res = AllocTempSimdReg();
297     Gen<PseudoCopy>(res.machine_reg(), arg.machine_reg(), 16);
298     return res;
299   }
300 
301   //
302   // V extension.
303   //
304 
305   template <typename VOpArgs, typename... ExtraAegs>
OpVector(const VOpArgs &,ExtraAegs...)306   void OpVector(const VOpArgs& /*args*/, ExtraAegs... /*extra_args*/) {
307     // TODO(b/300690740): develop and implement strategy which would allow us to support vector
308     // intrinsics not just in the interpreter.
309     Undefined();
310   }
311 
312   //
313   // Csr
314   //
315 
316   Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr);
317   Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr);
318 
success()319   [[nodiscard]] bool success() const { return success_; }
320 
321   //
322   // Intrinsic proxy methods.
323   //
324 
325 #include "berberis/intrinsics/translator_intrinsics_hooks-inl.h"
326 
327   //
328   // Guest state getters/setters.
329   //
330 
GetInsnAddr()331   [[nodiscard]] GuestAddr GetInsnAddr() const { return pc_; }
IncrementInsnAddr(uint8_t insn_size)332   void IncrementInsnAddr(uint8_t insn_size) { pc_ += insn_size; }
333 
334   [[nodiscard]] bool IsRegionEndReached() const;
335   void StartInsn();
336   void Finalize(GuestAddr stop_pc);
337 
338   // These methods are exported only for testing.
branch_targets()339   [[nodiscard]] const ArenaMap<GuestAddr, MachineInsnPosition>& branch_targets() const {
340     return branch_targets_;
341   }
342 
343   template <CsrName kName>
GetCsr()344   [[nodiscard]] Register GetCsr() {
345     auto csr_reg = AllocTempReg();
346     if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) {
347       Gen<x86_64::MovzxblRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>);
348     } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) {
349       Gen<x86_64::MovqRegMemBaseDisp>(csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<kName>);
350     } else {
351       static_assert(kDependentTypeFalse<CsrFieldType<kName>>);
352     }
353     return csr_reg;
354   }
355 
356   template <CsrName kName>
SetCsr(uint8_t imm)357   void SetCsr(uint8_t imm) {
358     // Note: csr immediate only have 5 bits in RISC-V encoding which guarantess us that
359     // “imm & kCsrMask<kName>”can be used as 8-bit immediate.
360     if constexpr (std::is_same_v<CsrFieldType<kName>, uint8_t>) {
361       Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP,
362                                       kCsrFieldOffset<kName>,
363                                       static_cast<int8_t>(imm & kCsrMask<kName>));
364     } else if constexpr (std::is_same_v<CsrFieldType<kName>, uint64_t>) {
365       Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP,
366                                       kCsrFieldOffset<kName>,
367                                       static_cast<int8_t>(imm & kCsrMask<kName>));
368     } else {
369       static_assert(kDependentTypeFalse<CsrFieldType<kName>>);
370     }
371   }
372 
373   template <CsrName kName>
SetCsr(Register arg)374   void SetCsr(Register arg) {
375     auto tmp = AllocTempReg();
376     Gen<PseudoCopy>(tmp, arg, sizeof(CsrFieldType<kName>));
377     if constexpr (sizeof(CsrFieldType<kName>) == 1) {
378       Gen<x86_64::AndbRegImm>(tmp, kCsrMask<kName>, GetFlagsRegister());
379       Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp);
380     } else if constexpr (sizeof(CsrFieldType<kName>) == 8) {
381       Gen<x86_64::AndqRegMemAbsolute>(
382           tmp, constants_pool::kConst<uint64_t{kCsrMask<kName>}>, GetFlagsRegister());
383       Gen<x86_64::MovqMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<kName>, tmp);
384     } else {
385       static_assert(kDependentTypeFalse<CsrFieldType<kName>>);
386     }
387   }
388 
389  private:
390   // Specialization for AssemblerResType=void
391   template <auto kFunction,
392             typename AssemblerResType,
393             typename... AssemblerArgType,
394             std::enable_if_t<std::is_same_v<std::decay_t<AssemblerResType>, void>, bool> = true>
CallIntrinsic(AssemblerArgType...args)395   void CallIntrinsic(AssemblerArgType... args) {
396     if (TryInlineIntrinsicForHeavyOptimizerVoid<kFunction>(
397             &builder_, GetFlagsRegister(), args...)) {
398       return;
399     }
400 
401     CallIntrinsicImpl(&builder_, kFunction, GetFlagsRegister(), args...);
402   }
403 
404   template <auto kFunction,
405             typename AssemblerResType,
406             typename... AssemblerArgType,
407             std::enable_if_t<!std::is_same_v<std::decay_t<AssemblerResType>, void>, bool> = true>
CallIntrinsic(AssemblerArgType...args)408   AssemblerResType CallIntrinsic(AssemblerArgType... args) {
409     AssemblerResType result;
410 
411     if constexpr (std::is_same_v<AssemblerResType, Register>) {
412       result = AllocTempReg();
413     } else if constexpr (std::is_same_v<AssemblerResType, SimdReg>) {
414       result = AllocTempSimdReg();
415     } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<Register, Register>>) {
416       result = {AllocTempReg(), AllocTempReg()};
417     } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, Register>>) {
418       result = {AllocTempSimdReg(), AllocTempReg()};
419     } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, SimdReg>>) {
420       result = {AllocTempSimdReg(), AllocTempSimdReg()};
421     } else if constexpr (std::is_same_v<AssemblerResType, std::tuple<SimdReg, SimdReg, SimdReg>>) {
422       result = {AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg()};
423     } else if constexpr (std::is_same_v<AssemblerResType,
424                                         std::tuple<SimdReg, SimdReg, SimdReg, SimdReg>>) {
425       result = {AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg(), AllocTempSimdReg()};
426     } else {
427       // This should not be reached by the compiler. If it is - there is a new result type that
428       // needs to be supported.
429       static_assert(kDependentTypeFalse<AssemblerResType>, "Unsupported result type");
430     }
431 
432     if (TryInlineIntrinsicForHeavyOptimizer<kFunction>(
433             &builder_, result, GetFlagsRegister(), args...)) {
434       return result;
435     }
436 
437     CallIntrinsicImpl(&builder_, kFunction, result, GetFlagsRegister(), args...);
438     return result;
439   }
440 
441   void MemoryRegionReservationLoad(Register aligned_addr);
442   Register MemoryRegionReservationExchange(Register aligned_addr, Register curr_reservation_value);
443   void MemoryRegionReservationSwapWithLockedOwner(Register aligned_addr,
444                                                   Register curr_reservation_value,
445                                                   Register new_reservation_value,
446                                                   MachineBasicBlock* failure_bb);
447 
448   // Syntax sugar.
449   template <typename InsnType, typename... Args>
Gen(Args...args)450   /*may_discard*/ InsnType* Gen(Args... args) {
451     return builder_.Gen<InsnType, Args...>(args...);
452   }
453 
454   static x86_64::Assembler::Condition ToAssemblerCond(Decoder::BranchOpcode opcode);
455 
456   [[nodiscard]] Register AllocTempReg();
457   [[nodiscard]] SimdReg AllocTempSimdReg();
GetFlagsRegister()458   [[nodiscard]] Register GetFlagsRegister() const { return flag_register_; };
459 
460   void GenJump(GuestAddr target);
461   void ExitGeneratedCode(GuestAddr target);
462   void ExitRegionIndirect(Register target);
463 
464   void GenRecoveryBlockForLastInsn();
465 
466   void ResolveJumps();
467   void ReplaceJumpWithBranch(MachineBasicBlock* bb, MachineBasicBlock* target_bb);
468   void UpdateBranchTargetsAfterSplit(GuestAddr addr,
469                                      const MachineBasicBlock* old_bb,
470                                      MachineBasicBlock* new_bb);
471 
StartRegion()472   void StartRegion() {
473     auto* region_entry_bb = builder_.ir()->NewBasicBlock();
474     auto* cont_bb = builder_.ir()->NewBasicBlock();
475     builder_.ir()->AddEdge(region_entry_bb, cont_bb);
476     builder_.StartBasicBlock(region_entry_bb);
477     Gen<PseudoBranch>(cont_bb);
478     builder_.StartBasicBlock(cont_bb);
479   }
480 
481   GuestAddr pc_;
482   bool success_;
483   x86_64::MachineIRBuilder builder_;
484   MachineReg flag_register_;
485   bool is_uncond_branch_;
486   // Contains IR positions of all guest instructions of the current region.
487   // Also contains all branch targets which the current region jumps to.
488   // If the target is outside of the current region the position is uninitialized,
489   // i.e. it's basic block (position.first) is nullptr.
490   ArenaMap<GuestAddr, MachineInsnPosition> branch_targets_;
491 };
492 
493 template <>
494 [[nodiscard]] inline HeavyOptimizerFrontend::FpRegister
495 HeavyOptimizerFrontend::GetFRegAndUnboxNan<intrinsics::Float64>(uint8_t reg) {
496   return GetFpReg(reg);
497 }
498 
499 template <>
500 inline void HeavyOptimizerFrontend::NanBoxFpReg<intrinsics::Float64>(FpRegister) {}
501 
502 template <>
503 [[nodiscard]] inline HeavyOptimizerFrontend::Register
504 HeavyOptimizerFrontend::GetCsr<CsrName::kCycle>() {
505   return CPUClockCount();
506 }
507 
508 template <>
509 [[nodiscard]] inline HeavyOptimizerFrontend::Register
510 HeavyOptimizerFrontend::GetCsr<CsrName::kFCsr>() {
511   auto csr_reg = AllocTempReg();
512   auto tmp = AllocTempReg();
513   InlineIntrinsicForHeavyOptimizer<&intrinsics::FeGetExceptions>(
514       &builder_, tmp, GetFlagsRegister());
515   Gen<x86_64::MovzxbqRegMemBaseDisp>(
516       csr_reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>);
517   Gen<x86_64::ShlbRegImm>(csr_reg, 5, GetFlagsRegister());
518   Gen<x86_64::OrbRegReg>(csr_reg, tmp, GetFlagsRegister());
519   return csr_reg;
520 }
521 
522 template <>
523 [[nodiscard]] inline HeavyOptimizerFrontend::Register
524 HeavyOptimizerFrontend::GetCsr<CsrName::kFFlags>() {
525   return FeGetExceptions();
526 }
527 
528 template <>
529 [[nodiscard]] inline HeavyOptimizerFrontend::Register
530 HeavyOptimizerFrontend::GetCsr<CsrName::kVlenb>() {
531   return GetImm(16);
532 }
533 
534 template <>
535 [[nodiscard]] inline HeavyOptimizerFrontend::Register
536 HeavyOptimizerFrontend::GetCsr<CsrName::kVxrm>() {
537   auto reg = AllocTempReg();
538   Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>);
539   Gen<x86_64::AndbRegImm>(reg, 0b11, GetFlagsRegister());
540   return reg;
541 }
542 
543 template <>
544 [[nodiscard]] inline HeavyOptimizerFrontend::Register
545 HeavyOptimizerFrontend::GetCsr<CsrName::kVxsat>() {
546   auto reg = AllocTempReg();
547   Gen<x86_64::MovzxbqRegMemBaseDisp>(reg, x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>);
548   Gen<x86_64::ShrbRegImm>(reg, 2, GetFlagsRegister());
549   return reg;
550 }
551 
552 template <>
553 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(uint8_t imm) {
554   // Note: instructions Csrrci or Csrrsi couldn't affect Frm because immediate only has five bits.
555   // But these instruction don't pass their immediate-specified argument into `SetCsr`, they combine
556   // it with register first. Fixing that can only be done by changing code in the semantics player.
557   //
558   // But Csrrwi may clear it.  And we actually may only arrive here from Csrrwi.
559   // Thus, technically, we know that imm >> 5 is always zero, but it doesn't look like a good idea
560   // to rely on that: it's very subtle and it only affects code generation speed.
561   Gen<x86_64::MovbMemBaseDispImm>(
562       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, static_cast<int8_t>(imm >> 5));
563   InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRoundImm>(
564       &builder_, GetFlagsRegister(), imm);
565 }
566 
567 template <>
568 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFCsr>(Register arg) {
569   // Check size to be sure we can use Andb and Movb below.
570   static_assert(sizeof(kCsrMask<CsrName::kFrm>) == 1);
571 
572   auto exceptions = AllocTempReg();
573   auto rounding_mode = AllocTempReg();
574   Gen<PseudoCopy>(exceptions, arg, 1);
575   Gen<x86_64::AndlRegImm>(exceptions, 0b1'1111, GetFlagsRegister());
576   // We don't care about the data in rounding_mode because we will shift in the
577   // data we need.
578   Gen<PseudoDefReg>(rounding_mode);
579   Gen<x86_64::ShldlRegRegImm>(rounding_mode, arg, int8_t{32 - 5}, GetFlagsRegister());
580   Gen<x86_64::AndbRegImm>(rounding_mode, kCsrMask<CsrName::kFrm>, GetFlagsRegister());
581   Gen<x86_64::MovbMemBaseDispReg>(
582       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, rounding_mode);
583   InlineIntrinsicForHeavyOptimizerVoid<&intrinsics::FeSetExceptionsAndRound>(
584       &builder_, GetFlagsRegister(), exceptions, rounding_mode);
585 }
586 
587 template <>
588 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(uint8_t imm) {
589   FeSetExceptionsImm(static_cast<int8_t>(imm & 0b1'1111));
590 }
591 
592 template <>
593 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFFlags>(Register arg) {
594   auto tmp = AllocTempReg();
595   Gen<PseudoCopy>(tmp, arg, 1);
596   Gen<x86_64::AndlRegImm>(tmp, 0b1'1111, GetFlagsRegister());
597   FeSetExceptions(tmp);
598 }
599 
600 template <>
601 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(uint8_t imm) {
602   Gen<x86_64::MovbMemBaseDispImm>(x86_64::kMachineRegRBP,
603                                   kCsrFieldOffset<CsrName::kFrm>,
604                                   static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>));
605   FeSetRoundImm(static_cast<int8_t>(imm & kCsrMask<CsrName::kFrm>));
606 }
607 
608 template <>
609 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kFrm>(Register arg) {
610   // Use RCX as temporary register. We know it would be used by FeSetRound, too.
611   auto tmp = AllocTempReg();
612   Gen<PseudoCopy>(tmp, arg, 1);
613   Gen<x86_64::AndbRegImm>(tmp, kCsrMask<CsrName::kFrm>, GetFlagsRegister());
614   Gen<x86_64::MovbMemBaseDispReg>(x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kFrm>, tmp);
615   FeSetRound(tmp);
616 }
617 
618 template <>
619 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(uint8_t imm) {
620   imm &= 0b11;
621   if (imm != 0b11) {
622     Gen<x86_64::AndbMemBaseDispImm>(
623         x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister());
624   }
625   if (imm != 0b00) {
626     Gen<x86_64::OrbMemBaseDispImm>(
627         x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, imm, GetFlagsRegister());
628   }
629 }
630 
631 template <>
632 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxrm>(Register arg) {
633   Gen<x86_64::AndbMemBaseDispImm>(
634       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister());
635   Gen<x86_64::AndbRegImm>(arg, 0b11, GetFlagsRegister());
636   Gen<x86_64::OrbMemBaseDispReg>(
637       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, arg, GetFlagsRegister());
638 }
639 
640 template <>
641 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(uint8_t imm) {
642   if (imm & 0b1) {
643     Gen<x86_64::OrbMemBaseDispImm>(
644         x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b100, GetFlagsRegister());
645   } else {
646     Gen<x86_64::AndbMemBaseDispImm>(
647         x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister());
648   }
649 }
650 
651 template <>
652 inline void HeavyOptimizerFrontend::SetCsr<CsrName::kVxsat>(Register arg) {
653   using Condition = x86_64::Assembler::Condition;
654   Gen<x86_64::AndbMemBaseDispImm>(
655       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, 0b11, GetFlagsRegister());
656   Gen<x86_64::TestbRegImm>(arg, 1, GetFlagsRegister());
657   auto tmp = AllocTempReg();
658   Gen<x86_64::SetccReg>(Condition::kNotZero, tmp, GetFlagsRegister());
659   Gen<x86_64::MovzxbqRegReg>(tmp, tmp);
660   Gen<x86_64::ShlbRegImm>(tmp, int8_t{2}, GetFlagsRegister());
661   Gen<x86_64::OrbMemBaseDispReg>(
662       x86_64::kMachineRegRBP, kCsrFieldOffset<CsrName::kVcsr>, tmp, GetFlagsRegister());
663 }
664 
665 }  // namespace berberis
666 
667 #endif /* BERBERIS_HEAVY_OPTIMIZER_RISCV64_FRONTEND_H_ */
668