1 /*
2  * Copyright (C) 2023 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file excenaupt in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "berberis/interpreter/riscv64/interpreter.h"
18 
19 #include <atomic>
20 #include <cfenv>
21 #include <cstdint>
22 #include <cstring>
23 
24 #include "berberis/base/bit_util.h"
25 #include "berberis/base/checks.h"
26 #include "berberis/base/macros.h"
27 #include "berberis/decoder/riscv64/decoder.h"
28 #include "berberis/decoder/riscv64/semantics_player.h"
29 #include "berberis/guest_state/guest_addr.h"
30 #include "berberis/guest_state/guest_state.h"
31 #include "berberis/intrinsics/guest_cpu_flags.h"  // ToHostRoundingMode
32 #include "berberis/intrinsics/intrinsics.h"
33 #include "berberis/intrinsics/riscv64_to_all/vector_intrinsics.h"
34 #include "berberis/intrinsics/simd_register.h"
35 #include "berberis/intrinsics/type_traits.h"
36 #include "berberis/kernel_api/run_guest_syscall.h"
37 #include "berberis/runtime_primitives/memory_region_reservation.h"
38 
39 #if !defined(__aarch64__)
40 #include "berberis/intrinsics/intrinsics_float.h"
41 #include "berberis/runtime_primitives/interpret_helpers.h"
42 #include "berberis/runtime_primitives/recovery_code.h"
43 #endif
44 
45 #include "regs.h"
46 
47 #include "../faulty_memory_accesses.h"
48 
49 namespace berberis {
50 
AqRlToStdMemoryOrder(bool aq,bool rl)51 inline constexpr std::memory_order AqRlToStdMemoryOrder(bool aq, bool rl) {
52   if (aq) {
53     return rl ? std::memory_order_acq_rel : std::memory_order_acquire;
54   } else {
55     return rl ? std::memory_order_release : std::memory_order_relaxed;
56   }
57 }
58 
59 template <typename ConcreteType, template <auto> typename TemplateType>
60 inline constexpr bool IsTypeTemplateOf = false;
61 
62 template <template <auto> typename TemplateType, auto Value>
63 inline constexpr bool IsTypeTemplateOf<TemplateType<Value>, TemplateType> = true;
64 
65 class Interpreter {
66  public:
67   using CsrName = berberis::CsrName;
68   using Decoder = Decoder<SemanticsPlayer<Interpreter>>;
69   using Register = uint64_t;
70   static constexpr Register no_register = 0;
71   using FpRegister = uint64_t;
72   static constexpr FpRegister no_fp_register = 0;
73   using Float32 = intrinsics::Float32;
74   using Float64 = intrinsics::Float64;
75 
Interpreter(ThreadState * state)76   explicit Interpreter(ThreadState* state)
77       : state_(state), branch_taken_(false), exception_raised_(false) {}
78 
79   //
80   // Instruction implementations.
81   //
82 
UpdateCsr(Decoder::CsrOpcode opcode,Register arg,Register csr)83   Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr) {
84     switch (opcode) {
85       case Decoder::CsrOpcode::kCsrrs:
86         return arg | csr;
87       case Decoder::CsrOpcode::kCsrrc:
88         return ~arg & csr;
89       default:
90         Undefined();
91         return {};
92     }
93   }
94 
UpdateCsr(Decoder::CsrImmOpcode opcode,uint8_t imm,Register csr)95   Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr) {
96     return UpdateCsr(static_cast<Decoder::CsrOpcode>(opcode), imm, csr);
97   }
98 
99 #if defined(__aarch64__)
Fence(Decoder::FenceOpcode,Register,bool sw,bool sr,bool,bool,bool pw,bool pr,bool,bool)100   void Fence(Decoder::FenceOpcode /*opcode*/,
101              Register /*src*/,
102              bool sw,
103              bool sr,
104              bool /*so*/,
105              bool /*si*/,
106              bool pw,
107              bool pr,
108              bool /*po*/,
109              bool /*pi*/) {
110     bool read_fence = sr | pr;
111     bool write_fence = sw | pw;
112     // "ish" is for inner shareable access, which is normally needed by userspace programs.
113     if (read_fence) {
114       if (write_fence) {
115         // This is equivalent to "fence rw,rw".
116         asm volatile("dmb ish" ::: "memory");
117       } else {
118         // "ishld" is equivalent to "fence r,rw", which is stronger than what we need here
119         // ("fence r,r"). However, it is the closet option that ARM offers.
120         asm volatile("dmb ishld" ::: "memory");
121       }
122     } else if (write_fence) {
123       // "st" is equivalent to "fence w,w".
124       asm volatile("dmb ishst" ::: "memory");
125     }
126     return;
127   }
128 #else
129   // Note: we prefer not to use C11/C++ atomic_thread_fence or even gcc/clang builtin
130   // __atomic_thread_fence because all these function rely on the fact that compiler never uses
131   // non-temporal loads and stores and only issue “mfence” when sequentially consistent ordering is
132   // requested. They never issue “lfence” or “sfence”.
133   // Instead we pull the page from Linux's kernel book and map read ordereding to “lfence”, write
134   // ordering to “sfence” and read-write ordering to “mfence”.
135   // This can be important in the future if we would start using nontemporal moves in manually
136   // created assembly code.
137   // Ordering affecting I/O devices is not relevant to user-space code thus we just ignore bits
138   // related to devices I/O.
Fence(Decoder::FenceOpcode,Register,bool sw,bool sr,bool,bool,bool pw,bool pr,bool,bool)139   void Fence(Decoder::FenceOpcode /*opcode*/,
140              Register /*src*/,
141              bool sw,
142              bool sr,
143              bool /*so*/,
144              bool /*si*/,
145              bool pw,
146              bool pr,
147              bool /*po*/,
148              bool /*pi*/) {
149     bool read_fence = sr | pr;
150     bool write_fence = sw | pw;
151     // Two types of fences (total store ordering fence and normal fence) are supposed to be
152     // processed differently, but only for the “read_fence && write_fence” case (otherwise total
153     // store ordering fence becomes normal fence for the “forward compatibility”), yet because x86
154     // doesn't distinguish between these two types of fences and since we are supposed to map all
155     // not-yet defined fences to normal fence (again, for the “forward compatibility”) it's Ok to
156     // just ignore opcode field.
157     if (read_fence) {
158       if (write_fence) {
159         asm volatile("mfence" ::: "memory");
160       } else {
161         asm volatile("lfence" ::: "memory");
162       }
163     } else if (write_fence) {
164       asm volatile("sfence" ::: "memory");
165     }
166     return;
167   }
168 #endif
169 
170   template <typename IntType, bool aq, bool rl>
Lr(int64_t addr)171   Register Lr(int64_t addr) {
172     static_assert(std::is_integral_v<IntType>, "Lr: IntType must be integral");
173     static_assert(std::is_signed_v<IntType>, "Lr: IntType must be signed");
174     CHECK(!exception_raised_);
175     // Address must be aligned on size of IntType.
176     CHECK((addr % sizeof(IntType)) == 0ULL);
177     return MemoryRegionReservation::Load<IntType>(&state_->cpu, addr, AqRlToStdMemoryOrder(aq, rl));
178   }
179 
180   template <typename IntType, bool aq, bool rl>
Sc(int64_t addr,IntType val)181   Register Sc(int64_t addr, IntType val) {
182     static_assert(std::is_integral_v<IntType>, "Sc: IntType must be integral");
183     static_assert(std::is_signed_v<IntType>, "Sc: IntType must be signed");
184     CHECK(!exception_raised_);
185     // Address must be aligned on size of IntType.
186     CHECK((addr % sizeof(IntType)) == 0ULL);
187     return static_cast<Register>(MemoryRegionReservation::Store<IntType>(
188         &state_->cpu, addr, val, AqRlToStdMemoryOrder(aq, rl)));
189   }
190 
Op(Decoder::OpOpcode opcode,Register arg1,Register arg2)191   Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2) {
192     switch (opcode) {
193       case Decoder::OpOpcode::kAdd:
194         return Int64(arg1) + Int64(arg2);
195       case Decoder::OpOpcode::kSub:
196         return Int64(arg1) - Int64(arg2);
197       case Decoder::OpOpcode::kAnd:
198         return Int64(arg1) & Int64(arg2);
199       case Decoder::OpOpcode::kOr:
200         return Int64(arg1) | Int64(arg2);
201       case Decoder::OpOpcode::kXor:
202         return Int64(arg1) ^ Int64(arg2);
203       case Decoder::OpOpcode::kSll:
204         return Int64(arg1) << Int64(arg2);
205       case Decoder::OpOpcode::kSrl:
206         return UInt64(arg1) >> Int64(arg2);
207       case Decoder::OpOpcode::kSra:
208         return Int64(arg1) >> Int64(arg2);
209       case Decoder::OpOpcode::kSlt:
210         return Int64(arg1) < Int64(arg2) ? 1 : 0;
211       case Decoder::OpOpcode::kSltu:
212         return UInt64(arg1) < UInt64(arg2) ? 1 : 0;
213 #if !defined(__aarch64__)
214       case Decoder::OpOpcode::kMul:
215         return Int64(arg1) * Int64(arg2);
216       case Decoder::OpOpcode::kMulh:
217         return NarrowTopHalf(Widen(Int64(arg1)) * Widen(Int64(arg2)));
218       case Decoder::OpOpcode::kMulhsu:
219         return NarrowTopHalf(Widen(Int64(arg1)) * BitCastToSigned(Widen(UInt64(arg2))));
220       case Decoder::OpOpcode::kMulhu:
221         return NarrowTopHalf(Widen(UInt64(arg1)) * Widen(UInt64(arg2)));
222 #endif
223       case Decoder::OpOpcode::kAndn:
224         return Int64(arg1) & (~Int64(arg2));
225       case Decoder::OpOpcode::kOrn:
226         return Int64(arg1) | (~Int64(arg2));
227       case Decoder::OpOpcode::kXnor:
228         return ~(Int64(arg1) ^ Int64(arg2));
229       default:
230         Undefined();
231         return {};
232     }
233   }
234 
Op32(Decoder::Op32Opcode opcode,Register arg1,Register arg2)235   Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2) {
236 #if defined(__aarch64__)
237     UNUSED(opcode, arg1, arg2);
238     Undefined();
239     return {};
240 #else
241     switch (opcode) {
242       case Decoder::Op32Opcode::kAddw:
243         return Widen(TruncateTo<Int32>(arg1) + TruncateTo<Int32>(arg2));
244       case Decoder::Op32Opcode::kSubw:
245         return Widen(TruncateTo<Int32>(arg1) - TruncateTo<Int32>(arg2));
246       case Decoder::Op32Opcode::kSllw:
247         return Widen(TruncateTo<Int32>(arg1) << TruncateTo<Int32>(arg2));
248       case Decoder::Op32Opcode::kSrlw:
249         return Widen(BitCastToSigned(TruncateTo<UInt32>(arg1) >> TruncateTo<Int32>(arg2)));
250       case Decoder::Op32Opcode::kSraw:
251         return Widen(TruncateTo<Int32>(arg1) >> TruncateTo<Int32>(arg2));
252       case Decoder::Op32Opcode::kMulw:
253         return Widen(TruncateTo<Int32>(arg1) * TruncateTo<Int32>(arg2));
254       default:
255         Undefined();
256         return {};
257     }
258 #endif
259   }
260 
Load(Decoder::LoadOperandType operand_type,Register arg,int16_t offset)261   Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset) {
262     void* ptr = ToHostAddr<void>(arg + offset);
263     switch (operand_type) {
264       case Decoder::LoadOperandType::k8bitUnsigned:
265         return Load<uint8_t>(ptr);
266       case Decoder::LoadOperandType::k16bitUnsigned:
267         return Load<uint16_t>(ptr);
268       case Decoder::LoadOperandType::k32bitUnsigned:
269         return Load<uint32_t>(ptr);
270       case Decoder::LoadOperandType::k64bit:
271         return Load<uint64_t>(ptr);
272       case Decoder::LoadOperandType::k8bitSigned:
273         return Load<int8_t>(ptr);
274       case Decoder::LoadOperandType::k16bitSigned:
275         return Load<int16_t>(ptr);
276       case Decoder::LoadOperandType::k32bitSigned:
277         return Load<int32_t>(ptr);
278       default:
279         Undefined();
280         return {};
281     }
282   }
283 
284   template <typename DataType>
LoadFp(Register arg,int16_t offset)285   FpRegister LoadFp(Register arg, int16_t offset) {
286 #if defined(__aarch64__)
287     UNUSED(arg, offset);
288     Undefined();
289     return {};
290 #else
291     static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
292     CHECK(!exception_raised_);
293     DataType* ptr = ToHostAddr<DataType>(arg + offset);
294     FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
295     if (result.is_fault) {
296       exception_raised_ = true;
297       return {};
298     }
299     return result.value;
300 #endif
301   }
302 
OpImm(Decoder::OpImmOpcode opcode,Register arg,int16_t imm)303   Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm) {
304     switch (opcode) {
305       case Decoder::OpImmOpcode::kAddi:
306         return arg + int64_t{imm};
307       case Decoder::OpImmOpcode::kSlti:
308         return bit_cast<int64_t>(arg) < int64_t{imm} ? 1 : 0;
309       case Decoder::OpImmOpcode::kSltiu:
310         return arg < bit_cast<uint64_t>(int64_t{imm}) ? 1 : 0;
311       case Decoder::OpImmOpcode::kXori:
312         return arg ^ int64_t { imm };
313       case Decoder::OpImmOpcode::kOri:
314         return arg | int64_t{imm};
315       case Decoder::OpImmOpcode::kAndi:
316         return arg & int64_t{imm};
317       default:
318         Undefined();
319         return {};
320     }
321   }
322 
Lui(int32_t imm)323   Register Lui(int32_t imm) { return int64_t{imm}; }
324 
Auipc(int32_t imm)325   Register Auipc(int32_t imm) {
326     uint64_t pc = state_->cpu.insn_addr;
327     return pc + int64_t{imm};
328   }
329 
OpImm32(Decoder::OpImm32Opcode opcode,Register arg,int16_t imm)330   Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm) {
331 #if defined(__aarch64__)
332     UNUSED(opcode, arg, imm);
333     Undefined();
334     return {};
335 #else
336     switch (opcode) {
337       case Decoder::OpImm32Opcode::kAddiw:
338         return int32_t(arg) + int32_t{imm};
339       default:
340         Undefined();
341         return {};
342     }
343 #endif
344   }
345 
346   // TODO(b/232598137): rework ecall to not take parameters explicitly.
Ecall(Register,Register,Register,Register,Register,Register,Register)347   Register Ecall(Register /* syscall_nr */,
348                  Register /* arg0 */,
349                  Register /* arg1 */,
350                  Register /* arg2 */,
351                  Register /* arg3 */,
352                  Register /* arg4 */,
353                  Register /* arg5 */) {
354     CHECK(!exception_raised_);
355     RunGuestSyscall(state_);
356     return state_->cpu.x[A0];
357   }
358 
Slli(Register arg,int8_t imm)359   Register Slli(Register arg, int8_t imm) { return arg << imm; }
360 
Srli(Register arg,int8_t imm)361   Register Srli(Register arg, int8_t imm) { return arg >> imm; }
362 
Srai(Register arg,int8_t imm)363   Register Srai(Register arg, int8_t imm) { return bit_cast<int64_t>(arg) >> imm; }
364 
ShiftImm32(Decoder::ShiftImm32Opcode opcode,Register arg,uint16_t imm)365   Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm) {
366 #if defined(__aarch64__)
367     UNUSED(opcode, arg, imm);
368     Undefined();
369     return {};
370 #else
371     switch (opcode) {
372       case Decoder::ShiftImm32Opcode::kSlliw:
373         return int32_t(arg) << int32_t{imm};
374       case Decoder::ShiftImm32Opcode::kSrliw:
375         return bit_cast<int32_t>(uint32_t(arg) >> uint32_t{imm});
376       case Decoder::ShiftImm32Opcode::kSraiw:
377         return int32_t(arg) >> int32_t{imm};
378       default:
379         Undefined();
380         return {};
381     }
382 #endif
383   }
384 
Rori(Register arg,int8_t shamt)385   Register Rori(Register arg, int8_t shamt) {
386     CheckShamtIsValid(shamt);
387     return (((uint64_t(arg) >> shamt)) | (uint64_t(arg) << (64 - shamt)));
388   }
389 
Roriw(Register arg,int8_t shamt)390   Register Roriw(Register arg, int8_t shamt) {
391 #if defined(__aarch64__)
392     UNUSED(arg, shamt);
393     Undefined();
394     return {};
395 #else
396     CheckShamt32IsValid(shamt);
397     return int32_t(((uint32_t(arg) >> shamt)) | (uint32_t(arg) << (32 - shamt)));
398 #endif
399   }
400 
Store(Decoder::MemoryDataOperandType operand_type,Register arg,int16_t offset,Register data)401   void Store(Decoder::MemoryDataOperandType operand_type,
402              Register arg,
403              int16_t offset,
404              Register data) {
405     void* ptr = ToHostAddr<void>(arg + offset);
406     switch (operand_type) {
407       case Decoder::MemoryDataOperandType::k8bit:
408         Store<uint8_t>(ptr, data);
409         break;
410       case Decoder::MemoryDataOperandType::k16bit:
411         Store<uint16_t>(ptr, data);
412         break;
413       case Decoder::MemoryDataOperandType::k32bit:
414         Store<uint32_t>(ptr, data);
415         break;
416       case Decoder::MemoryDataOperandType::k64bit:
417         Store<uint64_t>(ptr, data);
418         break;
419       default:
420         return Undefined();
421     }
422   }
423 
424   template <typename DataType>
StoreFp(Register arg,int16_t offset,FpRegister data)425   void StoreFp(Register arg, int16_t offset, FpRegister data) {
426 #if defined(__aarch64__)
427     UNUSED(arg, offset, data);
428     Undefined();
429 #else
430     static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
431     CHECK(!exception_raised_);
432     DataType* ptr = ToHostAddr<DataType>(arg + offset);
433     exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
434 #endif
435   }
436 
CompareAndBranch(Decoder::BranchOpcode opcode,Register arg1,Register arg2,int16_t offset)437   void CompareAndBranch(Decoder::BranchOpcode opcode,
438                         Register arg1,
439                         Register arg2,
440                         int16_t offset) {
441     bool cond_value;
442     switch (opcode) {
443       case Decoder::BranchOpcode::kBeq:
444         cond_value = arg1 == arg2;
445         break;
446       case Decoder::BranchOpcode::kBne:
447         cond_value = arg1 != arg2;
448         break;
449       case Decoder::BranchOpcode::kBltu:
450         cond_value = arg1 < arg2;
451         break;
452       case Decoder::BranchOpcode::kBgeu:
453         cond_value = arg1 >= arg2;
454         break;
455       case Decoder::BranchOpcode::kBlt:
456         cond_value = bit_cast<int64_t>(arg1) < bit_cast<int64_t>(arg2);
457         break;
458       case Decoder::BranchOpcode::kBge:
459         cond_value = bit_cast<int64_t>(arg1) >= bit_cast<int64_t>(arg2);
460         break;
461       default:
462         return Undefined();
463     }
464 
465     if (cond_value) {
466       Branch(offset);
467     }
468   }
469 
Branch(int32_t offset)470   void Branch(int32_t offset) {
471     CHECK(!exception_raised_);
472     state_->cpu.insn_addr += offset;
473     branch_taken_ = true;
474   }
475 
BranchRegister(Register base,int16_t offset)476   void BranchRegister(Register base, int16_t offset) {
477     CHECK(!exception_raised_);
478     state_->cpu.insn_addr = (base + offset) & ~uint64_t{1};
479     branch_taken_ = true;
480   }
481 
Fmv(FpRegister arg)482   FpRegister Fmv(FpRegister arg) { return arg; }
483 
484   //
485   // V extensions.
486   //
487 
488   using TailProcessing = intrinsics::TailProcessing;
489   using InactiveProcessing = intrinsics::InactiveProcessing;
490 
491   enum class VectorSelectElementWidth {
492     k8bit = 0b000,
493     k16bit = 0b001,
494     k32bit = 0b010,
495     k64bit = 0b011,
496     kMaxValue = 0b111,
497   };
498 
499   enum class VectorRegisterGroupMultiplier {
500     k1register = 0b000,
501     k2registers = 0b001,
502     k4registers = 0b010,
503     k8registers = 0b011,
504     kEigthOfRegister = 0b101,
505     kQuarterOfRegister = 0b110,
506     kHalfOfRegister = 0b111,
507     kMaxValue = 0b111,
508   };
509 
NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul)510   static constexpr size_t NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul) {
511     switch (vlmul) {
512       case VectorRegisterGroupMultiplier::k2registers:
513         return 2;
514       case VectorRegisterGroupMultiplier::k4registers:
515         return 4;
516       case VectorRegisterGroupMultiplier::k8registers:
517         return 8;
518       default:
519         return 1;
520     }
521   }
522 
NumRegistersInvolvedForWideOperand(VectorRegisterGroupMultiplier vlmul)523   static constexpr size_t NumRegistersInvolvedForWideOperand(VectorRegisterGroupMultiplier vlmul) {
524     switch (vlmul) {
525       case VectorRegisterGroupMultiplier::k1register:
526         return 2;
527       case VectorRegisterGroupMultiplier::k2registers:
528         return 4;
529       case VectorRegisterGroupMultiplier::k4registers:
530         return 8;
531       default:
532         return 1;
533     }
534   }
535 
536   template <typename ElementType, VectorRegisterGroupMultiplier vlmul>
GetVlmax()537   static constexpr size_t GetVlmax() {
538     constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
539     switch (vlmul) {
540       case VectorRegisterGroupMultiplier::k1register:
541         return kElementsCount;
542       case VectorRegisterGroupMultiplier::k2registers:
543         return 2 * kElementsCount;
544       case VectorRegisterGroupMultiplier::k4registers:
545         return 4 * kElementsCount;
546       case VectorRegisterGroupMultiplier::k8registers:
547         return 8 * kElementsCount;
548       case VectorRegisterGroupMultiplier::kEigthOfRegister:
549         return kElementsCount / 8;
550       case VectorRegisterGroupMultiplier::kQuarterOfRegister:
551         return kElementsCount / 4;
552       case VectorRegisterGroupMultiplier::kHalfOfRegister:
553         return kElementsCount / 2;
554       default:
555         return 0;
556     }
557   }
558 
559   template <typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,ExtraArgs...extra_args)560   void OpVector(const VOpArgs& args, ExtraArgs... extra_args) {
561     // Note: whole register instructions are not dependent on vtype and are supposed to work even
562     // if vill is set!  Handle them before processing other instructions.
563     // Note: other tupes of loads and store are not special and would be processed as usual.
564     // TODO(khim): Handle vstart properly.
565     if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs>) {
566       if (args.opcode == Decoder::VLUmOpOpcode::kVlXreXX) {
567         if (!IsPowerOf2(args.nf + 1)) {
568           return Undefined();
569         }
570         if ((args.dst & args.nf) != 0) {
571           return Undefined();
572         }
573         auto [src] = std::tuple{extra_args...};
574         __uint128_t* ptr = bit_cast<__uint128_t*>(src);
575         for (size_t index = 0; index <= args.nf; index++) {
576           state_->cpu.v[args.dst + index] = ptr[index];
577         }
578         return;
579       }
580     }
581 
582     if constexpr (std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
583       if (args.opcode == Decoder::VSUmOpOpcode::kVsX) {
584         if (args.width != Decoder::MemoryDataOperandType::k8bit) {
585           return Undefined();
586         }
587         if (!IsPowerOf2(args.nf + 1)) {
588           return Undefined();
589         }
590         if ((args.data & args.nf) != 0) {
591           return Undefined();
592         }
593         auto [src] = std::tuple{extra_args...};
594         __uint128_t* ptr = bit_cast<__uint128_t*>(src);
595         for (size_t index = 0; index <= args.nf; index++) {
596           ptr[index] = state_->cpu.v[args.data + index];
597         }
598         return;
599       }
600     }
601 
602     // RISC-V V extensions are using 8bit “opcode extension” vtype Csr to make sure 32bit encoding
603     // would be usable.
604     //
605     // Great care is made to ensure that vector code wouldn't need to change vtype Csr often (e.g.
606     // there are special mask instructions which allow one to manipulate on masks without the need
607     // to change the CPU mode.
608     //
609     // Currently we don't have support for multiple CPU mode in Berberis thus we can only handle
610     // these instrtuctions in the interpreter.
611     //
612     // TODO(b/300690740): develop and implement strategy which would allow us to support vector
613     // intrinsics not just in the interpreter. Move code from this function to semantics player.
614     Register vtype = GetCsr<CsrName::kVtype>();
615     if (static_cast<std::make_signed_t<Register>>(vtype) < 0) {
616       return Undefined();
617     }
618     if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
619                   std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
620                   std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
621                   std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
622                   std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
623                   std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
624       switch (args.width) {
625         case Decoder::MemoryDataOperandType::k8bit:
626           return OpVector<UInt8>(args, vtype, extra_args...);
627         case Decoder::MemoryDataOperandType::k16bit:
628           return OpVector<UInt16>(args, vtype, extra_args...);
629         case Decoder::MemoryDataOperandType::k32bit:
630           return OpVector<UInt32>(args, vtype, extra_args...);
631         case Decoder::MemoryDataOperandType::k64bit:
632           return OpVector<UInt64>(args, vtype, extra_args...);
633         default:
634           return Undefined();
635       }
636     } else {
637       VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0x7);
638       if constexpr (std::is_same_v<VOpArgs, Decoder::VOpFVfArgs> ||
639                     std::is_same_v<VOpArgs, Decoder::VOpFVvArgs>) {
640         switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
641           case VectorSelectElementWidth::k16bit:
642             if constexpr (sizeof...(extra_args) == 0) {
643               return OpVector<intrinsics::Float16>(args, vlmul, vtype);
644             } else {
645               return Undefined();
646             }
647           case VectorSelectElementWidth::k32bit:
648             return OpVector<Float32>(
649                 args,
650                 vlmul,
651                 vtype,
652                 std::get<0>(intrinsics::UnboxNan<Float32>(bit_cast<Float64>(extra_args)))...);
653           case VectorSelectElementWidth::k64bit:
654             // Note: if arguments are 64bit floats then we don't need to do any unboxing.
655             return OpVector<Float64>(args, vlmul, vtype, bit_cast<Float64>(extra_args)...);
656           default:
657             return Undefined();
658         }
659       } else {
660         switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
661           case VectorSelectElementWidth::k8bit:
662             return OpVector<UInt8>(args, vlmul, vtype, extra_args...);
663           case VectorSelectElementWidth::k16bit:
664             return OpVector<UInt16>(args, vlmul, vtype, extra_args...);
665           case VectorSelectElementWidth::k32bit:
666             return OpVector<UInt32>(args, vlmul, vtype, extra_args...);
667           case VectorSelectElementWidth::k64bit:
668             return OpVector<UInt64>(args, vlmul, vtype, extra_args...);
669           default:
670             return Undefined();
671         }
672       }
673     }
674   }
675 
676   template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)677   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
678     auto vemul = Decoder::SignExtend<3>(vtype & 0b111);
679     vemul -= ((vtype >> 3) & 0b111);  // Divide by SEW.
680     vemul +=
681         static_cast<std::underlying_type_t<decltype(args.width)>>(args.width);  // Multiply by EEW.
682     if (vemul < -3 || vemul > 3) [[unlikely]] {
683       return Undefined();
684     }
685     // Note: whole register loads and stores treat args.nf differently, but they are processed
686     // separately above anyway, because they also ignore vtype and all the information in it!
687     // For other loads and stores affected number of registers (EMUL * NF) should be 8 or less.
688     if ((vemul > 0) && ((args.nf + 1) * (1 << vemul) > 8)) {
689       return Undefined();
690     }
691     return OpVector<ElementType>(
692         args, static_cast<VectorRegisterGroupMultiplier>(vemul & 0b111), vtype, extra_args...);
693   }
694 
695   template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,VectorRegisterGroupMultiplier vlmul,Register vtype,ExtraArgs...extra_args)696   void OpVector(const VOpArgs& args,
697                 VectorRegisterGroupMultiplier vlmul,
698                 Register vtype,
699                 ExtraArgs... extra_args) {
700     switch (vlmul) {
701       case VectorRegisterGroupMultiplier::k1register:
702         return OpVector<ElementType, VectorRegisterGroupMultiplier::k1register>(
703             args, vtype, extra_args...);
704       case VectorRegisterGroupMultiplier::k2registers:
705         return OpVector<ElementType, VectorRegisterGroupMultiplier::k2registers>(
706             args, vtype, extra_args...);
707       case VectorRegisterGroupMultiplier::k4registers:
708         return OpVector<ElementType, VectorRegisterGroupMultiplier::k4registers>(
709             args, vtype, extra_args...);
710       case VectorRegisterGroupMultiplier::k8registers:
711         return OpVector<ElementType, VectorRegisterGroupMultiplier::k8registers>(
712             args, vtype, extra_args...);
713       case VectorRegisterGroupMultiplier::kEigthOfRegister:
714         return OpVector<ElementType, VectorRegisterGroupMultiplier::kEigthOfRegister>(
715             args, vtype, extra_args...);
716       case VectorRegisterGroupMultiplier::kQuarterOfRegister:
717         return OpVector<ElementType, VectorRegisterGroupMultiplier::kQuarterOfRegister>(
718             args, vtype, extra_args...);
719       case VectorRegisterGroupMultiplier::kHalfOfRegister:
720         return OpVector<ElementType, VectorRegisterGroupMultiplier::kHalfOfRegister>(
721             args, vtype, extra_args...);
722       default:
723         return Undefined();
724     }
725   }
726 
727   template <typename ElementType,
728             VectorRegisterGroupMultiplier vlmul,
729             typename VOpArgs,
730             typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)731   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
732     if (args.vm) {
733       return OpVector<ElementType, vlmul, intrinsics::NoInactiveProcessing{}>(
734           args, vtype, extra_args...);
735     }
736     if (vtype >> 7) {
737       return OpVector<ElementType, vlmul, InactiveProcessing::kAgnostic>(
738           args, vtype, extra_args...);
739     }
740     return OpVector<ElementType, vlmul, InactiveProcessing::kUndisturbed>(
741         args, vtype, extra_args...);
742   }
743 
744   template <typename ElementType,
745             VectorRegisterGroupMultiplier vlmul,
746             auto vma,
747             typename VOpArgs,
748             typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)749   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
750     if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
751                   std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
752                   std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
753                   std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
754                   std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
755                   std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
756       constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
757       // Note: whole register loads and stores treat args.nf differently, but they are processed
758       // separately above anyway, because they also ignore vtype and all the information in it!
759       switch (args.nf) {
760         case 0:
761           return OpVector<ElementType, 1, vlmul, vma>(args, vtype, extra_args...);
762         case 1:
763           if constexpr (kRegistersInvolved > 4) {
764             return Undefined();
765           } else {
766             return OpVector<ElementType, 2, vlmul, vma>(args, vtype, extra_args...);
767           }
768         case 2:
769           if constexpr (kRegistersInvolved > 2) {
770             return Undefined();
771           } else {
772             return OpVector<ElementType, 3, vlmul, vma>(args, vtype, extra_args...);
773           }
774         case 3:
775           if constexpr (kRegistersInvolved > 2) {
776             return Undefined();
777           } else {
778             return OpVector<ElementType, 4, vlmul, vma>(args, vtype, extra_args...);
779           }
780         case 4:
781           if constexpr (kRegistersInvolved > 1) {
782             return Undefined();
783           } else {
784             return OpVector<ElementType, 5, vlmul, vma>(args, vtype, extra_args...);
785           }
786         case 5:
787           if constexpr (kRegistersInvolved > 1) {
788             return Undefined();
789           } else {
790             return OpVector<ElementType, 6, vlmul, vma>(args, vtype, extra_args...);
791           }
792         case 6:
793           if constexpr (kRegistersInvolved > 1) {
794             return Undefined();
795           } else {
796             return OpVector<ElementType, 7, vlmul, vma>(args, vtype, extra_args...);
797           }
798         case 7:
799           if constexpr (kRegistersInvolved > 1) {
800             return Undefined();
801           } else {
802             return OpVector<ElementType, 8, vlmul, vma>(args, vtype, extra_args...);
803           }
804       }
805     } else {
806       if ((vtype >> 6) & 1) {
807         return OpVector<ElementType, vlmul, TailProcessing::kAgnostic, vma>(args, extra_args...);
808       }
809       return OpVector<ElementType, vlmul, TailProcessing::kUndisturbed, vma>(args, extra_args...);
810     }
811   }
812 
813   template <typename ElementType,
814             size_t kSegmentSize,
815             VectorRegisterGroupMultiplier vlmul,
816             auto vma,
817             typename VOpArgs,
818             typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)819   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
820     // Indexed loads and stores have two operands with different ElementType's and lmul sizes,
821     // pass vtype to do further selection.
822     if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
823                   std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs>) {
824       // Because we know that we are dealing with indexed loads and stores and wouldn't need to
825       // convert elmul to anything else we can immediately turn it into kIndexRegistersInvolved
826       // here.
827       if ((vtype >> 6) & 1) {
828         return OpVector<kSegmentSize,
829                         ElementType,
830                         NumberOfRegistersInvolved(vlmul),
831                         TailProcessing::kAgnostic,
832                         vma>(args, vtype, extra_args...);
833       }
834       return OpVector<kSegmentSize,
835                       ElementType,
836                       NumberOfRegistersInvolved(vlmul),
837                       TailProcessing::kUndisturbed,
838                       vma>(args, vtype, extra_args...);
839     } else {
840       // For other instruction we have parsed all the information from vtype and only need to pass
841       // args and extra_args.
842       if ((vtype >> 6) & 1) {
843         return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kAgnostic, vma>(
844             args, extra_args...);
845       }
846       return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kUndisturbed, vma>(
847           args, extra_args...);
848     }
849   }
850 
851   template <size_t kSegmentSize,
852             typename IndexElementType,
853             size_t kIndexRegistersInvolved,
854             TailProcessing vta,
855             auto vma,
856             typename VOpArgs,
857             typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)858   void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
859     VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0b111);
860     switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
861       case VectorSelectElementWidth::k8bit:
862         return OpVector<UInt8, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
863             args, vlmul, extra_args...);
864       case VectorSelectElementWidth::k16bit:
865         return OpVector<UInt16, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
866             args, vlmul, extra_args...);
867       case VectorSelectElementWidth::k32bit:
868         return OpVector<UInt32, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
869             args, vlmul, extra_args...);
870       case VectorSelectElementWidth::k64bit:
871         return OpVector<UInt64, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
872             args, vlmul, extra_args...);
873       default:
874         return Undefined();
875     }
876   }
877 
878   template <typename DataElementType,
879             size_t kSegmentSize,
880             typename IndexElementType,
881             size_t kIndexRegistersInvolved,
882             TailProcessing vta,
883             auto vma,
884             typename VOpArgs,
885             typename... ExtraArgs>
OpVector(const VOpArgs & args,VectorRegisterGroupMultiplier vlmul,ExtraArgs...extra_args)886   void OpVector(const VOpArgs& args, VectorRegisterGroupMultiplier vlmul, ExtraArgs... extra_args) {
887     switch (vlmul) {
888       case VectorRegisterGroupMultiplier::k1register:
889         return OpVector<DataElementType,
890                         VectorRegisterGroupMultiplier::k1register,
891                         IndexElementType,
892                         kSegmentSize,
893                         kIndexRegistersInvolved,
894                         vta,
895                         vma>(args, extra_args...);
896       case VectorRegisterGroupMultiplier::k2registers:
897         return OpVector<DataElementType,
898                         VectorRegisterGroupMultiplier::k2registers,
899                         IndexElementType,
900                         kSegmentSize,
901                         kIndexRegistersInvolved,
902                         vta,
903                         vma>(args, extra_args...);
904       case VectorRegisterGroupMultiplier::k4registers:
905         return OpVector<DataElementType,
906                         VectorRegisterGroupMultiplier::k4registers,
907                         IndexElementType,
908                         kSegmentSize,
909                         kIndexRegistersInvolved,
910                         vta,
911                         vma>(args, extra_args...);
912       case VectorRegisterGroupMultiplier::k8registers:
913         return OpVector<DataElementType,
914                         VectorRegisterGroupMultiplier::k8registers,
915                         IndexElementType,
916                         kSegmentSize,
917                         kIndexRegistersInvolved,
918                         vta,
919                         vma>(args, extra_args...);
920       case VectorRegisterGroupMultiplier::kEigthOfRegister:
921         return OpVector<DataElementType,
922                         VectorRegisterGroupMultiplier::kEigthOfRegister,
923                         IndexElementType,
924                         kSegmentSize,
925                         kIndexRegistersInvolved,
926                         vta,
927                         vma>(args, extra_args...);
928       case VectorRegisterGroupMultiplier::kQuarterOfRegister:
929         return OpVector<DataElementType,
930                         VectorRegisterGroupMultiplier::kQuarterOfRegister,
931                         IndexElementType,
932                         kSegmentSize,
933                         kIndexRegistersInvolved,
934                         vta,
935                         vma>(args, extra_args...);
936       case VectorRegisterGroupMultiplier::kHalfOfRegister:
937         return OpVector<DataElementType,
938                         VectorRegisterGroupMultiplier::kHalfOfRegister,
939                         IndexElementType,
940                         kSegmentSize,
941                         kIndexRegistersInvolved,
942                         vta,
943                         vma>(args, extra_args...);
944       default:
945         return Undefined();
946     }
947   }
948 
949   // CSR registers, that are permitted as an argument of strip-mining instrinsic.
950   using CsrName::kFrm;
951   using CsrName::kVxrm;
952   using CsrName::kVxsat;
953   // Argument of OpVectorXXX function is the number of vector register group.
954   template <auto DefaultElement = intrinsics::NoInactiveProcessing{}>
955   struct Vec {
956     uint8_t start_no;
957   };
958   // Vector argument 2x wide (for narrowing and widening instructions).
959   template <auto DefaultElement = intrinsics::NoInactiveProcessing{}>
960   struct WideVec {
961     uint8_t start_no;
962   };
963 
964   template <typename DataElementType,
965             VectorRegisterGroupMultiplier vlmul,
966             typename IndexElementType,
967             size_t kSegmentSize,
968             size_t kIndexRegistersInvolved,
969             TailProcessing vta,
970             auto vma>
OpVector(const Decoder::VLoadIndexedArgs & args,Register src)971   void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
972     return OpVector<DataElementType,
973                     kSegmentSize,
974                     NumberOfRegistersInvolved(vlmul),
975                     IndexElementType,
976                     kIndexRegistersInvolved,
977                     vta,
978                     vma>(args, src);
979   }
980 
981   template <typename DataElementType,
982             size_t kSegmentSize,
983             size_t kNumRegistersInGroup,
984             typename IndexElementType,
985             size_t kIndexRegistersInvolved,
986             TailProcessing vta,
987             auto vma>
OpVector(const Decoder::VLoadIndexedArgs & args,Register src)988   void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
989     if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
990       return Undefined();
991     }
992     constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
993     alignas(alignof(SIMD128Register))
994         IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
995     memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
996     return OpVectorLoad<DataElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
997         args.dst, src, [&indexes](size_t index) { return indexes[index]; });
998   }
999 
1000   template <typename ElementType,
1001             size_t kSegmentSize,
1002             VectorRegisterGroupMultiplier vlmul,
1003             TailProcessing vta,
1004             auto vma>
OpVector(const Decoder::VLoadStrideArgs & args,Register src,Register stride)1005   void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
1006     return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(
1007         args, src, stride);
1008   }
1009 
1010   template <typename ElementType,
1011             size_t kSegmentSize,
1012             size_t kNumRegistersInGroup,
1013             TailProcessing vta,
1014             auto vma>
OpVector(const Decoder::VLoadStrideArgs & args,Register src,Register stride)1015   void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
1016     return OpVectorLoad<ElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
1017         args.dst, src, [stride](size_t index) { return stride * index; });
1018   }
1019 
1020   template <typename ElementType,
1021             size_t kSegmentSize,
1022             VectorRegisterGroupMultiplier vlmul,
1023             TailProcessing vta,
1024             auto vma>
OpVector(const Decoder::VLoadUnitStrideArgs & args,Register src)1025   void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
1026     return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(args,
1027                                                                                            src);
1028   }
1029 
1030   template <typename ElementType,
1031             size_t kSegmentSize,
1032             size_t kNumRegistersInGroup,
1033             TailProcessing vta,
1034             auto vma>
OpVector(const Decoder::VLoadUnitStrideArgs & args,Register src)1035   void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
1036     switch (args.opcode) {
1037       case Decoder::VLUmOpOpcode::kVleXXff:
1038         return OpVectorLoad<ElementType,
1039                             kSegmentSize,
1040                             kNumRegistersInGroup,
1041                             vta,
1042                             vma,
1043                             Decoder::VLUmOpOpcode::kVleXXff>(
1044             args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
1045       case Decoder::VLUmOpOpcode::kVleXX:
1046         return OpVectorLoad<ElementType,
1047                             kSegmentSize,
1048                             kNumRegistersInGroup,
1049                             vta,
1050                             vma,
1051                             Decoder::VLUmOpOpcode::kVleXX>(
1052             args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
1053       case Decoder::VLUmOpOpcode::kVlm:
1054         if constexpr (kSegmentSize == 1 &&
1055                       std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1056           return OpVectorLoad<UInt8,
1057                               1,
1058                               1,
1059                               TailProcessing::kAgnostic,
1060                               vma,
1061                               Decoder::VLUmOpOpcode::kVlm>(
1062               args.dst, src, [](size_t index) { return index; });
1063         }
1064         return Undefined();
1065       default:
1066         return Undefined();
1067     }
1068   }
1069 
1070   // The strided version of segmented load sounds like something very convoluted and complicated
1071   // that no one may ever want to use, but it's not rare and may be illustrated with simple RGB
1072   // bitmap window.
1073   //
1074   // Suppose it's in memory like this (doubles are 8 bytes in size as per IEEE 754)):
1075   //   {R: 0.01}{G: 0.11}{B: 0.21} {R: 1.01}{G: 1.11}{B: 1.21}, {R: 2.01}{G: 2.11}{B: 2.21}
1076   //   {R:10.01}{G:10.11}{B:10.21} {R:11.01}{G:11.11}{B:11.21}, {R:12.01}{G:12.11}{B:12.21}
1077   //   {R:20.01}{G:20.11}{B:20.21} {R:21.01}{G:21.11}{B:21.21}, {R:22.01}{G:22.11}{B:22.21}
1078   //   {R:30.01}{G:30.11}{B:30.21} {R:31.01}{G:31.11}{B:31.21}, {R:32.01}{G:32.11}{B:32.21}
1079   // This is very tiny 3x4 image with 3 components: red, green, blue.
1080   //
1081   // Let's assume that x1 is loaded with address of first element and x2 with 72 (that's how much
1082   // one row of this image takes).
1083   //
1084   // Then we may use the following command to load values in memory (with LMUL = 2, ELEN = 4):
1085   //   vlsseg3e64.v v0, (x1), x2
1086   //
1087   // They would be loaded like this:
1088   //   v0: {R: 0.01}{R:10.01} (first group of 2 registers)
1089   //   v1: {R:20.01}{R:30.01}
1090   //   v2: {G: 0.11}{G:10.11} (second group of 2 registers)
1091   //   v3: {G:20.11}{G:30.11}
1092   //   v4: {B: 0.21}{B:10.21} (third group of 3 registers)
1093   //   v5: {B:20.21}{B:30.21}
1094   // Now we have loaded a column from memory and all three colors are put into a different register
1095   // groups for further processing.
1096   template <typename ElementType,
1097             size_t kSegmentSize,
1098             size_t kNumRegistersInGroup,
1099             TailProcessing vta,
1100             auto vma,
1101             typename Decoder::VLUmOpOpcode opcode = typename Decoder::VLUmOpOpcode{},
1102             typename GetElementOffsetLambdaType>
1103   void OpVectorLoad(uint8_t dst, Register src, GetElementOffsetLambdaType GetElementOffset) {
1104     using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
1105     if (!IsAligned<kNumRegistersInGroup>(dst)) {
1106       return Undefined();
1107     }
1108     if (dst + kNumRegistersInGroup * kSegmentSize > 32) {
1109       return Undefined();
1110     }
1111     constexpr size_t kElementsCount = 16 / sizeof(ElementType);
1112     size_t vstart = GetCsr<CsrName::kVstart>();
1113     size_t vl = GetCsr<CsrName::kVl>();
1114     if constexpr (opcode == Decoder::VLUmOpOpcode::kVlm) {
1115       vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
1116     }
1117     // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
1118     // simplify the logic below.
1119     SetCsr<CsrName::kVstart>(0);
1120     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
1121     // vector register group, including that no tail elements are updated with agnostic values.
1122     if (vstart >= vl) [[unlikely]] {
1123       return;
1124     }
1125     if constexpr (vta == TailProcessing::kAgnostic) {
1126       vstart = std::min(vstart, vl);
1127     }
1128     // Note: within_group_id is the current register id within a register group. During one
1129     // iteration of this loop we compute results for all registers with the current id in all
1130     // groups. E.g. for the example above we'd compute v0, v2, v4 during the first iteration (id
1131     // within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
1132     // ensures that memory is always accessed in ordered fashion.
1133     std::array<SIMD128Register, kSegmentSize> result;
1134     char* ptr = ToHostAddr<char>(src);
1135     auto mask = GetMaskForVectorOperations<vma>();
1136     for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
1137          ++within_group_id) {
1138       // No need to continue if we have kUndisturbed vta strategy.
1139       if constexpr (vta == TailProcessing::kUndisturbed) {
1140         if (within_group_id * kElementsCount >= vl) {
1141           break;
1142         }
1143       }
1144       // If we have elements that won't be overwritten then load these from registers.
1145       // For interpreter we could have filled all the registers unconditionally but we'll want to
1146       // reuse this code JITs later.
1147       auto register_mask =
1148           std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
1149       auto full_mask = std::get<0>(intrinsics::FullMaskForRegister<ElementType>(mask));
1150       if (vstart ||
1151           (vl < (within_group_id + 1) * kElementsCount && vta == TailProcessing::kUndisturbed) ||
1152           !(std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing> ||
1153             static_cast<InactiveProcessing>(vma) != InactiveProcessing::kUndisturbed ||
1154             register_mask == full_mask)) {
1155         for (size_t field = 0; field < kSegmentSize; ++field) {
1156           result[field].Set(state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup]);
1157         }
1158       }
1159       // Read elements from memory, but only if there are any active ones.
1160       for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
1161            ++within_register_id) {
1162         size_t element_index = kElementsCount * within_group_id + within_register_id;
1163         // Stop if we reached the vl limit.
1164         if (vl <= element_index) {
1165           break;
1166         }
1167         // Don't touch masked-out elements.
1168         if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1169           if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
1170                                              1 << within_register_id)}) == MaskType{0}) {
1171             continue;
1172           }
1173         }
1174         // Load segment from memory.
1175         for (size_t field = 0; field < kSegmentSize; ++field) {
1176           FaultyLoadResult mem_access_result =
1177               FaultyLoad(ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
1178                          sizeof(ElementType));
1179           if (mem_access_result.is_fault) {
1180             // Documentation doesn't tell us what we are supposed to do to remaining elements when
1181             // access fault happens but let's trigger an exception and treat the remaining elements
1182             // using vta-specified strategy by simply just adjusting the vl.
1183             vl = element_index;
1184             if constexpr (opcode == Decoder::VLUmOpOpcode::kVleXXff) {
1185               // Fail-first load only triggers exceptions for the first element, otherwise it
1186               // changes vl to ensure that other operations would only process elements that are
1187               // successfully loaded.
1188               if (element_index == 0) [[unlikely]] {
1189                 exception_raised_ = true;
1190               } else {
1191                 // TODO(b/323994286): Write a test case to verify vl changes correctly.
1192                 SetCsr<CsrName::kVl>(element_index);
1193               }
1194             } else {
1195               // Most load instructions set vstart to failing element which then may be processed
1196               // by exception handler.
1197               exception_raised_ = true;
1198               SetCsr<CsrName::kVstart>(element_index);
1199             }
1200             break;
1201           }
1202           result[field].template Set<ElementType>(static_cast<ElementType>(mem_access_result.value),
1203                                                   within_register_id);
1204         }
1205       }
1206       // Lambda to generate tail mask. We don't want to call MakeBitmaskFromVl eagerly because it's
1207       // not needed, most of the time, and compiler couldn't eliminate access to mmap-backed memory.
1208       auto GetTailMask = [vl, within_group_id] {
1209         return std::get<0>(intrinsics::MakeBitmaskFromVl<ElementType>(
1210             (vl <= within_group_id * kElementsCount) ? 0 : vl - within_group_id * kElementsCount));
1211       };
1212       // If mask has inactive elements and InactiveProcessing::kAgnostic mode is used then set them
1213       // to ~0.
1214       if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1215         if (register_mask != full_mask) {
1216           auto [simd_mask] =
1217               intrinsics::BitMaskToSimdMask<ElementType>(Int64{MaskType{register_mask}});
1218           for (size_t field = 0; field < kSegmentSize; ++field) {
1219             if constexpr (vma == InactiveProcessing::kAgnostic) {
1220               // vstart equal to zero is supposed to be exceptional. From RISV-V V manual (page 14):
1221               // The vstart CSR is writable by unprivileged code, but non-zero vstart values may
1222               // cause vector instructions to run substantially slower on some implementations, so
1223               // vstart should not be used by application programmers. A few vector instructions
1224               // cannot be executed with a non-zero vstart value and will raise an illegal
1225               // instruction exception as dened below.
1226               // TODO(b/300690740): decide whether to merge two cases after support for vectors in
1227               // heavy optimizer would be implemented.
1228               if (vstart) [[unlikely]] {
1229                 SIMD128Register vstart_mask = std::get<0>(
1230                     intrinsics::MakeBitmaskFromVl<ElementType>(vstart % kElementsCount));
1231                 if constexpr (vta == TailProcessing::kAgnostic) {
1232                   result[field] |= vstart_mask & ~simd_mask;
1233                 } else if (vl < (within_group_id + 1) * kElementsCount) {
1234                   result[field] |= vstart_mask & ~simd_mask & ~GetTailMask();
1235                 } else {
1236                   result[field] |= vstart_mask & ~simd_mask;
1237                 }
1238               } else if constexpr (vta == TailProcessing::kAgnostic) {
1239                 result[field] |= ~simd_mask;
1240               } else {
1241                 if (vl < (within_group_id + 1) * kElementsCount) {
1242                   result[field] |= ~simd_mask & ~GetTailMask();
1243                 } else {
1244                   result[field] |= ~simd_mask;
1245                 }
1246               }
1247             }
1248           }
1249         }
1250       }
1251       // If we have tail elements and TailProcessing::kAgnostic mode then set them to ~0.
1252       if constexpr (vta == TailProcessing::kAgnostic) {
1253         for (size_t field = 0; field < kSegmentSize; ++field) {
1254           if (vl < (within_group_id + 1) * kElementsCount) {
1255             result[field] |= GetTailMask();
1256           }
1257         }
1258       }
1259       // Put values back into register file.
1260       for (size_t field = 0; field < kSegmentSize; ++field) {
1261         state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup] =
1262             result[field].template Get<__uint128_t>();
1263       }
1264       // Next group should be fully processed.
1265       vstart = 0;
1266     }
1267   }
1268 
1269   // The vector register gather instructions read elements from src1 vector register group at
1270   // locations given by the second source vector src2 register group.
1271   //   src1: element vector register.
1272   //   GetElementIndex: universal lambda that returns index from src2,
1273   template <typename ElementType,
1274             VectorRegisterGroupMultiplier vlmul,
1275             TailProcessing vta,
1276             auto vma,
1277             typename GetElementIndexLambdaType>
OpVectorGather(uint8_t dst,uint8_t src1,GetElementIndexLambdaType GetElementIndex)1278   void OpVectorGather(uint8_t dst, uint8_t src1, GetElementIndexLambdaType GetElementIndex) {
1279     constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
1280     if (!IsAligned<kRegistersInvolved>(dst | src1)) {
1281       return Undefined();
1282     }
1283     // Source and destination must not overlap.
1284     if (dst < (src1 + kRegistersInvolved) && src1 < (dst + kRegistersInvolved)) {
1285       return Undefined();
1286     }
1287     constexpr size_t kElementsCount = 16 / sizeof(ElementType);
1288     constexpr size_t vlmax = GetVlmax<ElementType, vlmul>();
1289 
1290     size_t vstart = GetCsr<CsrName::kVstart>();
1291     size_t vl = GetCsr<CsrName::kVl>();
1292     auto mask = GetMaskForVectorOperations<vma>();
1293     SetCsr<CsrName::kVstart>(0);
1294     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
1295     // vector register group, including that no tail elements are updated with agnostic values.
1296     if (vstart >= vl) [[unlikely]] {
1297       return;
1298     }
1299 
1300     // Copy vlmul registers into array of elements, access elements of temporary array.
1301     alignas(alignof(SIMD128Register)) ElementType values[vlmax];
1302     memcpy(values, state_->cpu.v + src1, sizeof(values));
1303     // Fill dst first, resolve mask later.
1304     for (size_t index = vstart / kElementsCount; index < kRegistersInvolved; ++index) {
1305       SIMD128Register original_dst_value;
1306       SIMD128Register result{state_->cpu.v[dst + index]};
1307       for (size_t dst_element_index = vstart % kElementsCount; dst_element_index < kElementsCount;
1308            ++dst_element_index) {
1309         size_t src_element_index = GetElementIndex(index * kElementsCount + dst_element_index);
1310 
1311         // If an element index is out of range ( vs1[i] >= VLMAX ) then zero is returned for the
1312         // element value.
1313         ElementType element_value = ElementType{0};
1314         if (src_element_index < vlmax) {
1315           element_value = values[src_element_index];
1316         }
1317         original_dst_value.Set<ElementType>(element_value, dst_element_index);
1318       }
1319 
1320       // Apply mask and put result values into dst register.
1321       result =
1322           VectorMasking<ElementType, vta, vma>(result, original_dst_value, vstart, vl, index, mask);
1323       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
1324       // Next group should be fully processed.
1325       vstart = 0;
1326     }
1327   }
1328 
1329   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpFVfArgs & args,ElementType arg2)1330   void OpVector(const Decoder::VOpFVfArgs& args, ElementType arg2) {
1331     using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
1332     if constexpr (sizeof(ElementType) == sizeof(Float32)) {
1333       // Keep cases sorted in opcode order to match RISC-V V manual.
1334       switch (args.opcode) {
1335         case Decoder::VOpFVfOpcode::kVfwaddvf:
1336           return OpVectorWidenvx<intrinsics::Vfwaddvf<ElementType>,
1337                                  ElementType,
1338                                  vlmul,
1339                                  vta,
1340                                  vma,
1341                                  kFrm>(args.dst, args.src1, arg2);
1342         case Decoder::VOpFVfOpcode::kVfwsubvf:
1343           return OpVectorWidenvx<intrinsics::Vfwsubvf<ElementType>,
1344                                  ElementType,
1345                                  vlmul,
1346                                  vta,
1347                                  vma,
1348                                  kFrm>(args.dst, args.src1, arg2);
1349         case Decoder::VOpFVfOpcode::kVfwmulvf:
1350           return OpVectorWidenvx<intrinsics::Vfwmulvf<ElementType>,
1351                                  ElementType,
1352                                  vlmul,
1353                                  vta,
1354                                  vma,
1355                                  kFrm>(args.dst, args.src1, arg2);
1356         case Decoder::VOpFVfOpcode::kVfwaddwf:
1357           return OpVectorWidenwx<intrinsics::Vfwaddwf<ElementType>,
1358                                  ElementType,
1359                                  vlmul,
1360                                  vta,
1361                                  vma,
1362                                  kFrm>(args.dst, args.src1, arg2);
1363         case Decoder::VOpFVfOpcode::kVfwsubwf:
1364           return OpVectorWidenwx<intrinsics::Vfwsubwf<ElementType>,
1365                                  ElementType,
1366                                  vlmul,
1367                                  vta,
1368                                  vma,
1369                                  kFrm>(args.dst, args.src1, arg2);
1370         case Decoder::VOpFVfOpcode::kVfwmaccvf:
1371           return OpVectorWidenvxw<intrinsics::Vfwmaccvf<ElementType>,
1372                                   ElementType,
1373                                   vlmul,
1374                                   vta,
1375                                   vma,
1376                                   kFrm>(args.dst, args.src1, arg2);
1377         case Decoder::VOpFVfOpcode::kVfwnmaccvf:
1378           return OpVectorWidenvxw<intrinsics::Vfwnmaccvf<ElementType>,
1379                                   ElementType,
1380                                   vlmul,
1381                                   vta,
1382                                   vma,
1383                                   kFrm>(args.dst, args.src1, arg2);
1384         case Decoder::VOpFVfOpcode::kVfwmsacvf:
1385           return OpVectorWidenvxw<intrinsics::Vfwmsacvf<ElementType>,
1386                                   ElementType,
1387                                   vlmul,
1388                                   vta,
1389                                   vma,
1390                                   kFrm>(args.dst, args.src1, arg2);
1391         case Decoder::VOpFVfOpcode::kVfwnmsacvf:
1392           return OpVectorWidenvxw<intrinsics::Vfwnmsacvf<ElementType>,
1393                                   ElementType,
1394                                   vlmul,
1395                                   vta,
1396                                   vma,
1397                                   kFrm>(args.dst, args.src1, arg2);
1398         default:
1399           break;
1400       }
1401     }
1402     // Keep cases sorted in opcode order to match RISC-V V manual.
1403     switch (args.opcode) {
1404       case Decoder::VOpFVfOpcode::kVfminvf:
1405         return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>(
1406             args.dst, args.src1, arg2);
1407       case Decoder::VOpFVfOpcode::kVfmaxvf:
1408         return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>(
1409             args.dst, args.src1, arg2);
1410       case Decoder::VOpFVfOpcode::kVfsgnjvf:
1411         return OpVectorvx<intrinsics::Vfsgnjvx<ElementType>, ElementType, vlmul, vta, vma>(
1412             args.dst, args.src1, arg2);
1413       case Decoder::VOpFVfOpcode::kVfsgnjnvf:
1414         return OpVectorvx<intrinsics::Vfsgnjnvx<ElementType>, ElementType, vlmul, vta, vma>(
1415             args.dst, args.src1, arg2);
1416       case Decoder::VOpFVfOpcode::kVfsgnjxvf:
1417         return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>(
1418             args.dst, args.src1, arg2);
1419       case Decoder::VOpFVfOpcode::kVfslide1upvf:
1420         return OpVectorslide1up<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
1421       case Decoder::VOpFVfOpcode::kVfslide1downvf:
1422         return OpVectorslide1down<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
1423       case Decoder::VOpFVfOpcode::kVfmvsf:
1424         if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1425           return Undefined();
1426         }
1427         if (args.src1 != 0) {
1428           return Undefined();
1429         }
1430         return OpVectorVmvsx<ElementType, vta>(args.dst, arg2);
1431       case Decoder::VOpFVfOpcode::kVfmergevf:
1432         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1433           if (args.src1 != 0) {
1434             return Undefined();
1435           }
1436           return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(args.dst,
1437                                                                                           arg2);
1438         } else {
1439           return OpVectorx<intrinsics::Vcopyx<ElementType>,
1440                            ElementType,
1441                            vlmul,
1442                            vta,
1443                            // Always use "undisturbed" value from source register.
1444                            InactiveProcessing::kUndisturbed>(
1445               args.dst, arg2, /*dst_mask=*/args.src1);
1446         }
1447       case Decoder::VOpFVfOpcode::kVmfeqvf:
1448         return OpVectorToMaskvx<intrinsics::Vfeqvx<ElementType>, ElementType, vlmul, vma>(
1449             args.dst, args.src1, arg2);
1450       case Decoder::VOpFVfOpcode::kVmflevf:
1451         return OpVectorToMaskvx<intrinsics::Vflevx<ElementType>, ElementType, vlmul, vma>(
1452             args.dst, args.src1, arg2);
1453       case Decoder::VOpFVfOpcode::kVmfltvf:
1454         return OpVectorToMaskvx<intrinsics::Vfltvx<ElementType>, ElementType, vlmul, vma>(
1455             args.dst, args.src1, arg2);
1456       case Decoder::VOpFVfOpcode::kVmfnevf:
1457         return OpVectorToMaskvx<intrinsics::Vfnevx<ElementType>, ElementType, vlmul, vma>(
1458             args.dst, args.src1, arg2);
1459       case Decoder::VOpFVfOpcode::kVmfgtvf:
1460         return OpVectorToMaskvx<intrinsics::Vfgtvx<ElementType>, ElementType, vlmul, vma>(
1461             args.dst, args.src1, arg2);
1462       case Decoder::VOpFVfOpcode::kVmfgevf:
1463         return OpVectorToMaskvx<intrinsics::Vfgevx<ElementType>, ElementType, vlmul, vma>(
1464             args.dst, args.src1, arg2);
1465       case Decoder::VOpFVfOpcode::kVfdivvf:
1466         return OpVectorSameWidth<intrinsics::Vfdivvf<ElementType>,
1467                                  ElementType,
1468                                  NumberOfRegistersInvolved(vlmul),
1469                                  vta,
1470                                  vma,
1471                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1472       case Decoder::VOpFVfOpcode::kVfrdivvf:
1473         return OpVectorSameWidth<intrinsics::Vfrdivvf<ElementType>,
1474                                  ElementType,
1475                                  NumberOfRegistersInvolved(vlmul),
1476                                  vta,
1477                                  vma,
1478                                  kFrm>(
1479             args.dst,
1480             Vec<SignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x3f80'0000
1481                                                                     : 0x3ff0'0000'0000'0000}>{
1482                 args.src1},
1483             arg2);
1484       case Decoder::VOpFVfOpcode::kVfmulvf:
1485         return OpVectorSameWidth<intrinsics::Vfmulvf<ElementType>,
1486                                  ElementType,
1487                                  NumberOfRegistersInvolved(vlmul),
1488                                  vta,
1489                                  vma,
1490                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1491       case Decoder::VOpFVfOpcode::kVfaddvf:
1492         return OpVectorSameWidth<intrinsics::Vfaddvf<ElementType>,
1493                                  ElementType,
1494                                  NumberOfRegistersInvolved(vlmul),
1495                                  vta,
1496                                  vma,
1497                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1498       case Decoder::VOpFVfOpcode::kVfsubvf:
1499         return OpVectorSameWidth<intrinsics::Vfsubvf<ElementType>,
1500                                  ElementType,
1501                                  NumberOfRegistersInvolved(vlmul),
1502                                  vta,
1503                                  vma,
1504                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1505       case Decoder::VOpFVfOpcode::kVfrsubvf:
1506         return OpVectorSameWidth<intrinsics::Vfrsubvf<ElementType>,
1507                                  ElementType,
1508                                  NumberOfRegistersInvolved(vlmul),
1509                                  vta,
1510                                  vma,
1511                                  kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1512       case Decoder::VOpFVfOpcode::kVfmaccvf:
1513         return OpVectorvxv<intrinsics::Vfmaccvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1514             args.dst, args.src1, arg2);
1515       case Decoder::VOpFVfOpcode::kVfmsacvf:
1516         return OpVectorvxv<intrinsics::Vfmsacvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1517             args.dst, args.src1, arg2);
1518       case Decoder::VOpFVfOpcode::kVfmaddvf:
1519         return OpVectorvxv<intrinsics::Vfmaddvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1520             args.dst, args.src1, arg2);
1521       case Decoder::VOpFVfOpcode::kVfmsubvf:
1522         return OpVectorvxv<intrinsics::Vfmsubvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1523             args.dst, args.src1, arg2);
1524       case Decoder::VOpFVfOpcode::kVfnmaccvf:
1525         return OpVectorvxv<intrinsics::Vfnmaccvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1526             args.dst, args.src1, arg2);
1527       case Decoder::VOpFVfOpcode::kVfnmsacvf:
1528         return OpVectorvxv<intrinsics::Vfnmsacvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1529             args.dst, args.src1, arg2);
1530       case Decoder::VOpFVfOpcode::kVfnmaddvf:
1531         return OpVectorvxv<intrinsics::Vfnmaddvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1532             args.dst, args.src1, arg2);
1533       case Decoder::VOpFVfOpcode::kVfnmsubvf:
1534         return OpVectorvxv<intrinsics::Vfnmsubvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1535             args.dst, args.src1, arg2);
1536       default:
1537         return Undefined();
1538     }
1539   }
1540 
1541   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpFVvArgs & args)1542   void OpVector(const Decoder::VOpFVvArgs& args) {
1543     using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
1544     using UnsignedType = Wrapping<std::make_unsigned_t<typename TypeTraits<ElementType>::Int>>;
1545     // Floating point IEEE 754 value -0.0 includes 1 top bit set and the other bits not set:
1546     // https://en.wikipedia.org/wiki/Signed_zero#Representations This is the exact same
1547     // representation minimum negative integer have in two's complement representation:
1548     // https://en.wikipedia.org/wiki/Two%27s_complement#Most_negative_number
1549     // Note: we pass filler elements as integers because `Float32`/`Float64` couldn't be template
1550     // parameters.
1551     constexpr SignedType kNegativeZero{std::numeric_limits<typename SignedType::BaseType>::min()};
1552     // Floating point IEEE 754 value +0.0 includes only zero bits, same as integer zero.
1553     constexpr SignedType kPositiveZero{};
1554     // We currently don't support Float16 operations, but conversion routines that deal with
1555     // double-width floats use these encodings to produce regular Float32 types.
1556     if constexpr (sizeof(ElementType) <= sizeof(Float32)) {
1557       using WideElementType = typename TypeTraits<ElementType>::Wide;
1558       // Keep cases sorted in opcode order to match RISC-V V manual.
1559       switch (args.opcode) {
1560         case Decoder::VOpFVvOpcode::kVFUnary0:
1561           switch (args.vfunary0_opcode) {
1562             case Decoder::VFUnary0Opcode::kVfwcvtfxuv:
1563               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1564                 return intrinsics::Vfcvtv<WideElementType, UnsignedType>(FPFlags::DYN, frm, src);
1565               },
1566                                     UnsignedType,
1567                                     vlmul,
1568                                     vta,
1569                                     vma,
1570                                     kFrm>(args.dst, args.src1);
1571             case Decoder::VFUnary0Opcode::kVfwcvtfxv:
1572               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1573                 return intrinsics::Vfcvtv<WideElementType, SignedType>(FPFlags::DYN, frm, src);
1574               },
1575                                     SignedType,
1576                                     vlmul,
1577                                     vta,
1578                                     vma,
1579                                     kFrm>(args.dst, args.src1);
1580             case Decoder::VFUnary0Opcode::kVfncvtxufw:
1581               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1582                 return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::DYN, frm, src);
1583               },
1584                                      UnsignedType,
1585                                      vlmul,
1586                                      vta,
1587                                      vma,
1588                                      kFrm>(args.dst, args.src1);
1589             case Decoder::VFUnary0Opcode::kVfncvtxfw:
1590               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1591                 return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::DYN, frm, src);
1592               },
1593                                      SignedType,
1594                                      vlmul,
1595                                      vta,
1596                                      vma,
1597                                      kFrm>(args.dst, args.src1);
1598             case Decoder::VFUnary0Opcode::kVfncvtrtzxufw:
1599               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1600                 return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::RTZ, frm, src);
1601               },
1602                                      UnsignedType,
1603                                      vlmul,
1604                                      vta,
1605                                      vma,
1606                                      kFrm>(args.dst, args.src1);
1607             case Decoder::VFUnary0Opcode::kVfncvtrtzxfw:
1608               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1609                 return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::RTZ, frm, src);
1610               },
1611                                      SignedType,
1612                                      vlmul,
1613                                      vta,
1614                                      vma,
1615                                      kFrm>(args.dst, args.src1);
1616             default:
1617               break;  // Make compiler happy.
1618           }
1619           break;
1620         default:
1621           break;  // Make compiler happy.
1622       }
1623     }
1624     // Widening and narrowing opeation which take floating point “narrow” operand may only work
1625     // correctly with Float32 input: Float16 is not supported yet, while Float64 input would produce
1626     // 128bit output which is currently reserver in RISC-V V.
1627     if constexpr (sizeof(ElementType) == sizeof(Float32)) {
1628       using WideElementType = WideType<ElementType>;
1629       using WideSignedType = WideType<SignedType>;
1630       using WideUnsignedType = WideType<UnsignedType>;
1631       // Keep cases sorted in opcode order to match RISC-V V manual.
1632       switch (args.opcode) {
1633         case Decoder::VOpFVvOpcode::kVfwaddvv:
1634           return OpVectorWidenvv<intrinsics::Vfwaddvv<ElementType>,
1635                                  ElementType,
1636                                  vlmul,
1637                                  vta,
1638                                  vma,
1639                                  kFrm>(args.dst, args.src1, args.src2);
1640         case Decoder::VOpFVvOpcode::kVfwredusumvs:
1641           // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1642           // The additive identity is +0.0 when rounding down or -0.0 for all other rounding
1643           // modes.
1644           if (GetCsr<kFrm>() != FPFlags::RDN) {
1645             return OpVectorvs<intrinsics::Vfredosumvs<ElementType, WideType<ElementType>>,
1646                               ElementType,
1647                               WideType<ElementType>,
1648                               vlmul,
1649                               vta,
1650                               vma,
1651                               kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1652           } else {
1653             return OpVectorvs<intrinsics::Vfredosumvs<ElementType, WideType<ElementType>>,
1654                               ElementType,
1655                               WideType<ElementType>,
1656                               vlmul,
1657                               vta,
1658                               vma,
1659                               kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1660           }
1661         case Decoder::VOpFVvOpcode::kVfwsubvv:
1662           return OpVectorWidenvv<intrinsics::Vfwsubvv<ElementType>,
1663                                  ElementType,
1664                                  vlmul,
1665                                  vta,
1666                                  vma,
1667                                  kFrm>(args.dst, args.src1, args.src2);
1668         case Decoder::VOpFVvOpcode::kVfwredosumvs:
1669           // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1670           // The additive identity is +0.0 when rounding down or -0.0 for all other rounding
1671           // modes.
1672           if (GetCsr<kFrm>() != FPFlags::RDN) {
1673             return OpVectorvs<intrinsics::Vfredosumvs<ElementType, WideType<ElementType>>,
1674                               ElementType,
1675                               WideType<ElementType>,
1676                               vlmul,
1677                               vta,
1678                               vma,
1679                               kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1680           } else {
1681             return OpVectorvs<intrinsics::Vfredosumvs<ElementType, WideType<ElementType>>,
1682                               ElementType,
1683                               WideType<ElementType>,
1684                               vlmul,
1685                               vta,
1686                               vma,
1687                               kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1688           }
1689         case Decoder::VOpFVvOpcode::kVfwmulvv:
1690           return OpVectorWidenvv<intrinsics::Vfwmulvv<ElementType>,
1691                                  ElementType,
1692                                  vlmul,
1693                                  vta,
1694                                  vma,
1695                                  kFrm>(args.dst, args.src1, args.src2);
1696         case Decoder::VOpFVvOpcode::kVfwaddwv:
1697           return OpVectorWidenwv<intrinsics::Vfwaddwv<ElementType>,
1698                                  ElementType,
1699                                  vlmul,
1700                                  vta,
1701                                  vma,
1702                                  kFrm>(args.dst, args.src1, args.src2);
1703         case Decoder::VOpFVvOpcode::kVfwsubwv:
1704           return OpVectorWidenwv<intrinsics::Vfwsubwv<ElementType>,
1705                                  ElementType,
1706                                  vlmul,
1707                                  vta,
1708                                  vma,
1709                                  kFrm>(args.dst, args.src1, args.src2);
1710         case Decoder::VOpFVvOpcode::kVfwmaccvv:
1711           return OpVectorWidenvvw<intrinsics::Vfwmaccvv<ElementType>,
1712                                   ElementType,
1713                                   vlmul,
1714                                   vta,
1715                                   vma,
1716                                   kFrm>(args.dst, args.src1, args.src2);
1717         case Decoder::VOpFVvOpcode::kVfwnmaccvv:
1718           return OpVectorWidenvvw<intrinsics::Vfwnmaccvv<ElementType>,
1719                                   ElementType,
1720                                   vlmul,
1721                                   vta,
1722                                   vma,
1723                                   kFrm>(args.dst, args.src1, args.src2);
1724         case Decoder::VOpFVvOpcode::kVfwmsacvv:
1725           return OpVectorWidenvvw<intrinsics::Vfwmsacvv<ElementType>,
1726                                   ElementType,
1727                                   vlmul,
1728                                   vta,
1729                                   vma,
1730                                   kFrm>(args.dst, args.src1, args.src2);
1731         case Decoder::VOpFVvOpcode::kVfwnmsacvv:
1732           return OpVectorWidenvvw<intrinsics::Vfwnmsacvv<ElementType>,
1733                                   ElementType,
1734                                   vlmul,
1735                                   vta,
1736                                   vma,
1737                                   kFrm>(args.dst, args.src1, args.src2);
1738         case Decoder::VOpFVvOpcode::kVFUnary0:
1739           switch (args.vfunary0_opcode) {
1740             case Decoder::VFUnary0Opcode::kVfwcvtxufv:
1741               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1742                 return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::DYN, frm, src);
1743               },
1744                                     ElementType,
1745                                     vlmul,
1746                                     vta,
1747                                     vma,
1748                                     kFrm>(args.dst, args.src1);
1749             case Decoder::VFUnary0Opcode::kVfwcvtxfv:
1750               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1751                 return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::DYN, frm, src);
1752               },
1753                                     ElementType,
1754                                     vlmul,
1755                                     vta,
1756                                     vma,
1757                                     kFrm>(args.dst, args.src1);
1758             case Decoder::VFUnary0Opcode::kVfwcvtffv:
1759               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1760                 return intrinsics::Vfcvtv<WideElementType, ElementType>(FPFlags::DYN, frm, src);
1761               },
1762                                     ElementType,
1763                                     vlmul,
1764                                     vta,
1765                                     vma,
1766                                     kFrm>(args.dst, args.src1);
1767             case Decoder::VFUnary0Opcode::kVfwcvtrtzxufv:
1768               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1769                 return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::RTZ, frm, src);
1770               },
1771                                     ElementType,
1772                                     vlmul,
1773                                     vta,
1774                                     vma,
1775                                     kFrm>(args.dst, args.src1);
1776             case Decoder::VFUnary0Opcode::kVfwcvtrtzxfv:
1777               return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1778                 return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::RTZ, frm, src);
1779               },
1780                                     ElementType,
1781                                     vlmul,
1782                                     vta,
1783                                     vma,
1784                                     kFrm>(args.dst, args.src1);
1785             case Decoder::VFUnary0Opcode::kVfncvtfxuw:
1786               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1787                 return intrinsics::Vfcvtv<ElementType, WideUnsignedType>(FPFlags::DYN, frm, src);
1788               },
1789                                      ElementType,
1790                                      vlmul,
1791                                      vta,
1792                                      vma,
1793                                      kFrm>(args.dst, args.src1);
1794             case Decoder::VFUnary0Opcode::kVfncvtffw:
1795               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1796                 return intrinsics::Vfcvtv<ElementType, WideElementType>(FPFlags::DYN, frm, src);
1797               },
1798                                      ElementType,
1799                                      vlmul,
1800                                      vta,
1801                                      vma,
1802                                      kFrm>(args.dst, args.src1);
1803             case Decoder::VFUnary0Opcode::kVfncvtfxw:
1804               return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1805                 return intrinsics::Vfcvtv<ElementType, WideSignedType>(FPFlags::DYN, frm, src);
1806               },
1807                                      ElementType,
1808                                      vlmul,
1809                                      vta,
1810                                      vma,
1811                                      kFrm>(args.dst, args.src1);
1812             default:
1813               break;  // Make compiler happy.
1814           }
1815           break;
1816         default:
1817           break;  // Make compiler happy.
1818       }
1819     }
1820     // If our ElementType is Float16 then “straight” operations are unsupported and we whouldn't try
1821     // instantiate any functions since this would lead to compilke-time error.
1822     if constexpr (sizeof(ElementType) >= sizeof(Float32)) {
1823       // Keep cases sorted in opcode order to match RISC-V V manual.
1824       switch (args.opcode) {
1825         case Decoder::VOpFVvOpcode::kVfredusumvs:
1826           // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1827           // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
1828           if (GetCsr<kFrm>() != FPFlags::RDN) {
1829             return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
1830                               ElementType,
1831                               vlmul,
1832                               vta,
1833                               vma,
1834                               kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1835           } else {
1836             return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
1837                               ElementType,
1838                               vlmul,
1839                               vta,
1840                               vma,
1841                               kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1842           }
1843         case Decoder::VOpFVvOpcode::kVfredosumvs:
1844           // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1845           // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
1846           if (GetCsr<kFrm>() != FPFlags::RDN) {
1847             return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
1848                               ElementType,
1849                               vlmul,
1850                               vta,
1851                               vma,
1852                               kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1853           } else {
1854             return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
1855                               ElementType,
1856                               vlmul,
1857                               vta,
1858                               vma,
1859                               kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1860           }
1861         case Decoder::VOpFVvOpcode::kVfminvv:
1862           return OpVectorvv<intrinsics::Vfminvv<ElementType>, ElementType, vlmul, vta, vma>(
1863               args.dst, args.src1, args.src2);
1864         case Decoder::VOpFVvOpcode::kVfredminvs:
1865           // For Vfredmin the identity element is +inf.
1866           return OpVectorvs<intrinsics::Vfredminvs<ElementType>, ElementType, vlmul, vta, vma>(
1867               args.dst,
1868               Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x7f80'0000
1869                                                                         : 0x7ff0'0000'0000'0000}>{
1870                   args.src1},
1871               args.src2);
1872         case Decoder::VOpFVvOpcode::kVfmaxvv:
1873           return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>(
1874               args.dst, args.src1, args.src2);
1875         case Decoder::VOpFVvOpcode::kVfredmaxvs:
1876           // For Vfredmax the identity element is -inf.
1877           return OpVectorvs<intrinsics::Vfredmaxvs<ElementType>, ElementType, vlmul, vta, vma>(
1878               args.dst,
1879               Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0xff80'0000
1880                                                                         : 0xfff0'0000'0000'0000}>{
1881                   args.src1},
1882               args.src2);
1883         case Decoder::VOpFVvOpcode::kVfsgnjvv:
1884           return OpVectorvv<intrinsics::Vfsgnjvv<ElementType>, ElementType, vlmul, vta, vma>(
1885               args.dst, args.src1, args.src2);
1886         case Decoder::VOpFVvOpcode::kVfsgnjnvv:
1887           return OpVectorvv<intrinsics::Vfsgnjnvv<ElementType>, ElementType, vlmul, vta, vma>(
1888               args.dst, args.src1, args.src2);
1889         case Decoder::VOpFVvOpcode::kVfsgnjxvv:
1890           return OpVectorvv<intrinsics::Vfsgnjxvv<ElementType>, ElementType, vlmul, vta, vma>(
1891               args.dst, args.src1, args.src2);
1892         case Decoder::VOpFVvOpcode::kVFUnary0:
1893           switch (args.vfunary0_opcode) {
1894             case Decoder::VFUnary0Opcode::kVfcvtxufv:
1895               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1896                 return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::DYN, frm, src);
1897               },
1898                                ElementType,
1899                                vlmul,
1900                                vta,
1901                                vma,
1902                                kFrm>(args.dst, args.src1);
1903             case Decoder::VFUnary0Opcode::kVfcvtxfv:
1904               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1905                 return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::DYN, frm, src);
1906               },
1907                                ElementType,
1908                                vlmul,
1909                                vta,
1910                                vma,
1911                                kFrm>(args.dst, args.src1);
1912             case Decoder::VFUnary0Opcode::kVfcvtfxuv:
1913               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1914                 return intrinsics::Vfcvtv<ElementType, UnsignedType>(FPFlags::DYN, frm, src);
1915               },
1916                                UnsignedType,
1917                                vlmul,
1918                                vta,
1919                                vma,
1920                                kFrm>(args.dst, args.src1);
1921             case Decoder::VFUnary0Opcode::kVfcvtfxv:
1922               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1923                 return intrinsics::Vfcvtv<ElementType, SignedType>(FPFlags::DYN, frm, src);
1924               },
1925                                SignedType,
1926                                vlmul,
1927                                vta,
1928                                vma,
1929                                kFrm>(args.dst, args.src1);
1930             case Decoder::VFUnary0Opcode::kVfcvtrtzxufv:
1931               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1932                 return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::RTZ, frm, src);
1933               },
1934                                ElementType,
1935                                vlmul,
1936                                vta,
1937                                vma,
1938                                kFrm>(args.dst, args.src1);
1939             case Decoder::VFUnary0Opcode::kVfcvtrtzxfv:
1940               return OpVectorv<[](int8_t frm, SIMD128Register src) {
1941                 return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::RTZ, frm, src);
1942               },
1943                                ElementType,
1944                                vlmul,
1945                                vta,
1946                                vma,
1947                                kFrm>(args.dst, args.src1);
1948             default:
1949               break;  // Make compiler happy.
1950           }
1951           break;
1952         case Decoder::VOpFVvOpcode::kVFUnary1:
1953           switch (args.vfunary1_opcode) {
1954             case Decoder::VFUnary1Opcode::kVfsqrtv:
1955               return OpVectorv<intrinsics::Vfsqrtv<ElementType>,
1956                                ElementType,
1957                                vlmul,
1958                                vta,
1959                                vma,
1960                                kFrm>(args.dst, args.src1);
1961               break;
1962             case Decoder::VFUnary1Opcode::kVfrsqrt7v:
1963               return OpVectorv<intrinsics::Vfrsqrt7v<ElementType>, ElementType, vlmul, vta, vma>(
1964                   args.dst, args.src1);
1965               break;
1966             case Decoder::VFUnary1Opcode::kVfclassv:
1967               return OpVectorv<intrinsics::Vfclassv<ElementType>, ElementType, vlmul, vta, vma>(
1968                   args.dst, args.src1);
1969               break;
1970             default:
1971               break;  // Make compiler happy.
1972           }
1973           break;
1974         case Decoder::VOpFVvOpcode::kVfmvfs:
1975           if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1976             return Undefined();
1977           }
1978           if (args.src2 != 0) {
1979             return Undefined();
1980           }
1981           return OpVectorVmvfs<ElementType>(args.dst, args.src1);
1982         case Decoder::VOpFVvOpcode::kVmfeqvv:
1983           return OpVectorToMaskvv<intrinsics::Vfeqvv<ElementType>, ElementType, vlmul, vma>(
1984               args.dst, args.src1, args.src2);
1985         case Decoder::VOpFVvOpcode::kVmflevv:
1986           return OpVectorToMaskvv<intrinsics::Vflevv<ElementType>, ElementType, vlmul, vma>(
1987               args.dst, args.src1, args.src2);
1988         case Decoder::VOpFVvOpcode::kVmfltvv:
1989           return OpVectorToMaskvv<intrinsics::Vfltvv<ElementType>, ElementType, vlmul, vma>(
1990               args.dst, args.src1, args.src2);
1991         case Decoder::VOpFVvOpcode::kVmfnevv:
1992           return OpVectorToMaskvv<intrinsics::Vfnevv<ElementType>, ElementType, vlmul, vma>(
1993               args.dst, args.src1, args.src2);
1994         case Decoder::VOpFVvOpcode::kVfdivvv:
1995           return OpVectorSameWidth<intrinsics::Vfdivvv<ElementType>,
1996                                    ElementType,
1997                                    NumberOfRegistersInvolved(vlmul),
1998                                    vta,
1999                                    vma,
2000                                    kFrm>(
2001               args.dst,
2002               Vec<SignedType{}>{args.src1},
2003               Vec<SignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x3f80'0000
2004                                                                       : 0x3ff0'0000'0000'0000}>{
2005                   args.src2});
2006         case Decoder::VOpFVvOpcode::kVfmulvv:
2007           return OpVectorSameWidth<intrinsics::Vfmulvv<ElementType>,
2008                                    ElementType,
2009                                    NumberOfRegistersInvolved(vlmul),
2010                                    vta,
2011                                    vma,
2012                                    kFrm>(
2013               args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
2014         case Decoder::VOpFVvOpcode::kVfaddvv:
2015           return OpVectorSameWidth<intrinsics::Vfaddvv<ElementType>,
2016                                    ElementType,
2017                                    NumberOfRegistersInvolved(vlmul),
2018                                    vta,
2019                                    vma,
2020                                    kFrm>(
2021               args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
2022         case Decoder::VOpFVvOpcode::kVfsubvv:
2023           return OpVectorSameWidth<intrinsics::Vfsubvv<ElementType>,
2024                                    ElementType,
2025                                    NumberOfRegistersInvolved(vlmul),
2026                                    vta,
2027                                    vma,
2028                                    kFrm>(
2029               args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
2030         case Decoder::VOpFVvOpcode::kVfmaccvv:
2031           return OpVectorvvv<intrinsics::Vfmaccvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
2032               args.dst, args.src1, args.src2);
2033         case Decoder::VOpFVvOpcode::kVfmsacvv:
2034           return OpVectorvvv<intrinsics::Vfmsacvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
2035               args.dst, args.src1, args.src2);
2036         case Decoder::VOpFVvOpcode::kVfmaddvv:
2037           return OpVectorvvv<intrinsics::Vfmaddvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
2038               args.dst, args.src1, args.src2);
2039         case Decoder::VOpFVvOpcode::kVfmsubvv:
2040           return OpVectorvvv<intrinsics::Vfmsubvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
2041               args.dst, args.src1, args.src2);
2042         case Decoder::VOpFVvOpcode::kVfnmaccvv:
2043           return OpVectorvvv<intrinsics::Vfnmaccvv<ElementType>,
2044                              ElementType,
2045                              vlmul,
2046                              vta,
2047                              vma,
2048                              kFrm>(args.dst, args.src1, args.src2);
2049         case Decoder::VOpFVvOpcode::kVfnmsacvv:
2050           return OpVectorvvv<intrinsics::Vfnmsacvv<ElementType>,
2051                              ElementType,
2052                              vlmul,
2053                              vta,
2054                              vma,
2055                              kFrm>(args.dst, args.src1, args.src2);
2056         case Decoder::VOpFVvOpcode::kVfnmaddvv:
2057           return OpVectorvvv<intrinsics::Vfnmaddvv<ElementType>,
2058                              ElementType,
2059                              vlmul,
2060                              vta,
2061                              vma,
2062                              kFrm>(args.dst, args.src1, args.src2);
2063         case Decoder::VOpFVvOpcode::kVfnmsubvv:
2064           return OpVectorvvv<intrinsics::Vfnmsubvv<ElementType>,
2065                              ElementType,
2066                              vlmul,
2067                              vta,
2068                              vma,
2069                              kFrm>(args.dst, args.src1, args.src2);
2070         default:
2071           break;  // Make compiler happy.
2072       }
2073     }
2074     return Undefined();
2075   }
2076 
2077   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIViArgs & args)2078   void OpVector(const Decoder::VOpIViArgs& args) {
2079     using SignedType = berberis::SignedType<ElementType>;
2080     using UnsignedType = berberis::UnsignedType<ElementType>;
2081     using SaturatingSignedType = SaturatingType<SignedType>;
2082     using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2083     // Keep cases sorted in opcode order to match RISC-V V manual.
2084     switch (args.opcode) {
2085       case Decoder::VOpIViOpcode::kVaddvi:
2086         return OpVectorvx<intrinsics::Vaddvx<SignedType>, SignedType, vlmul, vta, vma>(
2087             args.dst, args.src, SignedType{args.imm});
2088       case Decoder::VOpIViOpcode::kVrsubvi:
2089         return OpVectorvx<intrinsics::Vrsubvx<SignedType>, SignedType, vlmul, vta, vma>(
2090             args.dst, args.src, SignedType{args.imm});
2091       case Decoder::VOpIViOpcode::kVandvi:
2092         return OpVectorvx<intrinsics::Vandvx<SignedType>, SignedType, vlmul, vta, vma>(
2093             args.dst, args.src, SignedType{args.imm});
2094       case Decoder::VOpIViOpcode::kVorvi:
2095         return OpVectorvx<intrinsics::Vorvx<SignedType>, SignedType, vlmul, vta, vma>(
2096             args.dst, args.src, SignedType{args.imm});
2097       case Decoder::VOpIViOpcode::kVxorvi:
2098         return OpVectorvx<intrinsics::Vxorvx<SignedType>, SignedType, vlmul, vta, vma>(
2099             args.dst, args.src, SignedType{args.imm});
2100       case Decoder::VOpIViOpcode::kVrgathervi:
2101         return OpVectorGather<ElementType, vlmul, vta, vma>(
2102             args.dst, args.src, [&args](size_t /*index*/) { return ElementType{args.uimm}; });
2103       case Decoder::VOpIViOpcode::kVadcvi:
2104         return OpVectorvxm<intrinsics::Vadcvx<SignedType>,
2105                            SignedType,
2106                            NumberOfRegistersInvolved(vlmul),
2107                            vta,
2108                            vma>(args.dst, args.src, SignedType{args.imm});
2109       case Decoder::VOpIViOpcode::kVmseqvi:
2110         return OpVectorToMaskvx<intrinsics::Vseqvx<SignedType>, SignedType, vlmul, vma>(
2111             args.dst, args.src, SignedType{args.imm});
2112       case Decoder::VOpIViOpcode::kVmsnevi:
2113         return OpVectorToMaskvx<intrinsics::Vsnevx<SignedType>, SignedType, vlmul, vma>(
2114             args.dst, args.src, SignedType{args.imm});
2115       case Decoder::VOpIViOpcode::kVmsleuvi:
2116         // Note: Vmsleu.vi actually have signed immediate which means that we first need to
2117         // expand it to the width of element as signed value and then bit-cast to unsigned.
2118         return OpVectorToMaskvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
2119             args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
2120       case Decoder::VOpIViOpcode::kVmslevi:
2121         return OpVectorToMaskvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
2122             args.dst, args.src, SignedType{args.imm});
2123       case Decoder::VOpIViOpcode::kVmsgtuvi:
2124         // Note: Vmsleu.vi actually have signed immediate which means that we first need to
2125         // expand it to the width of element as signed value and then bit-cast to unsigned.
2126         return OpVectorToMaskvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
2127             args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
2128       case Decoder::VOpIViOpcode::kVmsgtvi:
2129         return OpVectorToMaskvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
2130             args.dst, args.src, SignedType{args.imm});
2131       case Decoder::VOpIViOpcode::kVsadduvi:
2132         // Note: Vsaddu.vi actually have signed immediate which means that we first need to
2133         // expand it to the width of element as signed value and then bit-cast to unsigned.
2134         return OpVectorvx<intrinsics::Vaddvx<SaturatingUnsignedType>,
2135                           SaturatingUnsignedType,
2136                           vlmul,
2137                           vta,
2138                           vma>(
2139             args.dst, args.src, BitCastToUnsigned(SaturatingSignedType{args.imm}));
2140       case Decoder::VOpIViOpcode::kVsaddvi:
2141         return OpVectorvx<intrinsics::Vaddvx<SaturatingSignedType>,
2142                           SaturatingSignedType,
2143                           vlmul,
2144                           vta,
2145                           vma>(args.dst, args.src, SaturatingSignedType{args.imm});
2146       case Decoder::VOpIViOpcode::kVsllvi:
2147         return OpVectorvx<intrinsics::Vslvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2148             args.dst, args.src, UnsignedType{args.uimm});
2149       case Decoder::VOpIViOpcode::kVsrlvi:
2150         return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2151             args.dst, args.src, UnsignedType{args.uimm});
2152       case Decoder::VOpIViOpcode::kVsravi:
2153         // We need to pass shift value here as signed type but uimm value is always positive
2154         // and always fits into any integer.
2155         return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
2156             args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2157       case Decoder::VOpIViOpcode::kVmergevi:
2158         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2159           if (args.src != 0) {
2160             return Undefined();
2161           }
2162           return OpVectorx<intrinsics::Vcopyx<SignedType>, SignedType, vlmul, vta, vma>(
2163               args.dst, SignedType{args.imm});
2164         } else {
2165           return OpVectorx<intrinsics::Vcopyx<SignedType>,
2166                            SignedType,
2167                            vlmul,
2168                            vta,
2169                            // Always use "undisturbed" value from source register.
2170                            InactiveProcessing::kUndisturbed>(
2171               args.dst, SignedType{args.imm}, /*dst_mask=*/args.src);
2172         }
2173       case Decoder::VOpIViOpcode::kVmvXrv:
2174         // kVmv<nr>rv instruction
2175         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2176           switch (args.imm) {
2177             case 0:
2178               return OpVectorVmvXrv<ElementType, 1>(args.dst, args.src);
2179             case 1:
2180               return OpVectorVmvXrv<ElementType, 2>(args.dst, args.src);
2181             case 3:
2182               return OpVectorVmvXrv<ElementType, 4>(args.dst, args.src);
2183             case 7:
2184               return OpVectorVmvXrv<ElementType, 8>(args.dst, args.src);
2185             default:
2186               return Undefined();
2187           }
2188         } else {
2189           return Undefined();
2190         }
2191       case Decoder::VOpIViOpcode::kVnsrawi:
2192         // We need to pass shift value here as signed type but uimm value is always positive
2193         // and always fits into any integer.
2194         return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
2195             args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2196       case Decoder::VOpIViOpcode::kVnsrlwi:
2197         return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2198             args.dst, args.src, UnsignedType{args.uimm});
2199       case Decoder::VOpIViOpcode::kVslideupvi:
2200         return OpVectorslideup<UnsignedType, vlmul, vta, vma>(
2201             args.dst, args.src, UnsignedType{args.uimm});
2202       case Decoder::VOpIViOpcode::kVslidedownvi:
2203         return OpVectorslidedown<UnsignedType, vlmul, vta, vma>(
2204             args.dst, args.src, UnsignedType{args.uimm});
2205       case Decoder::VOpIViOpcode::kVnclipuwi:
2206         return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>,
2207                                 SaturatingUnsignedType,
2208                                 vlmul,
2209                                 vta,
2210                                 vma,
2211                                 kVxrm>(args.dst, args.src, UnsignedType{args.uimm});
2212       case Decoder::VOpIViOpcode::kVnclipwi:
2213         return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>,
2214                                 SaturatingSignedType,
2215                                 vlmul,
2216                                 vta,
2217                                 vma,
2218                                 kVxrm>(args.dst, args.src, UnsignedType{args.uimm});
2219       case Decoder::VOpIViOpcode::kVssrlvi:
2220         return OpVectorvx<intrinsics::Vssrvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2221             args.dst, args.src, UnsignedType{args.uimm});
2222       case Decoder::VOpIViOpcode::kVssravi:
2223         return OpVectorvx<intrinsics::Vssrvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2224             args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2225       default:
2226         Undefined();
2227     }
2228   }
2229 
2230   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIVvArgs & args)2231   void OpVector(const Decoder::VOpIVvArgs& args) {
2232     using SignedType = berberis::SignedType<ElementType>;
2233     using UnsignedType = berberis::UnsignedType<ElementType>;
2234     using SaturatingSignedType = SaturatingType<SignedType>;
2235     using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2236     // Keep cases sorted in opcode order to match RISC-V V manual.
2237     switch (args.opcode) {
2238       case Decoder::VOpIVvOpcode::kVaddvv:
2239         return OpVectorvv<intrinsics::Vaddvv<ElementType>, ElementType, vlmul, vta, vma>(
2240             args.dst, args.src1, args.src2);
2241       case Decoder::VOpIVvOpcode::kVsubvv:
2242         return OpVectorvv<intrinsics::Vsubvv<ElementType>, ElementType, vlmul, vta, vma>(
2243             args.dst, args.src1, args.src2);
2244       case Decoder::VOpIVvOpcode::kVandvv:
2245         return OpVectorvv<intrinsics::Vandvv<ElementType>, ElementType, vlmul, vta, vma>(
2246             args.dst, args.src1, args.src2);
2247       case Decoder::VOpIVvOpcode::kVorvv:
2248         return OpVectorvv<intrinsics::Vorvv<ElementType>, ElementType, vlmul, vta, vma>(
2249             args.dst, args.src1, args.src2);
2250       case Decoder::VOpIVvOpcode::kVxorvv:
2251         return OpVectorvv<intrinsics::Vxorvv<ElementType>, ElementType, vlmul, vta, vma>(
2252             args.dst, args.src1, args.src2);
2253       case Decoder::VOpIVvOpcode::kVrgathervv: {
2254         constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
2255         if (!IsAligned<kRegistersInvolved>(args.src2)) {
2256           return Undefined();
2257         }
2258         constexpr size_t vlmax = GetVlmax<ElementType, vlmul>();
2259         alignas(alignof(SIMD128Register)) ElementType indexes[vlmax];
2260         memcpy(indexes, state_->cpu.v + args.src2, sizeof(indexes));
2261         return OpVectorGather<ElementType, vlmul, vta, vma>(
2262             args.dst, args.src1, [&indexes](size_t index) { return indexes[index]; });
2263       }
2264       case Decoder::VOpIVvOpcode::kVadcvv:
2265         return OpVectorvvm<intrinsics::Vadcvv<SignedType>,
2266                            SignedType,
2267                            NumberOfRegistersInvolved(vlmul),
2268                            vta,
2269                            vma>(args.dst, args.src1, args.src2);
2270       case Decoder::VOpIVvOpcode::kVsbcvv:
2271         return OpVectorvvm<intrinsics::Vsbcvv<SignedType>,
2272                            SignedType,
2273                            NumberOfRegistersInvolved(vlmul),
2274                            vta,
2275                            vma>(args.dst, args.src1, args.src2);
2276       case Decoder::VOpIVvOpcode::kVmseqvv:
2277         return OpVectorToMaskvv<intrinsics::Vseqvv<ElementType>, ElementType, vlmul, vma>(
2278             args.dst, args.src1, args.src2);
2279       case Decoder::VOpIVvOpcode::kVmsnevv:
2280         return OpVectorToMaskvv<intrinsics::Vsnevv<ElementType>, ElementType, vlmul, vma>(
2281             args.dst, args.src1, args.src2);
2282       case Decoder::VOpIVvOpcode::kVmsltuvv:
2283         return OpVectorToMaskvv<intrinsics::Vsltvv<UnsignedType>, ElementType, vlmul, vma>(
2284             args.dst, args.src1, args.src2);
2285       case Decoder::VOpIVvOpcode::kVmsltvv:
2286         return OpVectorToMaskvv<intrinsics::Vsltvv<SignedType>, ElementType, vlmul, vma>(
2287             args.dst, args.src1, args.src2);
2288       case Decoder::VOpIVvOpcode::kVmsleuvv:
2289         return OpVectorToMaskvv<intrinsics::Vslevv<UnsignedType>, ElementType, vlmul, vma>(
2290             args.dst, args.src1, args.src2);
2291       case Decoder::VOpIVvOpcode::kVmslevv:
2292         return OpVectorToMaskvv<intrinsics::Vslevv<SignedType>, ElementType, vlmul, vma>(
2293             args.dst, args.src1, args.src2);
2294       case Decoder::VOpIVvOpcode::kVsadduvv:
2295         return OpVectorvv<intrinsics::Vaddvv<SaturatingUnsignedType>,
2296                           SaturatingUnsignedType,
2297                           vlmul,
2298                           vta,
2299                           vma>(args.dst, args.src1, args.src2);
2300       case Decoder::VOpIVvOpcode::kVsaddvv:
2301         return OpVectorvv<intrinsics::Vaddvv<SaturatingSignedType>,
2302                           SaturatingSignedType,
2303                           vlmul,
2304                           vta,
2305                           vma>(args.dst, args.src1, args.src2);
2306       case Decoder::VOpIVvOpcode::kVssubuvv:
2307         return OpVectorvv<intrinsics::Vsubvv<SaturatingUnsignedType>,
2308                           SaturatingUnsignedType,
2309                           vlmul,
2310                           vta,
2311                           vma>(args.dst, args.src1, args.src2);
2312       case Decoder::VOpIVvOpcode::kVssubvv:
2313         return OpVectorvv<intrinsics::Vsubvv<SaturatingSignedType>,
2314                           SaturatingSignedType,
2315                           vlmul,
2316                           vta,
2317                           vma>(args.dst, args.src1, args.src2);
2318       case Decoder::VOpIVvOpcode::kVsllvv:
2319         return OpVectorvv<intrinsics::Vslvv<ElementType>, ElementType, vlmul, vta, vma>(
2320             args.dst, args.src1, args.src2);
2321       case Decoder::VOpIVvOpcode::kVsrlvv:
2322         return OpVectorvv<intrinsics::Vsrvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2323             args.dst, args.src1, args.src2);
2324       case Decoder::VOpIVvOpcode::kVsravv:
2325         return OpVectorvv<intrinsics::Vsrvv<SignedType>, ElementType, vlmul, vta, vma>(
2326             args.dst, args.src1, args.src2);
2327       case Decoder::VOpIVvOpcode::kVminuvv:
2328         return OpVectorvv<intrinsics::Vminvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2329             args.dst, args.src1, args.src2);
2330       case Decoder::VOpIVvOpcode::kVminvv:
2331         return OpVectorvv<intrinsics::Vminvv<SignedType>, ElementType, vlmul, vta, vma>(
2332             args.dst, args.src1, args.src2);
2333       case Decoder::VOpIVvOpcode::kVmaxuvv:
2334         return OpVectorvv<intrinsics::Vmaxvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2335             args.dst, args.src1, args.src2);
2336       case Decoder::VOpIVvOpcode::kVmaxvv:
2337         return OpVectorvv<intrinsics::Vmaxvv<SignedType>, ElementType, vlmul, vta, vma>(
2338             args.dst, args.src1, args.src2);
2339       case Decoder::VOpIVvOpcode::kVmergevv:
2340         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2341           if (args.src1 != 0) {
2342             return Undefined();
2343           }
2344           return OpVectorv<intrinsics::Vcopyv<ElementType>, ElementType, vlmul, vta, vma>(
2345               args.dst, args.src2);
2346         } else {
2347           return OpVectorv<intrinsics::Vcopyv<ElementType>,
2348                            ElementType,
2349                            vlmul,
2350                            vta,
2351                            // Always use "undisturbed" value from source register.
2352                            InactiveProcessing::kUndisturbed>(
2353               args.dst, args.src2, /*dst_mask=*/args.src1);
2354         }
2355       case Decoder::VOpIVvOpcode::kVnsrawv:
2356         return OpVectorNarrowwv<intrinsics::Vnsrwv<SignedType>, SignedType, vlmul, vta, vma>(
2357             args.dst, args.src1, args.src2);
2358       case Decoder::VOpIVvOpcode::kVnsrlwv:
2359         return OpVectorNarrowwv<intrinsics::Vnsrwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2360             args.dst, args.src1, args.src2);
2361       case Decoder::VOpIVvOpcode::kVsmulvv:
2362         return OpVectorvv<intrinsics::Vsmulvv<SaturatingSignedType>,
2363                           ElementType,
2364                           vlmul,
2365                           vta,
2366                           vma,
2367                           kVxrm>(args.dst, args.src1, args.src2);
2368       case Decoder::VOpIVvOpcode::kVssrlvv:
2369         return OpVectorvv<intrinsics::Vssrvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2370             args.dst, args.src1, args.src2);
2371       case Decoder::VOpIVvOpcode::kVssravv:
2372         return OpVectorvv<intrinsics::Vssrvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2373             args.dst, args.src1, args.src2);
2374       case Decoder::VOpIVvOpcode::kVnclipuwv:
2375         return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingUnsignedType>,
2376                                 SaturatingUnsignedType,
2377                                 vlmul,
2378                                 vta,
2379                                 vma,
2380                                 kVxrm>(args.dst, args.src1, args.src2);
2381       case Decoder::VOpIVvOpcode::kVnclipwv:
2382         return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingSignedType>,
2383                                 SaturatingSignedType,
2384                                 vlmul,
2385                                 vta,
2386                                 vma,
2387                                 kVxrm>(args.dst, args.src1, args.src2);
2388       case Decoder::VOpIVvOpcode::kVwredsumuvs:
2389         return OpVectorvs<intrinsics::Vredsumvs<UnsignedType, WideType<UnsignedType>>,
2390                           UnsignedType,
2391                           WideType<UnsignedType>,
2392                           vlmul,
2393                           vta,
2394                           vma>(args.dst, Vec<UnsignedType{}>{args.src1}, args.src2);
2395       case Decoder::VOpIVvOpcode::kVwredsumvs:
2396         return OpVectorvs<intrinsics::Vredsumvs<SignedType, WideType<SignedType>>,
2397                           SignedType,
2398                           WideType<SignedType>,
2399                           vlmul,
2400                           vta,
2401                           vma>(args.dst, Vec<SignedType{}>{args.src1}, args.src2);
2402       default:
2403         Undefined();
2404     }
2405   }
2406 
2407   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIVxArgs & args,Register arg2)2408   void OpVector(const Decoder::VOpIVxArgs& args, Register arg2) {
2409     using SignedType = berberis::SignedType<ElementType>;
2410     using UnsignedType = berberis::UnsignedType<ElementType>;
2411     using SaturatingSignedType = SaturatingType<SignedType>;
2412     using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2413     // Keep cases sorted in opcode order to match RISC-V V manual.
2414     switch (args.opcode) {
2415       case Decoder::VOpIVxOpcode::kVaddvx:
2416         return OpVectorvx<intrinsics::Vaddvx<ElementType>, ElementType, vlmul, vta, vma>(
2417             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2418       case Decoder::VOpIVxOpcode::kVsubvx:
2419         return OpVectorvx<intrinsics::Vsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2420             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2421       case Decoder::VOpIVxOpcode::kVrsubvx:
2422         return OpVectorvx<intrinsics::Vrsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2423             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2424       case Decoder::VOpIVxOpcode::kVandvx:
2425         return OpVectorvx<intrinsics::Vandvx<ElementType>, ElementType, vlmul, vta, vma>(
2426             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2427       case Decoder::VOpIVxOpcode::kVorvx:
2428         return OpVectorvx<intrinsics::Vorvx<ElementType>, ElementType, vlmul, vta, vma>(
2429             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2430       case Decoder::VOpIVxOpcode::kVxorvx:
2431         return OpVectorvx<intrinsics::Vxorvx<ElementType>, ElementType, vlmul, vta, vma>(
2432             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2433       case Decoder::VOpIVxOpcode::kVrgathervx:
2434         return OpVectorGather<ElementType, vlmul, vta, vma>(
2435             args.dst, args.src1, [&arg2](size_t /*index*/) {
2436               return MaybeTruncateTo<ElementType>(arg2);
2437             });
2438       case Decoder::VOpIVxOpcode::kVadcvx:
2439         return OpVectorvxm<intrinsics::Vadcvx<ElementType>,
2440                            ElementType,
2441                            NumberOfRegistersInvolved(vlmul),
2442                            vta,
2443                            vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2444       case Decoder::VOpIVxOpcode::kVsbcvx:
2445         return OpVectorvxm<intrinsics::Vsbcvx<ElementType>,
2446                            ElementType,
2447                            NumberOfRegistersInvolved(vlmul),
2448                            vta,
2449                            vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2450       case Decoder::VOpIVxOpcode::kVmseqvx:
2451         return OpVectorToMaskvx<intrinsics::Vseqvx<ElementType>, ElementType, vlmul, vma>(
2452             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2453       case Decoder::VOpIVxOpcode::kVmsnevx:
2454         return OpVectorToMaskvx<intrinsics::Vsnevx<ElementType>, ElementType, vlmul, vma>(
2455             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2456       case Decoder::VOpIVxOpcode::kVmsltuvx:
2457         return OpVectorToMaskvx<intrinsics::Vsltvx<UnsignedType>, UnsignedType, vlmul, vma>(
2458             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2459       case Decoder::VOpIVxOpcode::kVmsltvx:
2460         return OpVectorToMaskvx<intrinsics::Vsltvx<SignedType>, SignedType, vlmul, vma>(
2461             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2462       case Decoder::VOpIVxOpcode::kVmsleuvx:
2463         return OpVectorToMaskvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
2464             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2465       case Decoder::VOpIVxOpcode::kVmslevx:
2466         return OpVectorToMaskvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
2467             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2468       case Decoder::VOpIVxOpcode::kVmsgtuvx:
2469         return OpVectorToMaskvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
2470             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2471       case Decoder::VOpIVxOpcode::kVmsgtvx:
2472         return OpVectorToMaskvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
2473             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2474       case Decoder::VOpIVxOpcode::kVsadduvx:
2475         return OpVectorvx<intrinsics::Vaddvx<SaturatingUnsignedType>,
2476                           SaturatingUnsignedType,
2477                           vlmul,
2478                           vta,
2479                           vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2480       case Decoder::VOpIVxOpcode::kVsaddvx:
2481         return OpVectorvx<intrinsics::Vaddvx<SaturatingSignedType>,
2482                           SaturatingSignedType,
2483                           vlmul,
2484                           vta,
2485                           vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2486       case Decoder::VOpIVxOpcode::kVssubuvx:
2487         return OpVectorvx<intrinsics::Vsubvx<SaturatingUnsignedType>,
2488                           SaturatingUnsignedType,
2489                           vlmul,
2490                           vta,
2491                           vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2492       case Decoder::VOpIVxOpcode::kVssubvx:
2493         return OpVectorvx<intrinsics::Vsubvx<SaturatingSignedType>,
2494                           SaturatingSignedType,
2495                           vlmul,
2496                           vta,
2497                           vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2498       case Decoder::VOpIVxOpcode::kVsllvx:
2499         return OpVectorvx<intrinsics::Vslvx<ElementType>, ElementType, vlmul, vta, vma>(
2500             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2501       case Decoder::VOpIVxOpcode::kVsrlvx:
2502         return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2503             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2504       case Decoder::VOpIVxOpcode::kVsravx:
2505         return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
2506             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2507       case Decoder::VOpIVxOpcode::kVminuvx:
2508         return OpVectorvx<intrinsics::Vminvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2509             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2510       case Decoder::VOpIVxOpcode::kVminvx:
2511         return OpVectorvx<intrinsics::Vminvx<SignedType>, SignedType, vlmul, vta, vma>(
2512             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2513       case Decoder::VOpIVxOpcode::kVmaxuvx:
2514         return OpVectorvx<intrinsics::Vmaxvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2515             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2516       case Decoder::VOpIVxOpcode::kVmaxvx:
2517         return OpVectorvx<intrinsics::Vmaxvx<SignedType>, SignedType, vlmul, vta, vma>(
2518             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2519       case Decoder::VOpIVxOpcode::kVmergevx:
2520         if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2521           if (args.src1 != 0) {
2522             return Undefined();
2523           }
2524           return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(
2525               args.dst, MaybeTruncateTo<ElementType>(arg2));
2526         } else {
2527           return OpVectorx<intrinsics::Vcopyx<ElementType>,
2528                            ElementType,
2529                            vlmul,
2530                            vta,
2531                            // Always use "undisturbed" value from source register.
2532                            InactiveProcessing::kUndisturbed>(
2533               args.dst, MaybeTruncateTo<ElementType>(arg2), /*dst_mask=*/args.src1);
2534         }
2535       case Decoder::VOpIVxOpcode::kVnsrawx:
2536         return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
2537             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2538       case Decoder::VOpIVxOpcode::kVnsrlwx:
2539         return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2540             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2541       case Decoder::VOpIVxOpcode::kVslideupvx:
2542         return OpVectorslideup<ElementType, vlmul, vta, vma>(
2543             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2544       case Decoder::VOpIVxOpcode::kVslidedownvx:
2545         return OpVectorslidedown<ElementType, vlmul, vta, vma>(
2546             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2547       case Decoder::VOpIVxOpcode::kVsmulvx:
2548         return OpVectorvx<intrinsics::Vsmulvx<SaturatingSignedType>,
2549                           SaturatingSignedType,
2550                           vlmul,
2551                           vta,
2552                           vma,
2553                           kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2554       case Decoder::VOpIVxOpcode::kVssrlvx:
2555         return OpVectorvx<intrinsics::Vssrvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2556             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2557       case Decoder::VOpIVxOpcode::kVssravx:
2558         return OpVectorvx<intrinsics::Vssrvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2559             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2560       case Decoder::VOpIVxOpcode::kVnclipuwx:
2561         return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>,
2562                                 SaturatingUnsignedType,
2563                                 vlmul,
2564                                 vta,
2565                                 vma,
2566                                 kVxrm>(args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2567       case Decoder::VOpIVxOpcode::kVnclipwx:
2568         return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>,
2569                                 SaturatingSignedType,
2570                                 vlmul,
2571                                 vta,
2572                                 vma,
2573                                 kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2574       default:
2575         Undefined();
2576     }
2577   }
2578 
2579   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpMVvArgs & args)2580   void OpVector(const Decoder::VOpMVvArgs& args) {
2581     using SignedType = berberis::SignedType<ElementType>;
2582     using UnsignedType = berberis::UnsignedType<ElementType>;
2583     if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2584       // Keep cases sorted in opcode order to match RISC-V V manual.
2585       switch (args.opcode) {
2586         case Decoder::VOpMVvOpcode::kVmandnmm:
2587           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & ~rhs; }>(
2588               args.dst, args.src1, args.src2);
2589         case Decoder::VOpMVvOpcode::kVmandmm:
2590           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & rhs; }>(
2591               args.dst, args.src1, args.src2);
2592         case Decoder::VOpMVvOpcode::kVmormm:
2593           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | rhs; }>(
2594               args.dst, args.src1, args.src2);
2595         case Decoder::VOpMVvOpcode::kVmxormm:
2596           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs ^ rhs; }>(
2597               args.dst, args.src1, args.src2);
2598         case Decoder::VOpMVvOpcode::kVmornmm:
2599           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | ~rhs; }>(
2600               args.dst, args.src1, args.src2);
2601         case Decoder::VOpMVvOpcode::kVmnandmm:
2602           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs & rhs); }>(
2603               args.dst, args.src1, args.src2);
2604         case Decoder::VOpMVvOpcode::kVmnormm:
2605           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs | rhs); }>(
2606               args.dst, args.src1, args.src2);
2607         case Decoder::VOpMVvOpcode::kVmxnormm:
2608           return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs ^ rhs); }>(
2609               args.dst, args.src1, args.src2);
2610         default:;  // Do nothing: handled in next switch.
2611       }
2612     }
2613     // Keep cases sorted in opcode order to match RISC-V V manual.
2614     switch (args.opcode) {
2615       case Decoder::VOpMVvOpcode::kVredsumvs:
2616         return OpVectorvs<intrinsics::Vredsumvs<ElementType>, ElementType, vlmul, vta, vma>(
2617             args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2618       case Decoder::VOpMVvOpcode::kVredandvs:
2619         return OpVectorvs<intrinsics::Vredandvs<ElementType>, ElementType, vlmul, vta, vma>(
2620             args.dst, Vec<~ElementType{}>{args.src1}, args.src2);
2621       case Decoder::VOpMVvOpcode::kVredorvs:
2622         return OpVectorvs<intrinsics::Vredorvs<ElementType>, ElementType, vlmul, vta, vma>(
2623             args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2624       case Decoder::VOpMVvOpcode::kVredxorvs:
2625         return OpVectorvs<intrinsics::Vredxorvs<ElementType>, ElementType, vlmul, vta, vma>(
2626             args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2627       case Decoder::VOpMVvOpcode::kVredminuvs:
2628         return OpVectorvs<intrinsics::Vredminvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2629             args.dst,
2630             Vec<UnsignedType{std::numeric_limits<typename UnsignedType::BaseType>::max()}>{
2631                 args.src1},
2632             args.src2);
2633       case Decoder::VOpMVvOpcode::kVredminvs:
2634         return OpVectorvs<intrinsics::Vredminvs<SignedType>, SignedType, vlmul, vta, vma>(
2635             args.dst,
2636             Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::max()}>{args.src1},
2637             args.src2);
2638       case Decoder::VOpMVvOpcode::kVredmaxuvs:
2639         return OpVectorvs<intrinsics::Vredmaxvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2640             args.dst, Vec<UnsignedType{}>{args.src1}, args.src2);
2641       case Decoder::VOpMVvOpcode::kVredmaxvs:
2642         return OpVectorvs<intrinsics::Vredmaxvs<SignedType>, SignedType, vlmul, vta, vma>(
2643             args.dst,
2644             Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::min()}>{args.src1},
2645             args.src2);
2646       case Decoder::VOpMVvOpcode::kVaadduvv:
2647         return OpVectorvv<intrinsics::Vaaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2648             args.dst, args.src1, args.src2);
2649       case Decoder::VOpMVvOpcode::kVaaddvv:
2650         return OpVectorvv<intrinsics::Vaaddvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2651             args.dst, args.src1, args.src2);
2652       case Decoder::VOpMVvOpcode::kVasubuvv:
2653         return OpVectorvv<intrinsics::Vasubvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2654             args.dst, args.src1, args.src2);
2655       case Decoder::VOpMVvOpcode::kVasubvv:
2656         return OpVectorvv<intrinsics::Vasubvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2657             args.dst, args.src1, args.src2);
2658       case Decoder::VOpMVvOpcode::kVWXUnary0:
2659         switch (args.vwxunary0_opcode) {
2660           case Decoder::VWXUnary0Opcode::kVmvxs:
2661             if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2662               return Undefined();
2663             }
2664             return OpVectorVmvxs<SignedType>(args.dst, args.src1);
2665           case Decoder::VWXUnary0Opcode::kVcpopm:
2666             return OpVectorVWXUnary0<intrinsics::Vcpopm<>, vma>(args.dst, args.src1);
2667           case Decoder::VWXUnary0Opcode::kVfirstm:
2668             return OpVectorVWXUnary0<intrinsics::Vfirstm<>, vma>(args.dst, args.src1);
2669           default:
2670             return Undefined();
2671         }
2672       case Decoder::VOpMVvOpcode::kVFUnary0:
2673         switch (args.vxunary0_opcode) {
2674           case Decoder::VXUnary0Opcode::kVzextvf2m:
2675             if constexpr (sizeof(UnsignedType) >= 2) {
2676               return OpVectorVXUnary0<intrinsics::Vextf2<UnsignedType>,
2677                                       UnsignedType,
2678                                       2,
2679                                       vlmul,
2680                                       vta,
2681                                       vma>(args.dst, args.src1);
2682             }
2683             break;
2684           case Decoder::VXUnary0Opcode::kVsextvf2m:
2685             if constexpr (sizeof(SignedType) >= 2) {
2686               return OpVectorVXUnary0<intrinsics::Vextf2<SignedType>,
2687                                       SignedType,
2688                                       2,
2689                                       vlmul,
2690                                       vta,
2691                                       vma>(args.dst, args.src1);
2692             }
2693             break;
2694           case Decoder::VXUnary0Opcode::kVzextvf4m:
2695             if constexpr (sizeof(UnsignedType) >= 4) {
2696               return OpVectorVXUnary0<intrinsics::Vextf4<UnsignedType>,
2697                                       UnsignedType,
2698                                       4,
2699                                       vlmul,
2700                                       vta,
2701                                       vma>(args.dst, args.src1);
2702             }
2703             break;
2704           case Decoder::VXUnary0Opcode::kVsextvf4m:
2705             if constexpr (sizeof(SignedType) >= 4) {
2706               return OpVectorVXUnary0<intrinsics::Vextf4<SignedType>,
2707                                       SignedType,
2708                                       4,
2709                                       vlmul,
2710                                       vta,
2711                                       vma>(args.dst, args.src1);
2712             }
2713             break;
2714           case Decoder::VXUnary0Opcode::kVzextvf8m:
2715             if constexpr (sizeof(UnsignedType) >= 8) {
2716               return OpVectorVXUnary0<intrinsics::Vextf8<UnsignedType>,
2717                                       UnsignedType,
2718                                       8,
2719                                       vlmul,
2720                                       vta,
2721                                       vma>(args.dst, args.src1);
2722             }
2723             break;
2724           case Decoder::VXUnary0Opcode::kVsextvf8m:
2725             if constexpr (sizeof(SignedType) >= 8) {
2726               return OpVectorVXUnary0<intrinsics::Vextf8<SignedType>,
2727                                       SignedType,
2728                                       8,
2729                                       vlmul,
2730                                       vta,
2731                                       vma>(args.dst, args.src1);
2732             }
2733             break;
2734           case Decoder::VXUnary0Opcode::kVbrev8v:
2735             return OpVectorv<intrinsics::Vbrev8v<ElementType>, ElementType, vlmul, vta, vma>(
2736                 args.dst, args.src1);
2737             break;
2738           default:
2739             return Undefined();
2740         }
2741         return Undefined();
2742       case Decoder::VOpMVvOpcode::kVMUnary0:
2743         switch (args.vmunary0_opcode) {
2744           case Decoder::VMUnary0Opcode::kVmsbfm:
2745             return OpVectorVMUnary0<intrinsics::Vmsbfm<>, vma>(args.dst, args.src1);
2746           case Decoder::VMUnary0Opcode::kVmsofm:
2747             return OpVectorVMUnary0<intrinsics::Vmsofm<>, vma>(args.dst, args.src1);
2748           case Decoder::VMUnary0Opcode::kVmsifm:
2749             return OpVectorVMUnary0<intrinsics::Vmsifm<>, vma>(args.dst, args.src1);
2750           case Decoder::VMUnary0Opcode::kViotam:
2751             return OpVectorViotam<ElementType, vlmul, vta, vma>(args.dst, args.src1);
2752           case Decoder::VMUnary0Opcode::kVidv:
2753             if (args.src1) {
2754               return Undefined();
2755             }
2756             return OpVectorVidv<ElementType, vlmul, vta, vma>(args.dst);
2757           default:
2758             return Undefined();
2759         }
2760       case Decoder::VOpMVvOpcode::kVdivuvv:
2761         return OpVectorvv<intrinsics::Vdivvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2762             args.dst, args.src1, args.src2);
2763       case Decoder::VOpMVvOpcode::kVdivvv:
2764         return OpVectorvv<intrinsics::Vdivvv<SignedType>, SignedType, vlmul, vta, vma>(
2765             args.dst, args.src1, args.src2);
2766       case Decoder::VOpMVvOpcode::kVremuvv:
2767         return OpVectorvv<intrinsics::Vremvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2768             args.dst, args.src1, args.src2);
2769       case Decoder::VOpMVvOpcode::kVremvv:
2770         return OpVectorvv<intrinsics::Vremvv<SignedType>, SignedType, vlmul, vta, vma>(
2771             args.dst, args.src1, args.src2);
2772       case Decoder::VOpMVvOpcode::kVmulhuvv:
2773         return OpVectorvv<intrinsics::Vmulhvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2774             args.dst, args.src1, args.src2);
2775       case Decoder::VOpMVvOpcode::kVmulvv:
2776         return OpVectorvv<intrinsics::Vmulvv<SignedType>, SignedType, vlmul, vta, vma>(
2777             args.dst, args.src1, args.src2);
2778       case Decoder::VOpMVvOpcode::kVmulhsuvv:
2779         return OpVectorvv<intrinsics::Vmulhsuvv<SignedType>, SignedType, vlmul, vta, vma>(
2780             args.dst, args.src1, args.src2);
2781       case Decoder::VOpMVvOpcode::kVmulhvv:
2782         return OpVectorvv<intrinsics::Vmulhvv<SignedType>, SignedType, vlmul, vta, vma>(
2783             args.dst, args.src1, args.src2);
2784       case Decoder::VOpMVvOpcode::kVmaddvv:
2785         return OpVectorvvv<intrinsics::Vmaddvv<ElementType>, ElementType, vlmul, vta, vma>(
2786             args.dst, args.src1, args.src2);
2787       case Decoder::VOpMVvOpcode::kVnmsubvv:
2788         return OpVectorvvv<intrinsics::Vnmsubvv<ElementType>, ElementType, vlmul, vta, vma>(
2789             args.dst, args.src1, args.src2);
2790       case Decoder::VOpMVvOpcode::kVmaccvv:
2791         return OpVectorvvv<intrinsics::Vmaccvv<ElementType>, ElementType, vlmul, vta, vma>(
2792             args.dst, args.src1, args.src2);
2793       case Decoder::VOpMVvOpcode::kVnmsacvv:
2794         return OpVectorvvv<intrinsics::Vnmsacvv<ElementType>, ElementType, vlmul, vta, vma>(
2795             args.dst, args.src1, args.src2);
2796       case Decoder::VOpMVvOpcode::kVwadduvv:
2797         return OpVectorWidenvv<intrinsics::Vwaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2798             args.dst, args.src1, args.src2);
2799       case Decoder::VOpMVvOpcode::kVwaddvv:
2800         return OpVectorWidenvv<intrinsics::Vwaddvv<SignedType>, SignedType, vlmul, vta, vma>(
2801             args.dst, args.src1, args.src2);
2802       case Decoder::VOpMVvOpcode::kVwsubuvv:
2803         return OpVectorWidenvv<intrinsics::Vwsubvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2804             args.dst, args.src1, args.src2);
2805       case Decoder::VOpMVvOpcode::kVwsubvv:
2806         return OpVectorWidenvv<intrinsics::Vwsubvv<SignedType>, SignedType, vlmul, vta, vma>(
2807             args.dst, args.src1, args.src2);
2808       case Decoder::VOpMVvOpcode::kVwadduwv:
2809         return OpVectorWidenwv<intrinsics::Vwaddwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2810             args.dst, args.src1, args.src2);
2811       case Decoder::VOpMVvOpcode::kVwaddwv:
2812         return OpVectorWidenwv<intrinsics::Vwaddwv<SignedType>, SignedType, vlmul, vta, vma>(
2813             args.dst, args.src1, args.src2);
2814       case Decoder::VOpMVvOpcode::kVwsubuwv:
2815         return OpVectorWidenwv<intrinsics::Vwsubwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2816             args.dst, args.src1, args.src2);
2817       case Decoder::VOpMVvOpcode::kVwsubwv:
2818         return OpVectorWidenwv<intrinsics::Vwsubwv<SignedType>, SignedType, vlmul, vta, vma>(
2819             args.dst, args.src1, args.src2);
2820       case Decoder::VOpMVvOpcode::kVwmuluvv:
2821         return OpVectorWidenvv<intrinsics::Vwmulvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2822             args.dst, args.src1, args.src2);
2823       case Decoder::VOpMVvOpcode::kVwmulsuvv:
2824         return OpVectorWidenvv<intrinsics::Vwmulsuvv<ElementType>, ElementType, vlmul, vta, vma>(
2825             args.dst, args.src1, args.src2);
2826       case Decoder::VOpMVvOpcode::kVwmulvv:
2827         return OpVectorWidenvv<intrinsics::Vwmulvv<SignedType>, SignedType, vlmul, vta, vma>(
2828             args.dst, args.src1, args.src2);
2829       case Decoder::VOpMVvOpcode::kVwmaccuvv:
2830         return OpVectorWidenvvw<intrinsics::Vwmaccvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2831             args.dst, args.src1, args.src2);
2832       case Decoder::VOpMVvOpcode::kVwmaccvv:
2833         return OpVectorWidenvvw<intrinsics::Vwmaccvv<SignedType>, SignedType, vlmul, vta, vma>(
2834             args.dst, args.src1, args.src2);
2835       case Decoder::VOpMVvOpcode::kVwmaccsuvv:
2836         return OpVectorWidenvvw<intrinsics::Vwmaccsuvv<ElementType>, ElementType, vlmul, vta, vma>(
2837             args.dst, args.src1, args.src2);
2838       default:
2839         Undefined();
2840     }
2841   }
2842 
2843   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpMVxArgs & args,Register arg2)2844   void OpVector(const Decoder::VOpMVxArgs& args, Register arg2) {
2845     using SignedType = berberis::SignedType<ElementType>;
2846     using UnsignedType = berberis::UnsignedType<ElementType>;
2847     // Keep cases sorted in opcode order to match RISC-V V manual.
2848     switch (args.opcode) {
2849       case Decoder::VOpMVxOpcode::kVaadduvx:
2850         return OpVectorvx<intrinsics::Vaaddvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2851             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2852       case Decoder::VOpMVxOpcode::kVaaddvx:
2853         return OpVectorvx<intrinsics::Vaaddvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2854             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2855       case Decoder::VOpMVxOpcode::kVasubuvx:
2856         return OpVectorvx<intrinsics::Vasubvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2857             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2858       case Decoder::VOpMVxOpcode::kVasubvx:
2859         return OpVectorvx<intrinsics::Vasubvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2860             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2861       case Decoder::VOpMVxOpcode::kVslide1upvx:
2862         return OpVectorslide1up<SignedType, vlmul, vta, vma>(
2863             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2864       case Decoder::VOpMVxOpcode::kVslide1downvx:
2865         return OpVectorslide1down<SignedType, vlmul, vta, vma>(
2866             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2867       case Decoder::VOpMVxOpcode::kVRXUnary0:
2868         switch (args.vrxunary0_opcode) {
2869           case Decoder::VRXUnary0Opcode::kVmvsx:
2870             if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2871               return Undefined();
2872             }
2873             return OpVectorVmvsx<SignedType, vta>(args.dst, MaybeTruncateTo<SignedType>(arg2));
2874           default:
2875             return Undefined();
2876         }
2877       case Decoder::VOpMVxOpcode::kVmulhuvx:
2878         return OpVectorvx<intrinsics::Vmulhvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2879             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2880       case Decoder::VOpMVxOpcode::kVmulvx:
2881         return OpVectorvx<intrinsics::Vmulvx<SignedType>, SignedType, vlmul, vta, vma>(
2882             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2883       case Decoder::VOpMVxOpcode::kVdivuvx:
2884         return OpVectorvx<intrinsics::Vdivvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2885             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2886       case Decoder::VOpMVxOpcode::kVdivvx:
2887         return OpVectorvx<intrinsics::Vdivvx<SignedType>, SignedType, vlmul, vta, vma>(
2888             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2889       case Decoder::VOpMVxOpcode::kVremuvx:
2890         return OpVectorvx<intrinsics::Vremvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2891             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2892       case Decoder::VOpMVxOpcode::kVremvx:
2893         return OpVectorvx<intrinsics::Vremvx<SignedType>, SignedType, vlmul, vta, vma>(
2894             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2895       case Decoder::VOpMVxOpcode::kVmulhsuvx:
2896         return OpVectorvx<intrinsics::Vmulhsuvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2897             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2898       case Decoder::VOpMVxOpcode::kVmulhvx:
2899         return OpVectorvx<intrinsics::Vmulhvx<SignedType>, SignedType, vlmul, vta, vma>(
2900             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2901       case Decoder::VOpMVxOpcode::kVmaddvx:
2902         return OpVectorvxv<intrinsics::Vmaddvx<ElementType>, ElementType, vlmul, vta, vma>(
2903             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2904       case Decoder::VOpMVxOpcode::kVnmsubvx:
2905         return OpVectorvxv<intrinsics::Vnmsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2906             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2907       case Decoder::VOpMVxOpcode::kVmaccvx:
2908         return OpVectorvxv<intrinsics::Vmaccvx<ElementType>, ElementType, vlmul, vta, vma>(
2909             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2910       case Decoder::VOpMVxOpcode::kVnmsacvx:
2911         return OpVectorvxv<intrinsics::Vnmsacvx<ElementType>, ElementType, vlmul, vta, vma>(
2912             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2913       case Decoder::VOpMVxOpcode::kVwadduvx:
2914         return OpVectorWidenvx<intrinsics::Vwaddvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2915             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2916       case Decoder::VOpMVxOpcode::kVwaddvx:
2917         return OpVectorWidenvx<intrinsics::Vwaddvx<SignedType>, SignedType, vlmul, vta, vma>(
2918             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2919       case Decoder::VOpMVxOpcode::kVwsubuvx:
2920         return OpVectorWidenvx<intrinsics::Vwsubvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2921             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2922       case Decoder::VOpMVxOpcode::kVwsubvx:
2923         return OpVectorWidenvx<intrinsics::Vwsubvx<SignedType>, SignedType, vlmul, vta, vma>(
2924             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2925       case Decoder::VOpMVxOpcode::kVwadduwx:
2926         return OpVectorWidenwx<intrinsics::Vwaddwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2927             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2928       case Decoder::VOpMVxOpcode::kVwaddwx:
2929         return OpVectorWidenwx<intrinsics::Vwaddwx<SignedType>, SignedType, vlmul, vta, vma>(
2930             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2931       case Decoder::VOpMVxOpcode::kVwsubuwx:
2932         return OpVectorWidenwx<intrinsics::Vwsubwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2933             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2934       case Decoder::VOpMVxOpcode::kVwsubwx:
2935         return OpVectorWidenwx<intrinsics::Vwsubwx<SignedType>, SignedType, vlmul, vta, vma>(
2936             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2937       case Decoder::VOpMVxOpcode::kVwmuluvx:
2938         return OpVectorWidenvx<intrinsics::Vwmulvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2939             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2940       case Decoder::VOpMVxOpcode::kVwmulsuvx:
2941         return OpVectorWidenvx<intrinsics::Vwmulsuvx<ElementType>, ElementType, vlmul, vta, vma>(
2942             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2943       case Decoder::VOpMVxOpcode::kVwmulvx:
2944         return OpVectorWidenvx<intrinsics::Vwmulvx<SignedType>, SignedType, vlmul, vta, vma>(
2945             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2946       case Decoder::VOpMVxOpcode::kVwmaccuvx:
2947         return OpVectorWidenvxw<intrinsics::Vwmaccvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2948             args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2949       case Decoder::VOpMVxOpcode::kVwmaccvx:
2950         return OpVectorWidenvxw<intrinsics::Vwmaccvx<SignedType>, SignedType, vlmul, vta, vma>(
2951             args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2952       case Decoder::VOpMVxOpcode::kVwmaccusvx:
2953         return OpVectorWidenvxw<intrinsics::Vwmaccusvx<ElementType>, ElementType, vlmul, vta, vma>(
2954             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2955       case Decoder::VOpMVxOpcode::kVwmaccsuvx:
2956         return OpVectorWidenvxw<intrinsics::Vwmaccsuvx<ElementType>, ElementType, vlmul, vta, vma>(
2957             args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2958       default:
2959         Undefined();
2960     }
2961   }
2962 
2963   template <typename DataElementType,
2964             VectorRegisterGroupMultiplier vlmul,
2965             typename IndexElementType,
2966             size_t kSegmentSize,
2967             size_t kIndexRegistersInvolved,
2968             TailProcessing vta,
2969             auto vma>
OpVector(const Decoder::VStoreIndexedArgs & args,Register src)2970   void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
2971     return OpVector<DataElementType,
2972                     kSegmentSize,
2973                     NumberOfRegistersInvolved(vlmul),
2974                     IndexElementType,
2975                     kIndexRegistersInvolved,
2976                     !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(args, src);
2977   }
2978 
2979   template <typename DataElementType,
2980             size_t kSegmentSize,
2981             size_t kNumRegistersInGroup,
2982             typename IndexElementType,
2983             size_t kIndexRegistersInvolved,
2984             bool kUseMasking>
OpVector(const Decoder::VStoreIndexedArgs & args,Register src)2985   void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
2986     if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
2987       return Undefined();
2988     }
2989     constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
2990     alignas(alignof(SIMD128Register))
2991         IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
2992     memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
2993     return OpVectorStore<DataElementType, kSegmentSize, kNumRegistersInGroup, kUseMasking>(
2994         args.data, src, [&indexes](size_t index) { return indexes[index]; });
2995   }
2996 
2997   template <typename ElementType,
2998             size_t kSegmentSize,
2999             VectorRegisterGroupMultiplier vlmul,
3000             TailProcessing vta,
3001             auto vma>
OpVector(const Decoder::VStoreStrideArgs & args,Register src,Register stride)3002   void OpVector(const Decoder::VStoreStrideArgs& args, Register src, Register stride) {
3003     return OpVectorStore<ElementType,
3004                          kSegmentSize,
3005                          NumberOfRegistersInvolved(vlmul),
3006                          !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(
3007         args.data, src, [stride](size_t index) { return stride * index; });
3008   }
3009 
3010   template <typename ElementType,
3011             size_t kSegmentSize,
3012             VectorRegisterGroupMultiplier vlmul,
3013             TailProcessing vta,
3014             auto vma>
OpVector(const Decoder::VStoreUnitStrideArgs & args,Register src)3015   void OpVector(const Decoder::VStoreUnitStrideArgs& args, Register src) {
3016     switch (args.opcode) {
3017       case Decoder::VSUmOpOpcode::kVseXX:
3018         return OpVectorStore<ElementType,
3019                              kSegmentSize,
3020                              NumberOfRegistersInvolved(vlmul),
3021                              !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
3022                              Decoder::VSUmOpOpcode::kVseXX>(args.data, src, [](size_t index) {
3023           return kSegmentSize * sizeof(ElementType) * index;
3024         });
3025       case Decoder::VSUmOpOpcode::kVsm:
3026         if constexpr (kSegmentSize == 1 &&
3027                       std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3028           return OpVectorStore<UInt8,
3029                                1,
3030                                1,
3031                                /*kUseMasking=*/false,
3032                                Decoder::VSUmOpOpcode::kVsm>(
3033               args.data, src, [](size_t index) { return index; });
3034         }
3035         return Undefined();
3036       default:
3037         return Undefined();
3038     }
3039   }
3040 
3041   // Look for VLoadStrideArgs for explanation about semantics: VStoreStrideArgs is almost symmetric,
3042   // except it ignores vta and vma modes and never alters inactive elements in memory.
3043   template <typename ElementType,
3044             size_t kSegmentSize,
3045             size_t kNumRegistersInGroup,
3046             bool kUseMasking,
3047             typename Decoder::VSUmOpOpcode opcode = typename Decoder::VSUmOpOpcode{},
3048             typename GetElementOffsetLambdaType>
3049   void OpVectorStore(uint8_t data, Register src, GetElementOffsetLambdaType GetElementOffset) {
3050     using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
3051     if (!IsAligned<kNumRegistersInGroup>(data)) {
3052       return Undefined();
3053     }
3054     if (data + kNumRegistersInGroup * kSegmentSize > 32) {
3055       return Undefined();
3056     }
3057     constexpr size_t kElementsCount = 16 / sizeof(ElementType);
3058     size_t vstart = GetCsr<CsrName::kVstart>();
3059     size_t vl = GetCsr<CsrName::kVl>();
3060     if constexpr (opcode == Decoder::VSUmOpOpcode::kVsm) {
3061       vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
3062     }
3063     // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
3064     // simplify the logic below.
3065     SetCsr<CsrName::kVstart>(0);
3066     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3067     // vector register group, including that no tail elements are updated with agnostic values.
3068     if (vstart >= vl) [[unlikely]] {
3069       // Technically, since stores never touch tail elements it's not needed, but makes it easier to
3070       // reason about the rest of function.
3071       return;
3072     }
3073     char* ptr = ToHostAddr<char>(src);
3074     // Note: within_group_id is the current register id within a register group. During one
3075     // iteration of this loop we store results for all registers with the current id in all
3076     // groups. E.g. for the example above we'd store data from v0, v2, v4 during the first iteration
3077     // (id within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
3078     // ensures that memory is always accessed in ordered fashion.
3079     auto mask = GetMaskForVectorOperationsIfNeeded<kUseMasking>();
3080     for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
3081          ++within_group_id) {
3082       // No need to continue if we no longer have elements to store.
3083       if (within_group_id * kElementsCount >= vl) {
3084         break;
3085       }
3086       auto register_mask =
3087           std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
3088       // Store elements to memory, but only if there are any active ones.
3089       for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
3090            ++within_register_id) {
3091         size_t element_index = kElementsCount * within_group_id + within_register_id;
3092         // Stop if we reached the vl limit.
3093         if (vl <= element_index) {
3094           break;
3095         }
3096         // Don't touch masked-out elements.
3097         if constexpr (kUseMasking) {
3098           if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
3099                                              1 << within_register_id)}) == MaskType{0}) {
3100             continue;
3101           }
3102         }
3103         // Store segment to memory.
3104         for (size_t field = 0; field < kSegmentSize; ++field) {
3105           bool exception_raised = FaultyStore(
3106               ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
3107               sizeof(ElementType),
3108               SIMD128Register{state_->cpu.v[data + within_group_id + field * kNumRegistersInGroup]}
3109                   .Get<ElementType>(within_register_id));
3110           // Stop processing if memory is inaccessible. It's also the only case where we have to set
3111           // vstart to non-zero value!
3112           if (exception_raised) {
3113             SetCsr<CsrName::kVstart>(element_index);
3114             return;
3115           }
3116         }
3117       }
3118       // Next group should be fully processed.
3119       vstart = 0;
3120     }
3121   }
3122 
3123   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorViotam(uint8_t dst,uint8_t src1)3124   void OpVectorViotam(uint8_t dst, uint8_t src1) {
3125     return OpVectorViotam<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src1);
3126   }
3127 
3128   template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorViotam(uint8_t dst,uint8_t src1)3129   void OpVectorViotam(uint8_t dst, uint8_t src1) {
3130     constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
3131     size_t vstart = GetCsr<CsrName::kVstart>();
3132     size_t vl = GetCsr<CsrName::kVl>();
3133     if (vstart != 0) {
3134       return Undefined();
3135     }
3136     // When vl = 0, there are no body elements, and no elements are updated in any destination
3137     // vector register group, including that no tail elements are updated with agnostic values.
3138     if (vl == 0) [[unlikely]] {
3139       return;
3140     }
3141     SIMD128Register arg1(state_->cpu.v[src1]);
3142     auto mask = GetMaskForVectorOperations<vma>();
3143     if constexpr (std::is_same_v<decltype(mask), SIMD128Register>) {
3144       arg1 &= mask;
3145     }
3146 
3147     size_t counter = 0;
3148     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3149       SIMD128Register result{state_->cpu.v[dst + index]};
3150       auto [original_dst_value, new_counter] = intrinsics::Viotam<ElementType>(arg1, counter);
3151       arg1.Set(arg1.Get<__uint128_t>() >> kElementsCount);
3152       counter = new_counter;
3153 
3154       // Apply mask and put result values into dst register.
3155       result =
3156           VectorMasking<ElementType, vta, vma>(result, original_dst_value, vstart, vl, index, mask);
3157       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3158     }
3159   }
3160 
3161   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorVidv(uint8_t dst)3162   void OpVectorVidv(uint8_t dst) {
3163     return OpVectorVidv<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst);
3164   }
3165 
3166   template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorVidv(uint8_t dst)3167   void OpVectorVidv(uint8_t dst) {
3168     if (!IsAligned<kRegistersInvolved>(dst)) {
3169       return Undefined();
3170     }
3171     size_t vstart = GetCsr<CsrName::kVstart>();
3172     size_t vl = GetCsr<CsrName::kVl>();
3173     SetCsr<CsrName::kVstart>(0);
3174     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3175     // vector register group, including that no tail elements are updated with agnostic values.
3176     if (vstart >= vl) [[unlikely]] {
3177       return;
3178     }
3179     auto mask = GetMaskForVectorOperations<vma>();
3180     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3181       SIMD128Register result{state_->cpu.v[dst + index]};
3182       result = VectorMasking<ElementType, vta, vma>(
3183           result, std::get<0>(intrinsics::Vidv<ElementType>(index)), vstart, vl, index, mask);
3184       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3185     }
3186   }
3187 
3188   template <typename ElementType>
OpVectorVmvfs(uint8_t dst,uint8_t src)3189   void OpVectorVmvfs(uint8_t dst, uint8_t src) {
3190     // Note: intrinsics::NanBox always received Float64 argument, even if it processes Float32 value
3191     // to not cause recursion in interinsics handling.
3192     // NanBox in the interpreter takes FpRegister and returns FpRegister which is probably the
3193     // cleanest way of processing that data (at least on x86-64 this produces code that's close to
3194     // optimal).
3195     NanBoxAndSetFpReg<ElementType>(dst, SIMD128Register{state_->cpu.v[src]}.Get<FpRegister>(0));
3196     SetCsr<CsrName::kVstart>(0);
3197   }
3198 
3199   template <typename ElementType, TailProcessing vta>
OpVectorVmvsx(uint8_t dst,ElementType element)3200   void OpVectorVmvsx(uint8_t dst, ElementType element) {
3201     size_t vstart = GetCsr<CsrName::kVstart>();
3202     size_t vl = GetCsr<CsrName::kVl>();
3203     // Documentation doesn't specify what happenes when vstart is non-zero but less than vl.
3204     // But at least one hardware implementation treats it as NOP:
3205     //   https://github.com/riscv/riscv-v-spec/issues/937
3206     // We are doing the same here.
3207     if (vstart == 0 && vl != 0) [[likely]] {
3208       SIMD128Register result;
3209       if constexpr (vta == intrinsics::TailProcessing::kAgnostic) {
3210         result = ~SIMD128Register{};
3211       } else {
3212         result.Set(state_->cpu.v[dst]);
3213       }
3214       result.Set(element, 0);
3215       state_->cpu.v[dst] = result.Get<Int128>();
3216     }
3217     SetCsr<CsrName::kVstart>(0);
3218   }
3219 
3220   template <typename ElementType>
OpVectorVmvxs(uint8_t dst,uint8_t src1)3221   void OpVectorVmvxs(uint8_t dst, uint8_t src1) {
3222     static_assert(ElementType::kIsSigned);
3223     // Conversion to Int64 would perform sign-extension if source element is signed.
3224     Register element = Int64{SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0)};
3225     SetRegOrIgnore(dst, element);
3226     SetCsr<CsrName::kVstart>(0);
3227   }
3228 
3229   template <auto Intrinsic, auto vma>
OpVectorVWXUnary0(uint8_t dst,uint8_t src1)3230   void OpVectorVWXUnary0(uint8_t dst, uint8_t src1) {
3231     size_t vstart = GetCsr<CsrName::kVstart>();
3232     size_t vl = GetCsr<CsrName::kVl>();
3233     if (vstart != 0) [[unlikely]] {
3234       return Undefined();
3235     }
3236     // Note: vcpop.m  and vfirst.m are explicit exception to the rule that vstart >= vl doesn't
3237     // perform any operations, and they are explicitly defined to perform write even if vl == 0.
3238     SIMD128Register arg1(state_->cpu.v[src1]);
3239     if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3240       SIMD128Register mask(state_->cpu.v[0]);
3241       arg1 &= mask;
3242     }
3243     const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3244     arg1 &= ~tail_mask;
3245     SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
3246     SetRegOrIgnore(dst, TruncateTo<UInt64>(BitCastToUnsigned(result.Get<Int128>())));
3247   }
3248 
3249   template <auto Intrinsic>
OpVectormm(uint8_t dst,uint8_t src1,uint8_t src2)3250   void OpVectormm(uint8_t dst, uint8_t src1, uint8_t src2) {
3251     size_t vstart = GetCsr<CsrName::kVstart>();
3252     size_t vl = GetCsr<CsrName::kVl>();
3253     SetCsr<CsrName::kVstart>(0);
3254     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3255     // vector register group, including that no tail elements are updated with agnostic values.
3256     if (vstart >= vl) [[unlikely]] {
3257       return;
3258     }
3259     SIMD128Register arg1(state_->cpu.v[src1]);
3260     SIMD128Register arg2(state_->cpu.v[src2]);
3261     SIMD128Register result;
3262     if (vstart > 0) [[unlikely]] {
3263       const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
3264       result.Set(state_->cpu.v[dst]);
3265       result = (result & ~start_mask) | (Intrinsic(arg1, arg2) & start_mask);
3266     } else {
3267       result = Intrinsic(arg1, arg2);
3268     }
3269     const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3270     result = result | tail_mask;
3271     state_->cpu.v[dst] = result.Get<__uint128_t>();
3272   }
3273 
3274   template <auto Intrinsic, auto vma>
OpVectorVMUnary0(uint8_t dst,uint8_t src1)3275   void OpVectorVMUnary0(uint8_t dst, uint8_t src1) {
3276     size_t vstart = GetCsr<CsrName::kVstart>();
3277     size_t vl = GetCsr<CsrName::kVl>();
3278     if (vstart != 0) {
3279       return Undefined();
3280     }
3281     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3282     // vector register group, including that no tail elements are updated with agnostic values.
3283     if (vl == 0) [[unlikely]] {
3284       return;
3285     }
3286     SIMD128Register arg1(state_->cpu.v[src1]);
3287     SIMD128Register mask;
3288     if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3289       mask.Set<__uint128_t>(state_->cpu.v[0]);
3290       arg1 &= mask;
3291     }
3292     const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3293     arg1 &= ~tail_mask;
3294     SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
3295     if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3296       arg1 &= mask;
3297       if (vma == InactiveProcessing::kUndisturbed) {
3298         result = (result & mask) | (SIMD128Register(state_->cpu.v[dst]) & ~mask);
3299       } else {
3300         result |= ~mask;
3301       }
3302     }
3303     result |= tail_mask;
3304     state_->cpu.v[dst] = result.Get<__uint128_t>();
3305   }
3306 
3307   template <typename ElementType, size_t kRegistersInvolved>
OpVectorVmvXrv(uint8_t dst,uint8_t src)3308   void OpVectorVmvXrv(uint8_t dst, uint8_t src) {
3309     if (!IsAligned<kRegistersInvolved>(dst | src)) {
3310       return Undefined();
3311     }
3312     constexpr size_t kElementsCount = 16 / sizeof(ElementType);
3313     size_t vstart = GetCsr<CsrName::kVstart>();
3314     SetCsr<CsrName::kVstart>(0);
3315     // The usual property that no elements are written if vstart >= vl does not apply to these
3316     // instructions. Instead, no elements are written if vstart >= evl.
3317     if (vstart >= kElementsCount * kRegistersInvolved) [[unlikely]] {
3318       return;
3319     }
3320     if (vstart == 0) [[likely]] {
3321       for (size_t index = 0; index < kRegistersInvolved; ++index) {
3322         state_->cpu.v[dst + index] = state_->cpu.v[src + index];
3323       }
3324       return;
3325     }
3326     size_t index = vstart / kElementsCount;
3327     SIMD128Register destination{state_->cpu.v[dst + index]};
3328     SIMD128Register source{state_->cpu.v[src + index]};
3329     for (size_t element_index = vstart % kElementsCount; element_index < kElementsCount;
3330          ++element_index) {
3331       destination.Set(source.Get<ElementType>(element_index), element_index);
3332     }
3333     state_->cpu.v[dst + index] = destination.Get<__uint128_t>();
3334     for (index++; index < kRegistersInvolved; ++index) {
3335       state_->cpu.v[dst + index] = state_->cpu.v[src + index];
3336     }
3337   }
3338 
3339   template <auto Intrinsic,
3340             typename ElementType,
3341             VectorRegisterGroupMultiplier vlmul,
3342             auto vma,
3343             CsrName... kExtraCsrs>
OpVectorToMaskvv(uint8_t dst,uint8_t src1,uint8_t src2)3344   void OpVectorToMaskvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3345     return OpVectorToMask<Intrinsic,
3346                           ElementType,
3347                           NumberOfRegistersInvolved(vlmul),
3348                           vma,
3349                           kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3350   }
3351 
3352   template <auto Intrinsic,
3353             typename ElementType,
3354             VectorRegisterGroupMultiplier vlmul,
3355             auto vma,
3356             CsrName... kExtraCsrs>
OpVectorToMaskvx(uint8_t dst,uint8_t src1,ElementType arg2)3357   void OpVectorToMaskvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3358     return OpVectorToMask<Intrinsic,
3359                           ElementType,
3360                           NumberOfRegistersInvolved(vlmul),
3361                           vma,
3362                           kExtraCsrs...>(dst, Vec{src1}, arg2);
3363   }
3364 
3365   template <auto Intrinsic,
3366             typename ElementType,
3367             size_t kRegistersInvolved,
3368             auto vma,
3369             CsrName... kExtraCsrs,
3370             typename... Args>
OpVectorToMask(uint8_t dst,Args...args)3371   void OpVectorToMask(uint8_t dst, Args... args) {
3372     // All args, except dst must be aligned at kRegistersInvolved amount. We'll merge them
3373     // together and then do a combined check for all of them at once.
3374     if (!IsAligned<kRegistersInvolved>(OrValuesOnlyForType<Vec>(args...))) {
3375       return Undefined();
3376     }
3377     SIMD128Register original_result(state_->cpu.v[dst]);
3378     size_t vstart = GetCsr<CsrName::kVstart>();
3379     size_t vl = GetCsr<CsrName::kVl>();
3380     SetCsr<CsrName::kVstart>(0);
3381     SIMD128Register result_before_vl_masking;
3382     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3383     // vector register group, including that no tail elements are updated with agnostic values.
3384     if (vstart >= vl) [[unlikely]] {
3385       result_before_vl_masking = original_result;
3386     } else {
3387       result_before_vl_masking = CollectBitmaskResult<ElementType, kRegistersInvolved>(
3388           [this, vstart, vl, args...](auto index) {
3389             return Intrinsic(this->GetCsr<kExtraCsrs>()...,
3390                              this->GetVectorArgument<ElementType, TailProcessing::kAgnostic, vma>(
3391                                  args, vstart, vl, index, intrinsics::NoInactiveProcessing{})...);
3392           });
3393       if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3394         SIMD128Register mask(state_->cpu.v[0]);
3395         if constexpr (vma == InactiveProcessing::kAgnostic) {
3396           result_before_vl_masking |= ~mask;
3397         } else {
3398           result_before_vl_masking = (mask & result_before_vl_masking) | (original_result & ~mask);
3399         }
3400       }
3401       if (vstart > 0) [[unlikely]] {
3402         const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
3403         result_before_vl_masking =
3404             (original_result & ~start_mask) | (result_before_vl_masking & start_mask);
3405       }
3406     }
3407     const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3408     state_->cpu.v[dst] = (result_before_vl_masking | tail_mask).Get<__uint128_t>();
3409   }
3410 
3411   template <auto Intrinsic,
3412             typename ElementType,
3413             VectorRegisterGroupMultiplier vlmul,
3414             TailProcessing vta,
3415             auto vma,
3416             CsrName... kExtraCsrs,
3417             typename... DstMaskType>
OpVectorv(uint8_t dst,uint8_t src1,DstMaskType...dst_mask)3418   void OpVectorv(uint8_t dst, uint8_t src1, DstMaskType... dst_mask) {
3419     return OpVectorv<Intrinsic,
3420                      ElementType,
3421                      NumberOfRegistersInvolved(vlmul),
3422                      vta,
3423                      vma,
3424                      kExtraCsrs...>(dst, src1, dst_mask...);
3425   }
3426 
3427   template <auto Intrinsic,
3428             typename ElementType,
3429             size_t kRegistersInvolved,
3430             TailProcessing vta,
3431             auto vma,
3432             CsrName... kExtraCsrs,
3433             typename... DstMaskType>
OpVectorv(uint8_t dst,uint8_t src,DstMaskType...dst_mask)3434   void OpVectorv(uint8_t dst, uint8_t src, DstMaskType... dst_mask) {
3435     static_assert(sizeof...(dst_mask) <= 1);
3436     if (!IsAligned<kRegistersInvolved>(dst | src | (dst_mask | ... | 0))) {
3437       return Undefined();
3438     }
3439     size_t vstart = GetCsr<CsrName::kVstart>();
3440     size_t vl = GetCsr<CsrName::kVl>();
3441     SetCsr<CsrName::kVstart>(0);
3442     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3443     // vector register group, including that no tail elements are updated with agnostic values.
3444     if (vstart >= vl) [[unlikely]] {
3445       return;
3446     }
3447     auto mask = GetMaskForVectorOperations<vma>();
3448     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3449       SIMD128Register result{state_->cpu.v[dst + index]};
3450       SIMD128Register result_mask;
3451       if constexpr (sizeof...(DstMaskType) == 0) {
3452         result_mask.Set(state_->cpu.v[dst + index]);
3453       } else {
3454         uint8_t dst_mask_unpacked[1] = {dst_mask...};
3455         result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
3456       }
3457       SIMD128Register arg{state_->cpu.v[src + index]};
3458       result =
3459           VectorMasking<ElementType, vta, vma>(result,
3460                                                std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg)),
3461                                                result_mask,
3462                                                vstart,
3463                                                vl,
3464                                                index,
3465                                                mask);
3466       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3467     }
3468   }
3469 
3470   template <auto Intrinsic,
3471             typename ElementType,
3472             VectorRegisterGroupMultiplier vlmul,
3473             TailProcessing vta,
3474             auto vma,
3475             CsrName... kExtraCsrs,
3476             auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3477   void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3478     return OpVectorvs<Intrinsic, ElementType, ElementType, vlmul, vta, vma, kExtraCsrs...>(
3479         dst, src1, src2);
3480   }
3481 
3482   template <auto Intrinsic,
3483             typename ElementType,
3484             typename ResultType,
3485             VectorRegisterGroupMultiplier vlmul,
3486             TailProcessing vta,
3487             auto vma,
3488             CsrName... kExtraCsrs,
3489             auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3490   void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3491     return OpVectorvs<Intrinsic,
3492                       ElementType,
3493                       ResultType,
3494                       NumberOfRegistersInvolved(vlmul),
3495                       vta,
3496                       vma,
3497                       kExtraCsrs...>(dst, src1, src2);
3498   }
3499 
3500   template <auto Intrinsic,
3501             typename ElementType,
3502             typename ResultType,
3503             size_t kRegistersInvolved,
3504             TailProcessing vta,
3505             auto vma,
3506             CsrName... kExtraCsrs,
3507             auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3508   void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3509     if (!IsAligned<kRegistersInvolved>(dst | src1.start_no)) {
3510       return Undefined();
3511     }
3512     size_t vstart = GetCsr<CsrName::kVstart>();
3513     size_t vl = GetCsr<CsrName::kVl>();
3514     if (vstart != 0) {
3515       return Undefined();
3516     }
3517     SetCsr<CsrName::kVstart>(0);
3518     // If vl = 0, no operation is performed and the destination register is not updated.
3519     if (vl == 0) [[unlikely]] {
3520       return;
3521     }
3522     auto mask = GetMaskForVectorOperations<vma>();
3523     ResultType init = SIMD128Register{state_->cpu.v[src2]}.Get<ResultType>(0);
3524     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3525       init = std::get<0>(
3526           Intrinsic(GetCsr<kExtraCsrs>()...,
3527                     init,
3528                     GetVectorArgument<ElementType, vta, vma>(src1, vstart, vl, index, mask)));
3529     }
3530     SIMD128Register result{state_->cpu.v[dst]};
3531     result.Set(init, 0);
3532     result = std::get<0>(intrinsics::VectorMasking<ResultType, vta>(result, result, 0, 1));
3533     state_->cpu.v[dst] = result.Get<__uint128_t>();
3534   }
3535 
3536   template <auto Intrinsic,
3537             typename ElementType,
3538             VectorRegisterGroupMultiplier vlmul,
3539             TailProcessing vta,
3540             auto vma,
3541             CsrName... kExtraCsrs>
OpVectorvv(uint8_t dst,uint8_t src1,uint8_t src2)3542   void OpVectorvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3543     return OpVectorSameWidth<Intrinsic,
3544                              ElementType,
3545                              NumberOfRegistersInvolved(vlmul),
3546                              vta,
3547                              vma,
3548                              kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3549   }
3550 
3551   template <auto Intrinsic,
3552             typename ElementType,
3553             VectorRegisterGroupMultiplier vlmul,
3554             TailProcessing vta,
3555             auto vma,
3556             CsrName... kExtraCsrs>
OpVectorvvv(uint8_t dst,uint8_t src1,uint8_t src2)3557   void OpVectorvvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3558     return OpVectorSameWidth<Intrinsic,
3559                              ElementType,
3560                              NumberOfRegistersInvolved(vlmul),
3561                              vta,
3562                              vma,
3563                              kExtraCsrs...>(dst, Vec{src1}, Vec{src2}, Vec{dst});
3564   }
3565 
3566   template <auto Intrinsic,
3567             typename ElementType,
3568             VectorRegisterGroupMultiplier vlmul,
3569             TailProcessing vta,
3570             auto vma,
3571             CsrName... kExtraCsrs>
OpVectorWidenv(uint8_t dst,uint8_t src)3572   void OpVectorWidenv(uint8_t dst, uint8_t src) {
3573     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3574                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3575       return OpVectorWiden<Intrinsic,
3576                            ElementType,
3577                            NumRegistersInvolvedForWideOperand(vlmul),
3578                            NumberOfRegistersInvolved(vlmul),
3579                            vta,
3580                            vma,
3581                            kExtraCsrs...>(dst, Vec{src});
3582     }
3583     return Undefined();
3584   }
3585 
3586   // 2*SEW = SEW op SEW
3587   // Attention: not to confuse with OpVectorWidenwv with 2*SEW = 2*SEW op SEW
3588   template <auto Intrinsic,
3589             typename ElementType,
3590             VectorRegisterGroupMultiplier vlmul,
3591             TailProcessing vta,
3592             auto vma,
3593             CsrName... kExtraCsrs>
OpVectorWidenvv(uint8_t dst,uint8_t src1,uint8_t src2)3594   void OpVectorWidenvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3595     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3596                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3597       return OpVectorWiden<Intrinsic,
3598                            ElementType,
3599                            NumRegistersInvolvedForWideOperand(vlmul),
3600                            NumberOfRegistersInvolved(vlmul),
3601                            vta,
3602                            vma,
3603                            kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3604     }
3605     return Undefined();
3606   }
3607 
3608   // 2*SEW = SEW op SEW op 2*SEW
3609   template <auto Intrinsic,
3610             typename ElementType,
3611             VectorRegisterGroupMultiplier vlmul,
3612             TailProcessing vta,
3613             auto vma,
3614             CsrName... kExtraCsrs>
OpVectorWidenvvw(uint8_t dst,uint8_t src1,uint8_t src2)3615   void OpVectorWidenvvw(uint8_t dst, uint8_t src1, uint8_t src2) {
3616     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3617                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3618       return OpVectorWiden<Intrinsic,
3619                            ElementType,
3620                            NumRegistersInvolvedForWideOperand(vlmul),
3621                            NumberOfRegistersInvolved(vlmul),
3622                            vta,
3623                            vma,
3624                            kExtraCsrs...>(dst, Vec{src1}, Vec{src2}, WideVec{dst});
3625     }
3626     return Undefined();
3627   }
3628 
3629   // 2*SEW = 2*SEW op SEW
3630   template <auto Intrinsic,
3631             typename ElementType,
3632             VectorRegisterGroupMultiplier vlmul,
3633             TailProcessing vta,
3634             auto vma,
3635             CsrName... kExtraCsrs>
OpVectorWidenwv(uint8_t dst,uint8_t src1,uint8_t src2)3636   void OpVectorWidenwv(uint8_t dst, uint8_t src1, uint8_t src2) {
3637     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3638                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3639       return OpVectorWiden<Intrinsic,
3640                            ElementType,
3641                            NumRegistersInvolvedForWideOperand(vlmul),
3642                            NumberOfRegistersInvolved(vlmul),
3643                            vta,
3644                            vma,
3645                            kExtraCsrs...>(dst, WideVec{src1}, Vec{src2});
3646     }
3647     return Undefined();
3648   }
3649 
3650   template <auto Intrinsic,
3651             typename ElementType,
3652             VectorRegisterGroupMultiplier vlmul,
3653             TailProcessing vta,
3654             auto vma,
3655             CsrName... kExtraCsrs>
OpVectorWidenwx(uint8_t dst,uint8_t src1,ElementType arg2)3656   void OpVectorWidenwx(uint8_t dst, uint8_t src1, ElementType arg2) {
3657     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3658                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3659       return OpVectorWiden<Intrinsic,
3660                            ElementType,
3661                            NumRegistersInvolvedForWideOperand(vlmul),
3662                            NumberOfRegistersInvolved(vlmul),
3663                            vta,
3664                            vma,
3665                            kExtraCsrs...>(dst, WideVec{src1}, arg2);
3666     }
3667     return Undefined();
3668   }
3669 
3670   template <auto Intrinsic,
3671             typename ElementType,
3672             VectorRegisterGroupMultiplier vlmul,
3673             TailProcessing vta,
3674             auto vma,
3675             CsrName... kExtraCsrs>
OpVectorWidenvx(uint8_t dst,uint8_t src1,ElementType arg2)3676   void OpVectorWidenvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3677     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3678                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3679       return OpVectorWiden<Intrinsic,
3680                            ElementType,
3681                            NumRegistersInvolvedForWideOperand(vlmul),
3682                            NumberOfRegistersInvolved(vlmul),
3683                            vta,
3684                            vma,
3685                            kExtraCsrs...>(dst, Vec{src1}, arg2);
3686     }
3687     return Undefined();
3688   }
3689 
3690   template <auto Intrinsic,
3691             typename ElementType,
3692             VectorRegisterGroupMultiplier vlmul,
3693             TailProcessing vta,
3694             auto vma,
3695             CsrName... kExtraCsrs>
OpVectorWidenvxw(uint8_t dst,uint8_t src1,ElementType arg2)3696   void OpVectorWidenvxw(uint8_t dst, uint8_t src1, ElementType arg2) {
3697     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3698                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3699       return OpVectorWiden<Intrinsic,
3700                            ElementType,
3701                            NumRegistersInvolvedForWideOperand(vlmul),
3702                            NumberOfRegistersInvolved(vlmul),
3703                            vta,
3704                            vma,
3705                            kExtraCsrs...>(dst, Vec{src1}, arg2, WideVec{dst});
3706     }
3707     return Undefined();
3708   }
3709 
3710   template <auto Intrinsic,
3711             typename ElementType,
3712             size_t kDestRegistersInvolved,
3713             size_t kRegistersInvolved,
3714             TailProcessing vta,
3715             auto vma,
3716             CsrName... kExtraCsrs,
3717             typename... Args>
OpVectorWiden(uint8_t dst,Args...args)3718   void OpVectorWiden(uint8_t dst, Args... args) {
3719     if constexpr (kDestRegistersInvolved == kRegistersInvolved) {
3720       static_assert(kDestRegistersInvolved == 1);
3721     } else {
3722       static_assert(kDestRegistersInvolved == 2 * kRegistersInvolved);
3723       // All normal (narrow) args must be aligned at kRegistersInvolved amount. We'll merge them
3724       // together and then do a combined check for all of them at once.
3725       uint8_t ored_args = OrValuesOnlyForType<Vec>(args...);
3726       // All wide args must be aligned at kRegistersInvolved amount. We'll merge them together and
3727       // then do a combined check for all of them at once.
3728       uint8_t ored_wide_args = OrValuesOnlyForType<WideVec>(args...) | dst;
3729       if (!IsAligned<kDestRegistersInvolved>(ored_wide_args) ||
3730           !IsAligned<kRegistersInvolved>(ored_args)) {
3731         return Undefined();
3732       }
3733     }
3734     // From RISC-V vectors manual: If destination EEW is greater than the source EEW, the source
3735     // EMUL is at least 1, [then overlap is permitted if ] the overlap is in the highest numbered
3736     // part of the destination register group (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a
3737     // source of v0, v2, or v4 is not).
3738     // Here only one forbidden combination is possible because of static_asserts above and we
3739     // detect and reject it.
3740     if (OrResultsOnlyForType<Vec>([dst](auto arg) { return arg.start_no == dst; }, args...)) {
3741       return Undefined();
3742     }
3743     size_t vstart = GetCsr<CsrName::kVstart>();
3744     size_t vl = GetCsr<CsrName::kVl>();
3745     SetCsr<CsrName::kVstart>(0);
3746     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3747     // vector register group, including that no tail elements are updated with agnostic values.
3748     if (vstart >= vl) [[unlikely]] {
3749       return;
3750     }
3751     auto mask = GetMaskForVectorOperations<vma>();
3752     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3753       SIMD128Register result(state_->cpu.v[dst + 2 * index]);
3754       result = VectorMasking<WideType<ElementType>, vta, vma>(
3755           result,
3756           std::get<0>(Intrinsic(
3757               GetCsr<kExtraCsrs>()...,
3758               GetLowVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3759           vstart,
3760           vl,
3761           2 * index,
3762           mask);
3763       state_->cpu.v[dst + 2 * index] = result.Get<__uint128_t>();
3764       if constexpr (kDestRegistersInvolved > 1) {  // if lmul is one full register or more
3765         result.Set(state_->cpu.v[dst + 2 * index + 1]);
3766         result = VectorMasking<WideType<ElementType>, vta, vma>(
3767             result,
3768             std::get<0>(Intrinsic(
3769                 GetCsr<kExtraCsrs>()...,
3770                 GetHighVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3771             vstart,
3772             vl,
3773             2 * index + 1,
3774             mask);
3775         state_->cpu.v[dst + 2 * index + 1] = result.Get<__uint128_t>();
3776       }
3777     }
3778   }
3779 
3780   template <auto Intrinsic,
3781             typename ElementType,
3782             size_t kRegistersInvolved,
3783             TailProcessing vta,
3784             auto vma,
3785             CsrName... kExtraCsrs>
OpVectorvxm(uint8_t dst,uint8_t src1,ElementType arg2)3786   void OpVectorvxm(uint8_t dst, uint8_t src1, ElementType arg2) {
3787     // All args must be aligned at kRegistersInvolved amount. We'll merge them
3788     // together and then do a combined check for all of them at once.
3789     if (!IsAligned<kRegistersInvolved>(dst | src1)) {
3790       return Undefined();
3791     }
3792 
3793     size_t vstart = GetCsr<CsrName::kVstart>();
3794     size_t vl = GetCsr<CsrName::kVl>();
3795     SetCsr<CsrName::kVstart>(0);
3796     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3797     // vector register group, including that no tail elements are updated with agnostic values.
3798     if (vstart >= vl) [[unlikely]] {
3799       return Undefined();
3800     }
3801 
3802     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3803       SIMD128Register arg1{state_->cpu.v[src1 + index]};
3804       SIMD128Register arg3{};
3805       if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3806         if constexpr (vma == InactiveProcessing::kUndisturbed) {
3807           arg3 = std::get<0>(
3808               intrinsics::GetMaskVectorArgument<ElementType, vta, vma>(state_->cpu.v[0], index));
3809         }
3810       }
3811 
3812       SIMD128Register result(state_->cpu.v[dst + index]);
3813       result = VectorMasking<ElementType, vta, intrinsics::NoInactiveProcessing{}>(
3814           result,
3815           std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg1, arg2, arg3)),
3816           vstart,
3817           vl,
3818           index,
3819           intrinsics::NoInactiveProcessing{});
3820       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3821     }
3822   }
3823 
3824   template <auto Intrinsic,
3825             typename ElementType,
3826             size_t kRegistersInvolved,
3827             TailProcessing vta,
3828             auto vma,
3829             CsrName... kExtraCsrs>
OpVectorvvm(uint8_t dst,uint8_t src1,uint8_t src2)3830   void OpVectorvvm(uint8_t dst, uint8_t src1, uint8_t src2) {
3831     // All args must be aligned at kRegistersInvolved amount. We'll merge them
3832     // together and then do a combined check for all of them at once.
3833     if (!IsAligned<kRegistersInvolved>(dst | src1 | src2)) {
3834       return Undefined();
3835     }
3836 
3837     size_t vstart = GetCsr<CsrName::kVstart>();
3838     size_t vl = GetCsr<CsrName::kVl>();
3839     SetCsr<CsrName::kVstart>(0);
3840     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3841     // vector register group, including that no tail elements are updated with agnostic values.
3842     if (vstart >= vl) [[unlikely]] {
3843       return Undefined();
3844     }
3845 
3846     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3847       SIMD128Register arg1{state_->cpu.v[src1 + index]};
3848       SIMD128Register arg2{state_->cpu.v[src2 + index]};
3849       SIMD128Register arg3{};
3850       if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3851         if constexpr (vma == InactiveProcessing::kUndisturbed) {
3852           arg3 = std::get<0>(
3853               intrinsics::GetMaskVectorArgument<ElementType, vta, vma>(state_->cpu.v[0], index));
3854         }
3855       }
3856 
3857       SIMD128Register result(state_->cpu.v[dst + index]);
3858       result = VectorMasking<ElementType, vta, intrinsics::NoInactiveProcessing{}>(
3859           result,
3860           std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg1, arg2, arg3)),
3861           vstart,
3862           vl,
3863           index,
3864           intrinsics::NoInactiveProcessing{});
3865       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3866     }
3867   }
3868 
3869   template <auto Intrinsic,
3870             typename ElementType,
3871             VectorRegisterGroupMultiplier vlmul,
3872             TailProcessing vta,
3873             auto vma,
3874             CsrName... kExtraCsrs>
OpVectorvx(uint8_t dst,uint8_t src1,ElementType arg2)3875   void OpVectorvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3876     return OpVectorSameWidth<Intrinsic,
3877                              ElementType,
3878                              NumberOfRegistersInvolved(vlmul),
3879                              vta,
3880                              vma,
3881                              kExtraCsrs...>(dst, Vec{src1}, arg2);
3882   }
3883 
3884   template <auto Intrinsic,
3885             typename ElementType,
3886             size_t kRegistersInvolved,
3887             TailProcessing vta,
3888             auto vma,
3889             CsrName... kExtraCsrs,
3890             typename... Args>
OpVectorSameWidth(uint8_t dst,Args...args)3891   void OpVectorSameWidth(uint8_t dst, Args... args) {
3892     // All args must be aligned at kRegistersInvolved amount. We'll merge them
3893     // together and then do a combined check for all of them at once.
3894     if (!IsAligned<kRegistersInvolved>(OrValuesOnlyForType<Vec>(args...) | dst)) {
3895       return Undefined();
3896     }
3897     size_t vstart = GetCsr<CsrName::kVstart>();
3898     size_t vl = GetCsr<CsrName::kVl>();
3899     SetCsr<CsrName::kVstart>(0);
3900     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3901     // vector register group, including that no tail elements are updated with agnostic values.
3902     if (vstart >= vl) [[unlikely]] {
3903       return;
3904     }
3905     auto mask = GetMaskForVectorOperations<vma>();
3906     for (size_t index = 0; index < kRegistersInvolved; ++index) {
3907       SIMD128Register result(state_->cpu.v[dst + index]);
3908       result = VectorMasking<ElementType, vta, vma>(
3909           result,
3910           std::get<0>(Intrinsic(
3911               GetCsr<kExtraCsrs>()...,
3912               GetVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3913           vstart,
3914           vl,
3915           index,
3916           mask);
3917       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3918     }
3919   }
3920 
3921   template <auto Intrinsic,
3922             typename TargetElementType,
3923             VectorRegisterGroupMultiplier vlmul,
3924             TailProcessing vta,
3925             auto vma,
3926             CsrName... kExtraCsrs>
OpVectorNarroww(uint8_t dst,uint8_t src)3927   void OpVectorNarroww(uint8_t dst, uint8_t src) {
3928     if constexpr (sizeof(TargetElementType) < sizeof(Int64) &&
3929                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3930       return OpVectorNarrow<Intrinsic,
3931                             TargetElementType,
3932                             NumberOfRegistersInvolved(vlmul),
3933                             NumRegistersInvolvedForWideOperand(vlmul),
3934                             vta,
3935                             vma,
3936                             kExtraCsrs...>(dst, WideVec{src});
3937     }
3938     return Undefined();
3939   }
3940 
3941   // SEW = 2*SEW op SEW
3942   template <auto Intrinsic,
3943             typename ElementType,
3944             VectorRegisterGroupMultiplier vlmul,
3945             TailProcessing vta,
3946             auto vma,
3947             CsrName... kExtraCsrs>
OpVectorNarrowwx(uint8_t dst,uint8_t src1,ElementType arg2)3948   void OpVectorNarrowwx(uint8_t dst, uint8_t src1, ElementType arg2) {
3949     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3950                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3951       return OpVectorNarrow<Intrinsic,
3952                             ElementType,
3953                             NumberOfRegistersInvolved(vlmul),
3954                             NumRegistersInvolvedForWideOperand(vlmul),
3955                             vta,
3956                             vma,
3957                             kExtraCsrs...>(dst, WideVec{src1}, arg2);
3958     }
3959     return Undefined();
3960   }
3961 
3962   // SEW = 2*SEW op SEW
3963   template <auto Intrinsic,
3964             typename ElementType,
3965             VectorRegisterGroupMultiplier vlmul,
3966             TailProcessing vta,
3967             auto vma,
3968             CsrName... kExtraCsrs>
OpVectorNarrowwv(uint8_t dst,uint8_t src1,uint8_t src2)3969   void OpVectorNarrowwv(uint8_t dst, uint8_t src1, uint8_t src2) {
3970     if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3971                   vlmul != VectorRegisterGroupMultiplier::k8registers) {
3972       return OpVectorNarrow<Intrinsic,
3973                             ElementType,
3974                             NumberOfRegistersInvolved(vlmul),
3975                             NumRegistersInvolvedForWideOperand(vlmul),
3976                             vta,
3977                             vma,
3978                             kExtraCsrs...>(dst, WideVec{src1}, Vec{src2});
3979     }
3980     return Undefined();
3981   }
3982 
3983   template <auto Intrinsic,
3984             typename ElementType,
3985             size_t kRegistersInvolved,
3986             size_t kWideSrcRegistersInvolved,
3987             TailProcessing vta,
3988             auto vma,
3989             CsrName... kExtraCsrs,
3990             typename... Args>
OpVectorNarrow(uint8_t dst,Args...args)3991   void OpVectorNarrow(uint8_t dst, Args... args) {
3992     if constexpr (kWideSrcRegistersInvolved == kRegistersInvolved) {
3993       static_assert(kWideSrcRegistersInvolved == 1);
3994     } else {
3995       // All normal (narrow) args must be aligned at kRegistersInvolved amount. We'll merge them
3996       // together and then do a combined check for all of them at once.
3997       uint8_t ored_args = OrValuesOnlyForType<Vec>(args...) | dst;
3998       // All wide args must be aligned at kWideSrcRegistersInvolved amount. We'll merge them
3999       // together and then do a combined check for all of them at once.
4000       uint8_t ored_wide_args = OrValuesOnlyForType<WideVec>(args...);
4001       if (!IsAligned<kWideSrcRegistersInvolved>(ored_wide_args) ||
4002           !IsAligned<kRegistersInvolved>(ored_args)) {
4003         return Undefined();
4004       }
4005       static_assert(kWideSrcRegistersInvolved == 2 * kRegistersInvolved);
4006       // From RISC-V vectors manual: If destination EEW is smaller than the source EEW, [then
4007       // overlap is permitted if] the overlap is in the lowest-numbered part of the source register
4008       // group (e.g., when LMUL=1, vnsrl.wi v0, v0, 3 is legal, but a destination of v1 is not).
4009       // We only have one possible invalid value here because of alignment requirements.
4010       if (OrResultsOnlyForType<Vec>(
4011               [dst](auto arg) { return arg.start_no == dst + kRegistersInvolved; }, args...)) {
4012         return Undefined();
4013       }
4014     }
4015     size_t vstart = GetCsr<CsrName::kVstart>();
4016     size_t vl = GetCsr<CsrName::kVl>();
4017     SetCsr<CsrName::kVstart>(0);
4018     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
4019     // vector register group, including that no tail elements are updated with agnostic values.
4020     if (vstart >= vl) [[unlikely]] {
4021       return;
4022     }
4023     auto mask = GetMaskForVectorOperations<vma>();
4024     for (size_t index = 0; index < kRegistersInvolved; index++) {
4025       SIMD128Register orig_result(state_->cpu.v[dst + index]);
4026       SIMD128Register intrinsic_result = std::get<0>(
4027           Intrinsic(GetCsr<kExtraCsrs>()...,
4028                     GetLowVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...));
4029       if constexpr (kWideSrcRegistersInvolved > 1) {
4030         SIMD128Register result_high = std::get<0>(Intrinsic(
4031             GetCsr<kExtraCsrs>()...,
4032             GetHighVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...));
4033         intrinsic_result = std::get<0>(
4034             intrinsics::VMergeBottomHalfToTop<ElementType>(intrinsic_result, result_high));
4035       }
4036       auto result = VectorMasking<ElementType, vta, vma>(
4037           orig_result, intrinsic_result, vstart, vl, index, mask);
4038       state_->cpu.v[dst + index] = result.template Get<__uint128_t>();
4039     }
4040   }
4041 
4042   template <auto Intrinsic,
4043             typename DestElementType,
4044             const uint8_t kFactor,
4045             VectorRegisterGroupMultiplier vlmul,
4046             TailProcessing vta,
4047             auto vma>
OpVectorVXUnary0(uint8_t dst,uint8_t src)4048   void OpVectorVXUnary0(uint8_t dst, uint8_t src) {
4049     static_assert(kFactor == 2 || kFactor == 4 || kFactor == 8);
4050     constexpr size_t kDestRegistersInvolved = NumberOfRegistersInvolved(vlmul);
4051     constexpr size_t kSourceRegistersInvolved = (kDestRegistersInvolved / kFactor) ?: 1;
4052     if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSourceRegistersInvolved>(src)) {
4053       return Undefined();
4054     }
4055     size_t vstart = GetCsr<CsrName::kVstart>();
4056     size_t vl = GetCsr<CsrName::kVl>();
4057     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
4058     // vector register group, including that no tail elements are updated with agnostic values.
4059     if (vstart >= vl) [[unlikely]] {
4060       SetCsr<CsrName::kVstart>(0);
4061       return;
4062     }
4063     auto mask = GetMaskForVectorOperations<vma>();
4064     for (size_t dst_index = 0; dst_index < kDestRegistersInvolved; dst_index++) {
4065       size_t src_index = dst_index / kFactor;
4066       size_t src_elem = dst_index % kFactor;
4067       SIMD128Register result{state_->cpu.v[dst + dst_index]};
4068       SIMD128Register arg{state_->cpu.v[src + src_index] >> ((128 / kFactor) * src_elem)};
4069 
4070       result = VectorMasking<DestElementType, vta, vma>(
4071           result, std::get<0>(Intrinsic(arg)), vstart, vl, dst_index, mask);
4072       state_->cpu.v[dst + dst_index] = result.Get<__uint128_t>();
4073     }
4074     SetCsr<CsrName::kVstart>(0);
4075   }
4076 
4077   template <auto Intrinsic,
4078             typename ElementType,
4079             VectorRegisterGroupMultiplier vlmul,
4080             TailProcessing vta,
4081             auto vma,
4082             CsrName... kExtraCsrs>
OpVectorvxv(uint8_t dst,uint8_t src1,ElementType arg2)4083   void OpVectorvxv(uint8_t dst, uint8_t src1, ElementType arg2) {
4084     return OpVectorSameWidth<Intrinsic,
4085                              ElementType,
4086                              NumberOfRegistersInvolved(vlmul),
4087                              vta,
4088                              vma,
4089                              kExtraCsrs...>(dst, Vec{src1}, arg2, Vec{dst});
4090   }
4091 
4092   template <auto Intrinsic,
4093             typename ElementType,
4094             VectorRegisterGroupMultiplier vlmul,
4095             TailProcessing vta,
4096             auto vma,
4097             typename... DstMaskType>
OpVectorx(uint8_t dst,ElementType arg2,DstMaskType...dst_mask)4098   void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
4099     return OpVectorx<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
4100         dst, arg2, dst_mask...);
4101   }
4102 
4103   template <auto Intrinsic,
4104             typename ElementType,
4105             size_t kRegistersInvolved,
4106             TailProcessing vta,
4107             auto vma,
4108             typename... DstMaskType>
OpVectorx(uint8_t dst,ElementType arg2,DstMaskType...dst_mask)4109   void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
4110     static_assert(sizeof...(dst_mask) <= 1);
4111     if (!IsAligned<kRegistersInvolved>(dst | (dst_mask | ... | 0))) {
4112       return Undefined();
4113     }
4114     size_t vstart = GetCsr<CsrName::kVstart>();
4115     size_t vl = GetCsr<CsrName::kVl>();
4116     SetCsr<CsrName::kVstart>(0);
4117     // When vstart >= vl, there are no body elements, and no elements are updated in any destination
4118     // vector register group, including that no tail elements are updated with agnostic values.
4119     if (vstart >= vl) [[unlikely]] {
4120       return;
4121     }
4122     auto mask = GetMaskForVectorOperations<vma>();
4123     for (size_t index = 0; index < kRegistersInvolved; ++index) {
4124       SIMD128Register result(state_->cpu.v[dst + index]);
4125       SIMD128Register result_mask;
4126       if constexpr (sizeof...(DstMaskType) == 0) {
4127         result_mask.Set(state_->cpu.v[dst + index]);
4128       } else {
4129         uint8_t dst_mask_unpacked[1] = {dst_mask...};
4130         result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
4131       }
4132       result = VectorMasking<ElementType, vta, vma>(
4133           result, std::get<0>(Intrinsic(arg2)), result_mask, vstart, vl, index, mask);
4134       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
4135     }
4136   }
4137 
4138   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslideup(uint8_t dst,uint8_t src,Register offset)4139   void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
4140     return OpVectorslideup<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
4141         dst, src, offset);
4142   }
4143 
4144   template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorslideup(uint8_t dst,uint8_t src,Register offset)4145   void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
4146     constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
4147     if (!IsAligned<kRegistersInvolved>(dst | src)) {
4148       return Undefined();
4149     }
4150     // Source and destination must not intersect.
4151     if (dst < (src + kRegistersInvolved) && src < (dst + kRegistersInvolved)) {
4152       return Undefined();
4153     }
4154     size_t vstart = GetCsr<CsrName::kVstart>();
4155     size_t vl = GetCsr<CsrName::kVl>();
4156     SetCsr<CsrName::kVstart>(0);
4157     if (vstart >= vl) [[unlikely]] {
4158       // From 16.3: For all of the [slide instructions], if vstart >= vl, the
4159       // instruction performs no operation and leaves the destination vector
4160       // register unchanged.
4161       return;
4162     }
4163     auto mask = GetMaskForVectorOperations<vma>();
4164     // The slideup operation leaves Elements 0 through MAX(vstart, OFFSET) unchanged.
4165     //
4166     // From 16.3.1: Destination elements OFFSET through vl-1 are written if
4167     // unmasked and if OFFSET < vl.
4168     // However if OFFSET > vl, we still need to apply the tail policy (as
4169     // clarified in https://github.com/riscv/riscv-v-spec/issues/263). Given
4170     // that OFFSET could be well past vl we start at vl rather than OFFSET in
4171     // that case.
4172     const size_t start_elem_index = std::min(std::max(vstart, offset), vl);
4173     for (size_t index = start_elem_index / kElementsPerRegister; index < kRegistersInvolved;
4174          ++index) {
4175       SIMD128Register result(state_->cpu.v[dst + index]);
4176 
4177       // Arguments falling before the input group correspond to the first offset-amount
4178       // result elements, which must remain undisturbed. We zero-initialize them here,
4179       // but their values are eventually ignored by vstart masking in VectorMasking.
4180       ssize_t first_arg_disp = index - 1 - offset / kElementsPerRegister;
4181       SIMD128Register arg1 =
4182           (first_arg_disp < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp];
4183       SIMD128Register arg2 =
4184           (first_arg_disp + 1 < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp + 1];
4185 
4186       result =
4187           VectorMasking<ElementType, vta, vma>(result,
4188                                                std::get<0>(intrinsics::VectorSlideUp<ElementType>(
4189                                                    offset % kElementsPerRegister, arg1, arg2)),
4190                                                start_elem_index,
4191                                                vl,
4192                                                index,
4193                                                mask);
4194       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
4195     }
4196   }
4197 
4198   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslide1up(uint8_t dst,uint8_t src,ElementType xval)4199   void OpVectorslide1up(uint8_t dst, uint8_t src, ElementType xval) {
4200     // Save the vstart before it's reset by vslideup.
4201     size_t vstart = GetCsr<CsrName::kVstart>();
4202     // Slide all the elements by one.
4203     OpVectorslideup<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src, 1);
4204     if (exception_raised_) {
4205       return;
4206     }
4207     if (vstart > 0) {
4208       // First element is not affected and should remain untouched.
4209       return;
4210     }
4211 
4212     // From 16.3.3: places the x register argument at location 0 of the
4213     // destination vector register group provided that element 0 is active,
4214     // otherwise the destination element update follows the current mask
4215     // agnostic/undisturbed policy.
4216     if constexpr (std::is_same_v<decltype(vma), intrinsics::InactiveProcessing>) {
4217       auto mask = GetMaskForVectorOperations<vma>();
4218       if (!(mask.template Get<uint8_t>(0) & 0x1)) {
4219         // The first element is masked. OpVectorslideup already applied the proper masking to it.
4220         return;
4221       }
4222     }
4223 
4224     SIMD128Register result = state_->cpu.v[dst];
4225     result.Set(xval, 0);
4226     state_->cpu.v[dst] = result.Get<__uint128_t>();
4227   }
4228 
4229   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslidedown(uint8_t dst,uint8_t src,Register offset)4230   void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
4231     return OpVectorslidedown<ElementType,
4232                              NumberOfRegistersInvolved(vlmul),
4233                              GetVlmax<ElementType, vlmul>(),
4234                              vta,
4235                              vma>(dst, src, offset);
4236   }
4237 
4238   template <typename ElementType,
4239             size_t kRegistersInvolved,
4240             size_t kVlmax,
4241             TailProcessing vta,
4242             auto vma>
OpVectorslidedown(uint8_t dst,uint8_t src,Register offset)4243   void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
4244     constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
4245     if (!IsAligned<kRegistersInvolved>(dst | src)) {
4246       return Undefined();
4247     }
4248     size_t vstart = GetCsr<CsrName::kVstart>();
4249     size_t vl = GetCsr<CsrName::kVl>();
4250     SetCsr<CsrName::kVstart>(0);
4251     if (vstart >= vl) [[unlikely]] {
4252       // From 16.3: For all of the [slide instructions], if vstart >= vl, the
4253       // instruction performs no operation and leaves the destination vector
4254       // register unchanged.
4255       return;
4256     }
4257     auto mask = GetMaskForVectorOperations<vma>();
4258     for (size_t index = 0; index < kRegistersInvolved; ++index) {
4259       SIMD128Register result(state_->cpu.v[dst + index]);
4260 
4261       size_t first_arg_disp = index + offset / kElementsPerRegister;
4262       SIMD128Register arg1 = state_->cpu.v[src + first_arg_disp];
4263       SIMD128Register arg2 = state_->cpu.v[src + first_arg_disp + 1];
4264       SIMD128Register tunnel_shift_result;
4265       // Elements coming from above vlmax are zeroes.
4266       if (offset >= kVlmax) {
4267         tunnel_shift_result = SIMD128Register{0};
4268       } else {
4269         tunnel_shift_result = std::get<0>(
4270             intrinsics::VectorSlideDown<ElementType>(offset % kElementsPerRegister, arg1, arg2));
4271         tunnel_shift_result =
4272             VectorZeroFill<ElementType>(tunnel_shift_result, kVlmax - offset, kVlmax, index);
4273       }
4274 
4275       result = VectorMasking<ElementType, vta, vma>(
4276           result, tunnel_shift_result, vstart, vl, index, mask);
4277       state_->cpu.v[dst + index] = result.Get<__uint128_t>();
4278     }
4279   }
4280 
4281   template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslide1down(uint8_t dst,uint8_t src,ElementType xval)4282   void OpVectorslide1down(uint8_t dst, uint8_t src, ElementType xval) {
4283     constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
4284     const size_t vl = GetCsr<CsrName::kVl>();
4285 
4286     // From 16.3.4: ... places the x register argument at location vl-1 in the
4287     // destination vector register, provided that element vl-1 is active,
4288     // otherwise the destination element is **unchanged** (emphasis added.)
4289     //
4290     // This means that element at vl-1 would not follow the Mask Agnostic policy
4291     // and would stay Unchanged when inactive. So we need to undo just this one
4292     // element if using agnostic masking.
4293     ElementType last_elem_value = xval;
4294     const size_t last_elem_register = (vl - 1) / kElementsPerRegister;
4295     const size_t last_elem_within_reg_pos = (vl - 1) % kElementsPerRegister;
4296     bool set_last_element = true;
4297     if constexpr (std::is_same_v<decltype(vma), intrinsics::InactiveProcessing>) {
4298       auto mask = GetMaskForVectorOperations<vma>();
4299       auto [mask_bits] =
4300           intrinsics::MaskForRegisterInSequence<ElementType>(mask, last_elem_register);
4301       using MaskType = decltype(mask_bits);
4302       if ((static_cast<MaskType::BaseType>(mask_bits) & (1 << last_elem_within_reg_pos)) == 0) {
4303         if constexpr (vma == intrinsics::InactiveProcessing::kUndisturbed) {
4304           // Element is inactive and the undisturbed policy will be followed,
4305           // just let Opvectorslidedown handle everything.
4306           set_last_element = false;
4307         } else {
4308           // Element is inactive and the agnostic policy will be followed, get
4309           // the original value to restore before it's changed by
4310           // the agnostic policy.
4311           SIMD128Register original = state_->cpu.v[dst + last_elem_register];
4312           last_elem_value = original.Get<ElementType>(last_elem_within_reg_pos);
4313         }
4314       }
4315     }
4316 
4317     // Slide all the elements by one.
4318     OpVectorslidedown<ElementType,
4319                       NumberOfRegistersInvolved(vlmul),
4320                       GetVlmax<ElementType, vlmul>(),
4321                       vta,
4322                       vma>(dst, src, 1);
4323     if (exception_raised_) {
4324       return;
4325     }
4326     if (!set_last_element) {
4327       return;
4328     }
4329 
4330     SIMD128Register result = state_->cpu.v[dst + last_elem_register];
4331     result.Set(last_elem_value, last_elem_within_reg_pos);
4332     state_->cpu.v[dst + last_elem_register] = result.Get<__uint128_t>();
4333   }
4334 
4335   // Helper function needed to generate bitmak result from non-bitmask inputs.
4336   // We are processing between 1 and 8 registers here and each register produces between 2 bits
4337   // (for 64 bit inputs) and 16 bits (for 8 bit inputs) bitmasks which are then combined into
4338   // final result (between 2 and 128 bits long).
4339   // Note that we are not handling tail here! These bits remain undefined and should be handled
4340   // later.
4341   // TODO(b/317757595): Add separate tests to verify the logic.
4342   template <typename ElementType, size_t kRegistersInvolved, typename Intrinsic>
CollectBitmaskResult(Intrinsic intrinsic)4343   SIMD128Register CollectBitmaskResult(Intrinsic intrinsic) {
4344     // We employ two distinct tactics to handle all possibilities:
4345     //   1. For 8bit/16bit types we get full UInt8/UInt16 result and thus use SIMD128Register.Set.
4346     //   2. For 32bit/64bit types we only get 2bit or 4bit from each call and thus need to use
4347     //      shifts to accumulate the result.
4348     //      But since each of up to 8 results is at most 4bits total bitmask is 32bit (or less).
4349     std::conditional_t<sizeof(ElementType) < sizeof(UInt32), SIMD128Register, UInt32>
4350         bitmask_result{};
4351     for (UInt32 index = UInt32{0}; index < UInt32(kRegistersInvolved); index += UInt32{1}) {
4352       const auto [raw_result] =
4353           intrinsics::SimdMaskToBitMask<ElementType>(std::get<0>(intrinsic(index)));
4354       if constexpr (sizeof(ElementType) < sizeof(Int32)) {
4355         bitmask_result.Set(raw_result, index);
4356       } else {
4357         constexpr UInt32 kElemNum =
4358             UInt32{static_cast<uint32_t>((sizeof(SIMD128Register) / sizeof(ElementType)))};
4359         bitmask_result |= UInt32(UInt8(raw_result)) << (index * kElemNum);
4360       }
4361     }
4362     return SIMD128Register(bitmask_result);
4363   }
4364 
Nop()4365   void Nop() {}
4366 
Undefined()4367   void Undefined() {
4368 #if defined(__aarch64__)
4369     abort();
4370 #else
4371     UndefinedInsn(GetInsnAddr());
4372     // If there is a guest handler registered for SIGILL we'll delay its processing until the next
4373     // sync point (likely the main dispatching loop) due to enabled pending signals. Thus we must
4374     // ensure that insn_addr isn't automatically advanced in FinalizeInsn.
4375     exception_raised_ = true;
4376 #endif
4377   }
4378 
4379   //
4380   // Guest state getters/setters.
4381   //
4382 
GetReg(uint8_t reg)4383   Register GetReg(uint8_t reg) const {
4384     CheckRegIsValid(reg);
4385     return state_->cpu.x[reg];
4386   }
4387 
GetRegOrZero(uint8_t reg)4388   Register GetRegOrZero(uint8_t reg) { return reg == 0 ? 0 : GetReg(reg); }
4389 
SetReg(uint8_t reg,Register value)4390   void SetReg(uint8_t reg, Register value) {
4391     if (exception_raised_) {
4392       // Do not produce side effects.
4393       return;
4394     }
4395     CheckRegIsValid(reg);
4396     state_->cpu.x[reg] = value;
4397   }
4398 
SetRegOrIgnore(uint8_t reg,Register value)4399   void SetRegOrIgnore(uint8_t reg, Register value) {
4400     if (reg != 0) {
4401       SetReg(reg, value);
4402     }
4403   }
4404 
GetFpReg(uint8_t reg)4405   FpRegister GetFpReg(uint8_t reg) const {
4406     CheckFpRegIsValid(reg);
4407     return state_->cpu.f[reg];
4408   }
4409 
4410   template <typename FloatType>
4411   FpRegister GetFRegAndUnboxNan(uint8_t reg);
4412 
4413   template <typename FloatType>
4414   void NanBoxAndSetFpReg(uint8_t reg, FpRegister value);
4415 
4416   //
4417   // Various helper methods.
4418   //
4419 
4420 #if defined(__aarch64__)
4421   template <CsrName kName>
GetCsr()4422   [[nodiscard]] Register GetCsr() {
4423     Undefined();
4424     return {};
4425   }
4426 #else
4427   template <CsrName kName>
GetCsr()4428   [[nodiscard]] Register GetCsr() const {
4429     return state_->cpu.*CsrFieldAddr<kName>;
4430   }
4431 #endif
4432 
4433   template <CsrName kName>
SetCsr(Register arg)4434   void SetCsr(Register arg) {
4435 #if defined(__aarch64__)
4436     UNUSED(arg);
4437     Undefined();
4438 #else
4439     if (exception_raised_) {
4440       return;
4441     }
4442     state_->cpu.*CsrFieldAddr<kName> = arg & kCsrMask<kName>;
4443 #endif
4444   }
4445 
GetImm(uint64_t imm)4446   [[nodiscard]] uint64_t GetImm(uint64_t imm) const { return imm; }
4447 
Copy(Register value)4448   [[nodiscard]] Register Copy(Register value) const { return value; }
4449 
GetInsnAddr()4450   [[nodiscard]] GuestAddr GetInsnAddr() const { return state_->cpu.insn_addr; }
4451 
FinalizeInsn(uint8_t insn_len)4452   void FinalizeInsn(uint8_t insn_len) {
4453     if (!branch_taken_ && !exception_raised_) {
4454       state_->cpu.insn_addr += insn_len;
4455     }
4456   }
4457 
4458 #include "berberis/intrinsics/interpreter_intrinsics_hooks-inl.h"
4459 
4460  private:
4461   template <typename DataType>
Load(const void * ptr)4462   Register Load(const void* ptr) {
4463     static_assert(std::is_integral_v<DataType>);
4464     CHECK(!exception_raised_);
4465     FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
4466     if (result.is_fault) {
4467       exception_raised_ = true;
4468       return {};
4469     }
4470     return static_cast<DataType>(result.value);
4471   }
4472 
4473   template <typename DataType>
Store(void * ptr,uint64_t data)4474   void Store(void* ptr, uint64_t data) {
4475     static_assert(std::is_integral_v<DataType>);
4476     CHECK(!exception_raised_);
4477     exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
4478   }
4479 
CheckShamtIsValid(int8_t shamt)4480   void CheckShamtIsValid(int8_t shamt) const {
4481     CHECK_GE(shamt, 0);
4482     CHECK_LT(shamt, 64);
4483   }
4484 
CheckShamt32IsValid(int8_t shamt)4485   void CheckShamt32IsValid(int8_t shamt) const {
4486     CHECK_GE(shamt, 0);
4487     CHECK_LT(shamt, 32);
4488   }
4489 
CheckRegIsValid(uint8_t reg)4490   void CheckRegIsValid(uint8_t reg) const {
4491     CHECK_GT(reg, 0u);
4492     CHECK_LE(reg, std::size(state_->cpu.x));
4493   }
4494 
CheckFpRegIsValid(uint8_t reg)4495   void CheckFpRegIsValid(uint8_t reg) const { CHECK_LT(reg, std::size(state_->cpu.f)); }
4496 
4497   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4498   SIMD128Register GetHighVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4499                                         size_t /*vstart*/,
4500                                         size_t /*vl*/,
4501                                         size_t index,
4502                                         MaskType /*mask*/) {
4503     return std::get<0>(intrinsics::VMovTopHalfToBottom<ElementType>(
4504         SIMD128Register{state_->cpu.v[src.start_no + index]}));
4505   }
4506 
4507   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4508   SIMD128Register GetHighVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,
4509                                         size_t /*vstart*/,
4510                                         size_t /*vl*/,
4511                                         size_t index,
4512                                         MaskType /*mask*/) {
4513     return SIMD128Register{state_->cpu.v[src.start_no + 2 * index + 1]};
4514   }
4515 
4516   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4517   ElementType GetHighVectorArgument(ElementType arg,
4518                                     size_t /*vstart*/,
4519                                     size_t /*vl*/,
4520                                     size_t /*index*/,
4521                                     MaskType /*mask*/) {
4522     return arg;
4523   }
4524 
4525   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4526   SIMD128Register GetLowVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4527                                        size_t /*vstart*/,
4528                                        size_t /*vl*/,
4529                                        size_t index,
4530                                        MaskType /*mask*/) {
4531     return SIMD128Register{state_->cpu.v[src.start_no + index]};
4532   }
4533 
4534   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4535   SIMD128Register GetLowVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,
4536                                        size_t /*vstart*/,
4537                                        size_t /*vl*/,
4538                                        size_t index,
4539                                        MaskType /*mask*/) {
4540     return SIMD128Register{state_->cpu.v[src.start_no + 2 * index]};
4541   }
4542 
4543   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4544   ElementType GetLowVectorArgument(ElementType arg,
4545                                    size_t /*vstart*/,
4546                                    size_t /*vl*/,
4547                                    size_t /*index*/,
4548                                    MaskType /*mask*/) {
4549     return arg;
4550   }
4551 
4552   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4553   SIMD128Register GetVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4554                                     size_t /*vstart*/,
4555                                     size_t /*vl*/,
4556                                     size_t index,
4557                                     MaskType /*mask*/) {
4558     return SIMD128Register{state_->cpu.v[src.start_no + index]};
4559   }
4560 
4561   template <typename ElementType,
4562             TailProcessing vta,
4563             auto vma,
4564             typename MaskType,
4565             auto kDefaultElement>
GetVectorArgument(Vec<kDefaultElement> src,size_t vstart,size_t vl,size_t index,MaskType mask)4566   SIMD128Register GetVectorArgument(Vec<kDefaultElement> src,
4567                                     size_t vstart,
4568                                     size_t vl,
4569                                     size_t index,
4570                                     MaskType mask) {
4571     return VectorMasking<kDefaultElement, vta, vma>(
4572         SIMD128Register{state_->cpu.v[src.start_no + index]}, vstart, vl, index, mask);
4573   }
4574 
4575   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4576   ElementType GetVectorArgument(ElementType arg,
4577                                 size_t /*vstart*/,
4578                                 size_t /*vl*/,
4579                                 size_t /*index*/,
4580                                 MaskType /*mask*/) {
4581     return arg;
4582   }
4583 
4584   template <bool kUseMasking>
4585   std::conditional_t<kUseMasking, SIMD128Register, intrinsics::NoInactiveProcessing>
GetMaskForVectorOperationsIfNeeded()4586   GetMaskForVectorOperationsIfNeeded() {
4587     if constexpr (kUseMasking) {
4588       return {state_->cpu.v[0]};
4589     } else {
4590       return intrinsics::NoInactiveProcessing{};
4591     }
4592   }
4593 
4594   template <auto vma>
4595   std::conditional_t<std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
4596                      intrinsics::NoInactiveProcessing,
4597                      SIMD128Register>
GetMaskForVectorOperations()4598   GetMaskForVectorOperations() {
4599     return GetMaskForVectorOperationsIfNeeded<
4600         !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>();
4601   }
4602 
4603   template <auto kDefaultElement, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register result,size_t vstart,size_t vl,size_t index,MaskType mask)4604   SIMD128Register VectorMasking(SIMD128Register result,
4605                                 size_t vstart,
4606                                 size_t vl,
4607                                 size_t index,
4608                                 MaskType mask) {
4609     return std::get<0>(intrinsics::VectorMasking<kDefaultElement, vta, vma>(
4610         result,
4611         vstart - index * (sizeof(SIMD128Register) / sizeof(kDefaultElement)),
4612         vl - index * (sizeof(SIMD128Register) / sizeof(kDefaultElement)),
4613         std::get<0>(
4614             intrinsics::MaskForRegisterInSequence<decltype(kDefaultElement)>(mask, index))));
4615   }
4616 
4617   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,size_t vstart,size_t vl,size_t index,MaskType mask)4618   SIMD128Register VectorMasking(SIMD128Register dest,
4619                                 SIMD128Register result,
4620                                 size_t vstart,
4621                                 size_t vl,
4622                                 size_t index,
4623                                 MaskType mask) {
4624     return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
4625         dest,
4626         result,
4627         vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4628         vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4629         std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
4630   }
4631 
4632   template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,SIMD128Register result_mask,size_t vstart,size_t vl,size_t index,MaskType mask)4633   SIMD128Register VectorMasking(SIMD128Register dest,
4634                                 SIMD128Register result,
4635                                 SIMD128Register result_mask,
4636                                 size_t vstart,
4637                                 size_t vl,
4638                                 size_t index,
4639                                 MaskType mask) {
4640     return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
4641         dest,
4642         result,
4643         result_mask,
4644         vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4645         vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4646         std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
4647   }
4648 
4649   template <typename ElementType>
VectorZeroFill(SIMD128Register src,size_t start,size_t end,size_t index)4650   SIMD128Register VectorZeroFill(SIMD128Register src, size_t start, size_t end, size_t index) {
4651     return VectorMasking<ElementType,
4652                          TailProcessing::kUndisturbed,
4653                          intrinsics::NoInactiveProcessing{}>(
4654         src, SIMD128Register{0}, start, end, index, intrinsics::NoInactiveProcessing{});
4655   }
4656 
4657   template <template <auto> typename ProcessType,
4658             auto kLambda =
4659                 [](auto packaged_value) {
4660                   auto [unpacked_value] = packaged_value;
4661                   return unpacked_value;
4662                 },
4663             auto kDefaultValue = false,
4664             typename... Args>
4665   [[nodiscard]] static constexpr auto OrValuesOnlyForType(Args... args) {
4666     return OrResultsOnlyForType<ProcessType, kDefaultValue>(kLambda, args...);
4667   }
4668 
4669   template <template <auto> typename ProcessTemplateType,
4670             auto kDefaultValue = false,
4671             typename Lambda,
4672             typename... Args>
OrResultsOnlyForType(Lambda lambda,Args...args)4673   [[nodiscard]] static constexpr auto OrResultsOnlyForType(Lambda lambda, Args... args) {
4674 #pragma GCC diagnostic push
4675 #pragma GCC diagnostic ignored "-Wbitwise-instead-of-logical"
4676     return ([lambda](auto arg) {
4677       if constexpr (IsTypeTemplateOf<std::decay_t<decltype(arg)>, ProcessTemplateType>) {
4678         return lambda(arg);
4679       } else {
4680         return kDefaultValue;
4681       }
4682     }(args) |
4683             ...);
4684 #pragma GCC diagnostic pop
4685   }
4686 
4687   template <template <auto> typename ProcessTemplateType, typename Lambda, typename... Args>
ProcessOnlyForType(Lambda lambda,Args...args)4688   static constexpr void ProcessOnlyForType(Lambda lambda, Args... args) {
4689     (
4690         [lambda](auto arg) {
4691           if constexpr (IsTypeTemplateOf<std::decay_t<decltype(arg)>, ProcessTemplateType>) {
4692             lambda(arg);
4693           }
4694         }(args),
4695         ...);
4696   }
4697 
4698   ThreadState* state_;
4699   bool branch_taken_;
4700   // This flag is set by illegal instructions and faulted memory accesses. The former must always
4701   // stop the playback of the current instruction, so we don't need to do anything special. The
4702   // latter may result in having more operations with side effects called before the end of the
4703   // current instruction:
4704   //   Load (faulted)    -> SetReg
4705   //   LoadFp (faulted)  -> NanBoxAndSetFpReg
4706   // If an exception is raised before these operations, we skip them. For all other operations with
4707   // side-effects we check that this flag is never raised.
4708   bool exception_raised_;
4709 };
4710 
4711 #if !defined(__aarch64__)
4712 template <>
4713 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kCycle>() const {
4714   return CPUClockCount();
4715 }
4716 
4717 template <>
4718 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFCsr>() const {
4719   return FeGetExceptions() | (state_->cpu.frm << 5);
4720 }
4721 
4722 template <>
4723 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFFlags>() const {
4724   return FeGetExceptions();
4725 }
4726 
4727 template <>
4728 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVlenb>() const {
4729   return 16;
4730 }
4731 
4732 template <>
4733 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxrm>() const {
4734   return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11;
4735 }
4736 
4737 template <>
4738 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxsat>() const {
4739   return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> >> 2;
4740 }
4741 
4742 template <>
4743 void inline Interpreter::SetCsr<CsrName::kFCsr>(Register arg) {
4744   CHECK(!exception_raised_);
4745   FeSetExceptions(arg & 0b1'1111);
4746   arg = (arg >> 5) & kCsrMask<CsrName::kFrm>;
4747   state_->cpu.frm = arg;
4748   FeSetRound(arg);
4749 }
4750 
4751 template <>
4752 void inline Interpreter::SetCsr<CsrName::kFFlags>(Register arg) {
4753   CHECK(!exception_raised_);
4754   FeSetExceptions(arg & 0b1'1111);
4755 }
4756 
4757 template <>
4758 void inline Interpreter::SetCsr<CsrName::kFrm>(Register arg) {
4759   CHECK(!exception_raised_);
4760   arg &= kCsrMask<CsrName::kFrm>;
4761   state_->cpu.frm = arg;
4762   FeSetRound(arg);
4763 }
4764 
4765 template <>
4766 void inline Interpreter::SetCsr<CsrName::kVxrm>(Register arg) {
4767   CHECK(!exception_raised_);
4768   state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
4769       (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b100) | (arg & 0b11);
4770 }
4771 
4772 template <>
4773 void inline Interpreter::SetCsr<CsrName::kVxsat>(Register arg) {
4774   CHECK(!exception_raised_);
4775   state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
4776       (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11) | ((arg & 0b1) << 2);
4777 }
4778 
4779 #endif
4780 
4781 template <>
4782 [[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float32>(
4783     uint8_t reg) {
4784 #if defined(__aarch64__)
4785   UNUSED(reg);
4786   Interpreter::Undefined();
4787   return {};
4788 #else
4789   CheckFpRegIsValid(reg);
4790   FpRegister value = state_->cpu.f[reg];
4791   return UnboxNan<Float32>(value);
4792 #endif
4793 }
4794 
4795 template <>
4796 [[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float64>(
4797     uint8_t reg) {
4798 #if defined(__aarch64__)
4799   UNUSED(reg);
4800   Interpreter::Undefined();
4801   return {};
4802 #else
4803   CheckFpRegIsValid(reg);
4804   return state_->cpu.f[reg];
4805 #endif
4806 }
4807 
4808 template <>
4809 void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float32>(uint8_t reg, FpRegister value) {
4810   if (exception_raised_) {
4811     // Do not produce side effects.
4812     return;
4813   }
4814   CheckFpRegIsValid(reg);
4815   state_->cpu.f[reg] = NanBox<Float32>(value);
4816 }
4817 
4818 template <>
4819 void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float64>(uint8_t reg, FpRegister value) {
4820   if (exception_raised_) {
4821     // Do not produce side effects.
4822     return;
4823   }
4824   CheckFpRegIsValid(reg);
4825   state_->cpu.f[reg] = value;
4826 }
4827 
4828 #ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS
4829 template <>
4830 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadIndexedArgs& args);
4831 template <>
4832 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadStrideArgs& args);
4833 template <>
4834 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadUnitStrideArgs& args);
4835 template <>
4836 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVfArgs& args);
4837 template <>
4838 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVvArgs& args);
4839 template <>
4840 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIViArgs& args);
4841 template <>
4842 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVvArgs& args);
4843 template <>
4844 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVxArgs& args);
4845 template <>
4846 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVvArgs& args);
4847 template <>
4848 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVxArgs& args);
4849 template <>
4850 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreIndexedArgs& args);
4851 template <>
4852 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreStrideArgs& args);
4853 template <>
4854 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreUnitStrideArgs& args);
4855 #endif
4856 
4857 }  // namespace berberis
4858