1 /*
2 * Copyright (C) 2023 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file excenaupt in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "berberis/interpreter/riscv64/interpreter.h"
18
19 #include <atomic>
20 #include <cfenv>
21 #include <cstdint>
22 #include <cstring>
23
24 #include "berberis/base/bit_util.h"
25 #include "berberis/base/checks.h"
26 #include "berberis/base/macros.h"
27 #include "berberis/decoder/riscv64/decoder.h"
28 #include "berberis/decoder/riscv64/semantics_player.h"
29 #include "berberis/guest_state/guest_addr.h"
30 #include "berberis/guest_state/guest_state.h"
31 #include "berberis/intrinsics/guest_cpu_flags.h" // ToHostRoundingMode
32 #include "berberis/intrinsics/intrinsics.h"
33 #include "berberis/intrinsics/riscv64_to_all/vector_intrinsics.h"
34 #include "berberis/intrinsics/simd_register.h"
35 #include "berberis/intrinsics/type_traits.h"
36 #include "berberis/kernel_api/run_guest_syscall.h"
37 #include "berberis/runtime_primitives/memory_region_reservation.h"
38
39 #if !defined(__aarch64__)
40 #include "berberis/intrinsics/intrinsics_float.h"
41 #include "berberis/runtime_primitives/interpret_helpers.h"
42 #include "berberis/runtime_primitives/recovery_code.h"
43 #endif
44
45 #include "regs.h"
46
47 #include "../faulty_memory_accesses.h"
48
49 namespace berberis {
50
AqRlToStdMemoryOrder(bool aq,bool rl)51 inline constexpr std::memory_order AqRlToStdMemoryOrder(bool aq, bool rl) {
52 if (aq) {
53 return rl ? std::memory_order_acq_rel : std::memory_order_acquire;
54 } else {
55 return rl ? std::memory_order_release : std::memory_order_relaxed;
56 }
57 }
58
59 template <typename ConcreteType, template <auto> typename TemplateType>
60 inline constexpr bool IsTypeTemplateOf = false;
61
62 template <template <auto> typename TemplateType, auto Value>
63 inline constexpr bool IsTypeTemplateOf<TemplateType<Value>, TemplateType> = true;
64
65 class Interpreter {
66 public:
67 using CsrName = berberis::CsrName;
68 using Decoder = Decoder<SemanticsPlayer<Interpreter>>;
69 using Register = uint64_t;
70 static constexpr Register no_register = 0;
71 using FpRegister = uint64_t;
72 static constexpr FpRegister no_fp_register = 0;
73 using Float32 = intrinsics::Float32;
74 using Float64 = intrinsics::Float64;
75
Interpreter(ThreadState * state)76 explicit Interpreter(ThreadState* state)
77 : state_(state), branch_taken_(false), exception_raised_(false) {}
78
79 //
80 // Instruction implementations.
81 //
82
UpdateCsr(Decoder::CsrOpcode opcode,Register arg,Register csr)83 Register UpdateCsr(Decoder::CsrOpcode opcode, Register arg, Register csr) {
84 switch (opcode) {
85 case Decoder::CsrOpcode::kCsrrs:
86 return arg | csr;
87 case Decoder::CsrOpcode::kCsrrc:
88 return ~arg & csr;
89 default:
90 Undefined();
91 return {};
92 }
93 }
94
UpdateCsr(Decoder::CsrImmOpcode opcode,uint8_t imm,Register csr)95 Register UpdateCsr(Decoder::CsrImmOpcode opcode, uint8_t imm, Register csr) {
96 return UpdateCsr(static_cast<Decoder::CsrOpcode>(opcode), imm, csr);
97 }
98
99 #if defined(__aarch64__)
Fence(Decoder::FenceOpcode,Register,bool sw,bool sr,bool,bool,bool pw,bool pr,bool,bool)100 void Fence(Decoder::FenceOpcode /*opcode*/,
101 Register /*src*/,
102 bool sw,
103 bool sr,
104 bool /*so*/,
105 bool /*si*/,
106 bool pw,
107 bool pr,
108 bool /*po*/,
109 bool /*pi*/) {
110 bool read_fence = sr | pr;
111 bool write_fence = sw | pw;
112 // "ish" is for inner shareable access, which is normally needed by userspace programs.
113 if (read_fence) {
114 if (write_fence) {
115 // This is equivalent to "fence rw,rw".
116 asm volatile("dmb ish" ::: "memory");
117 } else {
118 // "ishld" is equivalent to "fence r,rw", which is stronger than what we need here
119 // ("fence r,r"). However, it is the closet option that ARM offers.
120 asm volatile("dmb ishld" ::: "memory");
121 }
122 } else if (write_fence) {
123 // "st" is equivalent to "fence w,w".
124 asm volatile("dmb ishst" ::: "memory");
125 }
126 return;
127 }
128 #else
129 // Note: we prefer not to use C11/C++ atomic_thread_fence or even gcc/clang builtin
130 // __atomic_thread_fence because all these function rely on the fact that compiler never uses
131 // non-temporal loads and stores and only issue “mfence” when sequentially consistent ordering is
132 // requested. They never issue “lfence” or “sfence”.
133 // Instead we pull the page from Linux's kernel book and map read ordereding to “lfence”, write
134 // ordering to “sfence” and read-write ordering to “mfence”.
135 // This can be important in the future if we would start using nontemporal moves in manually
136 // created assembly code.
137 // Ordering affecting I/O devices is not relevant to user-space code thus we just ignore bits
138 // related to devices I/O.
Fence(Decoder::FenceOpcode,Register,bool sw,bool sr,bool,bool,bool pw,bool pr,bool,bool)139 void Fence(Decoder::FenceOpcode /*opcode*/,
140 Register /*src*/,
141 bool sw,
142 bool sr,
143 bool /*so*/,
144 bool /*si*/,
145 bool pw,
146 bool pr,
147 bool /*po*/,
148 bool /*pi*/) {
149 bool read_fence = sr | pr;
150 bool write_fence = sw | pw;
151 // Two types of fences (total store ordering fence and normal fence) are supposed to be
152 // processed differently, but only for the “read_fence && write_fence” case (otherwise total
153 // store ordering fence becomes normal fence for the “forward compatibility”), yet because x86
154 // doesn't distinguish between these two types of fences and since we are supposed to map all
155 // not-yet defined fences to normal fence (again, for the “forward compatibility”) it's Ok to
156 // just ignore opcode field.
157 if (read_fence) {
158 if (write_fence) {
159 asm volatile("mfence" ::: "memory");
160 } else {
161 asm volatile("lfence" ::: "memory");
162 }
163 } else if (write_fence) {
164 asm volatile("sfence" ::: "memory");
165 }
166 return;
167 }
168 #endif
169
170 template <typename IntType, bool aq, bool rl>
Lr(int64_t addr)171 Register Lr(int64_t addr) {
172 static_assert(std::is_integral_v<IntType>, "Lr: IntType must be integral");
173 static_assert(std::is_signed_v<IntType>, "Lr: IntType must be signed");
174 CHECK(!exception_raised_);
175 // Address must be aligned on size of IntType.
176 CHECK((addr % sizeof(IntType)) == 0ULL);
177 return MemoryRegionReservation::Load<IntType>(&state_->cpu, addr, AqRlToStdMemoryOrder(aq, rl));
178 }
179
180 template <typename IntType, bool aq, bool rl>
Sc(int64_t addr,IntType val)181 Register Sc(int64_t addr, IntType val) {
182 static_assert(std::is_integral_v<IntType>, "Sc: IntType must be integral");
183 static_assert(std::is_signed_v<IntType>, "Sc: IntType must be signed");
184 CHECK(!exception_raised_);
185 // Address must be aligned on size of IntType.
186 CHECK((addr % sizeof(IntType)) == 0ULL);
187 return static_cast<Register>(MemoryRegionReservation::Store<IntType>(
188 &state_->cpu, addr, val, AqRlToStdMemoryOrder(aq, rl)));
189 }
190
Op(Decoder::OpOpcode opcode,Register arg1,Register arg2)191 Register Op(Decoder::OpOpcode opcode, Register arg1, Register arg2) {
192 switch (opcode) {
193 case Decoder::OpOpcode::kAdd:
194 return Int64(arg1) + Int64(arg2);
195 case Decoder::OpOpcode::kSub:
196 return Int64(arg1) - Int64(arg2);
197 case Decoder::OpOpcode::kAnd:
198 return Int64(arg1) & Int64(arg2);
199 case Decoder::OpOpcode::kOr:
200 return Int64(arg1) | Int64(arg2);
201 case Decoder::OpOpcode::kXor:
202 return Int64(arg1) ^ Int64(arg2);
203 case Decoder::OpOpcode::kSll:
204 return Int64(arg1) << Int64(arg2);
205 case Decoder::OpOpcode::kSrl:
206 return UInt64(arg1) >> Int64(arg2);
207 case Decoder::OpOpcode::kSra:
208 return Int64(arg1) >> Int64(arg2);
209 case Decoder::OpOpcode::kSlt:
210 return Int64(arg1) < Int64(arg2) ? 1 : 0;
211 case Decoder::OpOpcode::kSltu:
212 return UInt64(arg1) < UInt64(arg2) ? 1 : 0;
213 #if !defined(__aarch64__)
214 case Decoder::OpOpcode::kMul:
215 return Int64(arg1) * Int64(arg2);
216 case Decoder::OpOpcode::kMulh:
217 return NarrowTopHalf(Widen(Int64(arg1)) * Widen(Int64(arg2)));
218 case Decoder::OpOpcode::kMulhsu:
219 return NarrowTopHalf(Widen(Int64(arg1)) * BitCastToSigned(Widen(UInt64(arg2))));
220 case Decoder::OpOpcode::kMulhu:
221 return NarrowTopHalf(Widen(UInt64(arg1)) * Widen(UInt64(arg2)));
222 #endif
223 case Decoder::OpOpcode::kAndn:
224 return Int64(arg1) & (~Int64(arg2));
225 case Decoder::OpOpcode::kOrn:
226 return Int64(arg1) | (~Int64(arg2));
227 case Decoder::OpOpcode::kXnor:
228 return ~(Int64(arg1) ^ Int64(arg2));
229 default:
230 Undefined();
231 return {};
232 }
233 }
234
Op32(Decoder::Op32Opcode opcode,Register arg1,Register arg2)235 Register Op32(Decoder::Op32Opcode opcode, Register arg1, Register arg2) {
236 #if defined(__aarch64__)
237 UNUSED(opcode, arg1, arg2);
238 Undefined();
239 return {};
240 #else
241 switch (opcode) {
242 case Decoder::Op32Opcode::kAddw:
243 return Widen(TruncateTo<Int32>(arg1) + TruncateTo<Int32>(arg2));
244 case Decoder::Op32Opcode::kSubw:
245 return Widen(TruncateTo<Int32>(arg1) - TruncateTo<Int32>(arg2));
246 case Decoder::Op32Opcode::kSllw:
247 return Widen(TruncateTo<Int32>(arg1) << TruncateTo<Int32>(arg2));
248 case Decoder::Op32Opcode::kSrlw:
249 return Widen(BitCastToSigned(TruncateTo<UInt32>(arg1) >> TruncateTo<Int32>(arg2)));
250 case Decoder::Op32Opcode::kSraw:
251 return Widen(TruncateTo<Int32>(arg1) >> TruncateTo<Int32>(arg2));
252 case Decoder::Op32Opcode::kMulw:
253 return Widen(TruncateTo<Int32>(arg1) * TruncateTo<Int32>(arg2));
254 default:
255 Undefined();
256 return {};
257 }
258 #endif
259 }
260
Load(Decoder::LoadOperandType operand_type,Register arg,int16_t offset)261 Register Load(Decoder::LoadOperandType operand_type, Register arg, int16_t offset) {
262 void* ptr = ToHostAddr<void>(arg + offset);
263 switch (operand_type) {
264 case Decoder::LoadOperandType::k8bitUnsigned:
265 return Load<uint8_t>(ptr);
266 case Decoder::LoadOperandType::k16bitUnsigned:
267 return Load<uint16_t>(ptr);
268 case Decoder::LoadOperandType::k32bitUnsigned:
269 return Load<uint32_t>(ptr);
270 case Decoder::LoadOperandType::k64bit:
271 return Load<uint64_t>(ptr);
272 case Decoder::LoadOperandType::k8bitSigned:
273 return Load<int8_t>(ptr);
274 case Decoder::LoadOperandType::k16bitSigned:
275 return Load<int16_t>(ptr);
276 case Decoder::LoadOperandType::k32bitSigned:
277 return Load<int32_t>(ptr);
278 default:
279 Undefined();
280 return {};
281 }
282 }
283
284 template <typename DataType>
LoadFp(Register arg,int16_t offset)285 FpRegister LoadFp(Register arg, int16_t offset) {
286 #if defined(__aarch64__)
287 UNUSED(arg, offset);
288 Undefined();
289 return {};
290 #else
291 static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
292 CHECK(!exception_raised_);
293 DataType* ptr = ToHostAddr<DataType>(arg + offset);
294 FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
295 if (result.is_fault) {
296 exception_raised_ = true;
297 return {};
298 }
299 return result.value;
300 #endif
301 }
302
OpImm(Decoder::OpImmOpcode opcode,Register arg,int16_t imm)303 Register OpImm(Decoder::OpImmOpcode opcode, Register arg, int16_t imm) {
304 switch (opcode) {
305 case Decoder::OpImmOpcode::kAddi:
306 return arg + int64_t{imm};
307 case Decoder::OpImmOpcode::kSlti:
308 return bit_cast<int64_t>(arg) < int64_t{imm} ? 1 : 0;
309 case Decoder::OpImmOpcode::kSltiu:
310 return arg < bit_cast<uint64_t>(int64_t{imm}) ? 1 : 0;
311 case Decoder::OpImmOpcode::kXori:
312 return arg ^ int64_t { imm };
313 case Decoder::OpImmOpcode::kOri:
314 return arg | int64_t{imm};
315 case Decoder::OpImmOpcode::kAndi:
316 return arg & int64_t{imm};
317 default:
318 Undefined();
319 return {};
320 }
321 }
322
Lui(int32_t imm)323 Register Lui(int32_t imm) { return int64_t{imm}; }
324
Auipc(int32_t imm)325 Register Auipc(int32_t imm) {
326 uint64_t pc = state_->cpu.insn_addr;
327 return pc + int64_t{imm};
328 }
329
OpImm32(Decoder::OpImm32Opcode opcode,Register arg,int16_t imm)330 Register OpImm32(Decoder::OpImm32Opcode opcode, Register arg, int16_t imm) {
331 #if defined(__aarch64__)
332 UNUSED(opcode, arg, imm);
333 Undefined();
334 return {};
335 #else
336 switch (opcode) {
337 case Decoder::OpImm32Opcode::kAddiw:
338 return int32_t(arg) + int32_t{imm};
339 default:
340 Undefined();
341 return {};
342 }
343 #endif
344 }
345
346 // TODO(b/232598137): rework ecall to not take parameters explicitly.
Ecall(Register,Register,Register,Register,Register,Register,Register)347 Register Ecall(Register /* syscall_nr */,
348 Register /* arg0 */,
349 Register /* arg1 */,
350 Register /* arg2 */,
351 Register /* arg3 */,
352 Register /* arg4 */,
353 Register /* arg5 */) {
354 CHECK(!exception_raised_);
355 RunGuestSyscall(state_);
356 return state_->cpu.x[A0];
357 }
358
Slli(Register arg,int8_t imm)359 Register Slli(Register arg, int8_t imm) { return arg << imm; }
360
Srli(Register arg,int8_t imm)361 Register Srli(Register arg, int8_t imm) { return arg >> imm; }
362
Srai(Register arg,int8_t imm)363 Register Srai(Register arg, int8_t imm) { return bit_cast<int64_t>(arg) >> imm; }
364
ShiftImm32(Decoder::ShiftImm32Opcode opcode,Register arg,uint16_t imm)365 Register ShiftImm32(Decoder::ShiftImm32Opcode opcode, Register arg, uint16_t imm) {
366 #if defined(__aarch64__)
367 UNUSED(opcode, arg, imm);
368 Undefined();
369 return {};
370 #else
371 switch (opcode) {
372 case Decoder::ShiftImm32Opcode::kSlliw:
373 return int32_t(arg) << int32_t{imm};
374 case Decoder::ShiftImm32Opcode::kSrliw:
375 return bit_cast<int32_t>(uint32_t(arg) >> uint32_t{imm});
376 case Decoder::ShiftImm32Opcode::kSraiw:
377 return int32_t(arg) >> int32_t{imm};
378 default:
379 Undefined();
380 return {};
381 }
382 #endif
383 }
384
Rori(Register arg,int8_t shamt)385 Register Rori(Register arg, int8_t shamt) {
386 CheckShamtIsValid(shamt);
387 return (((uint64_t(arg) >> shamt)) | (uint64_t(arg) << (64 - shamt)));
388 }
389
Roriw(Register arg,int8_t shamt)390 Register Roriw(Register arg, int8_t shamt) {
391 #if defined(__aarch64__)
392 UNUSED(arg, shamt);
393 Undefined();
394 return {};
395 #else
396 CheckShamt32IsValid(shamt);
397 return int32_t(((uint32_t(arg) >> shamt)) | (uint32_t(arg) << (32 - shamt)));
398 #endif
399 }
400
Store(Decoder::MemoryDataOperandType operand_type,Register arg,int16_t offset,Register data)401 void Store(Decoder::MemoryDataOperandType operand_type,
402 Register arg,
403 int16_t offset,
404 Register data) {
405 void* ptr = ToHostAddr<void>(arg + offset);
406 switch (operand_type) {
407 case Decoder::MemoryDataOperandType::k8bit:
408 Store<uint8_t>(ptr, data);
409 break;
410 case Decoder::MemoryDataOperandType::k16bit:
411 Store<uint16_t>(ptr, data);
412 break;
413 case Decoder::MemoryDataOperandType::k32bit:
414 Store<uint32_t>(ptr, data);
415 break;
416 case Decoder::MemoryDataOperandType::k64bit:
417 Store<uint64_t>(ptr, data);
418 break;
419 default:
420 return Undefined();
421 }
422 }
423
424 template <typename DataType>
StoreFp(Register arg,int16_t offset,FpRegister data)425 void StoreFp(Register arg, int16_t offset, FpRegister data) {
426 #if defined(__aarch64__)
427 UNUSED(arg, offset, data);
428 Undefined();
429 #else
430 static_assert(std::is_same_v<DataType, Float32> || std::is_same_v<DataType, Float64>);
431 CHECK(!exception_raised_);
432 DataType* ptr = ToHostAddr<DataType>(arg + offset);
433 exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
434 #endif
435 }
436
CompareAndBranch(Decoder::BranchOpcode opcode,Register arg1,Register arg2,int16_t offset)437 void CompareAndBranch(Decoder::BranchOpcode opcode,
438 Register arg1,
439 Register arg2,
440 int16_t offset) {
441 bool cond_value;
442 switch (opcode) {
443 case Decoder::BranchOpcode::kBeq:
444 cond_value = arg1 == arg2;
445 break;
446 case Decoder::BranchOpcode::kBne:
447 cond_value = arg1 != arg2;
448 break;
449 case Decoder::BranchOpcode::kBltu:
450 cond_value = arg1 < arg2;
451 break;
452 case Decoder::BranchOpcode::kBgeu:
453 cond_value = arg1 >= arg2;
454 break;
455 case Decoder::BranchOpcode::kBlt:
456 cond_value = bit_cast<int64_t>(arg1) < bit_cast<int64_t>(arg2);
457 break;
458 case Decoder::BranchOpcode::kBge:
459 cond_value = bit_cast<int64_t>(arg1) >= bit_cast<int64_t>(arg2);
460 break;
461 default:
462 return Undefined();
463 }
464
465 if (cond_value) {
466 Branch(offset);
467 }
468 }
469
Branch(int32_t offset)470 void Branch(int32_t offset) {
471 CHECK(!exception_raised_);
472 state_->cpu.insn_addr += offset;
473 branch_taken_ = true;
474 }
475
BranchRegister(Register base,int16_t offset)476 void BranchRegister(Register base, int16_t offset) {
477 CHECK(!exception_raised_);
478 state_->cpu.insn_addr = (base + offset) & ~uint64_t{1};
479 branch_taken_ = true;
480 }
481
Fmv(FpRegister arg)482 FpRegister Fmv(FpRegister arg) { return arg; }
483
484 //
485 // V extensions.
486 //
487
488 using TailProcessing = intrinsics::TailProcessing;
489 using InactiveProcessing = intrinsics::InactiveProcessing;
490
491 enum class VectorSelectElementWidth {
492 k8bit = 0b000,
493 k16bit = 0b001,
494 k32bit = 0b010,
495 k64bit = 0b011,
496 kMaxValue = 0b111,
497 };
498
499 enum class VectorRegisterGroupMultiplier {
500 k1register = 0b000,
501 k2registers = 0b001,
502 k4registers = 0b010,
503 k8registers = 0b011,
504 kEigthOfRegister = 0b101,
505 kQuarterOfRegister = 0b110,
506 kHalfOfRegister = 0b111,
507 kMaxValue = 0b111,
508 };
509
NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul)510 static constexpr size_t NumberOfRegistersInvolved(VectorRegisterGroupMultiplier vlmul) {
511 switch (vlmul) {
512 case VectorRegisterGroupMultiplier::k2registers:
513 return 2;
514 case VectorRegisterGroupMultiplier::k4registers:
515 return 4;
516 case VectorRegisterGroupMultiplier::k8registers:
517 return 8;
518 default:
519 return 1;
520 }
521 }
522
NumRegistersInvolvedForWideOperand(VectorRegisterGroupMultiplier vlmul)523 static constexpr size_t NumRegistersInvolvedForWideOperand(VectorRegisterGroupMultiplier vlmul) {
524 switch (vlmul) {
525 case VectorRegisterGroupMultiplier::k1register:
526 return 2;
527 case VectorRegisterGroupMultiplier::k2registers:
528 return 4;
529 case VectorRegisterGroupMultiplier::k4registers:
530 return 8;
531 default:
532 return 1;
533 }
534 }
535
536 template <typename ElementType, VectorRegisterGroupMultiplier vlmul>
GetVlmax()537 static constexpr size_t GetVlmax() {
538 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
539 switch (vlmul) {
540 case VectorRegisterGroupMultiplier::k1register:
541 return kElementsCount;
542 case VectorRegisterGroupMultiplier::k2registers:
543 return 2 * kElementsCount;
544 case VectorRegisterGroupMultiplier::k4registers:
545 return 4 * kElementsCount;
546 case VectorRegisterGroupMultiplier::k8registers:
547 return 8 * kElementsCount;
548 case VectorRegisterGroupMultiplier::kEigthOfRegister:
549 return kElementsCount / 8;
550 case VectorRegisterGroupMultiplier::kQuarterOfRegister:
551 return kElementsCount / 4;
552 case VectorRegisterGroupMultiplier::kHalfOfRegister:
553 return kElementsCount / 2;
554 default:
555 return 0;
556 }
557 }
558
559 template <typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,ExtraArgs...extra_args)560 void OpVector(const VOpArgs& args, ExtraArgs... extra_args) {
561 // Note: whole register instructions are not dependent on vtype and are supposed to work even
562 // if vill is set! Handle them before processing other instructions.
563 // Note: other tupes of loads and store are not special and would be processed as usual.
564 // TODO(khim): Handle vstart properly.
565 if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs>) {
566 if (args.opcode == Decoder::VLUmOpOpcode::kVlXreXX) {
567 if (!IsPowerOf2(args.nf + 1)) {
568 return Undefined();
569 }
570 if ((args.dst & args.nf) != 0) {
571 return Undefined();
572 }
573 auto [src] = std::tuple{extra_args...};
574 __uint128_t* ptr = bit_cast<__uint128_t*>(src);
575 for (size_t index = 0; index <= args.nf; index++) {
576 state_->cpu.v[args.dst + index] = ptr[index];
577 }
578 return;
579 }
580 }
581
582 if constexpr (std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
583 if (args.opcode == Decoder::VSUmOpOpcode::kVsX) {
584 if (args.width != Decoder::MemoryDataOperandType::k8bit) {
585 return Undefined();
586 }
587 if (!IsPowerOf2(args.nf + 1)) {
588 return Undefined();
589 }
590 if ((args.data & args.nf) != 0) {
591 return Undefined();
592 }
593 auto [src] = std::tuple{extra_args...};
594 __uint128_t* ptr = bit_cast<__uint128_t*>(src);
595 for (size_t index = 0; index <= args.nf; index++) {
596 ptr[index] = state_->cpu.v[args.data + index];
597 }
598 return;
599 }
600 }
601
602 // RISC-V V extensions are using 8bit “opcode extension” vtype Csr to make sure 32bit encoding
603 // would be usable.
604 //
605 // Great care is made to ensure that vector code wouldn't need to change vtype Csr often (e.g.
606 // there are special mask instructions which allow one to manipulate on masks without the need
607 // to change the CPU mode.
608 //
609 // Currently we don't have support for multiple CPU mode in Berberis thus we can only handle
610 // these instrtuctions in the interpreter.
611 //
612 // TODO(b/300690740): develop and implement strategy which would allow us to support vector
613 // intrinsics not just in the interpreter. Move code from this function to semantics player.
614 Register vtype = GetCsr<CsrName::kVtype>();
615 if (static_cast<std::make_signed_t<Register>>(vtype) < 0) {
616 return Undefined();
617 }
618 if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
619 std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
620 std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
621 std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
622 std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
623 std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
624 switch (args.width) {
625 case Decoder::MemoryDataOperandType::k8bit:
626 return OpVector<UInt8>(args, vtype, extra_args...);
627 case Decoder::MemoryDataOperandType::k16bit:
628 return OpVector<UInt16>(args, vtype, extra_args...);
629 case Decoder::MemoryDataOperandType::k32bit:
630 return OpVector<UInt32>(args, vtype, extra_args...);
631 case Decoder::MemoryDataOperandType::k64bit:
632 return OpVector<UInt64>(args, vtype, extra_args...);
633 default:
634 return Undefined();
635 }
636 } else {
637 VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0x7);
638 if constexpr (std::is_same_v<VOpArgs, Decoder::VOpFVfArgs> ||
639 std::is_same_v<VOpArgs, Decoder::VOpFVvArgs>) {
640 switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
641 case VectorSelectElementWidth::k16bit:
642 if constexpr (sizeof...(extra_args) == 0) {
643 return OpVector<intrinsics::Float16>(args, vlmul, vtype);
644 } else {
645 return Undefined();
646 }
647 case VectorSelectElementWidth::k32bit:
648 return OpVector<Float32>(
649 args,
650 vlmul,
651 vtype,
652 std::get<0>(intrinsics::UnboxNan<Float32>(bit_cast<Float64>(extra_args)))...);
653 case VectorSelectElementWidth::k64bit:
654 // Note: if arguments are 64bit floats then we don't need to do any unboxing.
655 return OpVector<Float64>(args, vlmul, vtype, bit_cast<Float64>(extra_args)...);
656 default:
657 return Undefined();
658 }
659 } else {
660 switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
661 case VectorSelectElementWidth::k8bit:
662 return OpVector<UInt8>(args, vlmul, vtype, extra_args...);
663 case VectorSelectElementWidth::k16bit:
664 return OpVector<UInt16>(args, vlmul, vtype, extra_args...);
665 case VectorSelectElementWidth::k32bit:
666 return OpVector<UInt32>(args, vlmul, vtype, extra_args...);
667 case VectorSelectElementWidth::k64bit:
668 return OpVector<UInt64>(args, vlmul, vtype, extra_args...);
669 default:
670 return Undefined();
671 }
672 }
673 }
674 }
675
676 template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)677 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
678 auto vemul = Decoder::SignExtend<3>(vtype & 0b111);
679 vemul -= ((vtype >> 3) & 0b111); // Divide by SEW.
680 vemul +=
681 static_cast<std::underlying_type_t<decltype(args.width)>>(args.width); // Multiply by EEW.
682 if (vemul < -3 || vemul > 3) [[unlikely]] {
683 return Undefined();
684 }
685 // Note: whole register loads and stores treat args.nf differently, but they are processed
686 // separately above anyway, because they also ignore vtype and all the information in it!
687 // For other loads and stores affected number of registers (EMUL * NF) should be 8 or less.
688 if ((vemul > 0) && ((args.nf + 1) * (1 << vemul) > 8)) {
689 return Undefined();
690 }
691 return OpVector<ElementType>(
692 args, static_cast<VectorRegisterGroupMultiplier>(vemul & 0b111), vtype, extra_args...);
693 }
694
695 template <typename ElementType, typename VOpArgs, typename... ExtraArgs>
OpVector(const VOpArgs & args,VectorRegisterGroupMultiplier vlmul,Register vtype,ExtraArgs...extra_args)696 void OpVector(const VOpArgs& args,
697 VectorRegisterGroupMultiplier vlmul,
698 Register vtype,
699 ExtraArgs... extra_args) {
700 switch (vlmul) {
701 case VectorRegisterGroupMultiplier::k1register:
702 return OpVector<ElementType, VectorRegisterGroupMultiplier::k1register>(
703 args, vtype, extra_args...);
704 case VectorRegisterGroupMultiplier::k2registers:
705 return OpVector<ElementType, VectorRegisterGroupMultiplier::k2registers>(
706 args, vtype, extra_args...);
707 case VectorRegisterGroupMultiplier::k4registers:
708 return OpVector<ElementType, VectorRegisterGroupMultiplier::k4registers>(
709 args, vtype, extra_args...);
710 case VectorRegisterGroupMultiplier::k8registers:
711 return OpVector<ElementType, VectorRegisterGroupMultiplier::k8registers>(
712 args, vtype, extra_args...);
713 case VectorRegisterGroupMultiplier::kEigthOfRegister:
714 return OpVector<ElementType, VectorRegisterGroupMultiplier::kEigthOfRegister>(
715 args, vtype, extra_args...);
716 case VectorRegisterGroupMultiplier::kQuarterOfRegister:
717 return OpVector<ElementType, VectorRegisterGroupMultiplier::kQuarterOfRegister>(
718 args, vtype, extra_args...);
719 case VectorRegisterGroupMultiplier::kHalfOfRegister:
720 return OpVector<ElementType, VectorRegisterGroupMultiplier::kHalfOfRegister>(
721 args, vtype, extra_args...);
722 default:
723 return Undefined();
724 }
725 }
726
727 template <typename ElementType,
728 VectorRegisterGroupMultiplier vlmul,
729 typename VOpArgs,
730 typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)731 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
732 if (args.vm) {
733 return OpVector<ElementType, vlmul, intrinsics::NoInactiveProcessing{}>(
734 args, vtype, extra_args...);
735 }
736 if (vtype >> 7) {
737 return OpVector<ElementType, vlmul, InactiveProcessing::kAgnostic>(
738 args, vtype, extra_args...);
739 }
740 return OpVector<ElementType, vlmul, InactiveProcessing::kUndisturbed>(
741 args, vtype, extra_args...);
742 }
743
744 template <typename ElementType,
745 VectorRegisterGroupMultiplier vlmul,
746 auto vma,
747 typename VOpArgs,
748 typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)749 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
750 if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
751 std::is_same_v<VOpArgs, Decoder::VLoadStrideArgs> ||
752 std::is_same_v<VOpArgs, Decoder::VLoadUnitStrideArgs> ||
753 std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs> ||
754 std::is_same_v<VOpArgs, Decoder::VStoreStrideArgs> ||
755 std::is_same_v<VOpArgs, Decoder::VStoreUnitStrideArgs>) {
756 constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
757 // Note: whole register loads and stores treat args.nf differently, but they are processed
758 // separately above anyway, because they also ignore vtype and all the information in it!
759 switch (args.nf) {
760 case 0:
761 return OpVector<ElementType, 1, vlmul, vma>(args, vtype, extra_args...);
762 case 1:
763 if constexpr (kRegistersInvolved > 4) {
764 return Undefined();
765 } else {
766 return OpVector<ElementType, 2, vlmul, vma>(args, vtype, extra_args...);
767 }
768 case 2:
769 if constexpr (kRegistersInvolved > 2) {
770 return Undefined();
771 } else {
772 return OpVector<ElementType, 3, vlmul, vma>(args, vtype, extra_args...);
773 }
774 case 3:
775 if constexpr (kRegistersInvolved > 2) {
776 return Undefined();
777 } else {
778 return OpVector<ElementType, 4, vlmul, vma>(args, vtype, extra_args...);
779 }
780 case 4:
781 if constexpr (kRegistersInvolved > 1) {
782 return Undefined();
783 } else {
784 return OpVector<ElementType, 5, vlmul, vma>(args, vtype, extra_args...);
785 }
786 case 5:
787 if constexpr (kRegistersInvolved > 1) {
788 return Undefined();
789 } else {
790 return OpVector<ElementType, 6, vlmul, vma>(args, vtype, extra_args...);
791 }
792 case 6:
793 if constexpr (kRegistersInvolved > 1) {
794 return Undefined();
795 } else {
796 return OpVector<ElementType, 7, vlmul, vma>(args, vtype, extra_args...);
797 }
798 case 7:
799 if constexpr (kRegistersInvolved > 1) {
800 return Undefined();
801 } else {
802 return OpVector<ElementType, 8, vlmul, vma>(args, vtype, extra_args...);
803 }
804 }
805 } else {
806 if ((vtype >> 6) & 1) {
807 return OpVector<ElementType, vlmul, TailProcessing::kAgnostic, vma>(args, extra_args...);
808 }
809 return OpVector<ElementType, vlmul, TailProcessing::kUndisturbed, vma>(args, extra_args...);
810 }
811 }
812
813 template <typename ElementType,
814 size_t kSegmentSize,
815 VectorRegisterGroupMultiplier vlmul,
816 auto vma,
817 typename VOpArgs,
818 typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)819 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
820 // Indexed loads and stores have two operands with different ElementType's and lmul sizes,
821 // pass vtype to do further selection.
822 if constexpr (std::is_same_v<VOpArgs, Decoder::VLoadIndexedArgs> ||
823 std::is_same_v<VOpArgs, Decoder::VStoreIndexedArgs>) {
824 // Because we know that we are dealing with indexed loads and stores and wouldn't need to
825 // convert elmul to anything else we can immediately turn it into kIndexRegistersInvolved
826 // here.
827 if ((vtype >> 6) & 1) {
828 return OpVector<kSegmentSize,
829 ElementType,
830 NumberOfRegistersInvolved(vlmul),
831 TailProcessing::kAgnostic,
832 vma>(args, vtype, extra_args...);
833 }
834 return OpVector<kSegmentSize,
835 ElementType,
836 NumberOfRegistersInvolved(vlmul),
837 TailProcessing::kUndisturbed,
838 vma>(args, vtype, extra_args...);
839 } else {
840 // For other instruction we have parsed all the information from vtype and only need to pass
841 // args and extra_args.
842 if ((vtype >> 6) & 1) {
843 return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kAgnostic, vma>(
844 args, extra_args...);
845 }
846 return OpVector<ElementType, kSegmentSize, vlmul, TailProcessing::kUndisturbed, vma>(
847 args, extra_args...);
848 }
849 }
850
851 template <size_t kSegmentSize,
852 typename IndexElementType,
853 size_t kIndexRegistersInvolved,
854 TailProcessing vta,
855 auto vma,
856 typename VOpArgs,
857 typename... ExtraArgs>
OpVector(const VOpArgs & args,Register vtype,ExtraArgs...extra_args)858 void OpVector(const VOpArgs& args, Register vtype, ExtraArgs... extra_args) {
859 VectorRegisterGroupMultiplier vlmul = static_cast<VectorRegisterGroupMultiplier>(vtype & 0b111);
860 switch (static_cast<VectorSelectElementWidth>((vtype >> 3) & 0b111)) {
861 case VectorSelectElementWidth::k8bit:
862 return OpVector<UInt8, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
863 args, vlmul, extra_args...);
864 case VectorSelectElementWidth::k16bit:
865 return OpVector<UInt16, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
866 args, vlmul, extra_args...);
867 case VectorSelectElementWidth::k32bit:
868 return OpVector<UInt32, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
869 args, vlmul, extra_args...);
870 case VectorSelectElementWidth::k64bit:
871 return OpVector<UInt64, kSegmentSize, IndexElementType, kIndexRegistersInvolved, vta, vma>(
872 args, vlmul, extra_args...);
873 default:
874 return Undefined();
875 }
876 }
877
878 template <typename DataElementType,
879 size_t kSegmentSize,
880 typename IndexElementType,
881 size_t kIndexRegistersInvolved,
882 TailProcessing vta,
883 auto vma,
884 typename VOpArgs,
885 typename... ExtraArgs>
OpVector(const VOpArgs & args,VectorRegisterGroupMultiplier vlmul,ExtraArgs...extra_args)886 void OpVector(const VOpArgs& args, VectorRegisterGroupMultiplier vlmul, ExtraArgs... extra_args) {
887 switch (vlmul) {
888 case VectorRegisterGroupMultiplier::k1register:
889 return OpVector<DataElementType,
890 VectorRegisterGroupMultiplier::k1register,
891 IndexElementType,
892 kSegmentSize,
893 kIndexRegistersInvolved,
894 vta,
895 vma>(args, extra_args...);
896 case VectorRegisterGroupMultiplier::k2registers:
897 return OpVector<DataElementType,
898 VectorRegisterGroupMultiplier::k2registers,
899 IndexElementType,
900 kSegmentSize,
901 kIndexRegistersInvolved,
902 vta,
903 vma>(args, extra_args...);
904 case VectorRegisterGroupMultiplier::k4registers:
905 return OpVector<DataElementType,
906 VectorRegisterGroupMultiplier::k4registers,
907 IndexElementType,
908 kSegmentSize,
909 kIndexRegistersInvolved,
910 vta,
911 vma>(args, extra_args...);
912 case VectorRegisterGroupMultiplier::k8registers:
913 return OpVector<DataElementType,
914 VectorRegisterGroupMultiplier::k8registers,
915 IndexElementType,
916 kSegmentSize,
917 kIndexRegistersInvolved,
918 vta,
919 vma>(args, extra_args...);
920 case VectorRegisterGroupMultiplier::kEigthOfRegister:
921 return OpVector<DataElementType,
922 VectorRegisterGroupMultiplier::kEigthOfRegister,
923 IndexElementType,
924 kSegmentSize,
925 kIndexRegistersInvolved,
926 vta,
927 vma>(args, extra_args...);
928 case VectorRegisterGroupMultiplier::kQuarterOfRegister:
929 return OpVector<DataElementType,
930 VectorRegisterGroupMultiplier::kQuarterOfRegister,
931 IndexElementType,
932 kSegmentSize,
933 kIndexRegistersInvolved,
934 vta,
935 vma>(args, extra_args...);
936 case VectorRegisterGroupMultiplier::kHalfOfRegister:
937 return OpVector<DataElementType,
938 VectorRegisterGroupMultiplier::kHalfOfRegister,
939 IndexElementType,
940 kSegmentSize,
941 kIndexRegistersInvolved,
942 vta,
943 vma>(args, extra_args...);
944 default:
945 return Undefined();
946 }
947 }
948
949 // CSR registers, that are permitted as an argument of strip-mining instrinsic.
950 using CsrName::kFrm;
951 using CsrName::kVxrm;
952 using CsrName::kVxsat;
953 // Argument of OpVectorXXX function is the number of vector register group.
954 template <auto DefaultElement = intrinsics::NoInactiveProcessing{}>
955 struct Vec {
956 uint8_t start_no;
957 };
958 // Vector argument 2x wide (for narrowing and widening instructions).
959 template <auto DefaultElement = intrinsics::NoInactiveProcessing{}>
960 struct WideVec {
961 uint8_t start_no;
962 };
963
964 template <typename DataElementType,
965 VectorRegisterGroupMultiplier vlmul,
966 typename IndexElementType,
967 size_t kSegmentSize,
968 size_t kIndexRegistersInvolved,
969 TailProcessing vta,
970 auto vma>
OpVector(const Decoder::VLoadIndexedArgs & args,Register src)971 void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
972 return OpVector<DataElementType,
973 kSegmentSize,
974 NumberOfRegistersInvolved(vlmul),
975 IndexElementType,
976 kIndexRegistersInvolved,
977 vta,
978 vma>(args, src);
979 }
980
981 template <typename DataElementType,
982 size_t kSegmentSize,
983 size_t kNumRegistersInGroup,
984 typename IndexElementType,
985 size_t kIndexRegistersInvolved,
986 TailProcessing vta,
987 auto vma>
OpVector(const Decoder::VLoadIndexedArgs & args,Register src)988 void OpVector(const Decoder::VLoadIndexedArgs& args, Register src) {
989 if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
990 return Undefined();
991 }
992 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
993 alignas(alignof(SIMD128Register))
994 IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
995 memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
996 return OpVectorLoad<DataElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
997 args.dst, src, [&indexes](size_t index) { return indexes[index]; });
998 }
999
1000 template <typename ElementType,
1001 size_t kSegmentSize,
1002 VectorRegisterGroupMultiplier vlmul,
1003 TailProcessing vta,
1004 auto vma>
OpVector(const Decoder::VLoadStrideArgs & args,Register src,Register stride)1005 void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
1006 return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(
1007 args, src, stride);
1008 }
1009
1010 template <typename ElementType,
1011 size_t kSegmentSize,
1012 size_t kNumRegistersInGroup,
1013 TailProcessing vta,
1014 auto vma>
OpVector(const Decoder::VLoadStrideArgs & args,Register src,Register stride)1015 void OpVector(const Decoder::VLoadStrideArgs& args, Register src, Register stride) {
1016 return OpVectorLoad<ElementType, kSegmentSize, kNumRegistersInGroup, vta, vma>(
1017 args.dst, src, [stride](size_t index) { return stride * index; });
1018 }
1019
1020 template <typename ElementType,
1021 size_t kSegmentSize,
1022 VectorRegisterGroupMultiplier vlmul,
1023 TailProcessing vta,
1024 auto vma>
OpVector(const Decoder::VLoadUnitStrideArgs & args,Register src)1025 void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
1026 return OpVector<ElementType, kSegmentSize, NumberOfRegistersInvolved(vlmul), vta, vma>(args,
1027 src);
1028 }
1029
1030 template <typename ElementType,
1031 size_t kSegmentSize,
1032 size_t kNumRegistersInGroup,
1033 TailProcessing vta,
1034 auto vma>
OpVector(const Decoder::VLoadUnitStrideArgs & args,Register src)1035 void OpVector(const Decoder::VLoadUnitStrideArgs& args, Register src) {
1036 switch (args.opcode) {
1037 case Decoder::VLUmOpOpcode::kVleXXff:
1038 return OpVectorLoad<ElementType,
1039 kSegmentSize,
1040 kNumRegistersInGroup,
1041 vta,
1042 vma,
1043 Decoder::VLUmOpOpcode::kVleXXff>(
1044 args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
1045 case Decoder::VLUmOpOpcode::kVleXX:
1046 return OpVectorLoad<ElementType,
1047 kSegmentSize,
1048 kNumRegistersInGroup,
1049 vta,
1050 vma,
1051 Decoder::VLUmOpOpcode::kVleXX>(
1052 args.dst, src, [](size_t index) { return kSegmentSize * sizeof(ElementType) * index; });
1053 case Decoder::VLUmOpOpcode::kVlm:
1054 if constexpr (kSegmentSize == 1 &&
1055 std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1056 return OpVectorLoad<UInt8,
1057 1,
1058 1,
1059 TailProcessing::kAgnostic,
1060 vma,
1061 Decoder::VLUmOpOpcode::kVlm>(
1062 args.dst, src, [](size_t index) { return index; });
1063 }
1064 return Undefined();
1065 default:
1066 return Undefined();
1067 }
1068 }
1069
1070 // The strided version of segmented load sounds like something very convoluted and complicated
1071 // that no one may ever want to use, but it's not rare and may be illustrated with simple RGB
1072 // bitmap window.
1073 //
1074 // Suppose it's in memory like this (doubles are 8 bytes in size as per IEEE 754)):
1075 // {R: 0.01}{G: 0.11}{B: 0.21} {R: 1.01}{G: 1.11}{B: 1.21}, {R: 2.01}{G: 2.11}{B: 2.21}
1076 // {R:10.01}{G:10.11}{B:10.21} {R:11.01}{G:11.11}{B:11.21}, {R:12.01}{G:12.11}{B:12.21}
1077 // {R:20.01}{G:20.11}{B:20.21} {R:21.01}{G:21.11}{B:21.21}, {R:22.01}{G:22.11}{B:22.21}
1078 // {R:30.01}{G:30.11}{B:30.21} {R:31.01}{G:31.11}{B:31.21}, {R:32.01}{G:32.11}{B:32.21}
1079 // This is very tiny 3x4 image with 3 components: red, green, blue.
1080 //
1081 // Let's assume that x1 is loaded with address of first element and x2 with 72 (that's how much
1082 // one row of this image takes).
1083 //
1084 // Then we may use the following command to load values in memory (with LMUL = 2, ELEN = 4):
1085 // vlsseg3e64.v v0, (x1), x2
1086 //
1087 // They would be loaded like this:
1088 // v0: {R: 0.01}{R:10.01} (first group of 2 registers)
1089 // v1: {R:20.01}{R:30.01}
1090 // v2: {G: 0.11}{G:10.11} (second group of 2 registers)
1091 // v3: {G:20.11}{G:30.11}
1092 // v4: {B: 0.21}{B:10.21} (third group of 3 registers)
1093 // v5: {B:20.21}{B:30.21}
1094 // Now we have loaded a column from memory and all three colors are put into a different register
1095 // groups for further processing.
1096 template <typename ElementType,
1097 size_t kSegmentSize,
1098 size_t kNumRegistersInGroup,
1099 TailProcessing vta,
1100 auto vma,
1101 typename Decoder::VLUmOpOpcode opcode = typename Decoder::VLUmOpOpcode{},
1102 typename GetElementOffsetLambdaType>
1103 void OpVectorLoad(uint8_t dst, Register src, GetElementOffsetLambdaType GetElementOffset) {
1104 using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
1105 if (!IsAligned<kNumRegistersInGroup>(dst)) {
1106 return Undefined();
1107 }
1108 if (dst + kNumRegistersInGroup * kSegmentSize > 32) {
1109 return Undefined();
1110 }
1111 constexpr size_t kElementsCount = 16 / sizeof(ElementType);
1112 size_t vstart = GetCsr<CsrName::kVstart>();
1113 size_t vl = GetCsr<CsrName::kVl>();
1114 if constexpr (opcode == Decoder::VLUmOpOpcode::kVlm) {
1115 vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
1116 }
1117 // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
1118 // simplify the logic below.
1119 SetCsr<CsrName::kVstart>(0);
1120 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
1121 // vector register group, including that no tail elements are updated with agnostic values.
1122 if (vstart >= vl) [[unlikely]] {
1123 return;
1124 }
1125 if constexpr (vta == TailProcessing::kAgnostic) {
1126 vstart = std::min(vstart, vl);
1127 }
1128 // Note: within_group_id is the current register id within a register group. During one
1129 // iteration of this loop we compute results for all registers with the current id in all
1130 // groups. E.g. for the example above we'd compute v0, v2, v4 during the first iteration (id
1131 // within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
1132 // ensures that memory is always accessed in ordered fashion.
1133 std::array<SIMD128Register, kSegmentSize> result;
1134 char* ptr = ToHostAddr<char>(src);
1135 auto mask = GetMaskForVectorOperations<vma>();
1136 for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
1137 ++within_group_id) {
1138 // No need to continue if we have kUndisturbed vta strategy.
1139 if constexpr (vta == TailProcessing::kUndisturbed) {
1140 if (within_group_id * kElementsCount >= vl) {
1141 break;
1142 }
1143 }
1144 // If we have elements that won't be overwritten then load these from registers.
1145 // For interpreter we could have filled all the registers unconditionally but we'll want to
1146 // reuse this code JITs later.
1147 auto register_mask =
1148 std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
1149 auto full_mask = std::get<0>(intrinsics::FullMaskForRegister<ElementType>(mask));
1150 if (vstart ||
1151 (vl < (within_group_id + 1) * kElementsCount && vta == TailProcessing::kUndisturbed) ||
1152 !(std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing> ||
1153 static_cast<InactiveProcessing>(vma) != InactiveProcessing::kUndisturbed ||
1154 register_mask == full_mask)) {
1155 for (size_t field = 0; field < kSegmentSize; ++field) {
1156 result[field].Set(state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup]);
1157 }
1158 }
1159 // Read elements from memory, but only if there are any active ones.
1160 for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
1161 ++within_register_id) {
1162 size_t element_index = kElementsCount * within_group_id + within_register_id;
1163 // Stop if we reached the vl limit.
1164 if (vl <= element_index) {
1165 break;
1166 }
1167 // Don't touch masked-out elements.
1168 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1169 if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
1170 1 << within_register_id)}) == MaskType{0}) {
1171 continue;
1172 }
1173 }
1174 // Load segment from memory.
1175 for (size_t field = 0; field < kSegmentSize; ++field) {
1176 FaultyLoadResult mem_access_result =
1177 FaultyLoad(ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
1178 sizeof(ElementType));
1179 if (mem_access_result.is_fault) {
1180 // Documentation doesn't tell us what we are supposed to do to remaining elements when
1181 // access fault happens but let's trigger an exception and treat the remaining elements
1182 // using vta-specified strategy by simply just adjusting the vl.
1183 vl = element_index;
1184 if constexpr (opcode == Decoder::VLUmOpOpcode::kVleXXff) {
1185 // Fail-first load only triggers exceptions for the first element, otherwise it
1186 // changes vl to ensure that other operations would only process elements that are
1187 // successfully loaded.
1188 if (element_index == 0) [[unlikely]] {
1189 exception_raised_ = true;
1190 } else {
1191 // TODO(b/323994286): Write a test case to verify vl changes correctly.
1192 SetCsr<CsrName::kVl>(element_index);
1193 }
1194 } else {
1195 // Most load instructions set vstart to failing element which then may be processed
1196 // by exception handler.
1197 exception_raised_ = true;
1198 SetCsr<CsrName::kVstart>(element_index);
1199 }
1200 break;
1201 }
1202 result[field].template Set<ElementType>(static_cast<ElementType>(mem_access_result.value),
1203 within_register_id);
1204 }
1205 }
1206 // Lambda to generate tail mask. We don't want to call MakeBitmaskFromVl eagerly because it's
1207 // not needed, most of the time, and compiler couldn't eliminate access to mmap-backed memory.
1208 auto GetTailMask = [vl, within_group_id] {
1209 return std::get<0>(intrinsics::MakeBitmaskFromVl<ElementType>(
1210 (vl <= within_group_id * kElementsCount) ? 0 : vl - within_group_id * kElementsCount));
1211 };
1212 // If mask has inactive elements and InactiveProcessing::kAgnostic mode is used then set them
1213 // to ~0.
1214 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1215 if (register_mask != full_mask) {
1216 auto [simd_mask] =
1217 intrinsics::BitMaskToSimdMask<ElementType>(Int64{MaskType{register_mask}});
1218 for (size_t field = 0; field < kSegmentSize; ++field) {
1219 if constexpr (vma == InactiveProcessing::kAgnostic) {
1220 // vstart equal to zero is supposed to be exceptional. From RISV-V V manual (page 14):
1221 // The vstart CSR is writable by unprivileged code, but non-zero vstart values may
1222 // cause vector instructions to run substantially slower on some implementations, so
1223 // vstart should not be used by application programmers. A few vector instructions
1224 // cannot be executed with a non-zero vstart value and will raise an illegal
1225 // instruction exception as dened below.
1226 // TODO(b/300690740): decide whether to merge two cases after support for vectors in
1227 // heavy optimizer would be implemented.
1228 if (vstart) [[unlikely]] {
1229 SIMD128Register vstart_mask = std::get<0>(
1230 intrinsics::MakeBitmaskFromVl<ElementType>(vstart % kElementsCount));
1231 if constexpr (vta == TailProcessing::kAgnostic) {
1232 result[field] |= vstart_mask & ~simd_mask;
1233 } else if (vl < (within_group_id + 1) * kElementsCount) {
1234 result[field] |= vstart_mask & ~simd_mask & ~GetTailMask();
1235 } else {
1236 result[field] |= vstart_mask & ~simd_mask;
1237 }
1238 } else if constexpr (vta == TailProcessing::kAgnostic) {
1239 result[field] |= ~simd_mask;
1240 } else {
1241 if (vl < (within_group_id + 1) * kElementsCount) {
1242 result[field] |= ~simd_mask & ~GetTailMask();
1243 } else {
1244 result[field] |= ~simd_mask;
1245 }
1246 }
1247 }
1248 }
1249 }
1250 }
1251 // If we have tail elements and TailProcessing::kAgnostic mode then set them to ~0.
1252 if constexpr (vta == TailProcessing::kAgnostic) {
1253 for (size_t field = 0; field < kSegmentSize; ++field) {
1254 if (vl < (within_group_id + 1) * kElementsCount) {
1255 result[field] |= GetTailMask();
1256 }
1257 }
1258 }
1259 // Put values back into register file.
1260 for (size_t field = 0; field < kSegmentSize; ++field) {
1261 state_->cpu.v[dst + within_group_id + field * kNumRegistersInGroup] =
1262 result[field].template Get<__uint128_t>();
1263 }
1264 // Next group should be fully processed.
1265 vstart = 0;
1266 }
1267 }
1268
1269 // The vector register gather instructions read elements from src1 vector register group at
1270 // locations given by the second source vector src2 register group.
1271 // src1: element vector register.
1272 // GetElementIndex: universal lambda that returns index from src2,
1273 template <typename ElementType,
1274 VectorRegisterGroupMultiplier vlmul,
1275 TailProcessing vta,
1276 auto vma,
1277 typename GetElementIndexLambdaType>
OpVectorGather(uint8_t dst,uint8_t src1,GetElementIndexLambdaType GetElementIndex)1278 void OpVectorGather(uint8_t dst, uint8_t src1, GetElementIndexLambdaType GetElementIndex) {
1279 constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
1280 if (!IsAligned<kRegistersInvolved>(dst | src1)) {
1281 return Undefined();
1282 }
1283 // Source and destination must not overlap.
1284 if (dst < (src1 + kRegistersInvolved) && src1 < (dst + kRegistersInvolved)) {
1285 return Undefined();
1286 }
1287 constexpr size_t kElementsCount = 16 / sizeof(ElementType);
1288 constexpr size_t vlmax = GetVlmax<ElementType, vlmul>();
1289
1290 size_t vstart = GetCsr<CsrName::kVstart>();
1291 size_t vl = GetCsr<CsrName::kVl>();
1292 auto mask = GetMaskForVectorOperations<vma>();
1293 SetCsr<CsrName::kVstart>(0);
1294 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
1295 // vector register group, including that no tail elements are updated with agnostic values.
1296 if (vstart >= vl) [[unlikely]] {
1297 return;
1298 }
1299
1300 // Copy vlmul registers into array of elements, access elements of temporary array.
1301 alignas(alignof(SIMD128Register)) ElementType values[vlmax];
1302 memcpy(values, state_->cpu.v + src1, sizeof(values));
1303 // Fill dst first, resolve mask later.
1304 for (size_t index = vstart / kElementsCount; index < kRegistersInvolved; ++index) {
1305 SIMD128Register original_dst_value;
1306 SIMD128Register result{state_->cpu.v[dst + index]};
1307 for (size_t dst_element_index = vstart % kElementsCount; dst_element_index < kElementsCount;
1308 ++dst_element_index) {
1309 size_t src_element_index = GetElementIndex(index * kElementsCount + dst_element_index);
1310
1311 // If an element index is out of range ( vs1[i] >= VLMAX ) then zero is returned for the
1312 // element value.
1313 ElementType element_value = ElementType{0};
1314 if (src_element_index < vlmax) {
1315 element_value = values[src_element_index];
1316 }
1317 original_dst_value.Set<ElementType>(element_value, dst_element_index);
1318 }
1319
1320 // Apply mask and put result values into dst register.
1321 result =
1322 VectorMasking<ElementType, vta, vma>(result, original_dst_value, vstart, vl, index, mask);
1323 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
1324 // Next group should be fully processed.
1325 vstart = 0;
1326 }
1327 }
1328
1329 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpFVfArgs & args,ElementType arg2)1330 void OpVector(const Decoder::VOpFVfArgs& args, ElementType arg2) {
1331 using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
1332 if constexpr (sizeof(ElementType) == sizeof(Float32)) {
1333 // Keep cases sorted in opcode order to match RISC-V V manual.
1334 switch (args.opcode) {
1335 case Decoder::VOpFVfOpcode::kVfwaddvf:
1336 return OpVectorWidenvx<intrinsics::Vfwaddvf<ElementType>,
1337 ElementType,
1338 vlmul,
1339 vta,
1340 vma,
1341 kFrm>(args.dst, args.src1, arg2);
1342 case Decoder::VOpFVfOpcode::kVfwsubvf:
1343 return OpVectorWidenvx<intrinsics::Vfwsubvf<ElementType>,
1344 ElementType,
1345 vlmul,
1346 vta,
1347 vma,
1348 kFrm>(args.dst, args.src1, arg2);
1349 case Decoder::VOpFVfOpcode::kVfwmulvf:
1350 return OpVectorWidenvx<intrinsics::Vfwmulvf<ElementType>,
1351 ElementType,
1352 vlmul,
1353 vta,
1354 vma,
1355 kFrm>(args.dst, args.src1, arg2);
1356 case Decoder::VOpFVfOpcode::kVfwaddwf:
1357 return OpVectorWidenwx<intrinsics::Vfwaddwf<ElementType>,
1358 ElementType,
1359 vlmul,
1360 vta,
1361 vma,
1362 kFrm>(args.dst, args.src1, arg2);
1363 case Decoder::VOpFVfOpcode::kVfwsubwf:
1364 return OpVectorWidenwx<intrinsics::Vfwsubwf<ElementType>,
1365 ElementType,
1366 vlmul,
1367 vta,
1368 vma,
1369 kFrm>(args.dst, args.src1, arg2);
1370 case Decoder::VOpFVfOpcode::kVfwmaccvf:
1371 return OpVectorWidenvxw<intrinsics::Vfwmaccvf<ElementType>,
1372 ElementType,
1373 vlmul,
1374 vta,
1375 vma,
1376 kFrm>(args.dst, args.src1, arg2);
1377 case Decoder::VOpFVfOpcode::kVfwnmaccvf:
1378 return OpVectorWidenvxw<intrinsics::Vfwnmaccvf<ElementType>,
1379 ElementType,
1380 vlmul,
1381 vta,
1382 vma,
1383 kFrm>(args.dst, args.src1, arg2);
1384 case Decoder::VOpFVfOpcode::kVfwmsacvf:
1385 return OpVectorWidenvxw<intrinsics::Vfwmsacvf<ElementType>,
1386 ElementType,
1387 vlmul,
1388 vta,
1389 vma,
1390 kFrm>(args.dst, args.src1, arg2);
1391 case Decoder::VOpFVfOpcode::kVfwnmsacvf:
1392 return OpVectorWidenvxw<intrinsics::Vfwnmsacvf<ElementType>,
1393 ElementType,
1394 vlmul,
1395 vta,
1396 vma,
1397 kFrm>(args.dst, args.src1, arg2);
1398 default:
1399 break;
1400 }
1401 }
1402 // Keep cases sorted in opcode order to match RISC-V V manual.
1403 switch (args.opcode) {
1404 case Decoder::VOpFVfOpcode::kVfminvf:
1405 return OpVectorvx<intrinsics::Vfminvx<ElementType>, ElementType, vlmul, vta, vma>(
1406 args.dst, args.src1, arg2);
1407 case Decoder::VOpFVfOpcode::kVfmaxvf:
1408 return OpVectorvx<intrinsics::Vfmaxvx<ElementType>, ElementType, vlmul, vta, vma>(
1409 args.dst, args.src1, arg2);
1410 case Decoder::VOpFVfOpcode::kVfsgnjvf:
1411 return OpVectorvx<intrinsics::Vfsgnjvx<ElementType>, ElementType, vlmul, vta, vma>(
1412 args.dst, args.src1, arg2);
1413 case Decoder::VOpFVfOpcode::kVfsgnjnvf:
1414 return OpVectorvx<intrinsics::Vfsgnjnvx<ElementType>, ElementType, vlmul, vta, vma>(
1415 args.dst, args.src1, arg2);
1416 case Decoder::VOpFVfOpcode::kVfsgnjxvf:
1417 return OpVectorvx<intrinsics::Vfsgnjxvx<ElementType>, ElementType, vlmul, vta, vma>(
1418 args.dst, args.src1, arg2);
1419 case Decoder::VOpFVfOpcode::kVfslide1upvf:
1420 return OpVectorslide1up<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
1421 case Decoder::VOpFVfOpcode::kVfslide1downvf:
1422 return OpVectorslide1down<ElementType, vlmul, vta, vma>(args.dst, args.src1, arg2);
1423 case Decoder::VOpFVfOpcode::kVfmvsf:
1424 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1425 return Undefined();
1426 }
1427 if (args.src1 != 0) {
1428 return Undefined();
1429 }
1430 return OpVectorVmvsx<ElementType, vta>(args.dst, arg2);
1431 case Decoder::VOpFVfOpcode::kVfmergevf:
1432 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1433 if (args.src1 != 0) {
1434 return Undefined();
1435 }
1436 return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(args.dst,
1437 arg2);
1438 } else {
1439 return OpVectorx<intrinsics::Vcopyx<ElementType>,
1440 ElementType,
1441 vlmul,
1442 vta,
1443 // Always use "undisturbed" value from source register.
1444 InactiveProcessing::kUndisturbed>(
1445 args.dst, arg2, /*dst_mask=*/args.src1);
1446 }
1447 case Decoder::VOpFVfOpcode::kVmfeqvf:
1448 return OpVectorToMaskvx<intrinsics::Vfeqvx<ElementType>, ElementType, vlmul, vma>(
1449 args.dst, args.src1, arg2);
1450 case Decoder::VOpFVfOpcode::kVmflevf:
1451 return OpVectorToMaskvx<intrinsics::Vflevx<ElementType>, ElementType, vlmul, vma>(
1452 args.dst, args.src1, arg2);
1453 case Decoder::VOpFVfOpcode::kVmfltvf:
1454 return OpVectorToMaskvx<intrinsics::Vfltvx<ElementType>, ElementType, vlmul, vma>(
1455 args.dst, args.src1, arg2);
1456 case Decoder::VOpFVfOpcode::kVmfnevf:
1457 return OpVectorToMaskvx<intrinsics::Vfnevx<ElementType>, ElementType, vlmul, vma>(
1458 args.dst, args.src1, arg2);
1459 case Decoder::VOpFVfOpcode::kVmfgtvf:
1460 return OpVectorToMaskvx<intrinsics::Vfgtvx<ElementType>, ElementType, vlmul, vma>(
1461 args.dst, args.src1, arg2);
1462 case Decoder::VOpFVfOpcode::kVmfgevf:
1463 return OpVectorToMaskvx<intrinsics::Vfgevx<ElementType>, ElementType, vlmul, vma>(
1464 args.dst, args.src1, arg2);
1465 case Decoder::VOpFVfOpcode::kVfdivvf:
1466 return OpVectorSameWidth<intrinsics::Vfdivvf<ElementType>,
1467 ElementType,
1468 NumberOfRegistersInvolved(vlmul),
1469 vta,
1470 vma,
1471 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1472 case Decoder::VOpFVfOpcode::kVfrdivvf:
1473 return OpVectorSameWidth<intrinsics::Vfrdivvf<ElementType>,
1474 ElementType,
1475 NumberOfRegistersInvolved(vlmul),
1476 vta,
1477 vma,
1478 kFrm>(
1479 args.dst,
1480 Vec<SignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x3f80'0000
1481 : 0x3ff0'0000'0000'0000}>{
1482 args.src1},
1483 arg2);
1484 case Decoder::VOpFVfOpcode::kVfmulvf:
1485 return OpVectorSameWidth<intrinsics::Vfmulvf<ElementType>,
1486 ElementType,
1487 NumberOfRegistersInvolved(vlmul),
1488 vta,
1489 vma,
1490 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1491 case Decoder::VOpFVfOpcode::kVfaddvf:
1492 return OpVectorSameWidth<intrinsics::Vfaddvf<ElementType>,
1493 ElementType,
1494 NumberOfRegistersInvolved(vlmul),
1495 vta,
1496 vma,
1497 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1498 case Decoder::VOpFVfOpcode::kVfsubvf:
1499 return OpVectorSameWidth<intrinsics::Vfsubvf<ElementType>,
1500 ElementType,
1501 NumberOfRegistersInvolved(vlmul),
1502 vta,
1503 vma,
1504 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1505 case Decoder::VOpFVfOpcode::kVfrsubvf:
1506 return OpVectorSameWidth<intrinsics::Vfrsubvf<ElementType>,
1507 ElementType,
1508 NumberOfRegistersInvolved(vlmul),
1509 vta,
1510 vma,
1511 kFrm>(args.dst, Vec<SignedType{}>{args.src1}, arg2);
1512 case Decoder::VOpFVfOpcode::kVfmaccvf:
1513 return OpVectorvxv<intrinsics::Vfmaccvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1514 args.dst, args.src1, arg2);
1515 case Decoder::VOpFVfOpcode::kVfmsacvf:
1516 return OpVectorvxv<intrinsics::Vfmsacvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1517 args.dst, args.src1, arg2);
1518 case Decoder::VOpFVfOpcode::kVfmaddvf:
1519 return OpVectorvxv<intrinsics::Vfmaddvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1520 args.dst, args.src1, arg2);
1521 case Decoder::VOpFVfOpcode::kVfmsubvf:
1522 return OpVectorvxv<intrinsics::Vfmsubvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1523 args.dst, args.src1, arg2);
1524 case Decoder::VOpFVfOpcode::kVfnmaccvf:
1525 return OpVectorvxv<intrinsics::Vfnmaccvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1526 args.dst, args.src1, arg2);
1527 case Decoder::VOpFVfOpcode::kVfnmsacvf:
1528 return OpVectorvxv<intrinsics::Vfnmsacvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1529 args.dst, args.src1, arg2);
1530 case Decoder::VOpFVfOpcode::kVfnmaddvf:
1531 return OpVectorvxv<intrinsics::Vfnmaddvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1532 args.dst, args.src1, arg2);
1533 case Decoder::VOpFVfOpcode::kVfnmsubvf:
1534 return OpVectorvxv<intrinsics::Vfnmsubvf<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
1535 args.dst, args.src1, arg2);
1536 default:
1537 return Undefined();
1538 }
1539 }
1540
1541 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpFVvArgs & args)1542 void OpVector(const Decoder::VOpFVvArgs& args) {
1543 using SignedType = Wrapping<std::make_signed_t<typename TypeTraits<ElementType>::Int>>;
1544 using UnsignedType = Wrapping<std::make_unsigned_t<typename TypeTraits<ElementType>::Int>>;
1545 // Floating point IEEE 754 value -0.0 includes 1 top bit set and the other bits not set:
1546 // https://en.wikipedia.org/wiki/Signed_zero#Representations This is the exact same
1547 // representation minimum negative integer have in two's complement representation:
1548 // https://en.wikipedia.org/wiki/Two%27s_complement#Most_negative_number
1549 // Note: we pass filler elements as integers because `Float32`/`Float64` couldn't be template
1550 // parameters.
1551 constexpr SignedType kNegativeZero{std::numeric_limits<typename SignedType::BaseType>::min()};
1552 // Floating point IEEE 754 value +0.0 includes only zero bits, same as integer zero.
1553 constexpr SignedType kPositiveZero{};
1554 // We currently don't support Float16 operations, but conversion routines that deal with
1555 // double-width floats use these encodings to produce regular Float32 types.
1556 if constexpr (sizeof(ElementType) <= sizeof(Float32)) {
1557 using WideElementType = typename TypeTraits<ElementType>::Wide;
1558 // Keep cases sorted in opcode order to match RISC-V V manual.
1559 switch (args.opcode) {
1560 case Decoder::VOpFVvOpcode::kVFUnary0:
1561 switch (args.vfunary0_opcode) {
1562 case Decoder::VFUnary0Opcode::kVfwcvtfxuv:
1563 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1564 return intrinsics::Vfcvtv<WideElementType, UnsignedType>(FPFlags::DYN, frm, src);
1565 },
1566 UnsignedType,
1567 vlmul,
1568 vta,
1569 vma,
1570 kFrm>(args.dst, args.src1);
1571 case Decoder::VFUnary0Opcode::kVfwcvtfxv:
1572 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1573 return intrinsics::Vfcvtv<WideElementType, SignedType>(FPFlags::DYN, frm, src);
1574 },
1575 SignedType,
1576 vlmul,
1577 vta,
1578 vma,
1579 kFrm>(args.dst, args.src1);
1580 case Decoder::VFUnary0Opcode::kVfncvtxufw:
1581 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1582 return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::DYN, frm, src);
1583 },
1584 UnsignedType,
1585 vlmul,
1586 vta,
1587 vma,
1588 kFrm>(args.dst, args.src1);
1589 case Decoder::VFUnary0Opcode::kVfncvtxfw:
1590 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1591 return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::DYN, frm, src);
1592 },
1593 SignedType,
1594 vlmul,
1595 vta,
1596 vma,
1597 kFrm>(args.dst, args.src1);
1598 case Decoder::VFUnary0Opcode::kVfncvtrtzxufw:
1599 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1600 return intrinsics::Vfcvtv<UnsignedType, WideElementType>(FPFlags::RTZ, frm, src);
1601 },
1602 UnsignedType,
1603 vlmul,
1604 vta,
1605 vma,
1606 kFrm>(args.dst, args.src1);
1607 case Decoder::VFUnary0Opcode::kVfncvtrtzxfw:
1608 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1609 return intrinsics::Vfcvtv<SignedType, WideElementType>(FPFlags::RTZ, frm, src);
1610 },
1611 SignedType,
1612 vlmul,
1613 vta,
1614 vma,
1615 kFrm>(args.dst, args.src1);
1616 default:
1617 break; // Make compiler happy.
1618 }
1619 break;
1620 default:
1621 break; // Make compiler happy.
1622 }
1623 }
1624 // Widening and narrowing opeation which take floating point “narrow” operand may only work
1625 // correctly with Float32 input: Float16 is not supported yet, while Float64 input would produce
1626 // 128bit output which is currently reserver in RISC-V V.
1627 if constexpr (sizeof(ElementType) == sizeof(Float32)) {
1628 using WideElementType = WideType<ElementType>;
1629 using WideSignedType = WideType<SignedType>;
1630 using WideUnsignedType = WideType<UnsignedType>;
1631 // Keep cases sorted in opcode order to match RISC-V V manual.
1632 switch (args.opcode) {
1633 case Decoder::VOpFVvOpcode::kVfwaddvv:
1634 return OpVectorWidenvv<intrinsics::Vfwaddvv<ElementType>,
1635 ElementType,
1636 vlmul,
1637 vta,
1638 vma,
1639 kFrm>(args.dst, args.src1, args.src2);
1640 case Decoder::VOpFVvOpcode::kVfwredusumvs:
1641 // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1642 // The additive identity is +0.0 when rounding down or -0.0 for all other rounding
1643 // modes.
1644 if (GetCsr<kFrm>() != FPFlags::RDN) {
1645 return OpVectorvs<intrinsics::Vfredosumvs<ElementType, WideType<ElementType>>,
1646 ElementType,
1647 WideType<ElementType>,
1648 vlmul,
1649 vta,
1650 vma,
1651 kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1652 } else {
1653 return OpVectorvs<intrinsics::Vfredosumvs<ElementType, WideType<ElementType>>,
1654 ElementType,
1655 WideType<ElementType>,
1656 vlmul,
1657 vta,
1658 vma,
1659 kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1660 }
1661 case Decoder::VOpFVvOpcode::kVfwsubvv:
1662 return OpVectorWidenvv<intrinsics::Vfwsubvv<ElementType>,
1663 ElementType,
1664 vlmul,
1665 vta,
1666 vma,
1667 kFrm>(args.dst, args.src1, args.src2);
1668 case Decoder::VOpFVvOpcode::kVfwredosumvs:
1669 // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1670 // The additive identity is +0.0 when rounding down or -0.0 for all other rounding
1671 // modes.
1672 if (GetCsr<kFrm>() != FPFlags::RDN) {
1673 return OpVectorvs<intrinsics::Vfredosumvs<ElementType, WideType<ElementType>>,
1674 ElementType,
1675 WideType<ElementType>,
1676 vlmul,
1677 vta,
1678 vma,
1679 kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1680 } else {
1681 return OpVectorvs<intrinsics::Vfredosumvs<ElementType, WideType<ElementType>>,
1682 ElementType,
1683 WideType<ElementType>,
1684 vlmul,
1685 vta,
1686 vma,
1687 kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1688 }
1689 case Decoder::VOpFVvOpcode::kVfwmulvv:
1690 return OpVectorWidenvv<intrinsics::Vfwmulvv<ElementType>,
1691 ElementType,
1692 vlmul,
1693 vta,
1694 vma,
1695 kFrm>(args.dst, args.src1, args.src2);
1696 case Decoder::VOpFVvOpcode::kVfwaddwv:
1697 return OpVectorWidenwv<intrinsics::Vfwaddwv<ElementType>,
1698 ElementType,
1699 vlmul,
1700 vta,
1701 vma,
1702 kFrm>(args.dst, args.src1, args.src2);
1703 case Decoder::VOpFVvOpcode::kVfwsubwv:
1704 return OpVectorWidenwv<intrinsics::Vfwsubwv<ElementType>,
1705 ElementType,
1706 vlmul,
1707 vta,
1708 vma,
1709 kFrm>(args.dst, args.src1, args.src2);
1710 case Decoder::VOpFVvOpcode::kVfwmaccvv:
1711 return OpVectorWidenvvw<intrinsics::Vfwmaccvv<ElementType>,
1712 ElementType,
1713 vlmul,
1714 vta,
1715 vma,
1716 kFrm>(args.dst, args.src1, args.src2);
1717 case Decoder::VOpFVvOpcode::kVfwnmaccvv:
1718 return OpVectorWidenvvw<intrinsics::Vfwnmaccvv<ElementType>,
1719 ElementType,
1720 vlmul,
1721 vta,
1722 vma,
1723 kFrm>(args.dst, args.src1, args.src2);
1724 case Decoder::VOpFVvOpcode::kVfwmsacvv:
1725 return OpVectorWidenvvw<intrinsics::Vfwmsacvv<ElementType>,
1726 ElementType,
1727 vlmul,
1728 vta,
1729 vma,
1730 kFrm>(args.dst, args.src1, args.src2);
1731 case Decoder::VOpFVvOpcode::kVfwnmsacvv:
1732 return OpVectorWidenvvw<intrinsics::Vfwnmsacvv<ElementType>,
1733 ElementType,
1734 vlmul,
1735 vta,
1736 vma,
1737 kFrm>(args.dst, args.src1, args.src2);
1738 case Decoder::VOpFVvOpcode::kVFUnary0:
1739 switch (args.vfunary0_opcode) {
1740 case Decoder::VFUnary0Opcode::kVfwcvtxufv:
1741 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1742 return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::DYN, frm, src);
1743 },
1744 ElementType,
1745 vlmul,
1746 vta,
1747 vma,
1748 kFrm>(args.dst, args.src1);
1749 case Decoder::VFUnary0Opcode::kVfwcvtxfv:
1750 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1751 return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::DYN, frm, src);
1752 },
1753 ElementType,
1754 vlmul,
1755 vta,
1756 vma,
1757 kFrm>(args.dst, args.src1);
1758 case Decoder::VFUnary0Opcode::kVfwcvtffv:
1759 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1760 return intrinsics::Vfcvtv<WideElementType, ElementType>(FPFlags::DYN, frm, src);
1761 },
1762 ElementType,
1763 vlmul,
1764 vta,
1765 vma,
1766 kFrm>(args.dst, args.src1);
1767 case Decoder::VFUnary0Opcode::kVfwcvtrtzxufv:
1768 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1769 return intrinsics::Vfcvtv<WideUnsignedType, ElementType>(FPFlags::RTZ, frm, src);
1770 },
1771 ElementType,
1772 vlmul,
1773 vta,
1774 vma,
1775 kFrm>(args.dst, args.src1);
1776 case Decoder::VFUnary0Opcode::kVfwcvtrtzxfv:
1777 return OpVectorWidenv<[](int8_t frm, SIMD128Register src) {
1778 return intrinsics::Vfcvtv<WideSignedType, ElementType>(FPFlags::RTZ, frm, src);
1779 },
1780 ElementType,
1781 vlmul,
1782 vta,
1783 vma,
1784 kFrm>(args.dst, args.src1);
1785 case Decoder::VFUnary0Opcode::kVfncvtfxuw:
1786 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1787 return intrinsics::Vfcvtv<ElementType, WideUnsignedType>(FPFlags::DYN, frm, src);
1788 },
1789 ElementType,
1790 vlmul,
1791 vta,
1792 vma,
1793 kFrm>(args.dst, args.src1);
1794 case Decoder::VFUnary0Opcode::kVfncvtffw:
1795 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1796 return intrinsics::Vfcvtv<ElementType, WideElementType>(FPFlags::DYN, frm, src);
1797 },
1798 ElementType,
1799 vlmul,
1800 vta,
1801 vma,
1802 kFrm>(args.dst, args.src1);
1803 case Decoder::VFUnary0Opcode::kVfncvtfxw:
1804 return OpVectorNarroww<[](int8_t frm, SIMD128Register src) {
1805 return intrinsics::Vfcvtv<ElementType, WideSignedType>(FPFlags::DYN, frm, src);
1806 },
1807 ElementType,
1808 vlmul,
1809 vta,
1810 vma,
1811 kFrm>(args.dst, args.src1);
1812 default:
1813 break; // Make compiler happy.
1814 }
1815 break;
1816 default:
1817 break; // Make compiler happy.
1818 }
1819 }
1820 // If our ElementType is Float16 then “straight” operations are unsupported and we whouldn't try
1821 // instantiate any functions since this would lead to compilke-time error.
1822 if constexpr (sizeof(ElementType) >= sizeof(Float32)) {
1823 // Keep cases sorted in opcode order to match RISC-V V manual.
1824 switch (args.opcode) {
1825 case Decoder::VOpFVvOpcode::kVfredusumvs:
1826 // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1827 // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
1828 if (GetCsr<kFrm>() != FPFlags::RDN) {
1829 return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
1830 ElementType,
1831 vlmul,
1832 vta,
1833 vma,
1834 kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1835 } else {
1836 return OpVectorvs<intrinsics::Vfredusumvs<ElementType>,
1837 ElementType,
1838 vlmul,
1839 vta,
1840 vma,
1841 kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1842 }
1843 case Decoder::VOpFVvOpcode::kVfredosumvs:
1844 // 14.3. Vector Single-Width Floating-Point Reduction Instructions:
1845 // The additive identity is +0.0 when rounding down or -0.0 for all other rounding modes.
1846 if (GetCsr<kFrm>() != FPFlags::RDN) {
1847 return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
1848 ElementType,
1849 vlmul,
1850 vta,
1851 vma,
1852 kFrm>(args.dst, Vec<kNegativeZero>{args.src1}, args.src2);
1853 } else {
1854 return OpVectorvs<intrinsics::Vfredosumvs<ElementType>,
1855 ElementType,
1856 vlmul,
1857 vta,
1858 vma,
1859 kFrm>(args.dst, Vec<kPositiveZero>{args.src1}, args.src2);
1860 }
1861 case Decoder::VOpFVvOpcode::kVfminvv:
1862 return OpVectorvv<intrinsics::Vfminvv<ElementType>, ElementType, vlmul, vta, vma>(
1863 args.dst, args.src1, args.src2);
1864 case Decoder::VOpFVvOpcode::kVfredminvs:
1865 // For Vfredmin the identity element is +inf.
1866 return OpVectorvs<intrinsics::Vfredminvs<ElementType>, ElementType, vlmul, vta, vma>(
1867 args.dst,
1868 Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x7f80'0000
1869 : 0x7ff0'0000'0000'0000}>{
1870 args.src1},
1871 args.src2);
1872 case Decoder::VOpFVvOpcode::kVfmaxvv:
1873 return OpVectorvv<intrinsics::Vfmaxvv<ElementType>, ElementType, vlmul, vta, vma>(
1874 args.dst, args.src1, args.src2);
1875 case Decoder::VOpFVvOpcode::kVfredmaxvs:
1876 // For Vfredmax the identity element is -inf.
1877 return OpVectorvs<intrinsics::Vfredmaxvs<ElementType>, ElementType, vlmul, vta, vma>(
1878 args.dst,
1879 Vec<UnsignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0xff80'0000
1880 : 0xfff0'0000'0000'0000}>{
1881 args.src1},
1882 args.src2);
1883 case Decoder::VOpFVvOpcode::kVfsgnjvv:
1884 return OpVectorvv<intrinsics::Vfsgnjvv<ElementType>, ElementType, vlmul, vta, vma>(
1885 args.dst, args.src1, args.src2);
1886 case Decoder::VOpFVvOpcode::kVfsgnjnvv:
1887 return OpVectorvv<intrinsics::Vfsgnjnvv<ElementType>, ElementType, vlmul, vta, vma>(
1888 args.dst, args.src1, args.src2);
1889 case Decoder::VOpFVvOpcode::kVfsgnjxvv:
1890 return OpVectorvv<intrinsics::Vfsgnjxvv<ElementType>, ElementType, vlmul, vta, vma>(
1891 args.dst, args.src1, args.src2);
1892 case Decoder::VOpFVvOpcode::kVFUnary0:
1893 switch (args.vfunary0_opcode) {
1894 case Decoder::VFUnary0Opcode::kVfcvtxufv:
1895 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1896 return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::DYN, frm, src);
1897 },
1898 ElementType,
1899 vlmul,
1900 vta,
1901 vma,
1902 kFrm>(args.dst, args.src1);
1903 case Decoder::VFUnary0Opcode::kVfcvtxfv:
1904 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1905 return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::DYN, frm, src);
1906 },
1907 ElementType,
1908 vlmul,
1909 vta,
1910 vma,
1911 kFrm>(args.dst, args.src1);
1912 case Decoder::VFUnary0Opcode::kVfcvtfxuv:
1913 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1914 return intrinsics::Vfcvtv<ElementType, UnsignedType>(FPFlags::DYN, frm, src);
1915 },
1916 UnsignedType,
1917 vlmul,
1918 vta,
1919 vma,
1920 kFrm>(args.dst, args.src1);
1921 case Decoder::VFUnary0Opcode::kVfcvtfxv:
1922 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1923 return intrinsics::Vfcvtv<ElementType, SignedType>(FPFlags::DYN, frm, src);
1924 },
1925 SignedType,
1926 vlmul,
1927 vta,
1928 vma,
1929 kFrm>(args.dst, args.src1);
1930 case Decoder::VFUnary0Opcode::kVfcvtrtzxufv:
1931 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1932 return intrinsics::Vfcvtv<UnsignedType, ElementType>(FPFlags::RTZ, frm, src);
1933 },
1934 ElementType,
1935 vlmul,
1936 vta,
1937 vma,
1938 kFrm>(args.dst, args.src1);
1939 case Decoder::VFUnary0Opcode::kVfcvtrtzxfv:
1940 return OpVectorv<[](int8_t frm, SIMD128Register src) {
1941 return intrinsics::Vfcvtv<SignedType, ElementType>(FPFlags::RTZ, frm, src);
1942 },
1943 ElementType,
1944 vlmul,
1945 vta,
1946 vma,
1947 kFrm>(args.dst, args.src1);
1948 default:
1949 break; // Make compiler happy.
1950 }
1951 break;
1952 case Decoder::VOpFVvOpcode::kVFUnary1:
1953 switch (args.vfunary1_opcode) {
1954 case Decoder::VFUnary1Opcode::kVfsqrtv:
1955 return OpVectorv<intrinsics::Vfsqrtv<ElementType>,
1956 ElementType,
1957 vlmul,
1958 vta,
1959 vma,
1960 kFrm>(args.dst, args.src1);
1961 break;
1962 case Decoder::VFUnary1Opcode::kVfrsqrt7v:
1963 return OpVectorv<intrinsics::Vfrsqrt7v<ElementType>, ElementType, vlmul, vta, vma>(
1964 args.dst, args.src1);
1965 break;
1966 case Decoder::VFUnary1Opcode::kVfclassv:
1967 return OpVectorv<intrinsics::Vfclassv<ElementType>, ElementType, vlmul, vta, vma>(
1968 args.dst, args.src1);
1969 break;
1970 default:
1971 break; // Make compiler happy.
1972 }
1973 break;
1974 case Decoder::VOpFVvOpcode::kVfmvfs:
1975 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
1976 return Undefined();
1977 }
1978 if (args.src2 != 0) {
1979 return Undefined();
1980 }
1981 return OpVectorVmvfs<ElementType>(args.dst, args.src1);
1982 case Decoder::VOpFVvOpcode::kVmfeqvv:
1983 return OpVectorToMaskvv<intrinsics::Vfeqvv<ElementType>, ElementType, vlmul, vma>(
1984 args.dst, args.src1, args.src2);
1985 case Decoder::VOpFVvOpcode::kVmflevv:
1986 return OpVectorToMaskvv<intrinsics::Vflevv<ElementType>, ElementType, vlmul, vma>(
1987 args.dst, args.src1, args.src2);
1988 case Decoder::VOpFVvOpcode::kVmfltvv:
1989 return OpVectorToMaskvv<intrinsics::Vfltvv<ElementType>, ElementType, vlmul, vma>(
1990 args.dst, args.src1, args.src2);
1991 case Decoder::VOpFVvOpcode::kVmfnevv:
1992 return OpVectorToMaskvv<intrinsics::Vfnevv<ElementType>, ElementType, vlmul, vma>(
1993 args.dst, args.src1, args.src2);
1994 case Decoder::VOpFVvOpcode::kVfdivvv:
1995 return OpVectorSameWidth<intrinsics::Vfdivvv<ElementType>,
1996 ElementType,
1997 NumberOfRegistersInvolved(vlmul),
1998 vta,
1999 vma,
2000 kFrm>(
2001 args.dst,
2002 Vec<SignedType{}>{args.src1},
2003 Vec<SignedType{(sizeof(ElementType) == sizeof(Float32)) ? 0x3f80'0000
2004 : 0x3ff0'0000'0000'0000}>{
2005 args.src2});
2006 case Decoder::VOpFVvOpcode::kVfmulvv:
2007 return OpVectorSameWidth<intrinsics::Vfmulvv<ElementType>,
2008 ElementType,
2009 NumberOfRegistersInvolved(vlmul),
2010 vta,
2011 vma,
2012 kFrm>(
2013 args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
2014 case Decoder::VOpFVvOpcode::kVfaddvv:
2015 return OpVectorSameWidth<intrinsics::Vfaddvv<ElementType>,
2016 ElementType,
2017 NumberOfRegistersInvolved(vlmul),
2018 vta,
2019 vma,
2020 kFrm>(
2021 args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
2022 case Decoder::VOpFVvOpcode::kVfsubvv:
2023 return OpVectorSameWidth<intrinsics::Vfsubvv<ElementType>,
2024 ElementType,
2025 NumberOfRegistersInvolved(vlmul),
2026 vta,
2027 vma,
2028 kFrm>(
2029 args.dst, Vec<SignedType{}>{args.src1}, Vec<SignedType{}>{args.src2});
2030 case Decoder::VOpFVvOpcode::kVfmaccvv:
2031 return OpVectorvvv<intrinsics::Vfmaccvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
2032 args.dst, args.src1, args.src2);
2033 case Decoder::VOpFVvOpcode::kVfmsacvv:
2034 return OpVectorvvv<intrinsics::Vfmsacvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
2035 args.dst, args.src1, args.src2);
2036 case Decoder::VOpFVvOpcode::kVfmaddvv:
2037 return OpVectorvvv<intrinsics::Vfmaddvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
2038 args.dst, args.src1, args.src2);
2039 case Decoder::VOpFVvOpcode::kVfmsubvv:
2040 return OpVectorvvv<intrinsics::Vfmsubvv<ElementType>, ElementType, vlmul, vta, vma, kFrm>(
2041 args.dst, args.src1, args.src2);
2042 case Decoder::VOpFVvOpcode::kVfnmaccvv:
2043 return OpVectorvvv<intrinsics::Vfnmaccvv<ElementType>,
2044 ElementType,
2045 vlmul,
2046 vta,
2047 vma,
2048 kFrm>(args.dst, args.src1, args.src2);
2049 case Decoder::VOpFVvOpcode::kVfnmsacvv:
2050 return OpVectorvvv<intrinsics::Vfnmsacvv<ElementType>,
2051 ElementType,
2052 vlmul,
2053 vta,
2054 vma,
2055 kFrm>(args.dst, args.src1, args.src2);
2056 case Decoder::VOpFVvOpcode::kVfnmaddvv:
2057 return OpVectorvvv<intrinsics::Vfnmaddvv<ElementType>,
2058 ElementType,
2059 vlmul,
2060 vta,
2061 vma,
2062 kFrm>(args.dst, args.src1, args.src2);
2063 case Decoder::VOpFVvOpcode::kVfnmsubvv:
2064 return OpVectorvvv<intrinsics::Vfnmsubvv<ElementType>,
2065 ElementType,
2066 vlmul,
2067 vta,
2068 vma,
2069 kFrm>(args.dst, args.src1, args.src2);
2070 default:
2071 break; // Make compiler happy.
2072 }
2073 }
2074 return Undefined();
2075 }
2076
2077 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIViArgs & args)2078 void OpVector(const Decoder::VOpIViArgs& args) {
2079 using SignedType = berberis::SignedType<ElementType>;
2080 using UnsignedType = berberis::UnsignedType<ElementType>;
2081 using SaturatingSignedType = SaturatingType<SignedType>;
2082 using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2083 // Keep cases sorted in opcode order to match RISC-V V manual.
2084 switch (args.opcode) {
2085 case Decoder::VOpIViOpcode::kVaddvi:
2086 return OpVectorvx<intrinsics::Vaddvx<SignedType>, SignedType, vlmul, vta, vma>(
2087 args.dst, args.src, SignedType{args.imm});
2088 case Decoder::VOpIViOpcode::kVrsubvi:
2089 return OpVectorvx<intrinsics::Vrsubvx<SignedType>, SignedType, vlmul, vta, vma>(
2090 args.dst, args.src, SignedType{args.imm});
2091 case Decoder::VOpIViOpcode::kVandvi:
2092 return OpVectorvx<intrinsics::Vandvx<SignedType>, SignedType, vlmul, vta, vma>(
2093 args.dst, args.src, SignedType{args.imm});
2094 case Decoder::VOpIViOpcode::kVorvi:
2095 return OpVectorvx<intrinsics::Vorvx<SignedType>, SignedType, vlmul, vta, vma>(
2096 args.dst, args.src, SignedType{args.imm});
2097 case Decoder::VOpIViOpcode::kVxorvi:
2098 return OpVectorvx<intrinsics::Vxorvx<SignedType>, SignedType, vlmul, vta, vma>(
2099 args.dst, args.src, SignedType{args.imm});
2100 case Decoder::VOpIViOpcode::kVrgathervi:
2101 return OpVectorGather<ElementType, vlmul, vta, vma>(
2102 args.dst, args.src, [&args](size_t /*index*/) { return ElementType{args.uimm}; });
2103 case Decoder::VOpIViOpcode::kVadcvi:
2104 return OpVectorvxm<intrinsics::Vadcvx<SignedType>,
2105 SignedType,
2106 NumberOfRegistersInvolved(vlmul),
2107 vta,
2108 vma>(args.dst, args.src, SignedType{args.imm});
2109 case Decoder::VOpIViOpcode::kVmseqvi:
2110 return OpVectorToMaskvx<intrinsics::Vseqvx<SignedType>, SignedType, vlmul, vma>(
2111 args.dst, args.src, SignedType{args.imm});
2112 case Decoder::VOpIViOpcode::kVmsnevi:
2113 return OpVectorToMaskvx<intrinsics::Vsnevx<SignedType>, SignedType, vlmul, vma>(
2114 args.dst, args.src, SignedType{args.imm});
2115 case Decoder::VOpIViOpcode::kVmsleuvi:
2116 // Note: Vmsleu.vi actually have signed immediate which means that we first need to
2117 // expand it to the width of element as signed value and then bit-cast to unsigned.
2118 return OpVectorToMaskvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
2119 args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
2120 case Decoder::VOpIViOpcode::kVmslevi:
2121 return OpVectorToMaskvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
2122 args.dst, args.src, SignedType{args.imm});
2123 case Decoder::VOpIViOpcode::kVmsgtuvi:
2124 // Note: Vmsleu.vi actually have signed immediate which means that we first need to
2125 // expand it to the width of element as signed value and then bit-cast to unsigned.
2126 return OpVectorToMaskvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
2127 args.dst, args.src, BitCastToUnsigned(SignedType{args.imm}));
2128 case Decoder::VOpIViOpcode::kVmsgtvi:
2129 return OpVectorToMaskvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
2130 args.dst, args.src, SignedType{args.imm});
2131 case Decoder::VOpIViOpcode::kVsadduvi:
2132 // Note: Vsaddu.vi actually have signed immediate which means that we first need to
2133 // expand it to the width of element as signed value and then bit-cast to unsigned.
2134 return OpVectorvx<intrinsics::Vaddvx<SaturatingUnsignedType>,
2135 SaturatingUnsignedType,
2136 vlmul,
2137 vta,
2138 vma>(
2139 args.dst, args.src, BitCastToUnsigned(SaturatingSignedType{args.imm}));
2140 case Decoder::VOpIViOpcode::kVsaddvi:
2141 return OpVectorvx<intrinsics::Vaddvx<SaturatingSignedType>,
2142 SaturatingSignedType,
2143 vlmul,
2144 vta,
2145 vma>(args.dst, args.src, SaturatingSignedType{args.imm});
2146 case Decoder::VOpIViOpcode::kVsllvi:
2147 return OpVectorvx<intrinsics::Vslvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2148 args.dst, args.src, UnsignedType{args.uimm});
2149 case Decoder::VOpIViOpcode::kVsrlvi:
2150 return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2151 args.dst, args.src, UnsignedType{args.uimm});
2152 case Decoder::VOpIViOpcode::kVsravi:
2153 // We need to pass shift value here as signed type but uimm value is always positive
2154 // and always fits into any integer.
2155 return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
2156 args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2157 case Decoder::VOpIViOpcode::kVmergevi:
2158 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2159 if (args.src != 0) {
2160 return Undefined();
2161 }
2162 return OpVectorx<intrinsics::Vcopyx<SignedType>, SignedType, vlmul, vta, vma>(
2163 args.dst, SignedType{args.imm});
2164 } else {
2165 return OpVectorx<intrinsics::Vcopyx<SignedType>,
2166 SignedType,
2167 vlmul,
2168 vta,
2169 // Always use "undisturbed" value from source register.
2170 InactiveProcessing::kUndisturbed>(
2171 args.dst, SignedType{args.imm}, /*dst_mask=*/args.src);
2172 }
2173 case Decoder::VOpIViOpcode::kVmvXrv:
2174 // kVmv<nr>rv instruction
2175 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2176 switch (args.imm) {
2177 case 0:
2178 return OpVectorVmvXrv<ElementType, 1>(args.dst, args.src);
2179 case 1:
2180 return OpVectorVmvXrv<ElementType, 2>(args.dst, args.src);
2181 case 3:
2182 return OpVectorVmvXrv<ElementType, 4>(args.dst, args.src);
2183 case 7:
2184 return OpVectorVmvXrv<ElementType, 8>(args.dst, args.src);
2185 default:
2186 return Undefined();
2187 }
2188 } else {
2189 return Undefined();
2190 }
2191 case Decoder::VOpIViOpcode::kVnsrawi:
2192 // We need to pass shift value here as signed type but uimm value is always positive
2193 // and always fits into any integer.
2194 return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
2195 args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2196 case Decoder::VOpIViOpcode::kVnsrlwi:
2197 return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2198 args.dst, args.src, UnsignedType{args.uimm});
2199 case Decoder::VOpIViOpcode::kVslideupvi:
2200 return OpVectorslideup<UnsignedType, vlmul, vta, vma>(
2201 args.dst, args.src, UnsignedType{args.uimm});
2202 case Decoder::VOpIViOpcode::kVslidedownvi:
2203 return OpVectorslidedown<UnsignedType, vlmul, vta, vma>(
2204 args.dst, args.src, UnsignedType{args.uimm});
2205 case Decoder::VOpIViOpcode::kVnclipuwi:
2206 return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>,
2207 SaturatingUnsignedType,
2208 vlmul,
2209 vta,
2210 vma,
2211 kVxrm>(args.dst, args.src, UnsignedType{args.uimm});
2212 case Decoder::VOpIViOpcode::kVnclipwi:
2213 return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>,
2214 SaturatingSignedType,
2215 vlmul,
2216 vta,
2217 vma,
2218 kVxrm>(args.dst, args.src, UnsignedType{args.uimm});
2219 case Decoder::VOpIViOpcode::kVssrlvi:
2220 return OpVectorvx<intrinsics::Vssrvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2221 args.dst, args.src, UnsignedType{args.uimm});
2222 case Decoder::VOpIViOpcode::kVssravi:
2223 return OpVectorvx<intrinsics::Vssrvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2224 args.dst, args.src, BitCastToSigned(UnsignedType{args.uimm}));
2225 default:
2226 Undefined();
2227 }
2228 }
2229
2230 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIVvArgs & args)2231 void OpVector(const Decoder::VOpIVvArgs& args) {
2232 using SignedType = berberis::SignedType<ElementType>;
2233 using UnsignedType = berberis::UnsignedType<ElementType>;
2234 using SaturatingSignedType = SaturatingType<SignedType>;
2235 using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2236 // Keep cases sorted in opcode order to match RISC-V V manual.
2237 switch (args.opcode) {
2238 case Decoder::VOpIVvOpcode::kVaddvv:
2239 return OpVectorvv<intrinsics::Vaddvv<ElementType>, ElementType, vlmul, vta, vma>(
2240 args.dst, args.src1, args.src2);
2241 case Decoder::VOpIVvOpcode::kVsubvv:
2242 return OpVectorvv<intrinsics::Vsubvv<ElementType>, ElementType, vlmul, vta, vma>(
2243 args.dst, args.src1, args.src2);
2244 case Decoder::VOpIVvOpcode::kVandvv:
2245 return OpVectorvv<intrinsics::Vandvv<ElementType>, ElementType, vlmul, vta, vma>(
2246 args.dst, args.src1, args.src2);
2247 case Decoder::VOpIVvOpcode::kVorvv:
2248 return OpVectorvv<intrinsics::Vorvv<ElementType>, ElementType, vlmul, vta, vma>(
2249 args.dst, args.src1, args.src2);
2250 case Decoder::VOpIVvOpcode::kVxorvv:
2251 return OpVectorvv<intrinsics::Vxorvv<ElementType>, ElementType, vlmul, vta, vma>(
2252 args.dst, args.src1, args.src2);
2253 case Decoder::VOpIVvOpcode::kVrgathervv: {
2254 constexpr size_t kRegistersInvolved = NumberOfRegistersInvolved(vlmul);
2255 if (!IsAligned<kRegistersInvolved>(args.src2)) {
2256 return Undefined();
2257 }
2258 constexpr size_t vlmax = GetVlmax<ElementType, vlmul>();
2259 alignas(alignof(SIMD128Register)) ElementType indexes[vlmax];
2260 memcpy(indexes, state_->cpu.v + args.src2, sizeof(indexes));
2261 return OpVectorGather<ElementType, vlmul, vta, vma>(
2262 args.dst, args.src1, [&indexes](size_t index) { return indexes[index]; });
2263 }
2264 case Decoder::VOpIVvOpcode::kVadcvv:
2265 return OpVectorvvm<intrinsics::Vadcvv<SignedType>,
2266 SignedType,
2267 NumberOfRegistersInvolved(vlmul),
2268 vta,
2269 vma>(args.dst, args.src1, args.src2);
2270 case Decoder::VOpIVvOpcode::kVsbcvv:
2271 return OpVectorvvm<intrinsics::Vsbcvv<SignedType>,
2272 SignedType,
2273 NumberOfRegistersInvolved(vlmul),
2274 vta,
2275 vma>(args.dst, args.src1, args.src2);
2276 case Decoder::VOpIVvOpcode::kVmseqvv:
2277 return OpVectorToMaskvv<intrinsics::Vseqvv<ElementType>, ElementType, vlmul, vma>(
2278 args.dst, args.src1, args.src2);
2279 case Decoder::VOpIVvOpcode::kVmsnevv:
2280 return OpVectorToMaskvv<intrinsics::Vsnevv<ElementType>, ElementType, vlmul, vma>(
2281 args.dst, args.src1, args.src2);
2282 case Decoder::VOpIVvOpcode::kVmsltuvv:
2283 return OpVectorToMaskvv<intrinsics::Vsltvv<UnsignedType>, ElementType, vlmul, vma>(
2284 args.dst, args.src1, args.src2);
2285 case Decoder::VOpIVvOpcode::kVmsltvv:
2286 return OpVectorToMaskvv<intrinsics::Vsltvv<SignedType>, ElementType, vlmul, vma>(
2287 args.dst, args.src1, args.src2);
2288 case Decoder::VOpIVvOpcode::kVmsleuvv:
2289 return OpVectorToMaskvv<intrinsics::Vslevv<UnsignedType>, ElementType, vlmul, vma>(
2290 args.dst, args.src1, args.src2);
2291 case Decoder::VOpIVvOpcode::kVmslevv:
2292 return OpVectorToMaskvv<intrinsics::Vslevv<SignedType>, ElementType, vlmul, vma>(
2293 args.dst, args.src1, args.src2);
2294 case Decoder::VOpIVvOpcode::kVsadduvv:
2295 return OpVectorvv<intrinsics::Vaddvv<SaturatingUnsignedType>,
2296 SaturatingUnsignedType,
2297 vlmul,
2298 vta,
2299 vma>(args.dst, args.src1, args.src2);
2300 case Decoder::VOpIVvOpcode::kVsaddvv:
2301 return OpVectorvv<intrinsics::Vaddvv<SaturatingSignedType>,
2302 SaturatingSignedType,
2303 vlmul,
2304 vta,
2305 vma>(args.dst, args.src1, args.src2);
2306 case Decoder::VOpIVvOpcode::kVssubuvv:
2307 return OpVectorvv<intrinsics::Vsubvv<SaturatingUnsignedType>,
2308 SaturatingUnsignedType,
2309 vlmul,
2310 vta,
2311 vma>(args.dst, args.src1, args.src2);
2312 case Decoder::VOpIVvOpcode::kVssubvv:
2313 return OpVectorvv<intrinsics::Vsubvv<SaturatingSignedType>,
2314 SaturatingSignedType,
2315 vlmul,
2316 vta,
2317 vma>(args.dst, args.src1, args.src2);
2318 case Decoder::VOpIVvOpcode::kVsllvv:
2319 return OpVectorvv<intrinsics::Vslvv<ElementType>, ElementType, vlmul, vta, vma>(
2320 args.dst, args.src1, args.src2);
2321 case Decoder::VOpIVvOpcode::kVsrlvv:
2322 return OpVectorvv<intrinsics::Vsrvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2323 args.dst, args.src1, args.src2);
2324 case Decoder::VOpIVvOpcode::kVsravv:
2325 return OpVectorvv<intrinsics::Vsrvv<SignedType>, ElementType, vlmul, vta, vma>(
2326 args.dst, args.src1, args.src2);
2327 case Decoder::VOpIVvOpcode::kVminuvv:
2328 return OpVectorvv<intrinsics::Vminvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2329 args.dst, args.src1, args.src2);
2330 case Decoder::VOpIVvOpcode::kVminvv:
2331 return OpVectorvv<intrinsics::Vminvv<SignedType>, ElementType, vlmul, vta, vma>(
2332 args.dst, args.src1, args.src2);
2333 case Decoder::VOpIVvOpcode::kVmaxuvv:
2334 return OpVectorvv<intrinsics::Vmaxvv<UnsignedType>, ElementType, vlmul, vta, vma>(
2335 args.dst, args.src1, args.src2);
2336 case Decoder::VOpIVvOpcode::kVmaxvv:
2337 return OpVectorvv<intrinsics::Vmaxvv<SignedType>, ElementType, vlmul, vta, vma>(
2338 args.dst, args.src1, args.src2);
2339 case Decoder::VOpIVvOpcode::kVmergevv:
2340 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2341 if (args.src1 != 0) {
2342 return Undefined();
2343 }
2344 return OpVectorv<intrinsics::Vcopyv<ElementType>, ElementType, vlmul, vta, vma>(
2345 args.dst, args.src2);
2346 } else {
2347 return OpVectorv<intrinsics::Vcopyv<ElementType>,
2348 ElementType,
2349 vlmul,
2350 vta,
2351 // Always use "undisturbed" value from source register.
2352 InactiveProcessing::kUndisturbed>(
2353 args.dst, args.src2, /*dst_mask=*/args.src1);
2354 }
2355 case Decoder::VOpIVvOpcode::kVnsrawv:
2356 return OpVectorNarrowwv<intrinsics::Vnsrwv<SignedType>, SignedType, vlmul, vta, vma>(
2357 args.dst, args.src1, args.src2);
2358 case Decoder::VOpIVvOpcode::kVnsrlwv:
2359 return OpVectorNarrowwv<intrinsics::Vnsrwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2360 args.dst, args.src1, args.src2);
2361 case Decoder::VOpIVvOpcode::kVsmulvv:
2362 return OpVectorvv<intrinsics::Vsmulvv<SaturatingSignedType>,
2363 ElementType,
2364 vlmul,
2365 vta,
2366 vma,
2367 kVxrm>(args.dst, args.src1, args.src2);
2368 case Decoder::VOpIVvOpcode::kVssrlvv:
2369 return OpVectorvv<intrinsics::Vssrvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2370 args.dst, args.src1, args.src2);
2371 case Decoder::VOpIVvOpcode::kVssravv:
2372 return OpVectorvv<intrinsics::Vssrvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2373 args.dst, args.src1, args.src2);
2374 case Decoder::VOpIVvOpcode::kVnclipuwv:
2375 return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingUnsignedType>,
2376 SaturatingUnsignedType,
2377 vlmul,
2378 vta,
2379 vma,
2380 kVxrm>(args.dst, args.src1, args.src2);
2381 case Decoder::VOpIVvOpcode::kVnclipwv:
2382 return OpVectorNarrowwv<intrinsics::Vnclipwv<SaturatingSignedType>,
2383 SaturatingSignedType,
2384 vlmul,
2385 vta,
2386 vma,
2387 kVxrm>(args.dst, args.src1, args.src2);
2388 case Decoder::VOpIVvOpcode::kVwredsumuvs:
2389 return OpVectorvs<intrinsics::Vredsumvs<UnsignedType, WideType<UnsignedType>>,
2390 UnsignedType,
2391 WideType<UnsignedType>,
2392 vlmul,
2393 vta,
2394 vma>(args.dst, Vec<UnsignedType{}>{args.src1}, args.src2);
2395 case Decoder::VOpIVvOpcode::kVwredsumvs:
2396 return OpVectorvs<intrinsics::Vredsumvs<SignedType, WideType<SignedType>>,
2397 SignedType,
2398 WideType<SignedType>,
2399 vlmul,
2400 vta,
2401 vma>(args.dst, Vec<SignedType{}>{args.src1}, args.src2);
2402 default:
2403 Undefined();
2404 }
2405 }
2406
2407 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpIVxArgs & args,Register arg2)2408 void OpVector(const Decoder::VOpIVxArgs& args, Register arg2) {
2409 using SignedType = berberis::SignedType<ElementType>;
2410 using UnsignedType = berberis::UnsignedType<ElementType>;
2411 using SaturatingSignedType = SaturatingType<SignedType>;
2412 using SaturatingUnsignedType = SaturatingType<UnsignedType>;
2413 // Keep cases sorted in opcode order to match RISC-V V manual.
2414 switch (args.opcode) {
2415 case Decoder::VOpIVxOpcode::kVaddvx:
2416 return OpVectorvx<intrinsics::Vaddvx<ElementType>, ElementType, vlmul, vta, vma>(
2417 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2418 case Decoder::VOpIVxOpcode::kVsubvx:
2419 return OpVectorvx<intrinsics::Vsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2420 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2421 case Decoder::VOpIVxOpcode::kVrsubvx:
2422 return OpVectorvx<intrinsics::Vrsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2423 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2424 case Decoder::VOpIVxOpcode::kVandvx:
2425 return OpVectorvx<intrinsics::Vandvx<ElementType>, ElementType, vlmul, vta, vma>(
2426 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2427 case Decoder::VOpIVxOpcode::kVorvx:
2428 return OpVectorvx<intrinsics::Vorvx<ElementType>, ElementType, vlmul, vta, vma>(
2429 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2430 case Decoder::VOpIVxOpcode::kVxorvx:
2431 return OpVectorvx<intrinsics::Vxorvx<ElementType>, ElementType, vlmul, vta, vma>(
2432 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2433 case Decoder::VOpIVxOpcode::kVrgathervx:
2434 return OpVectorGather<ElementType, vlmul, vta, vma>(
2435 args.dst, args.src1, [&arg2](size_t /*index*/) {
2436 return MaybeTruncateTo<ElementType>(arg2);
2437 });
2438 case Decoder::VOpIVxOpcode::kVadcvx:
2439 return OpVectorvxm<intrinsics::Vadcvx<ElementType>,
2440 ElementType,
2441 NumberOfRegistersInvolved(vlmul),
2442 vta,
2443 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2444 case Decoder::VOpIVxOpcode::kVsbcvx:
2445 return OpVectorvxm<intrinsics::Vsbcvx<ElementType>,
2446 ElementType,
2447 NumberOfRegistersInvolved(vlmul),
2448 vta,
2449 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2450 case Decoder::VOpIVxOpcode::kVmseqvx:
2451 return OpVectorToMaskvx<intrinsics::Vseqvx<ElementType>, ElementType, vlmul, vma>(
2452 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2453 case Decoder::VOpIVxOpcode::kVmsnevx:
2454 return OpVectorToMaskvx<intrinsics::Vsnevx<ElementType>, ElementType, vlmul, vma>(
2455 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2456 case Decoder::VOpIVxOpcode::kVmsltuvx:
2457 return OpVectorToMaskvx<intrinsics::Vsltvx<UnsignedType>, UnsignedType, vlmul, vma>(
2458 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2459 case Decoder::VOpIVxOpcode::kVmsltvx:
2460 return OpVectorToMaskvx<intrinsics::Vsltvx<SignedType>, SignedType, vlmul, vma>(
2461 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2462 case Decoder::VOpIVxOpcode::kVmsleuvx:
2463 return OpVectorToMaskvx<intrinsics::Vslevx<UnsignedType>, UnsignedType, vlmul, vma>(
2464 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2465 case Decoder::VOpIVxOpcode::kVmslevx:
2466 return OpVectorToMaskvx<intrinsics::Vslevx<SignedType>, SignedType, vlmul, vma>(
2467 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2468 case Decoder::VOpIVxOpcode::kVmsgtuvx:
2469 return OpVectorToMaskvx<intrinsics::Vsgtvx<UnsignedType>, UnsignedType, vlmul, vma>(
2470 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2471 case Decoder::VOpIVxOpcode::kVmsgtvx:
2472 return OpVectorToMaskvx<intrinsics::Vsgtvx<SignedType>, SignedType, vlmul, vma>(
2473 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2474 case Decoder::VOpIVxOpcode::kVsadduvx:
2475 return OpVectorvx<intrinsics::Vaddvx<SaturatingUnsignedType>,
2476 SaturatingUnsignedType,
2477 vlmul,
2478 vta,
2479 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2480 case Decoder::VOpIVxOpcode::kVsaddvx:
2481 return OpVectorvx<intrinsics::Vaddvx<SaturatingSignedType>,
2482 SaturatingSignedType,
2483 vlmul,
2484 vta,
2485 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2486 case Decoder::VOpIVxOpcode::kVssubuvx:
2487 return OpVectorvx<intrinsics::Vsubvx<SaturatingUnsignedType>,
2488 SaturatingUnsignedType,
2489 vlmul,
2490 vta,
2491 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2492 case Decoder::VOpIVxOpcode::kVssubvx:
2493 return OpVectorvx<intrinsics::Vsubvx<SaturatingSignedType>,
2494 SaturatingSignedType,
2495 vlmul,
2496 vta,
2497 vma>(args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2498 case Decoder::VOpIVxOpcode::kVsllvx:
2499 return OpVectorvx<intrinsics::Vslvx<ElementType>, ElementType, vlmul, vta, vma>(
2500 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2501 case Decoder::VOpIVxOpcode::kVsrlvx:
2502 return OpVectorvx<intrinsics::Vsrvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2503 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2504 case Decoder::VOpIVxOpcode::kVsravx:
2505 return OpVectorvx<intrinsics::Vsrvx<SignedType>, SignedType, vlmul, vta, vma>(
2506 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2507 case Decoder::VOpIVxOpcode::kVminuvx:
2508 return OpVectorvx<intrinsics::Vminvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2509 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2510 case Decoder::VOpIVxOpcode::kVminvx:
2511 return OpVectorvx<intrinsics::Vminvx<SignedType>, SignedType, vlmul, vta, vma>(
2512 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2513 case Decoder::VOpIVxOpcode::kVmaxuvx:
2514 return OpVectorvx<intrinsics::Vmaxvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2515 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2516 case Decoder::VOpIVxOpcode::kVmaxvx:
2517 return OpVectorvx<intrinsics::Vmaxvx<SignedType>, SignedType, vlmul, vta, vma>(
2518 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2519 case Decoder::VOpIVxOpcode::kVmergevx:
2520 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2521 if (args.src1 != 0) {
2522 return Undefined();
2523 }
2524 return OpVectorx<intrinsics::Vcopyx<ElementType>, ElementType, vlmul, vta, vma>(
2525 args.dst, MaybeTruncateTo<ElementType>(arg2));
2526 } else {
2527 return OpVectorx<intrinsics::Vcopyx<ElementType>,
2528 ElementType,
2529 vlmul,
2530 vta,
2531 // Always use "undisturbed" value from source register.
2532 InactiveProcessing::kUndisturbed>(
2533 args.dst, MaybeTruncateTo<ElementType>(arg2), /*dst_mask=*/args.src1);
2534 }
2535 case Decoder::VOpIVxOpcode::kVnsrawx:
2536 return OpVectorNarrowwx<intrinsics::Vnsrwx<SignedType>, SignedType, vlmul, vta, vma>(
2537 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2538 case Decoder::VOpIVxOpcode::kVnsrlwx:
2539 return OpVectorNarrowwx<intrinsics::Vnsrwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2540 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2541 case Decoder::VOpIVxOpcode::kVslideupvx:
2542 return OpVectorslideup<ElementType, vlmul, vta, vma>(
2543 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2544 case Decoder::VOpIVxOpcode::kVslidedownvx:
2545 return OpVectorslidedown<ElementType, vlmul, vta, vma>(
2546 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2547 case Decoder::VOpIVxOpcode::kVsmulvx:
2548 return OpVectorvx<intrinsics::Vsmulvx<SaturatingSignedType>,
2549 SaturatingSignedType,
2550 vlmul,
2551 vta,
2552 vma,
2553 kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2554 case Decoder::VOpIVxOpcode::kVssrlvx:
2555 return OpVectorvx<intrinsics::Vssrvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2556 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2557 case Decoder::VOpIVxOpcode::kVssravx:
2558 return OpVectorvx<intrinsics::Vssrvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2559 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2560 case Decoder::VOpIVxOpcode::kVnclipuwx:
2561 return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingUnsignedType>,
2562 SaturatingUnsignedType,
2563 vlmul,
2564 vta,
2565 vma,
2566 kVxrm>(args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2567 case Decoder::VOpIVxOpcode::kVnclipwx:
2568 return OpVectorNarrowwx<intrinsics::Vnclipwx<SaturatingSignedType>,
2569 SaturatingSignedType,
2570 vlmul,
2571 vta,
2572 vma,
2573 kVxrm>(args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2574 default:
2575 Undefined();
2576 }
2577 }
2578
2579 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpMVvArgs & args)2580 void OpVector(const Decoder::VOpMVvArgs& args) {
2581 using SignedType = berberis::SignedType<ElementType>;
2582 using UnsignedType = berberis::UnsignedType<ElementType>;
2583 if constexpr (std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2584 // Keep cases sorted in opcode order to match RISC-V V manual.
2585 switch (args.opcode) {
2586 case Decoder::VOpMVvOpcode::kVmandnmm:
2587 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & ~rhs; }>(
2588 args.dst, args.src1, args.src2);
2589 case Decoder::VOpMVvOpcode::kVmandmm:
2590 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs & rhs; }>(
2591 args.dst, args.src1, args.src2);
2592 case Decoder::VOpMVvOpcode::kVmormm:
2593 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | rhs; }>(
2594 args.dst, args.src1, args.src2);
2595 case Decoder::VOpMVvOpcode::kVmxormm:
2596 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs ^ rhs; }>(
2597 args.dst, args.src1, args.src2);
2598 case Decoder::VOpMVvOpcode::kVmornmm:
2599 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return lhs | ~rhs; }>(
2600 args.dst, args.src1, args.src2);
2601 case Decoder::VOpMVvOpcode::kVmnandmm:
2602 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs & rhs); }>(
2603 args.dst, args.src1, args.src2);
2604 case Decoder::VOpMVvOpcode::kVmnormm:
2605 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs | rhs); }>(
2606 args.dst, args.src1, args.src2);
2607 case Decoder::VOpMVvOpcode::kVmxnormm:
2608 return OpVectormm<[](SIMD128Register lhs, SIMD128Register rhs) { return ~(lhs ^ rhs); }>(
2609 args.dst, args.src1, args.src2);
2610 default:; // Do nothing: handled in next switch.
2611 }
2612 }
2613 // Keep cases sorted in opcode order to match RISC-V V manual.
2614 switch (args.opcode) {
2615 case Decoder::VOpMVvOpcode::kVredsumvs:
2616 return OpVectorvs<intrinsics::Vredsumvs<ElementType>, ElementType, vlmul, vta, vma>(
2617 args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2618 case Decoder::VOpMVvOpcode::kVredandvs:
2619 return OpVectorvs<intrinsics::Vredandvs<ElementType>, ElementType, vlmul, vta, vma>(
2620 args.dst, Vec<~ElementType{}>{args.src1}, args.src2);
2621 case Decoder::VOpMVvOpcode::kVredorvs:
2622 return OpVectorvs<intrinsics::Vredorvs<ElementType>, ElementType, vlmul, vta, vma>(
2623 args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2624 case Decoder::VOpMVvOpcode::kVredxorvs:
2625 return OpVectorvs<intrinsics::Vredxorvs<ElementType>, ElementType, vlmul, vta, vma>(
2626 args.dst, Vec<ElementType{}>{args.src1}, args.src2);
2627 case Decoder::VOpMVvOpcode::kVredminuvs:
2628 return OpVectorvs<intrinsics::Vredminvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2629 args.dst,
2630 Vec<UnsignedType{std::numeric_limits<typename UnsignedType::BaseType>::max()}>{
2631 args.src1},
2632 args.src2);
2633 case Decoder::VOpMVvOpcode::kVredminvs:
2634 return OpVectorvs<intrinsics::Vredminvs<SignedType>, SignedType, vlmul, vta, vma>(
2635 args.dst,
2636 Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::max()}>{args.src1},
2637 args.src2);
2638 case Decoder::VOpMVvOpcode::kVredmaxuvs:
2639 return OpVectorvs<intrinsics::Vredmaxvs<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2640 args.dst, Vec<UnsignedType{}>{args.src1}, args.src2);
2641 case Decoder::VOpMVvOpcode::kVredmaxvs:
2642 return OpVectorvs<intrinsics::Vredmaxvs<SignedType>, SignedType, vlmul, vta, vma>(
2643 args.dst,
2644 Vec<SignedType{std::numeric_limits<typename SignedType::BaseType>::min()}>{args.src1},
2645 args.src2);
2646 case Decoder::VOpMVvOpcode::kVaadduvv:
2647 return OpVectorvv<intrinsics::Vaaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2648 args.dst, args.src1, args.src2);
2649 case Decoder::VOpMVvOpcode::kVaaddvv:
2650 return OpVectorvv<intrinsics::Vaaddvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2651 args.dst, args.src1, args.src2);
2652 case Decoder::VOpMVvOpcode::kVasubuvv:
2653 return OpVectorvv<intrinsics::Vasubvv<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2654 args.dst, args.src1, args.src2);
2655 case Decoder::VOpMVvOpcode::kVasubvv:
2656 return OpVectorvv<intrinsics::Vasubvv<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2657 args.dst, args.src1, args.src2);
2658 case Decoder::VOpMVvOpcode::kVWXUnary0:
2659 switch (args.vwxunary0_opcode) {
2660 case Decoder::VWXUnary0Opcode::kVmvxs:
2661 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2662 return Undefined();
2663 }
2664 return OpVectorVmvxs<SignedType>(args.dst, args.src1);
2665 case Decoder::VWXUnary0Opcode::kVcpopm:
2666 return OpVectorVWXUnary0<intrinsics::Vcpopm<>, vma>(args.dst, args.src1);
2667 case Decoder::VWXUnary0Opcode::kVfirstm:
2668 return OpVectorVWXUnary0<intrinsics::Vfirstm<>, vma>(args.dst, args.src1);
2669 default:
2670 return Undefined();
2671 }
2672 case Decoder::VOpMVvOpcode::kVFUnary0:
2673 switch (args.vxunary0_opcode) {
2674 case Decoder::VXUnary0Opcode::kVzextvf2m:
2675 if constexpr (sizeof(UnsignedType) >= 2) {
2676 return OpVectorVXUnary0<intrinsics::Vextf2<UnsignedType>,
2677 UnsignedType,
2678 2,
2679 vlmul,
2680 vta,
2681 vma>(args.dst, args.src1);
2682 }
2683 break;
2684 case Decoder::VXUnary0Opcode::kVsextvf2m:
2685 if constexpr (sizeof(SignedType) >= 2) {
2686 return OpVectorVXUnary0<intrinsics::Vextf2<SignedType>,
2687 SignedType,
2688 2,
2689 vlmul,
2690 vta,
2691 vma>(args.dst, args.src1);
2692 }
2693 break;
2694 case Decoder::VXUnary0Opcode::kVzextvf4m:
2695 if constexpr (sizeof(UnsignedType) >= 4) {
2696 return OpVectorVXUnary0<intrinsics::Vextf4<UnsignedType>,
2697 UnsignedType,
2698 4,
2699 vlmul,
2700 vta,
2701 vma>(args.dst, args.src1);
2702 }
2703 break;
2704 case Decoder::VXUnary0Opcode::kVsextvf4m:
2705 if constexpr (sizeof(SignedType) >= 4) {
2706 return OpVectorVXUnary0<intrinsics::Vextf4<SignedType>,
2707 SignedType,
2708 4,
2709 vlmul,
2710 vta,
2711 vma>(args.dst, args.src1);
2712 }
2713 break;
2714 case Decoder::VXUnary0Opcode::kVzextvf8m:
2715 if constexpr (sizeof(UnsignedType) >= 8) {
2716 return OpVectorVXUnary0<intrinsics::Vextf8<UnsignedType>,
2717 UnsignedType,
2718 8,
2719 vlmul,
2720 vta,
2721 vma>(args.dst, args.src1);
2722 }
2723 break;
2724 case Decoder::VXUnary0Opcode::kVsextvf8m:
2725 if constexpr (sizeof(SignedType) >= 8) {
2726 return OpVectorVXUnary0<intrinsics::Vextf8<SignedType>,
2727 SignedType,
2728 8,
2729 vlmul,
2730 vta,
2731 vma>(args.dst, args.src1);
2732 }
2733 break;
2734 case Decoder::VXUnary0Opcode::kVbrev8v:
2735 return OpVectorv<intrinsics::Vbrev8v<ElementType>, ElementType, vlmul, vta, vma>(
2736 args.dst, args.src1);
2737 break;
2738 default:
2739 return Undefined();
2740 }
2741 return Undefined();
2742 case Decoder::VOpMVvOpcode::kVMUnary0:
2743 switch (args.vmunary0_opcode) {
2744 case Decoder::VMUnary0Opcode::kVmsbfm:
2745 return OpVectorVMUnary0<intrinsics::Vmsbfm<>, vma>(args.dst, args.src1);
2746 case Decoder::VMUnary0Opcode::kVmsofm:
2747 return OpVectorVMUnary0<intrinsics::Vmsofm<>, vma>(args.dst, args.src1);
2748 case Decoder::VMUnary0Opcode::kVmsifm:
2749 return OpVectorVMUnary0<intrinsics::Vmsifm<>, vma>(args.dst, args.src1);
2750 case Decoder::VMUnary0Opcode::kViotam:
2751 return OpVectorViotam<ElementType, vlmul, vta, vma>(args.dst, args.src1);
2752 case Decoder::VMUnary0Opcode::kVidv:
2753 if (args.src1) {
2754 return Undefined();
2755 }
2756 return OpVectorVidv<ElementType, vlmul, vta, vma>(args.dst);
2757 default:
2758 return Undefined();
2759 }
2760 case Decoder::VOpMVvOpcode::kVdivuvv:
2761 return OpVectorvv<intrinsics::Vdivvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2762 args.dst, args.src1, args.src2);
2763 case Decoder::VOpMVvOpcode::kVdivvv:
2764 return OpVectorvv<intrinsics::Vdivvv<SignedType>, SignedType, vlmul, vta, vma>(
2765 args.dst, args.src1, args.src2);
2766 case Decoder::VOpMVvOpcode::kVremuvv:
2767 return OpVectorvv<intrinsics::Vremvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2768 args.dst, args.src1, args.src2);
2769 case Decoder::VOpMVvOpcode::kVremvv:
2770 return OpVectorvv<intrinsics::Vremvv<SignedType>, SignedType, vlmul, vta, vma>(
2771 args.dst, args.src1, args.src2);
2772 case Decoder::VOpMVvOpcode::kVmulhuvv:
2773 return OpVectorvv<intrinsics::Vmulhvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2774 args.dst, args.src1, args.src2);
2775 case Decoder::VOpMVvOpcode::kVmulvv:
2776 return OpVectorvv<intrinsics::Vmulvv<SignedType>, SignedType, vlmul, vta, vma>(
2777 args.dst, args.src1, args.src2);
2778 case Decoder::VOpMVvOpcode::kVmulhsuvv:
2779 return OpVectorvv<intrinsics::Vmulhsuvv<SignedType>, SignedType, vlmul, vta, vma>(
2780 args.dst, args.src1, args.src2);
2781 case Decoder::VOpMVvOpcode::kVmulhvv:
2782 return OpVectorvv<intrinsics::Vmulhvv<SignedType>, SignedType, vlmul, vta, vma>(
2783 args.dst, args.src1, args.src2);
2784 case Decoder::VOpMVvOpcode::kVmaddvv:
2785 return OpVectorvvv<intrinsics::Vmaddvv<ElementType>, ElementType, vlmul, vta, vma>(
2786 args.dst, args.src1, args.src2);
2787 case Decoder::VOpMVvOpcode::kVnmsubvv:
2788 return OpVectorvvv<intrinsics::Vnmsubvv<ElementType>, ElementType, vlmul, vta, vma>(
2789 args.dst, args.src1, args.src2);
2790 case Decoder::VOpMVvOpcode::kVmaccvv:
2791 return OpVectorvvv<intrinsics::Vmaccvv<ElementType>, ElementType, vlmul, vta, vma>(
2792 args.dst, args.src1, args.src2);
2793 case Decoder::VOpMVvOpcode::kVnmsacvv:
2794 return OpVectorvvv<intrinsics::Vnmsacvv<ElementType>, ElementType, vlmul, vta, vma>(
2795 args.dst, args.src1, args.src2);
2796 case Decoder::VOpMVvOpcode::kVwadduvv:
2797 return OpVectorWidenvv<intrinsics::Vwaddvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2798 args.dst, args.src1, args.src2);
2799 case Decoder::VOpMVvOpcode::kVwaddvv:
2800 return OpVectorWidenvv<intrinsics::Vwaddvv<SignedType>, SignedType, vlmul, vta, vma>(
2801 args.dst, args.src1, args.src2);
2802 case Decoder::VOpMVvOpcode::kVwsubuvv:
2803 return OpVectorWidenvv<intrinsics::Vwsubvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2804 args.dst, args.src1, args.src2);
2805 case Decoder::VOpMVvOpcode::kVwsubvv:
2806 return OpVectorWidenvv<intrinsics::Vwsubvv<SignedType>, SignedType, vlmul, vta, vma>(
2807 args.dst, args.src1, args.src2);
2808 case Decoder::VOpMVvOpcode::kVwadduwv:
2809 return OpVectorWidenwv<intrinsics::Vwaddwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2810 args.dst, args.src1, args.src2);
2811 case Decoder::VOpMVvOpcode::kVwaddwv:
2812 return OpVectorWidenwv<intrinsics::Vwaddwv<SignedType>, SignedType, vlmul, vta, vma>(
2813 args.dst, args.src1, args.src2);
2814 case Decoder::VOpMVvOpcode::kVwsubuwv:
2815 return OpVectorWidenwv<intrinsics::Vwsubwv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2816 args.dst, args.src1, args.src2);
2817 case Decoder::VOpMVvOpcode::kVwsubwv:
2818 return OpVectorWidenwv<intrinsics::Vwsubwv<SignedType>, SignedType, vlmul, vta, vma>(
2819 args.dst, args.src1, args.src2);
2820 case Decoder::VOpMVvOpcode::kVwmuluvv:
2821 return OpVectorWidenvv<intrinsics::Vwmulvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2822 args.dst, args.src1, args.src2);
2823 case Decoder::VOpMVvOpcode::kVwmulsuvv:
2824 return OpVectorWidenvv<intrinsics::Vwmulsuvv<ElementType>, ElementType, vlmul, vta, vma>(
2825 args.dst, args.src1, args.src2);
2826 case Decoder::VOpMVvOpcode::kVwmulvv:
2827 return OpVectorWidenvv<intrinsics::Vwmulvv<SignedType>, SignedType, vlmul, vta, vma>(
2828 args.dst, args.src1, args.src2);
2829 case Decoder::VOpMVvOpcode::kVwmaccuvv:
2830 return OpVectorWidenvvw<intrinsics::Vwmaccvv<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2831 args.dst, args.src1, args.src2);
2832 case Decoder::VOpMVvOpcode::kVwmaccvv:
2833 return OpVectorWidenvvw<intrinsics::Vwmaccvv<SignedType>, SignedType, vlmul, vta, vma>(
2834 args.dst, args.src1, args.src2);
2835 case Decoder::VOpMVvOpcode::kVwmaccsuvv:
2836 return OpVectorWidenvvw<intrinsics::Vwmaccsuvv<ElementType>, ElementType, vlmul, vta, vma>(
2837 args.dst, args.src1, args.src2);
2838 default:
2839 Undefined();
2840 }
2841 }
2842
2843 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVector(const Decoder::VOpMVxArgs & args,Register arg2)2844 void OpVector(const Decoder::VOpMVxArgs& args, Register arg2) {
2845 using SignedType = berberis::SignedType<ElementType>;
2846 using UnsignedType = berberis::UnsignedType<ElementType>;
2847 // Keep cases sorted in opcode order to match RISC-V V manual.
2848 switch (args.opcode) {
2849 case Decoder::VOpMVxOpcode::kVaadduvx:
2850 return OpVectorvx<intrinsics::Vaaddvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2851 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2852 case Decoder::VOpMVxOpcode::kVaaddvx:
2853 return OpVectorvx<intrinsics::Vaaddvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2854 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2855 case Decoder::VOpMVxOpcode::kVasubuvx:
2856 return OpVectorvx<intrinsics::Vasubvx<UnsignedType>, UnsignedType, vlmul, vta, vma, kVxrm>(
2857 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2858 case Decoder::VOpMVxOpcode::kVasubvx:
2859 return OpVectorvx<intrinsics::Vasubvx<SignedType>, SignedType, vlmul, vta, vma, kVxrm>(
2860 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2861 case Decoder::VOpMVxOpcode::kVslide1upvx:
2862 return OpVectorslide1up<SignedType, vlmul, vta, vma>(
2863 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2864 case Decoder::VOpMVxOpcode::kVslide1downvx:
2865 return OpVectorslide1down<SignedType, vlmul, vta, vma>(
2866 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2867 case Decoder::VOpMVxOpcode::kVRXUnary0:
2868 switch (args.vrxunary0_opcode) {
2869 case Decoder::VRXUnary0Opcode::kVmvsx:
2870 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
2871 return Undefined();
2872 }
2873 return OpVectorVmvsx<SignedType, vta>(args.dst, MaybeTruncateTo<SignedType>(arg2));
2874 default:
2875 return Undefined();
2876 }
2877 case Decoder::VOpMVxOpcode::kVmulhuvx:
2878 return OpVectorvx<intrinsics::Vmulhvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2879 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2880 case Decoder::VOpMVxOpcode::kVmulvx:
2881 return OpVectorvx<intrinsics::Vmulvx<SignedType>, SignedType, vlmul, vta, vma>(
2882 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2883 case Decoder::VOpMVxOpcode::kVdivuvx:
2884 return OpVectorvx<intrinsics::Vdivvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2885 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2886 case Decoder::VOpMVxOpcode::kVdivvx:
2887 return OpVectorvx<intrinsics::Vdivvx<SignedType>, SignedType, vlmul, vta, vma>(
2888 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2889 case Decoder::VOpMVxOpcode::kVremuvx:
2890 return OpVectorvx<intrinsics::Vremvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2891 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2892 case Decoder::VOpMVxOpcode::kVremvx:
2893 return OpVectorvx<intrinsics::Vremvx<SignedType>, SignedType, vlmul, vta, vma>(
2894 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2895 case Decoder::VOpMVxOpcode::kVmulhsuvx:
2896 return OpVectorvx<intrinsics::Vmulhsuvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2897 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2898 case Decoder::VOpMVxOpcode::kVmulhvx:
2899 return OpVectorvx<intrinsics::Vmulhvx<SignedType>, SignedType, vlmul, vta, vma>(
2900 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2901 case Decoder::VOpMVxOpcode::kVmaddvx:
2902 return OpVectorvxv<intrinsics::Vmaddvx<ElementType>, ElementType, vlmul, vta, vma>(
2903 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2904 case Decoder::VOpMVxOpcode::kVnmsubvx:
2905 return OpVectorvxv<intrinsics::Vnmsubvx<ElementType>, ElementType, vlmul, vta, vma>(
2906 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2907 case Decoder::VOpMVxOpcode::kVmaccvx:
2908 return OpVectorvxv<intrinsics::Vmaccvx<ElementType>, ElementType, vlmul, vta, vma>(
2909 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2910 case Decoder::VOpMVxOpcode::kVnmsacvx:
2911 return OpVectorvxv<intrinsics::Vnmsacvx<ElementType>, ElementType, vlmul, vta, vma>(
2912 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2913 case Decoder::VOpMVxOpcode::kVwadduvx:
2914 return OpVectorWidenvx<intrinsics::Vwaddvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2915 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2916 case Decoder::VOpMVxOpcode::kVwaddvx:
2917 return OpVectorWidenvx<intrinsics::Vwaddvx<SignedType>, SignedType, vlmul, vta, vma>(
2918 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2919 case Decoder::VOpMVxOpcode::kVwsubuvx:
2920 return OpVectorWidenvx<intrinsics::Vwsubvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2921 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2922 case Decoder::VOpMVxOpcode::kVwsubvx:
2923 return OpVectorWidenvx<intrinsics::Vwsubvx<SignedType>, SignedType, vlmul, vta, vma>(
2924 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2925 case Decoder::VOpMVxOpcode::kVwadduwx:
2926 return OpVectorWidenwx<intrinsics::Vwaddwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2927 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2928 case Decoder::VOpMVxOpcode::kVwaddwx:
2929 return OpVectorWidenwx<intrinsics::Vwaddwx<SignedType>, SignedType, vlmul, vta, vma>(
2930 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2931 case Decoder::VOpMVxOpcode::kVwsubuwx:
2932 return OpVectorWidenwx<intrinsics::Vwsubwx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2933 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2934 case Decoder::VOpMVxOpcode::kVwsubwx:
2935 return OpVectorWidenwx<intrinsics::Vwsubwx<SignedType>, SignedType, vlmul, vta, vma>(
2936 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2937 case Decoder::VOpMVxOpcode::kVwmuluvx:
2938 return OpVectorWidenvx<intrinsics::Vwmulvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2939 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2940 case Decoder::VOpMVxOpcode::kVwmulsuvx:
2941 return OpVectorWidenvx<intrinsics::Vwmulsuvx<ElementType>, ElementType, vlmul, vta, vma>(
2942 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2943 case Decoder::VOpMVxOpcode::kVwmulvx:
2944 return OpVectorWidenvx<intrinsics::Vwmulvx<SignedType>, SignedType, vlmul, vta, vma>(
2945 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2946 case Decoder::VOpMVxOpcode::kVwmaccuvx:
2947 return OpVectorWidenvxw<intrinsics::Vwmaccvx<UnsignedType>, UnsignedType, vlmul, vta, vma>(
2948 args.dst, args.src1, MaybeTruncateTo<UnsignedType>(arg2));
2949 case Decoder::VOpMVxOpcode::kVwmaccvx:
2950 return OpVectorWidenvxw<intrinsics::Vwmaccvx<SignedType>, SignedType, vlmul, vta, vma>(
2951 args.dst, args.src1, MaybeTruncateTo<SignedType>(arg2));
2952 case Decoder::VOpMVxOpcode::kVwmaccusvx:
2953 return OpVectorWidenvxw<intrinsics::Vwmaccusvx<ElementType>, ElementType, vlmul, vta, vma>(
2954 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2955 case Decoder::VOpMVxOpcode::kVwmaccsuvx:
2956 return OpVectorWidenvxw<intrinsics::Vwmaccsuvx<ElementType>, ElementType, vlmul, vta, vma>(
2957 args.dst, args.src1, MaybeTruncateTo<ElementType>(arg2));
2958 default:
2959 Undefined();
2960 }
2961 }
2962
2963 template <typename DataElementType,
2964 VectorRegisterGroupMultiplier vlmul,
2965 typename IndexElementType,
2966 size_t kSegmentSize,
2967 size_t kIndexRegistersInvolved,
2968 TailProcessing vta,
2969 auto vma>
OpVector(const Decoder::VStoreIndexedArgs & args,Register src)2970 void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
2971 return OpVector<DataElementType,
2972 kSegmentSize,
2973 NumberOfRegistersInvolved(vlmul),
2974 IndexElementType,
2975 kIndexRegistersInvolved,
2976 !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(args, src);
2977 }
2978
2979 template <typename DataElementType,
2980 size_t kSegmentSize,
2981 size_t kNumRegistersInGroup,
2982 typename IndexElementType,
2983 size_t kIndexRegistersInvolved,
2984 bool kUseMasking>
OpVector(const Decoder::VStoreIndexedArgs & args,Register src)2985 void OpVector(const Decoder::VStoreIndexedArgs& args, Register src) {
2986 if (!IsAligned<kIndexRegistersInvolved>(args.idx)) {
2987 return Undefined();
2988 }
2989 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(IndexElementType);
2990 alignas(alignof(SIMD128Register))
2991 IndexElementType indexes[kElementsCount * kIndexRegistersInvolved];
2992 memcpy(indexes, state_->cpu.v + args.idx, sizeof(SIMD128Register) * kIndexRegistersInvolved);
2993 return OpVectorStore<DataElementType, kSegmentSize, kNumRegistersInGroup, kUseMasking>(
2994 args.data, src, [&indexes](size_t index) { return indexes[index]; });
2995 }
2996
2997 template <typename ElementType,
2998 size_t kSegmentSize,
2999 VectorRegisterGroupMultiplier vlmul,
3000 TailProcessing vta,
3001 auto vma>
OpVector(const Decoder::VStoreStrideArgs & args,Register src,Register stride)3002 void OpVector(const Decoder::VStoreStrideArgs& args, Register src, Register stride) {
3003 return OpVectorStore<ElementType,
3004 kSegmentSize,
3005 NumberOfRegistersInvolved(vlmul),
3006 !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>(
3007 args.data, src, [stride](size_t index) { return stride * index; });
3008 }
3009
3010 template <typename ElementType,
3011 size_t kSegmentSize,
3012 VectorRegisterGroupMultiplier vlmul,
3013 TailProcessing vta,
3014 auto vma>
OpVector(const Decoder::VStoreUnitStrideArgs & args,Register src)3015 void OpVector(const Decoder::VStoreUnitStrideArgs& args, Register src) {
3016 switch (args.opcode) {
3017 case Decoder::VSUmOpOpcode::kVseXX:
3018 return OpVectorStore<ElementType,
3019 kSegmentSize,
3020 NumberOfRegistersInvolved(vlmul),
3021 !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
3022 Decoder::VSUmOpOpcode::kVseXX>(args.data, src, [](size_t index) {
3023 return kSegmentSize * sizeof(ElementType) * index;
3024 });
3025 case Decoder::VSUmOpOpcode::kVsm:
3026 if constexpr (kSegmentSize == 1 &&
3027 std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3028 return OpVectorStore<UInt8,
3029 1,
3030 1,
3031 /*kUseMasking=*/false,
3032 Decoder::VSUmOpOpcode::kVsm>(
3033 args.data, src, [](size_t index) { return index; });
3034 }
3035 return Undefined();
3036 default:
3037 return Undefined();
3038 }
3039 }
3040
3041 // Look for VLoadStrideArgs for explanation about semantics: VStoreStrideArgs is almost symmetric,
3042 // except it ignores vta and vma modes and never alters inactive elements in memory.
3043 template <typename ElementType,
3044 size_t kSegmentSize,
3045 size_t kNumRegistersInGroup,
3046 bool kUseMasking,
3047 typename Decoder::VSUmOpOpcode opcode = typename Decoder::VSUmOpOpcode{},
3048 typename GetElementOffsetLambdaType>
3049 void OpVectorStore(uint8_t data, Register src, GetElementOffsetLambdaType GetElementOffset) {
3050 using MaskType = std::conditional_t<sizeof(ElementType) == sizeof(Int8), UInt16, UInt8>;
3051 if (!IsAligned<kNumRegistersInGroup>(data)) {
3052 return Undefined();
3053 }
3054 if (data + kNumRegistersInGroup * kSegmentSize > 32) {
3055 return Undefined();
3056 }
3057 constexpr size_t kElementsCount = 16 / sizeof(ElementType);
3058 size_t vstart = GetCsr<CsrName::kVstart>();
3059 size_t vl = GetCsr<CsrName::kVl>();
3060 if constexpr (opcode == Decoder::VSUmOpOpcode::kVsm) {
3061 vl = AlignUp<CHAR_BIT>(vl) / CHAR_BIT;
3062 }
3063 // In case of memory access fault we may set vstart to non-zero value, set it to zero here to
3064 // simplify the logic below.
3065 SetCsr<CsrName::kVstart>(0);
3066 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3067 // vector register group, including that no tail elements are updated with agnostic values.
3068 if (vstart >= vl) [[unlikely]] {
3069 // Technically, since stores never touch tail elements it's not needed, but makes it easier to
3070 // reason about the rest of function.
3071 return;
3072 }
3073 char* ptr = ToHostAddr<char>(src);
3074 // Note: within_group_id is the current register id within a register group. During one
3075 // iteration of this loop we store results for all registers with the current id in all
3076 // groups. E.g. for the example above we'd store data from v0, v2, v4 during the first iteration
3077 // (id within group = 0), and v1, v3, v5 during the second iteration (id within group = 1). This
3078 // ensures that memory is always accessed in ordered fashion.
3079 auto mask = GetMaskForVectorOperationsIfNeeded<kUseMasking>();
3080 for (size_t within_group_id = vstart / kElementsCount; within_group_id < kNumRegistersInGroup;
3081 ++within_group_id) {
3082 // No need to continue if we no longer have elements to store.
3083 if (within_group_id * kElementsCount >= vl) {
3084 break;
3085 }
3086 auto register_mask =
3087 std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, within_group_id));
3088 // Store elements to memory, but only if there are any active ones.
3089 for (size_t within_register_id = vstart % kElementsCount; within_register_id < kElementsCount;
3090 ++within_register_id) {
3091 size_t element_index = kElementsCount * within_group_id + within_register_id;
3092 // Stop if we reached the vl limit.
3093 if (vl <= element_index) {
3094 break;
3095 }
3096 // Don't touch masked-out elements.
3097 if constexpr (kUseMasking) {
3098 if ((MaskType(register_mask) & MaskType{static_cast<typename MaskType::BaseType>(
3099 1 << within_register_id)}) == MaskType{0}) {
3100 continue;
3101 }
3102 }
3103 // Store segment to memory.
3104 for (size_t field = 0; field < kSegmentSize; ++field) {
3105 bool exception_raised = FaultyStore(
3106 ptr + field * sizeof(ElementType) + GetElementOffset(element_index),
3107 sizeof(ElementType),
3108 SIMD128Register{state_->cpu.v[data + within_group_id + field * kNumRegistersInGroup]}
3109 .Get<ElementType>(within_register_id));
3110 // Stop processing if memory is inaccessible. It's also the only case where we have to set
3111 // vstart to non-zero value!
3112 if (exception_raised) {
3113 SetCsr<CsrName::kVstart>(element_index);
3114 return;
3115 }
3116 }
3117 }
3118 // Next group should be fully processed.
3119 vstart = 0;
3120 }
3121 }
3122
3123 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorViotam(uint8_t dst,uint8_t src1)3124 void OpVectorViotam(uint8_t dst, uint8_t src1) {
3125 return OpVectorViotam<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src1);
3126 }
3127
3128 template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorViotam(uint8_t dst,uint8_t src1)3129 void OpVectorViotam(uint8_t dst, uint8_t src1) {
3130 constexpr size_t kElementsCount = sizeof(SIMD128Register) / sizeof(ElementType);
3131 size_t vstart = GetCsr<CsrName::kVstart>();
3132 size_t vl = GetCsr<CsrName::kVl>();
3133 if (vstart != 0) {
3134 return Undefined();
3135 }
3136 // When vl = 0, there are no body elements, and no elements are updated in any destination
3137 // vector register group, including that no tail elements are updated with agnostic values.
3138 if (vl == 0) [[unlikely]] {
3139 return;
3140 }
3141 SIMD128Register arg1(state_->cpu.v[src1]);
3142 auto mask = GetMaskForVectorOperations<vma>();
3143 if constexpr (std::is_same_v<decltype(mask), SIMD128Register>) {
3144 arg1 &= mask;
3145 }
3146
3147 size_t counter = 0;
3148 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3149 SIMD128Register result{state_->cpu.v[dst + index]};
3150 auto [original_dst_value, new_counter] = intrinsics::Viotam<ElementType>(arg1, counter);
3151 arg1.Set(arg1.Get<__uint128_t>() >> kElementsCount);
3152 counter = new_counter;
3153
3154 // Apply mask and put result values into dst register.
3155 result =
3156 VectorMasking<ElementType, vta, vma>(result, original_dst_value, vstart, vl, index, mask);
3157 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3158 }
3159 }
3160
3161 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorVidv(uint8_t dst)3162 void OpVectorVidv(uint8_t dst) {
3163 return OpVectorVidv<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst);
3164 }
3165
3166 template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorVidv(uint8_t dst)3167 void OpVectorVidv(uint8_t dst) {
3168 if (!IsAligned<kRegistersInvolved>(dst)) {
3169 return Undefined();
3170 }
3171 size_t vstart = GetCsr<CsrName::kVstart>();
3172 size_t vl = GetCsr<CsrName::kVl>();
3173 SetCsr<CsrName::kVstart>(0);
3174 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3175 // vector register group, including that no tail elements are updated with agnostic values.
3176 if (vstart >= vl) [[unlikely]] {
3177 return;
3178 }
3179 auto mask = GetMaskForVectorOperations<vma>();
3180 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3181 SIMD128Register result{state_->cpu.v[dst + index]};
3182 result = VectorMasking<ElementType, vta, vma>(
3183 result, std::get<0>(intrinsics::Vidv<ElementType>(index)), vstart, vl, index, mask);
3184 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3185 }
3186 }
3187
3188 template <typename ElementType>
OpVectorVmvfs(uint8_t dst,uint8_t src)3189 void OpVectorVmvfs(uint8_t dst, uint8_t src) {
3190 // Note: intrinsics::NanBox always received Float64 argument, even if it processes Float32 value
3191 // to not cause recursion in interinsics handling.
3192 // NanBox in the interpreter takes FpRegister and returns FpRegister which is probably the
3193 // cleanest way of processing that data (at least on x86-64 this produces code that's close to
3194 // optimal).
3195 NanBoxAndSetFpReg<ElementType>(dst, SIMD128Register{state_->cpu.v[src]}.Get<FpRegister>(0));
3196 SetCsr<CsrName::kVstart>(0);
3197 }
3198
3199 template <typename ElementType, TailProcessing vta>
OpVectorVmvsx(uint8_t dst,ElementType element)3200 void OpVectorVmvsx(uint8_t dst, ElementType element) {
3201 size_t vstart = GetCsr<CsrName::kVstart>();
3202 size_t vl = GetCsr<CsrName::kVl>();
3203 // Documentation doesn't specify what happenes when vstart is non-zero but less than vl.
3204 // But at least one hardware implementation treats it as NOP:
3205 // https://github.com/riscv/riscv-v-spec/issues/937
3206 // We are doing the same here.
3207 if (vstart == 0 && vl != 0) [[likely]] {
3208 SIMD128Register result;
3209 if constexpr (vta == intrinsics::TailProcessing::kAgnostic) {
3210 result = ~SIMD128Register{};
3211 } else {
3212 result.Set(state_->cpu.v[dst]);
3213 }
3214 result.Set(element, 0);
3215 state_->cpu.v[dst] = result.Get<Int128>();
3216 }
3217 SetCsr<CsrName::kVstart>(0);
3218 }
3219
3220 template <typename ElementType>
OpVectorVmvxs(uint8_t dst,uint8_t src1)3221 void OpVectorVmvxs(uint8_t dst, uint8_t src1) {
3222 static_assert(ElementType::kIsSigned);
3223 // Conversion to Int64 would perform sign-extension if source element is signed.
3224 Register element = Int64{SIMD128Register{state_->cpu.v[src1]}.Get<ElementType>(0)};
3225 SetRegOrIgnore(dst, element);
3226 SetCsr<CsrName::kVstart>(0);
3227 }
3228
3229 template <auto Intrinsic, auto vma>
OpVectorVWXUnary0(uint8_t dst,uint8_t src1)3230 void OpVectorVWXUnary0(uint8_t dst, uint8_t src1) {
3231 size_t vstart = GetCsr<CsrName::kVstart>();
3232 size_t vl = GetCsr<CsrName::kVl>();
3233 if (vstart != 0) [[unlikely]] {
3234 return Undefined();
3235 }
3236 // Note: vcpop.m and vfirst.m are explicit exception to the rule that vstart >= vl doesn't
3237 // perform any operations, and they are explicitly defined to perform write even if vl == 0.
3238 SIMD128Register arg1(state_->cpu.v[src1]);
3239 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3240 SIMD128Register mask(state_->cpu.v[0]);
3241 arg1 &= mask;
3242 }
3243 const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3244 arg1 &= ~tail_mask;
3245 SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
3246 SetRegOrIgnore(dst, TruncateTo<UInt64>(BitCastToUnsigned(result.Get<Int128>())));
3247 }
3248
3249 template <auto Intrinsic>
OpVectormm(uint8_t dst,uint8_t src1,uint8_t src2)3250 void OpVectormm(uint8_t dst, uint8_t src1, uint8_t src2) {
3251 size_t vstart = GetCsr<CsrName::kVstart>();
3252 size_t vl = GetCsr<CsrName::kVl>();
3253 SetCsr<CsrName::kVstart>(0);
3254 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3255 // vector register group, including that no tail elements are updated with agnostic values.
3256 if (vstart >= vl) [[unlikely]] {
3257 return;
3258 }
3259 SIMD128Register arg1(state_->cpu.v[src1]);
3260 SIMD128Register arg2(state_->cpu.v[src2]);
3261 SIMD128Register result;
3262 if (vstart > 0) [[unlikely]] {
3263 const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
3264 result.Set(state_->cpu.v[dst]);
3265 result = (result & ~start_mask) | (Intrinsic(arg1, arg2) & start_mask);
3266 } else {
3267 result = Intrinsic(arg1, arg2);
3268 }
3269 const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3270 result = result | tail_mask;
3271 state_->cpu.v[dst] = result.Get<__uint128_t>();
3272 }
3273
3274 template <auto Intrinsic, auto vma>
OpVectorVMUnary0(uint8_t dst,uint8_t src1)3275 void OpVectorVMUnary0(uint8_t dst, uint8_t src1) {
3276 size_t vstart = GetCsr<CsrName::kVstart>();
3277 size_t vl = GetCsr<CsrName::kVl>();
3278 if (vstart != 0) {
3279 return Undefined();
3280 }
3281 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3282 // vector register group, including that no tail elements are updated with agnostic values.
3283 if (vl == 0) [[unlikely]] {
3284 return;
3285 }
3286 SIMD128Register arg1(state_->cpu.v[src1]);
3287 SIMD128Register mask;
3288 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3289 mask.Set<__uint128_t>(state_->cpu.v[0]);
3290 arg1 &= mask;
3291 }
3292 const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3293 arg1 &= ~tail_mask;
3294 SIMD128Register result = std::get<0>(Intrinsic(arg1.Get<Int128>()));
3295 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3296 arg1 &= mask;
3297 if (vma == InactiveProcessing::kUndisturbed) {
3298 result = (result & mask) | (SIMD128Register(state_->cpu.v[dst]) & ~mask);
3299 } else {
3300 result |= ~mask;
3301 }
3302 }
3303 result |= tail_mask;
3304 state_->cpu.v[dst] = result.Get<__uint128_t>();
3305 }
3306
3307 template <typename ElementType, size_t kRegistersInvolved>
OpVectorVmvXrv(uint8_t dst,uint8_t src)3308 void OpVectorVmvXrv(uint8_t dst, uint8_t src) {
3309 if (!IsAligned<kRegistersInvolved>(dst | src)) {
3310 return Undefined();
3311 }
3312 constexpr size_t kElementsCount = 16 / sizeof(ElementType);
3313 size_t vstart = GetCsr<CsrName::kVstart>();
3314 SetCsr<CsrName::kVstart>(0);
3315 // The usual property that no elements are written if vstart >= vl does not apply to these
3316 // instructions. Instead, no elements are written if vstart >= evl.
3317 if (vstart >= kElementsCount * kRegistersInvolved) [[unlikely]] {
3318 return;
3319 }
3320 if (vstart == 0) [[likely]] {
3321 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3322 state_->cpu.v[dst + index] = state_->cpu.v[src + index];
3323 }
3324 return;
3325 }
3326 size_t index = vstart / kElementsCount;
3327 SIMD128Register destination{state_->cpu.v[dst + index]};
3328 SIMD128Register source{state_->cpu.v[src + index]};
3329 for (size_t element_index = vstart % kElementsCount; element_index < kElementsCount;
3330 ++element_index) {
3331 destination.Set(source.Get<ElementType>(element_index), element_index);
3332 }
3333 state_->cpu.v[dst + index] = destination.Get<__uint128_t>();
3334 for (index++; index < kRegistersInvolved; ++index) {
3335 state_->cpu.v[dst + index] = state_->cpu.v[src + index];
3336 }
3337 }
3338
3339 template <auto Intrinsic,
3340 typename ElementType,
3341 VectorRegisterGroupMultiplier vlmul,
3342 auto vma,
3343 CsrName... kExtraCsrs>
OpVectorToMaskvv(uint8_t dst,uint8_t src1,uint8_t src2)3344 void OpVectorToMaskvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3345 return OpVectorToMask<Intrinsic,
3346 ElementType,
3347 NumberOfRegistersInvolved(vlmul),
3348 vma,
3349 kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3350 }
3351
3352 template <auto Intrinsic,
3353 typename ElementType,
3354 VectorRegisterGroupMultiplier vlmul,
3355 auto vma,
3356 CsrName... kExtraCsrs>
OpVectorToMaskvx(uint8_t dst,uint8_t src1,ElementType arg2)3357 void OpVectorToMaskvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3358 return OpVectorToMask<Intrinsic,
3359 ElementType,
3360 NumberOfRegistersInvolved(vlmul),
3361 vma,
3362 kExtraCsrs...>(dst, Vec{src1}, arg2);
3363 }
3364
3365 template <auto Intrinsic,
3366 typename ElementType,
3367 size_t kRegistersInvolved,
3368 auto vma,
3369 CsrName... kExtraCsrs,
3370 typename... Args>
OpVectorToMask(uint8_t dst,Args...args)3371 void OpVectorToMask(uint8_t dst, Args... args) {
3372 // All args, except dst must be aligned at kRegistersInvolved amount. We'll merge them
3373 // together and then do a combined check for all of them at once.
3374 if (!IsAligned<kRegistersInvolved>(OrValuesOnlyForType<Vec>(args...))) {
3375 return Undefined();
3376 }
3377 SIMD128Register original_result(state_->cpu.v[dst]);
3378 size_t vstart = GetCsr<CsrName::kVstart>();
3379 size_t vl = GetCsr<CsrName::kVl>();
3380 SetCsr<CsrName::kVstart>(0);
3381 SIMD128Register result_before_vl_masking;
3382 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3383 // vector register group, including that no tail elements are updated with agnostic values.
3384 if (vstart >= vl) [[unlikely]] {
3385 result_before_vl_masking = original_result;
3386 } else {
3387 result_before_vl_masking = CollectBitmaskResult<ElementType, kRegistersInvolved>(
3388 [this, vstart, vl, args...](auto index) {
3389 return Intrinsic(this->GetCsr<kExtraCsrs>()...,
3390 this->GetVectorArgument<ElementType, TailProcessing::kAgnostic, vma>(
3391 args, vstart, vl, index, intrinsics::NoInactiveProcessing{})...);
3392 });
3393 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3394 SIMD128Register mask(state_->cpu.v[0]);
3395 if constexpr (vma == InactiveProcessing::kAgnostic) {
3396 result_before_vl_masking |= ~mask;
3397 } else {
3398 result_before_vl_masking = (mask & result_before_vl_masking) | (original_result & ~mask);
3399 }
3400 }
3401 if (vstart > 0) [[unlikely]] {
3402 const auto [start_mask] = intrinsics::MakeBitmaskFromVl(vstart);
3403 result_before_vl_masking =
3404 (original_result & ~start_mask) | (result_before_vl_masking & start_mask);
3405 }
3406 }
3407 const auto [tail_mask] = intrinsics::MakeBitmaskFromVl(vl);
3408 state_->cpu.v[dst] = (result_before_vl_masking | tail_mask).Get<__uint128_t>();
3409 }
3410
3411 template <auto Intrinsic,
3412 typename ElementType,
3413 VectorRegisterGroupMultiplier vlmul,
3414 TailProcessing vta,
3415 auto vma,
3416 CsrName... kExtraCsrs,
3417 typename... DstMaskType>
OpVectorv(uint8_t dst,uint8_t src1,DstMaskType...dst_mask)3418 void OpVectorv(uint8_t dst, uint8_t src1, DstMaskType... dst_mask) {
3419 return OpVectorv<Intrinsic,
3420 ElementType,
3421 NumberOfRegistersInvolved(vlmul),
3422 vta,
3423 vma,
3424 kExtraCsrs...>(dst, src1, dst_mask...);
3425 }
3426
3427 template <auto Intrinsic,
3428 typename ElementType,
3429 size_t kRegistersInvolved,
3430 TailProcessing vta,
3431 auto vma,
3432 CsrName... kExtraCsrs,
3433 typename... DstMaskType>
OpVectorv(uint8_t dst,uint8_t src,DstMaskType...dst_mask)3434 void OpVectorv(uint8_t dst, uint8_t src, DstMaskType... dst_mask) {
3435 static_assert(sizeof...(dst_mask) <= 1);
3436 if (!IsAligned<kRegistersInvolved>(dst | src | (dst_mask | ... | 0))) {
3437 return Undefined();
3438 }
3439 size_t vstart = GetCsr<CsrName::kVstart>();
3440 size_t vl = GetCsr<CsrName::kVl>();
3441 SetCsr<CsrName::kVstart>(0);
3442 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3443 // vector register group, including that no tail elements are updated with agnostic values.
3444 if (vstart >= vl) [[unlikely]] {
3445 return;
3446 }
3447 auto mask = GetMaskForVectorOperations<vma>();
3448 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3449 SIMD128Register result{state_->cpu.v[dst + index]};
3450 SIMD128Register result_mask;
3451 if constexpr (sizeof...(DstMaskType) == 0) {
3452 result_mask.Set(state_->cpu.v[dst + index]);
3453 } else {
3454 uint8_t dst_mask_unpacked[1] = {dst_mask...};
3455 result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
3456 }
3457 SIMD128Register arg{state_->cpu.v[src + index]};
3458 result =
3459 VectorMasking<ElementType, vta, vma>(result,
3460 std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg)),
3461 result_mask,
3462 vstart,
3463 vl,
3464 index,
3465 mask);
3466 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3467 }
3468 }
3469
3470 template <auto Intrinsic,
3471 typename ElementType,
3472 VectorRegisterGroupMultiplier vlmul,
3473 TailProcessing vta,
3474 auto vma,
3475 CsrName... kExtraCsrs,
3476 auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3477 void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3478 return OpVectorvs<Intrinsic, ElementType, ElementType, vlmul, vta, vma, kExtraCsrs...>(
3479 dst, src1, src2);
3480 }
3481
3482 template <auto Intrinsic,
3483 typename ElementType,
3484 typename ResultType,
3485 VectorRegisterGroupMultiplier vlmul,
3486 TailProcessing vta,
3487 auto vma,
3488 CsrName... kExtraCsrs,
3489 auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3490 void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3491 return OpVectorvs<Intrinsic,
3492 ElementType,
3493 ResultType,
3494 NumberOfRegistersInvolved(vlmul),
3495 vta,
3496 vma,
3497 kExtraCsrs...>(dst, src1, src2);
3498 }
3499
3500 template <auto Intrinsic,
3501 typename ElementType,
3502 typename ResultType,
3503 size_t kRegistersInvolved,
3504 TailProcessing vta,
3505 auto vma,
3506 CsrName... kExtraCsrs,
3507 auto kDefaultElement>
OpVectorvs(uint8_t dst,Vec<kDefaultElement> src1,uint8_t src2)3508 void OpVectorvs(uint8_t dst, Vec<kDefaultElement> src1, uint8_t src2) {
3509 if (!IsAligned<kRegistersInvolved>(dst | src1.start_no)) {
3510 return Undefined();
3511 }
3512 size_t vstart = GetCsr<CsrName::kVstart>();
3513 size_t vl = GetCsr<CsrName::kVl>();
3514 if (vstart != 0) {
3515 return Undefined();
3516 }
3517 SetCsr<CsrName::kVstart>(0);
3518 // If vl = 0, no operation is performed and the destination register is not updated.
3519 if (vl == 0) [[unlikely]] {
3520 return;
3521 }
3522 auto mask = GetMaskForVectorOperations<vma>();
3523 ResultType init = SIMD128Register{state_->cpu.v[src2]}.Get<ResultType>(0);
3524 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3525 init = std::get<0>(
3526 Intrinsic(GetCsr<kExtraCsrs>()...,
3527 init,
3528 GetVectorArgument<ElementType, vta, vma>(src1, vstart, vl, index, mask)));
3529 }
3530 SIMD128Register result{state_->cpu.v[dst]};
3531 result.Set(init, 0);
3532 result = std::get<0>(intrinsics::VectorMasking<ResultType, vta>(result, result, 0, 1));
3533 state_->cpu.v[dst] = result.Get<__uint128_t>();
3534 }
3535
3536 template <auto Intrinsic,
3537 typename ElementType,
3538 VectorRegisterGroupMultiplier vlmul,
3539 TailProcessing vta,
3540 auto vma,
3541 CsrName... kExtraCsrs>
OpVectorvv(uint8_t dst,uint8_t src1,uint8_t src2)3542 void OpVectorvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3543 return OpVectorSameWidth<Intrinsic,
3544 ElementType,
3545 NumberOfRegistersInvolved(vlmul),
3546 vta,
3547 vma,
3548 kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3549 }
3550
3551 template <auto Intrinsic,
3552 typename ElementType,
3553 VectorRegisterGroupMultiplier vlmul,
3554 TailProcessing vta,
3555 auto vma,
3556 CsrName... kExtraCsrs>
OpVectorvvv(uint8_t dst,uint8_t src1,uint8_t src2)3557 void OpVectorvvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3558 return OpVectorSameWidth<Intrinsic,
3559 ElementType,
3560 NumberOfRegistersInvolved(vlmul),
3561 vta,
3562 vma,
3563 kExtraCsrs...>(dst, Vec{src1}, Vec{src2}, Vec{dst});
3564 }
3565
3566 template <auto Intrinsic,
3567 typename ElementType,
3568 VectorRegisterGroupMultiplier vlmul,
3569 TailProcessing vta,
3570 auto vma,
3571 CsrName... kExtraCsrs>
OpVectorWidenv(uint8_t dst,uint8_t src)3572 void OpVectorWidenv(uint8_t dst, uint8_t src) {
3573 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3574 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3575 return OpVectorWiden<Intrinsic,
3576 ElementType,
3577 NumRegistersInvolvedForWideOperand(vlmul),
3578 NumberOfRegistersInvolved(vlmul),
3579 vta,
3580 vma,
3581 kExtraCsrs...>(dst, Vec{src});
3582 }
3583 return Undefined();
3584 }
3585
3586 // 2*SEW = SEW op SEW
3587 // Attention: not to confuse with OpVectorWidenwv with 2*SEW = 2*SEW op SEW
3588 template <auto Intrinsic,
3589 typename ElementType,
3590 VectorRegisterGroupMultiplier vlmul,
3591 TailProcessing vta,
3592 auto vma,
3593 CsrName... kExtraCsrs>
OpVectorWidenvv(uint8_t dst,uint8_t src1,uint8_t src2)3594 void OpVectorWidenvv(uint8_t dst, uint8_t src1, uint8_t src2) {
3595 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3596 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3597 return OpVectorWiden<Intrinsic,
3598 ElementType,
3599 NumRegistersInvolvedForWideOperand(vlmul),
3600 NumberOfRegistersInvolved(vlmul),
3601 vta,
3602 vma,
3603 kExtraCsrs...>(dst, Vec{src1}, Vec{src2});
3604 }
3605 return Undefined();
3606 }
3607
3608 // 2*SEW = SEW op SEW op 2*SEW
3609 template <auto Intrinsic,
3610 typename ElementType,
3611 VectorRegisterGroupMultiplier vlmul,
3612 TailProcessing vta,
3613 auto vma,
3614 CsrName... kExtraCsrs>
OpVectorWidenvvw(uint8_t dst,uint8_t src1,uint8_t src2)3615 void OpVectorWidenvvw(uint8_t dst, uint8_t src1, uint8_t src2) {
3616 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3617 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3618 return OpVectorWiden<Intrinsic,
3619 ElementType,
3620 NumRegistersInvolvedForWideOperand(vlmul),
3621 NumberOfRegistersInvolved(vlmul),
3622 vta,
3623 vma,
3624 kExtraCsrs...>(dst, Vec{src1}, Vec{src2}, WideVec{dst});
3625 }
3626 return Undefined();
3627 }
3628
3629 // 2*SEW = 2*SEW op SEW
3630 template <auto Intrinsic,
3631 typename ElementType,
3632 VectorRegisterGroupMultiplier vlmul,
3633 TailProcessing vta,
3634 auto vma,
3635 CsrName... kExtraCsrs>
OpVectorWidenwv(uint8_t dst,uint8_t src1,uint8_t src2)3636 void OpVectorWidenwv(uint8_t dst, uint8_t src1, uint8_t src2) {
3637 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3638 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3639 return OpVectorWiden<Intrinsic,
3640 ElementType,
3641 NumRegistersInvolvedForWideOperand(vlmul),
3642 NumberOfRegistersInvolved(vlmul),
3643 vta,
3644 vma,
3645 kExtraCsrs...>(dst, WideVec{src1}, Vec{src2});
3646 }
3647 return Undefined();
3648 }
3649
3650 template <auto Intrinsic,
3651 typename ElementType,
3652 VectorRegisterGroupMultiplier vlmul,
3653 TailProcessing vta,
3654 auto vma,
3655 CsrName... kExtraCsrs>
OpVectorWidenwx(uint8_t dst,uint8_t src1,ElementType arg2)3656 void OpVectorWidenwx(uint8_t dst, uint8_t src1, ElementType arg2) {
3657 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3658 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3659 return OpVectorWiden<Intrinsic,
3660 ElementType,
3661 NumRegistersInvolvedForWideOperand(vlmul),
3662 NumberOfRegistersInvolved(vlmul),
3663 vta,
3664 vma,
3665 kExtraCsrs...>(dst, WideVec{src1}, arg2);
3666 }
3667 return Undefined();
3668 }
3669
3670 template <auto Intrinsic,
3671 typename ElementType,
3672 VectorRegisterGroupMultiplier vlmul,
3673 TailProcessing vta,
3674 auto vma,
3675 CsrName... kExtraCsrs>
OpVectorWidenvx(uint8_t dst,uint8_t src1,ElementType arg2)3676 void OpVectorWidenvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3677 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3678 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3679 return OpVectorWiden<Intrinsic,
3680 ElementType,
3681 NumRegistersInvolvedForWideOperand(vlmul),
3682 NumberOfRegistersInvolved(vlmul),
3683 vta,
3684 vma,
3685 kExtraCsrs...>(dst, Vec{src1}, arg2);
3686 }
3687 return Undefined();
3688 }
3689
3690 template <auto Intrinsic,
3691 typename ElementType,
3692 VectorRegisterGroupMultiplier vlmul,
3693 TailProcessing vta,
3694 auto vma,
3695 CsrName... kExtraCsrs>
OpVectorWidenvxw(uint8_t dst,uint8_t src1,ElementType arg2)3696 void OpVectorWidenvxw(uint8_t dst, uint8_t src1, ElementType arg2) {
3697 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3698 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3699 return OpVectorWiden<Intrinsic,
3700 ElementType,
3701 NumRegistersInvolvedForWideOperand(vlmul),
3702 NumberOfRegistersInvolved(vlmul),
3703 vta,
3704 vma,
3705 kExtraCsrs...>(dst, Vec{src1}, arg2, WideVec{dst});
3706 }
3707 return Undefined();
3708 }
3709
3710 template <auto Intrinsic,
3711 typename ElementType,
3712 size_t kDestRegistersInvolved,
3713 size_t kRegistersInvolved,
3714 TailProcessing vta,
3715 auto vma,
3716 CsrName... kExtraCsrs,
3717 typename... Args>
OpVectorWiden(uint8_t dst,Args...args)3718 void OpVectorWiden(uint8_t dst, Args... args) {
3719 if constexpr (kDestRegistersInvolved == kRegistersInvolved) {
3720 static_assert(kDestRegistersInvolved == 1);
3721 } else {
3722 static_assert(kDestRegistersInvolved == 2 * kRegistersInvolved);
3723 // All normal (narrow) args must be aligned at kRegistersInvolved amount. We'll merge them
3724 // together and then do a combined check for all of them at once.
3725 uint8_t ored_args = OrValuesOnlyForType<Vec>(args...);
3726 // All wide args must be aligned at kRegistersInvolved amount. We'll merge them together and
3727 // then do a combined check for all of them at once.
3728 uint8_t ored_wide_args = OrValuesOnlyForType<WideVec>(args...) | dst;
3729 if (!IsAligned<kDestRegistersInvolved>(ored_wide_args) ||
3730 !IsAligned<kRegistersInvolved>(ored_args)) {
3731 return Undefined();
3732 }
3733 }
3734 // From RISC-V vectors manual: If destination EEW is greater than the source EEW, the source
3735 // EMUL is at least 1, [then overlap is permitted if ] the overlap is in the highest numbered
3736 // part of the destination register group (e.g., when LMUL=8, vzext.vf4 v0, v6 is legal, but a
3737 // source of v0, v2, or v4 is not).
3738 // Here only one forbidden combination is possible because of static_asserts above and we
3739 // detect and reject it.
3740 if (OrResultsOnlyForType<Vec>([dst](auto arg) { return arg.start_no == dst; }, args...)) {
3741 return Undefined();
3742 }
3743 size_t vstart = GetCsr<CsrName::kVstart>();
3744 size_t vl = GetCsr<CsrName::kVl>();
3745 SetCsr<CsrName::kVstart>(0);
3746 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3747 // vector register group, including that no tail elements are updated with agnostic values.
3748 if (vstart >= vl) [[unlikely]] {
3749 return;
3750 }
3751 auto mask = GetMaskForVectorOperations<vma>();
3752 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3753 SIMD128Register result(state_->cpu.v[dst + 2 * index]);
3754 result = VectorMasking<WideType<ElementType>, vta, vma>(
3755 result,
3756 std::get<0>(Intrinsic(
3757 GetCsr<kExtraCsrs>()...,
3758 GetLowVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3759 vstart,
3760 vl,
3761 2 * index,
3762 mask);
3763 state_->cpu.v[dst + 2 * index] = result.Get<__uint128_t>();
3764 if constexpr (kDestRegistersInvolved > 1) { // if lmul is one full register or more
3765 result.Set(state_->cpu.v[dst + 2 * index + 1]);
3766 result = VectorMasking<WideType<ElementType>, vta, vma>(
3767 result,
3768 std::get<0>(Intrinsic(
3769 GetCsr<kExtraCsrs>()...,
3770 GetHighVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3771 vstart,
3772 vl,
3773 2 * index + 1,
3774 mask);
3775 state_->cpu.v[dst + 2 * index + 1] = result.Get<__uint128_t>();
3776 }
3777 }
3778 }
3779
3780 template <auto Intrinsic,
3781 typename ElementType,
3782 size_t kRegistersInvolved,
3783 TailProcessing vta,
3784 auto vma,
3785 CsrName... kExtraCsrs>
OpVectorvxm(uint8_t dst,uint8_t src1,ElementType arg2)3786 void OpVectorvxm(uint8_t dst, uint8_t src1, ElementType arg2) {
3787 // All args must be aligned at kRegistersInvolved amount. We'll merge them
3788 // together and then do a combined check for all of them at once.
3789 if (!IsAligned<kRegistersInvolved>(dst | src1)) {
3790 return Undefined();
3791 }
3792
3793 size_t vstart = GetCsr<CsrName::kVstart>();
3794 size_t vl = GetCsr<CsrName::kVl>();
3795 SetCsr<CsrName::kVstart>(0);
3796 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3797 // vector register group, including that no tail elements are updated with agnostic values.
3798 if (vstart >= vl) [[unlikely]] {
3799 return Undefined();
3800 }
3801
3802 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3803 SIMD128Register arg1{state_->cpu.v[src1 + index]};
3804 SIMD128Register arg3{};
3805 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3806 if constexpr (vma == InactiveProcessing::kUndisturbed) {
3807 arg3 = std::get<0>(
3808 intrinsics::GetMaskVectorArgument<ElementType, vta, vma>(state_->cpu.v[0], index));
3809 }
3810 }
3811
3812 SIMD128Register result(state_->cpu.v[dst + index]);
3813 result = VectorMasking<ElementType, vta, intrinsics::NoInactiveProcessing{}>(
3814 result,
3815 std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg1, arg2, arg3)),
3816 vstart,
3817 vl,
3818 index,
3819 intrinsics::NoInactiveProcessing{});
3820 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3821 }
3822 }
3823
3824 template <auto Intrinsic,
3825 typename ElementType,
3826 size_t kRegistersInvolved,
3827 TailProcessing vta,
3828 auto vma,
3829 CsrName... kExtraCsrs>
OpVectorvvm(uint8_t dst,uint8_t src1,uint8_t src2)3830 void OpVectorvvm(uint8_t dst, uint8_t src1, uint8_t src2) {
3831 // All args must be aligned at kRegistersInvolved amount. We'll merge them
3832 // together and then do a combined check for all of them at once.
3833 if (!IsAligned<kRegistersInvolved>(dst | src1 | src2)) {
3834 return Undefined();
3835 }
3836
3837 size_t vstart = GetCsr<CsrName::kVstart>();
3838 size_t vl = GetCsr<CsrName::kVl>();
3839 SetCsr<CsrName::kVstart>(0);
3840 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3841 // vector register group, including that no tail elements are updated with agnostic values.
3842 if (vstart >= vl) [[unlikely]] {
3843 return Undefined();
3844 }
3845
3846 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3847 SIMD128Register arg1{state_->cpu.v[src1 + index]};
3848 SIMD128Register arg2{state_->cpu.v[src2 + index]};
3849 SIMD128Register arg3{};
3850 if constexpr (!std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>) {
3851 if constexpr (vma == InactiveProcessing::kUndisturbed) {
3852 arg3 = std::get<0>(
3853 intrinsics::GetMaskVectorArgument<ElementType, vta, vma>(state_->cpu.v[0], index));
3854 }
3855 }
3856
3857 SIMD128Register result(state_->cpu.v[dst + index]);
3858 result = VectorMasking<ElementType, vta, intrinsics::NoInactiveProcessing{}>(
3859 result,
3860 std::get<0>(Intrinsic(GetCsr<kExtraCsrs>()..., arg1, arg2, arg3)),
3861 vstart,
3862 vl,
3863 index,
3864 intrinsics::NoInactiveProcessing{});
3865 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3866 }
3867 }
3868
3869 template <auto Intrinsic,
3870 typename ElementType,
3871 VectorRegisterGroupMultiplier vlmul,
3872 TailProcessing vta,
3873 auto vma,
3874 CsrName... kExtraCsrs>
OpVectorvx(uint8_t dst,uint8_t src1,ElementType arg2)3875 void OpVectorvx(uint8_t dst, uint8_t src1, ElementType arg2) {
3876 return OpVectorSameWidth<Intrinsic,
3877 ElementType,
3878 NumberOfRegistersInvolved(vlmul),
3879 vta,
3880 vma,
3881 kExtraCsrs...>(dst, Vec{src1}, arg2);
3882 }
3883
3884 template <auto Intrinsic,
3885 typename ElementType,
3886 size_t kRegistersInvolved,
3887 TailProcessing vta,
3888 auto vma,
3889 CsrName... kExtraCsrs,
3890 typename... Args>
OpVectorSameWidth(uint8_t dst,Args...args)3891 void OpVectorSameWidth(uint8_t dst, Args... args) {
3892 // All args must be aligned at kRegistersInvolved amount. We'll merge them
3893 // together and then do a combined check for all of them at once.
3894 if (!IsAligned<kRegistersInvolved>(OrValuesOnlyForType<Vec>(args...) | dst)) {
3895 return Undefined();
3896 }
3897 size_t vstart = GetCsr<CsrName::kVstart>();
3898 size_t vl = GetCsr<CsrName::kVl>();
3899 SetCsr<CsrName::kVstart>(0);
3900 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
3901 // vector register group, including that no tail elements are updated with agnostic values.
3902 if (vstart >= vl) [[unlikely]] {
3903 return;
3904 }
3905 auto mask = GetMaskForVectorOperations<vma>();
3906 for (size_t index = 0; index < kRegistersInvolved; ++index) {
3907 SIMD128Register result(state_->cpu.v[dst + index]);
3908 result = VectorMasking<ElementType, vta, vma>(
3909 result,
3910 std::get<0>(Intrinsic(
3911 GetCsr<kExtraCsrs>()...,
3912 GetVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...)),
3913 vstart,
3914 vl,
3915 index,
3916 mask);
3917 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
3918 }
3919 }
3920
3921 template <auto Intrinsic,
3922 typename TargetElementType,
3923 VectorRegisterGroupMultiplier vlmul,
3924 TailProcessing vta,
3925 auto vma,
3926 CsrName... kExtraCsrs>
OpVectorNarroww(uint8_t dst,uint8_t src)3927 void OpVectorNarroww(uint8_t dst, uint8_t src) {
3928 if constexpr (sizeof(TargetElementType) < sizeof(Int64) &&
3929 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3930 return OpVectorNarrow<Intrinsic,
3931 TargetElementType,
3932 NumberOfRegistersInvolved(vlmul),
3933 NumRegistersInvolvedForWideOperand(vlmul),
3934 vta,
3935 vma,
3936 kExtraCsrs...>(dst, WideVec{src});
3937 }
3938 return Undefined();
3939 }
3940
3941 // SEW = 2*SEW op SEW
3942 template <auto Intrinsic,
3943 typename ElementType,
3944 VectorRegisterGroupMultiplier vlmul,
3945 TailProcessing vta,
3946 auto vma,
3947 CsrName... kExtraCsrs>
OpVectorNarrowwx(uint8_t dst,uint8_t src1,ElementType arg2)3948 void OpVectorNarrowwx(uint8_t dst, uint8_t src1, ElementType arg2) {
3949 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3950 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3951 return OpVectorNarrow<Intrinsic,
3952 ElementType,
3953 NumberOfRegistersInvolved(vlmul),
3954 NumRegistersInvolvedForWideOperand(vlmul),
3955 vta,
3956 vma,
3957 kExtraCsrs...>(dst, WideVec{src1}, arg2);
3958 }
3959 return Undefined();
3960 }
3961
3962 // SEW = 2*SEW op SEW
3963 template <auto Intrinsic,
3964 typename ElementType,
3965 VectorRegisterGroupMultiplier vlmul,
3966 TailProcessing vta,
3967 auto vma,
3968 CsrName... kExtraCsrs>
OpVectorNarrowwv(uint8_t dst,uint8_t src1,uint8_t src2)3969 void OpVectorNarrowwv(uint8_t dst, uint8_t src1, uint8_t src2) {
3970 if constexpr (sizeof(ElementType) < sizeof(Int64) &&
3971 vlmul != VectorRegisterGroupMultiplier::k8registers) {
3972 return OpVectorNarrow<Intrinsic,
3973 ElementType,
3974 NumberOfRegistersInvolved(vlmul),
3975 NumRegistersInvolvedForWideOperand(vlmul),
3976 vta,
3977 vma,
3978 kExtraCsrs...>(dst, WideVec{src1}, Vec{src2});
3979 }
3980 return Undefined();
3981 }
3982
3983 template <auto Intrinsic,
3984 typename ElementType,
3985 size_t kRegistersInvolved,
3986 size_t kWideSrcRegistersInvolved,
3987 TailProcessing vta,
3988 auto vma,
3989 CsrName... kExtraCsrs,
3990 typename... Args>
OpVectorNarrow(uint8_t dst,Args...args)3991 void OpVectorNarrow(uint8_t dst, Args... args) {
3992 if constexpr (kWideSrcRegistersInvolved == kRegistersInvolved) {
3993 static_assert(kWideSrcRegistersInvolved == 1);
3994 } else {
3995 // All normal (narrow) args must be aligned at kRegistersInvolved amount. We'll merge them
3996 // together and then do a combined check for all of them at once.
3997 uint8_t ored_args = OrValuesOnlyForType<Vec>(args...) | dst;
3998 // All wide args must be aligned at kWideSrcRegistersInvolved amount. We'll merge them
3999 // together and then do a combined check for all of them at once.
4000 uint8_t ored_wide_args = OrValuesOnlyForType<WideVec>(args...);
4001 if (!IsAligned<kWideSrcRegistersInvolved>(ored_wide_args) ||
4002 !IsAligned<kRegistersInvolved>(ored_args)) {
4003 return Undefined();
4004 }
4005 static_assert(kWideSrcRegistersInvolved == 2 * kRegistersInvolved);
4006 // From RISC-V vectors manual: If destination EEW is smaller than the source EEW, [then
4007 // overlap is permitted if] the overlap is in the lowest-numbered part of the source register
4008 // group (e.g., when LMUL=1, vnsrl.wi v0, v0, 3 is legal, but a destination of v1 is not).
4009 // We only have one possible invalid value here because of alignment requirements.
4010 if (OrResultsOnlyForType<Vec>(
4011 [dst](auto arg) { return arg.start_no == dst + kRegistersInvolved; }, args...)) {
4012 return Undefined();
4013 }
4014 }
4015 size_t vstart = GetCsr<CsrName::kVstart>();
4016 size_t vl = GetCsr<CsrName::kVl>();
4017 SetCsr<CsrName::kVstart>(0);
4018 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
4019 // vector register group, including that no tail elements are updated with agnostic values.
4020 if (vstart >= vl) [[unlikely]] {
4021 return;
4022 }
4023 auto mask = GetMaskForVectorOperations<vma>();
4024 for (size_t index = 0; index < kRegistersInvolved; index++) {
4025 SIMD128Register orig_result(state_->cpu.v[dst + index]);
4026 SIMD128Register intrinsic_result = std::get<0>(
4027 Intrinsic(GetCsr<kExtraCsrs>()...,
4028 GetLowVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...));
4029 if constexpr (kWideSrcRegistersInvolved > 1) {
4030 SIMD128Register result_high = std::get<0>(Intrinsic(
4031 GetCsr<kExtraCsrs>()...,
4032 GetHighVectorArgument<ElementType, vta, vma>(args, vstart, vl, index, mask)...));
4033 intrinsic_result = std::get<0>(
4034 intrinsics::VMergeBottomHalfToTop<ElementType>(intrinsic_result, result_high));
4035 }
4036 auto result = VectorMasking<ElementType, vta, vma>(
4037 orig_result, intrinsic_result, vstart, vl, index, mask);
4038 state_->cpu.v[dst + index] = result.template Get<__uint128_t>();
4039 }
4040 }
4041
4042 template <auto Intrinsic,
4043 typename DestElementType,
4044 const uint8_t kFactor,
4045 VectorRegisterGroupMultiplier vlmul,
4046 TailProcessing vta,
4047 auto vma>
OpVectorVXUnary0(uint8_t dst,uint8_t src)4048 void OpVectorVXUnary0(uint8_t dst, uint8_t src) {
4049 static_assert(kFactor == 2 || kFactor == 4 || kFactor == 8);
4050 constexpr size_t kDestRegistersInvolved = NumberOfRegistersInvolved(vlmul);
4051 constexpr size_t kSourceRegistersInvolved = (kDestRegistersInvolved / kFactor) ?: 1;
4052 if (!IsAligned<kDestRegistersInvolved>(dst) || !IsAligned<kSourceRegistersInvolved>(src)) {
4053 return Undefined();
4054 }
4055 size_t vstart = GetCsr<CsrName::kVstart>();
4056 size_t vl = GetCsr<CsrName::kVl>();
4057 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
4058 // vector register group, including that no tail elements are updated with agnostic values.
4059 if (vstart >= vl) [[unlikely]] {
4060 SetCsr<CsrName::kVstart>(0);
4061 return;
4062 }
4063 auto mask = GetMaskForVectorOperations<vma>();
4064 for (size_t dst_index = 0; dst_index < kDestRegistersInvolved; dst_index++) {
4065 size_t src_index = dst_index / kFactor;
4066 size_t src_elem = dst_index % kFactor;
4067 SIMD128Register result{state_->cpu.v[dst + dst_index]};
4068 SIMD128Register arg{state_->cpu.v[src + src_index] >> ((128 / kFactor) * src_elem)};
4069
4070 result = VectorMasking<DestElementType, vta, vma>(
4071 result, std::get<0>(Intrinsic(arg)), vstart, vl, dst_index, mask);
4072 state_->cpu.v[dst + dst_index] = result.Get<__uint128_t>();
4073 }
4074 SetCsr<CsrName::kVstart>(0);
4075 }
4076
4077 template <auto Intrinsic,
4078 typename ElementType,
4079 VectorRegisterGroupMultiplier vlmul,
4080 TailProcessing vta,
4081 auto vma,
4082 CsrName... kExtraCsrs>
OpVectorvxv(uint8_t dst,uint8_t src1,ElementType arg2)4083 void OpVectorvxv(uint8_t dst, uint8_t src1, ElementType arg2) {
4084 return OpVectorSameWidth<Intrinsic,
4085 ElementType,
4086 NumberOfRegistersInvolved(vlmul),
4087 vta,
4088 vma,
4089 kExtraCsrs...>(dst, Vec{src1}, arg2, Vec{dst});
4090 }
4091
4092 template <auto Intrinsic,
4093 typename ElementType,
4094 VectorRegisterGroupMultiplier vlmul,
4095 TailProcessing vta,
4096 auto vma,
4097 typename... DstMaskType>
OpVectorx(uint8_t dst,ElementType arg2,DstMaskType...dst_mask)4098 void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
4099 return OpVectorx<Intrinsic, ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
4100 dst, arg2, dst_mask...);
4101 }
4102
4103 template <auto Intrinsic,
4104 typename ElementType,
4105 size_t kRegistersInvolved,
4106 TailProcessing vta,
4107 auto vma,
4108 typename... DstMaskType>
OpVectorx(uint8_t dst,ElementType arg2,DstMaskType...dst_mask)4109 void OpVectorx(uint8_t dst, ElementType arg2, DstMaskType... dst_mask) {
4110 static_assert(sizeof...(dst_mask) <= 1);
4111 if (!IsAligned<kRegistersInvolved>(dst | (dst_mask | ... | 0))) {
4112 return Undefined();
4113 }
4114 size_t vstart = GetCsr<CsrName::kVstart>();
4115 size_t vl = GetCsr<CsrName::kVl>();
4116 SetCsr<CsrName::kVstart>(0);
4117 // When vstart >= vl, there are no body elements, and no elements are updated in any destination
4118 // vector register group, including that no tail elements are updated with agnostic values.
4119 if (vstart >= vl) [[unlikely]] {
4120 return;
4121 }
4122 auto mask = GetMaskForVectorOperations<vma>();
4123 for (size_t index = 0; index < kRegistersInvolved; ++index) {
4124 SIMD128Register result(state_->cpu.v[dst + index]);
4125 SIMD128Register result_mask;
4126 if constexpr (sizeof...(DstMaskType) == 0) {
4127 result_mask.Set(state_->cpu.v[dst + index]);
4128 } else {
4129 uint8_t dst_mask_unpacked[1] = {dst_mask...};
4130 result_mask.Set(state_->cpu.v[dst_mask_unpacked[0] + index]);
4131 }
4132 result = VectorMasking<ElementType, vta, vma>(
4133 result, std::get<0>(Intrinsic(arg2)), result_mask, vstart, vl, index, mask);
4134 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
4135 }
4136 }
4137
4138 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslideup(uint8_t dst,uint8_t src,Register offset)4139 void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
4140 return OpVectorslideup<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(
4141 dst, src, offset);
4142 }
4143
4144 template <typename ElementType, size_t kRegistersInvolved, TailProcessing vta, auto vma>
OpVectorslideup(uint8_t dst,uint8_t src,Register offset)4145 void OpVectorslideup(uint8_t dst, uint8_t src, Register offset) {
4146 constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
4147 if (!IsAligned<kRegistersInvolved>(dst | src)) {
4148 return Undefined();
4149 }
4150 // Source and destination must not intersect.
4151 if (dst < (src + kRegistersInvolved) && src < (dst + kRegistersInvolved)) {
4152 return Undefined();
4153 }
4154 size_t vstart = GetCsr<CsrName::kVstart>();
4155 size_t vl = GetCsr<CsrName::kVl>();
4156 SetCsr<CsrName::kVstart>(0);
4157 if (vstart >= vl) [[unlikely]] {
4158 // From 16.3: For all of the [slide instructions], if vstart >= vl, the
4159 // instruction performs no operation and leaves the destination vector
4160 // register unchanged.
4161 return;
4162 }
4163 auto mask = GetMaskForVectorOperations<vma>();
4164 // The slideup operation leaves Elements 0 through MAX(vstart, OFFSET) unchanged.
4165 //
4166 // From 16.3.1: Destination elements OFFSET through vl-1 are written if
4167 // unmasked and if OFFSET < vl.
4168 // However if OFFSET > vl, we still need to apply the tail policy (as
4169 // clarified in https://github.com/riscv/riscv-v-spec/issues/263). Given
4170 // that OFFSET could be well past vl we start at vl rather than OFFSET in
4171 // that case.
4172 const size_t start_elem_index = std::min(std::max(vstart, offset), vl);
4173 for (size_t index = start_elem_index / kElementsPerRegister; index < kRegistersInvolved;
4174 ++index) {
4175 SIMD128Register result(state_->cpu.v[dst + index]);
4176
4177 // Arguments falling before the input group correspond to the first offset-amount
4178 // result elements, which must remain undisturbed. We zero-initialize them here,
4179 // but their values are eventually ignored by vstart masking in VectorMasking.
4180 ssize_t first_arg_disp = index - 1 - offset / kElementsPerRegister;
4181 SIMD128Register arg1 =
4182 (first_arg_disp < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp];
4183 SIMD128Register arg2 =
4184 (first_arg_disp + 1 < 0) ? SIMD128Register{0} : state_->cpu.v[src + first_arg_disp + 1];
4185
4186 result =
4187 VectorMasking<ElementType, vta, vma>(result,
4188 std::get<0>(intrinsics::VectorSlideUp<ElementType>(
4189 offset % kElementsPerRegister, arg1, arg2)),
4190 start_elem_index,
4191 vl,
4192 index,
4193 mask);
4194 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
4195 }
4196 }
4197
4198 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslide1up(uint8_t dst,uint8_t src,ElementType xval)4199 void OpVectorslide1up(uint8_t dst, uint8_t src, ElementType xval) {
4200 // Save the vstart before it's reset by vslideup.
4201 size_t vstart = GetCsr<CsrName::kVstart>();
4202 // Slide all the elements by one.
4203 OpVectorslideup<ElementType, NumberOfRegistersInvolved(vlmul), vta, vma>(dst, src, 1);
4204 if (exception_raised_) {
4205 return;
4206 }
4207 if (vstart > 0) {
4208 // First element is not affected and should remain untouched.
4209 return;
4210 }
4211
4212 // From 16.3.3: places the x register argument at location 0 of the
4213 // destination vector register group provided that element 0 is active,
4214 // otherwise the destination element update follows the current mask
4215 // agnostic/undisturbed policy.
4216 if constexpr (std::is_same_v<decltype(vma), intrinsics::InactiveProcessing>) {
4217 auto mask = GetMaskForVectorOperations<vma>();
4218 if (!(mask.template Get<uint8_t>(0) & 0x1)) {
4219 // The first element is masked. OpVectorslideup already applied the proper masking to it.
4220 return;
4221 }
4222 }
4223
4224 SIMD128Register result = state_->cpu.v[dst];
4225 result.Set(xval, 0);
4226 state_->cpu.v[dst] = result.Get<__uint128_t>();
4227 }
4228
4229 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslidedown(uint8_t dst,uint8_t src,Register offset)4230 void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
4231 return OpVectorslidedown<ElementType,
4232 NumberOfRegistersInvolved(vlmul),
4233 GetVlmax<ElementType, vlmul>(),
4234 vta,
4235 vma>(dst, src, offset);
4236 }
4237
4238 template <typename ElementType,
4239 size_t kRegistersInvolved,
4240 size_t kVlmax,
4241 TailProcessing vta,
4242 auto vma>
OpVectorslidedown(uint8_t dst,uint8_t src,Register offset)4243 void OpVectorslidedown(uint8_t dst, uint8_t src, Register offset) {
4244 constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
4245 if (!IsAligned<kRegistersInvolved>(dst | src)) {
4246 return Undefined();
4247 }
4248 size_t vstart = GetCsr<CsrName::kVstart>();
4249 size_t vl = GetCsr<CsrName::kVl>();
4250 SetCsr<CsrName::kVstart>(0);
4251 if (vstart >= vl) [[unlikely]] {
4252 // From 16.3: For all of the [slide instructions], if vstart >= vl, the
4253 // instruction performs no operation and leaves the destination vector
4254 // register unchanged.
4255 return;
4256 }
4257 auto mask = GetMaskForVectorOperations<vma>();
4258 for (size_t index = 0; index < kRegistersInvolved; ++index) {
4259 SIMD128Register result(state_->cpu.v[dst + index]);
4260
4261 size_t first_arg_disp = index + offset / kElementsPerRegister;
4262 SIMD128Register arg1 = state_->cpu.v[src + first_arg_disp];
4263 SIMD128Register arg2 = state_->cpu.v[src + first_arg_disp + 1];
4264 SIMD128Register tunnel_shift_result;
4265 // Elements coming from above vlmax are zeroes.
4266 if (offset >= kVlmax) {
4267 tunnel_shift_result = SIMD128Register{0};
4268 } else {
4269 tunnel_shift_result = std::get<0>(
4270 intrinsics::VectorSlideDown<ElementType>(offset % kElementsPerRegister, arg1, arg2));
4271 tunnel_shift_result =
4272 VectorZeroFill<ElementType>(tunnel_shift_result, kVlmax - offset, kVlmax, index);
4273 }
4274
4275 result = VectorMasking<ElementType, vta, vma>(
4276 result, tunnel_shift_result, vstart, vl, index, mask);
4277 state_->cpu.v[dst + index] = result.Get<__uint128_t>();
4278 }
4279 }
4280
4281 template <typename ElementType, VectorRegisterGroupMultiplier vlmul, TailProcessing vta, auto vma>
OpVectorslide1down(uint8_t dst,uint8_t src,ElementType xval)4282 void OpVectorslide1down(uint8_t dst, uint8_t src, ElementType xval) {
4283 constexpr size_t kElementsPerRegister = 16 / sizeof(ElementType);
4284 const size_t vl = GetCsr<CsrName::kVl>();
4285
4286 // From 16.3.4: ... places the x register argument at location vl-1 in the
4287 // destination vector register, provided that element vl-1 is active,
4288 // otherwise the destination element is **unchanged** (emphasis added.)
4289 //
4290 // This means that element at vl-1 would not follow the Mask Agnostic policy
4291 // and would stay Unchanged when inactive. So we need to undo just this one
4292 // element if using agnostic masking.
4293 ElementType last_elem_value = xval;
4294 const size_t last_elem_register = (vl - 1) / kElementsPerRegister;
4295 const size_t last_elem_within_reg_pos = (vl - 1) % kElementsPerRegister;
4296 bool set_last_element = true;
4297 if constexpr (std::is_same_v<decltype(vma), intrinsics::InactiveProcessing>) {
4298 auto mask = GetMaskForVectorOperations<vma>();
4299 auto [mask_bits] =
4300 intrinsics::MaskForRegisterInSequence<ElementType>(mask, last_elem_register);
4301 using MaskType = decltype(mask_bits);
4302 if ((static_cast<MaskType::BaseType>(mask_bits) & (1 << last_elem_within_reg_pos)) == 0) {
4303 if constexpr (vma == intrinsics::InactiveProcessing::kUndisturbed) {
4304 // Element is inactive and the undisturbed policy will be followed,
4305 // just let Opvectorslidedown handle everything.
4306 set_last_element = false;
4307 } else {
4308 // Element is inactive and the agnostic policy will be followed, get
4309 // the original value to restore before it's changed by
4310 // the agnostic policy.
4311 SIMD128Register original = state_->cpu.v[dst + last_elem_register];
4312 last_elem_value = original.Get<ElementType>(last_elem_within_reg_pos);
4313 }
4314 }
4315 }
4316
4317 // Slide all the elements by one.
4318 OpVectorslidedown<ElementType,
4319 NumberOfRegistersInvolved(vlmul),
4320 GetVlmax<ElementType, vlmul>(),
4321 vta,
4322 vma>(dst, src, 1);
4323 if (exception_raised_) {
4324 return;
4325 }
4326 if (!set_last_element) {
4327 return;
4328 }
4329
4330 SIMD128Register result = state_->cpu.v[dst + last_elem_register];
4331 result.Set(last_elem_value, last_elem_within_reg_pos);
4332 state_->cpu.v[dst + last_elem_register] = result.Get<__uint128_t>();
4333 }
4334
4335 // Helper function needed to generate bitmak result from non-bitmask inputs.
4336 // We are processing between 1 and 8 registers here and each register produces between 2 bits
4337 // (for 64 bit inputs) and 16 bits (for 8 bit inputs) bitmasks which are then combined into
4338 // final result (between 2 and 128 bits long).
4339 // Note that we are not handling tail here! These bits remain undefined and should be handled
4340 // later.
4341 // TODO(b/317757595): Add separate tests to verify the logic.
4342 template <typename ElementType, size_t kRegistersInvolved, typename Intrinsic>
CollectBitmaskResult(Intrinsic intrinsic)4343 SIMD128Register CollectBitmaskResult(Intrinsic intrinsic) {
4344 // We employ two distinct tactics to handle all possibilities:
4345 // 1. For 8bit/16bit types we get full UInt8/UInt16 result and thus use SIMD128Register.Set.
4346 // 2. For 32bit/64bit types we only get 2bit or 4bit from each call and thus need to use
4347 // shifts to accumulate the result.
4348 // But since each of up to 8 results is at most 4bits total bitmask is 32bit (or less).
4349 std::conditional_t<sizeof(ElementType) < sizeof(UInt32), SIMD128Register, UInt32>
4350 bitmask_result{};
4351 for (UInt32 index = UInt32{0}; index < UInt32(kRegistersInvolved); index += UInt32{1}) {
4352 const auto [raw_result] =
4353 intrinsics::SimdMaskToBitMask<ElementType>(std::get<0>(intrinsic(index)));
4354 if constexpr (sizeof(ElementType) < sizeof(Int32)) {
4355 bitmask_result.Set(raw_result, index);
4356 } else {
4357 constexpr UInt32 kElemNum =
4358 UInt32{static_cast<uint32_t>((sizeof(SIMD128Register) / sizeof(ElementType)))};
4359 bitmask_result |= UInt32(UInt8(raw_result)) << (index * kElemNum);
4360 }
4361 }
4362 return SIMD128Register(bitmask_result);
4363 }
4364
Nop()4365 void Nop() {}
4366
Undefined()4367 void Undefined() {
4368 #if defined(__aarch64__)
4369 abort();
4370 #else
4371 UndefinedInsn(GetInsnAddr());
4372 // If there is a guest handler registered for SIGILL we'll delay its processing until the next
4373 // sync point (likely the main dispatching loop) due to enabled pending signals. Thus we must
4374 // ensure that insn_addr isn't automatically advanced in FinalizeInsn.
4375 exception_raised_ = true;
4376 #endif
4377 }
4378
4379 //
4380 // Guest state getters/setters.
4381 //
4382
GetReg(uint8_t reg)4383 Register GetReg(uint8_t reg) const {
4384 CheckRegIsValid(reg);
4385 return state_->cpu.x[reg];
4386 }
4387
GetRegOrZero(uint8_t reg)4388 Register GetRegOrZero(uint8_t reg) { return reg == 0 ? 0 : GetReg(reg); }
4389
SetReg(uint8_t reg,Register value)4390 void SetReg(uint8_t reg, Register value) {
4391 if (exception_raised_) {
4392 // Do not produce side effects.
4393 return;
4394 }
4395 CheckRegIsValid(reg);
4396 state_->cpu.x[reg] = value;
4397 }
4398
SetRegOrIgnore(uint8_t reg,Register value)4399 void SetRegOrIgnore(uint8_t reg, Register value) {
4400 if (reg != 0) {
4401 SetReg(reg, value);
4402 }
4403 }
4404
GetFpReg(uint8_t reg)4405 FpRegister GetFpReg(uint8_t reg) const {
4406 CheckFpRegIsValid(reg);
4407 return state_->cpu.f[reg];
4408 }
4409
4410 template <typename FloatType>
4411 FpRegister GetFRegAndUnboxNan(uint8_t reg);
4412
4413 template <typename FloatType>
4414 void NanBoxAndSetFpReg(uint8_t reg, FpRegister value);
4415
4416 //
4417 // Various helper methods.
4418 //
4419
4420 #if defined(__aarch64__)
4421 template <CsrName kName>
GetCsr()4422 [[nodiscard]] Register GetCsr() {
4423 Undefined();
4424 return {};
4425 }
4426 #else
4427 template <CsrName kName>
GetCsr()4428 [[nodiscard]] Register GetCsr() const {
4429 return state_->cpu.*CsrFieldAddr<kName>;
4430 }
4431 #endif
4432
4433 template <CsrName kName>
SetCsr(Register arg)4434 void SetCsr(Register arg) {
4435 #if defined(__aarch64__)
4436 UNUSED(arg);
4437 Undefined();
4438 #else
4439 if (exception_raised_) {
4440 return;
4441 }
4442 state_->cpu.*CsrFieldAddr<kName> = arg & kCsrMask<kName>;
4443 #endif
4444 }
4445
GetImm(uint64_t imm)4446 [[nodiscard]] uint64_t GetImm(uint64_t imm) const { return imm; }
4447
Copy(Register value)4448 [[nodiscard]] Register Copy(Register value) const { return value; }
4449
GetInsnAddr()4450 [[nodiscard]] GuestAddr GetInsnAddr() const { return state_->cpu.insn_addr; }
4451
FinalizeInsn(uint8_t insn_len)4452 void FinalizeInsn(uint8_t insn_len) {
4453 if (!branch_taken_ && !exception_raised_) {
4454 state_->cpu.insn_addr += insn_len;
4455 }
4456 }
4457
4458 #include "berberis/intrinsics/interpreter_intrinsics_hooks-inl.h"
4459
4460 private:
4461 template <typename DataType>
Load(const void * ptr)4462 Register Load(const void* ptr) {
4463 static_assert(std::is_integral_v<DataType>);
4464 CHECK(!exception_raised_);
4465 FaultyLoadResult result = FaultyLoad(ptr, sizeof(DataType));
4466 if (result.is_fault) {
4467 exception_raised_ = true;
4468 return {};
4469 }
4470 return static_cast<DataType>(result.value);
4471 }
4472
4473 template <typename DataType>
Store(void * ptr,uint64_t data)4474 void Store(void* ptr, uint64_t data) {
4475 static_assert(std::is_integral_v<DataType>);
4476 CHECK(!exception_raised_);
4477 exception_raised_ = FaultyStore(ptr, sizeof(DataType), data);
4478 }
4479
CheckShamtIsValid(int8_t shamt)4480 void CheckShamtIsValid(int8_t shamt) const {
4481 CHECK_GE(shamt, 0);
4482 CHECK_LT(shamt, 64);
4483 }
4484
CheckShamt32IsValid(int8_t shamt)4485 void CheckShamt32IsValid(int8_t shamt) const {
4486 CHECK_GE(shamt, 0);
4487 CHECK_LT(shamt, 32);
4488 }
4489
CheckRegIsValid(uint8_t reg)4490 void CheckRegIsValid(uint8_t reg) const {
4491 CHECK_GT(reg, 0u);
4492 CHECK_LE(reg, std::size(state_->cpu.x));
4493 }
4494
CheckFpRegIsValid(uint8_t reg)4495 void CheckFpRegIsValid(uint8_t reg) const { CHECK_LT(reg, std::size(state_->cpu.f)); }
4496
4497 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4498 SIMD128Register GetHighVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4499 size_t /*vstart*/,
4500 size_t /*vl*/,
4501 size_t index,
4502 MaskType /*mask*/) {
4503 return std::get<0>(intrinsics::VMovTopHalfToBottom<ElementType>(
4504 SIMD128Register{state_->cpu.v[src.start_no + index]}));
4505 }
4506
4507 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4508 SIMD128Register GetHighVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,
4509 size_t /*vstart*/,
4510 size_t /*vl*/,
4511 size_t index,
4512 MaskType /*mask*/) {
4513 return SIMD128Register{state_->cpu.v[src.start_no + 2 * index + 1]};
4514 }
4515
4516 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetHighVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4517 ElementType GetHighVectorArgument(ElementType arg,
4518 size_t /*vstart*/,
4519 size_t /*vl*/,
4520 size_t /*index*/,
4521 MaskType /*mask*/) {
4522 return arg;
4523 }
4524
4525 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4526 SIMD128Register GetLowVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4527 size_t /*vstart*/,
4528 size_t /*vl*/,
4529 size_t index,
4530 MaskType /*mask*/) {
4531 return SIMD128Register{state_->cpu.v[src.start_no + index]};
4532 }
4533
4534 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4535 SIMD128Register GetLowVectorArgument(WideVec<intrinsics::NoInactiveProcessing{}> src,
4536 size_t /*vstart*/,
4537 size_t /*vl*/,
4538 size_t index,
4539 MaskType /*mask*/) {
4540 return SIMD128Register{state_->cpu.v[src.start_no + 2 * index]};
4541 }
4542
4543 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetLowVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4544 ElementType GetLowVectorArgument(ElementType arg,
4545 size_t /*vstart*/,
4546 size_t /*vl*/,
4547 size_t /*index*/,
4548 MaskType /*mask*/) {
4549 return arg;
4550 }
4551
4552 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,size_t,size_t,size_t index,MaskType)4553 SIMD128Register GetVectorArgument(Vec<intrinsics::NoInactiveProcessing{}> src,
4554 size_t /*vstart*/,
4555 size_t /*vl*/,
4556 size_t index,
4557 MaskType /*mask*/) {
4558 return SIMD128Register{state_->cpu.v[src.start_no + index]};
4559 }
4560
4561 template <typename ElementType,
4562 TailProcessing vta,
4563 auto vma,
4564 typename MaskType,
4565 auto kDefaultElement>
GetVectorArgument(Vec<kDefaultElement> src,size_t vstart,size_t vl,size_t index,MaskType mask)4566 SIMD128Register GetVectorArgument(Vec<kDefaultElement> src,
4567 size_t vstart,
4568 size_t vl,
4569 size_t index,
4570 MaskType mask) {
4571 return VectorMasking<kDefaultElement, vta, vma>(
4572 SIMD128Register{state_->cpu.v[src.start_no + index]}, vstart, vl, index, mask);
4573 }
4574
4575 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
GetVectorArgument(ElementType arg,size_t,size_t,size_t,MaskType)4576 ElementType GetVectorArgument(ElementType arg,
4577 size_t /*vstart*/,
4578 size_t /*vl*/,
4579 size_t /*index*/,
4580 MaskType /*mask*/) {
4581 return arg;
4582 }
4583
4584 template <bool kUseMasking>
4585 std::conditional_t<kUseMasking, SIMD128Register, intrinsics::NoInactiveProcessing>
GetMaskForVectorOperationsIfNeeded()4586 GetMaskForVectorOperationsIfNeeded() {
4587 if constexpr (kUseMasking) {
4588 return {state_->cpu.v[0]};
4589 } else {
4590 return intrinsics::NoInactiveProcessing{};
4591 }
4592 }
4593
4594 template <auto vma>
4595 std::conditional_t<std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>,
4596 intrinsics::NoInactiveProcessing,
4597 SIMD128Register>
GetMaskForVectorOperations()4598 GetMaskForVectorOperations() {
4599 return GetMaskForVectorOperationsIfNeeded<
4600 !std::is_same_v<decltype(vma), intrinsics::NoInactiveProcessing>>();
4601 }
4602
4603 template <auto kDefaultElement, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register result,size_t vstart,size_t vl,size_t index,MaskType mask)4604 SIMD128Register VectorMasking(SIMD128Register result,
4605 size_t vstart,
4606 size_t vl,
4607 size_t index,
4608 MaskType mask) {
4609 return std::get<0>(intrinsics::VectorMasking<kDefaultElement, vta, vma>(
4610 result,
4611 vstart - index * (sizeof(SIMD128Register) / sizeof(kDefaultElement)),
4612 vl - index * (sizeof(SIMD128Register) / sizeof(kDefaultElement)),
4613 std::get<0>(
4614 intrinsics::MaskForRegisterInSequence<decltype(kDefaultElement)>(mask, index))));
4615 }
4616
4617 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,size_t vstart,size_t vl,size_t index,MaskType mask)4618 SIMD128Register VectorMasking(SIMD128Register dest,
4619 SIMD128Register result,
4620 size_t vstart,
4621 size_t vl,
4622 size_t index,
4623 MaskType mask) {
4624 return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
4625 dest,
4626 result,
4627 vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4628 vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4629 std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
4630 }
4631
4632 template <typename ElementType, TailProcessing vta, auto vma, typename MaskType>
VectorMasking(SIMD128Register dest,SIMD128Register result,SIMD128Register result_mask,size_t vstart,size_t vl,size_t index,MaskType mask)4633 SIMD128Register VectorMasking(SIMD128Register dest,
4634 SIMD128Register result,
4635 SIMD128Register result_mask,
4636 size_t vstart,
4637 size_t vl,
4638 size_t index,
4639 MaskType mask) {
4640 return std::get<0>(intrinsics::VectorMasking<ElementType, vta, vma>(
4641 dest,
4642 result,
4643 result_mask,
4644 vstart - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4645 vl - index * (sizeof(SIMD128Register) / sizeof(ElementType)),
4646 std::get<0>(intrinsics::MaskForRegisterInSequence<ElementType>(mask, index))));
4647 }
4648
4649 template <typename ElementType>
VectorZeroFill(SIMD128Register src,size_t start,size_t end,size_t index)4650 SIMD128Register VectorZeroFill(SIMD128Register src, size_t start, size_t end, size_t index) {
4651 return VectorMasking<ElementType,
4652 TailProcessing::kUndisturbed,
4653 intrinsics::NoInactiveProcessing{}>(
4654 src, SIMD128Register{0}, start, end, index, intrinsics::NoInactiveProcessing{});
4655 }
4656
4657 template <template <auto> typename ProcessType,
4658 auto kLambda =
4659 [](auto packaged_value) {
4660 auto [unpacked_value] = packaged_value;
4661 return unpacked_value;
4662 },
4663 auto kDefaultValue = false,
4664 typename... Args>
4665 [[nodiscard]] static constexpr auto OrValuesOnlyForType(Args... args) {
4666 return OrResultsOnlyForType<ProcessType, kDefaultValue>(kLambda, args...);
4667 }
4668
4669 template <template <auto> typename ProcessTemplateType,
4670 auto kDefaultValue = false,
4671 typename Lambda,
4672 typename... Args>
OrResultsOnlyForType(Lambda lambda,Args...args)4673 [[nodiscard]] static constexpr auto OrResultsOnlyForType(Lambda lambda, Args... args) {
4674 #pragma GCC diagnostic push
4675 #pragma GCC diagnostic ignored "-Wbitwise-instead-of-logical"
4676 return ([lambda](auto arg) {
4677 if constexpr (IsTypeTemplateOf<std::decay_t<decltype(arg)>, ProcessTemplateType>) {
4678 return lambda(arg);
4679 } else {
4680 return kDefaultValue;
4681 }
4682 }(args) |
4683 ...);
4684 #pragma GCC diagnostic pop
4685 }
4686
4687 template <template <auto> typename ProcessTemplateType, typename Lambda, typename... Args>
ProcessOnlyForType(Lambda lambda,Args...args)4688 static constexpr void ProcessOnlyForType(Lambda lambda, Args... args) {
4689 (
4690 [lambda](auto arg) {
4691 if constexpr (IsTypeTemplateOf<std::decay_t<decltype(arg)>, ProcessTemplateType>) {
4692 lambda(arg);
4693 }
4694 }(args),
4695 ...);
4696 }
4697
4698 ThreadState* state_;
4699 bool branch_taken_;
4700 // This flag is set by illegal instructions and faulted memory accesses. The former must always
4701 // stop the playback of the current instruction, so we don't need to do anything special. The
4702 // latter may result in having more operations with side effects called before the end of the
4703 // current instruction:
4704 // Load (faulted) -> SetReg
4705 // LoadFp (faulted) -> NanBoxAndSetFpReg
4706 // If an exception is raised before these operations, we skip them. For all other operations with
4707 // side-effects we check that this flag is never raised.
4708 bool exception_raised_;
4709 };
4710
4711 #if !defined(__aarch64__)
4712 template <>
4713 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kCycle>() const {
4714 return CPUClockCount();
4715 }
4716
4717 template <>
4718 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFCsr>() const {
4719 return FeGetExceptions() | (state_->cpu.frm << 5);
4720 }
4721
4722 template <>
4723 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kFFlags>() const {
4724 return FeGetExceptions();
4725 }
4726
4727 template <>
4728 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVlenb>() const {
4729 return 16;
4730 }
4731
4732 template <>
4733 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxrm>() const {
4734 return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11;
4735 }
4736
4737 template <>
4738 [[nodiscard]] Interpreter::Register inline Interpreter::GetCsr<CsrName::kVxsat>() const {
4739 return state_->cpu.*CsrFieldAddr<CsrName::kVcsr> >> 2;
4740 }
4741
4742 template <>
4743 void inline Interpreter::SetCsr<CsrName::kFCsr>(Register arg) {
4744 CHECK(!exception_raised_);
4745 FeSetExceptions(arg & 0b1'1111);
4746 arg = (arg >> 5) & kCsrMask<CsrName::kFrm>;
4747 state_->cpu.frm = arg;
4748 FeSetRound(arg);
4749 }
4750
4751 template <>
4752 void inline Interpreter::SetCsr<CsrName::kFFlags>(Register arg) {
4753 CHECK(!exception_raised_);
4754 FeSetExceptions(arg & 0b1'1111);
4755 }
4756
4757 template <>
4758 void inline Interpreter::SetCsr<CsrName::kFrm>(Register arg) {
4759 CHECK(!exception_raised_);
4760 arg &= kCsrMask<CsrName::kFrm>;
4761 state_->cpu.frm = arg;
4762 FeSetRound(arg);
4763 }
4764
4765 template <>
4766 void inline Interpreter::SetCsr<CsrName::kVxrm>(Register arg) {
4767 CHECK(!exception_raised_);
4768 state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
4769 (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b100) | (arg & 0b11);
4770 }
4771
4772 template <>
4773 void inline Interpreter::SetCsr<CsrName::kVxsat>(Register arg) {
4774 CHECK(!exception_raised_);
4775 state_->cpu.*CsrFieldAddr<CsrName::kVcsr> =
4776 (state_->cpu.*CsrFieldAddr<CsrName::kVcsr> & 0b11) | ((arg & 0b1) << 2);
4777 }
4778
4779 #endif
4780
4781 template <>
4782 [[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float32>(
4783 uint8_t reg) {
4784 #if defined(__aarch64__)
4785 UNUSED(reg);
4786 Interpreter::Undefined();
4787 return {};
4788 #else
4789 CheckFpRegIsValid(reg);
4790 FpRegister value = state_->cpu.f[reg];
4791 return UnboxNan<Float32>(value);
4792 #endif
4793 }
4794
4795 template <>
4796 [[nodiscard]] Interpreter::FpRegister inline Interpreter::GetFRegAndUnboxNan<Interpreter::Float64>(
4797 uint8_t reg) {
4798 #if defined(__aarch64__)
4799 UNUSED(reg);
4800 Interpreter::Undefined();
4801 return {};
4802 #else
4803 CheckFpRegIsValid(reg);
4804 return state_->cpu.f[reg];
4805 #endif
4806 }
4807
4808 template <>
4809 void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float32>(uint8_t reg, FpRegister value) {
4810 if (exception_raised_) {
4811 // Do not produce side effects.
4812 return;
4813 }
4814 CheckFpRegIsValid(reg);
4815 state_->cpu.f[reg] = NanBox<Float32>(value);
4816 }
4817
4818 template <>
4819 void inline Interpreter::NanBoxAndSetFpReg<Interpreter::Float64>(uint8_t reg, FpRegister value) {
4820 if (exception_raised_) {
4821 // Do not produce side effects.
4822 return;
4823 }
4824 CheckFpRegIsValid(reg);
4825 state_->cpu.f[reg] = value;
4826 }
4827
4828 #ifdef BERBERIS_RISCV64_INTERPRETER_SEPARATE_INSTANTIATION_OF_VECTOR_OPERATIONS
4829 template <>
4830 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadIndexedArgs& args);
4831 template <>
4832 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadStrideArgs& args);
4833 template <>
4834 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VLoadUnitStrideArgs& args);
4835 template <>
4836 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVfArgs& args);
4837 template <>
4838 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpFVvArgs& args);
4839 template <>
4840 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIViArgs& args);
4841 template <>
4842 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVvArgs& args);
4843 template <>
4844 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpIVxArgs& args);
4845 template <>
4846 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVvArgs& args);
4847 template <>
4848 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VOpMVxArgs& args);
4849 template <>
4850 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreIndexedArgs& args);
4851 template <>
4852 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreStrideArgs& args);
4853 template <>
4854 extern void SemanticsPlayer<Interpreter>::OpVector(const Decoder::VStoreUnitStrideArgs& args);
4855 #endif
4856
4857 } // namespace berberis
4858