1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3
4 use crate::ir::*;
5 use crate::legalize::{
6 src_is_reg, src_is_upred_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers,
7 LegalizeBuilder,
8 };
9 use bitview::*;
10
11 use std::collections::HashMap;
12 use std::ops::Range;
13
14 pub struct ShaderModel70 {
15 sm: u8,
16 }
17
18 impl ShaderModel70 {
new(sm: u8) -> Self19 pub fn new(sm: u8) -> Self {
20 assert!(sm >= 70);
21 Self { sm }
22 }
23
has_uniform_alu(&self) -> bool24 fn has_uniform_alu(&self) -> bool {
25 self.sm >= 75
26 }
27 }
28
29 impl ShaderModel for ShaderModel70 {
sm(&self) -> u830 fn sm(&self) -> u8 {
31 self.sm
32 }
33
num_regs(&self, file: RegFile) -> u3234 fn num_regs(&self, file: RegFile) -> u32 {
35 match file {
36 RegFile::GPR => {
37 // Volta+ has a maximum of 253 registers. Presumably
38 // because two registers get burned for UGPRs? Unclear
39 // on why we need it on Volta though.
40 253
41 }
42 RegFile::UGPR => {
43 if self.has_uniform_alu() {
44 63
45 } else {
46 0
47 }
48 }
49 RegFile::Pred => 7,
50 RegFile::UPred => {
51 if self.has_uniform_alu() {
52 7
53 } else {
54 0
55 }
56 }
57 RegFile::Carry => 0,
58 RegFile::Bar => 16,
59 RegFile::Mem => RegRef::MAX_IDX + 1,
60 }
61 }
62
crs_size(&self, max_crs_depth: u32) -> u3263 fn crs_size(&self, max_crs_depth: u32) -> u32 {
64 assert!(max_crs_depth == 0);
65 0
66 }
67
op_can_be_uniform(&self, op: &Op) -> bool68 fn op_can_be_uniform(&self, op: &Op) -> bool {
69 if !self.has_uniform_alu() {
70 return false;
71 }
72
73 match op {
74 Op::R2UR(_)
75 | Op::S2R(_)
76 | Op::BMsk(_)
77 | Op::BRev(_)
78 | Op::Flo(_)
79 | Op::IAdd3(_)
80 | Op::IAdd3X(_)
81 | Op::IMad(_)
82 | Op::IMad64(_)
83 | Op::ISetP(_)
84 | Op::Lop3(_)
85 | Op::Mov(_)
86 | Op::PLop3(_)
87 | Op::PopC(_)
88 | Op::Prmt(_)
89 | Op::PSetP(_)
90 | Op::Sel(_)
91 | Op::Shf(_)
92 | Op::Shl(_)
93 | Op::Shr(_)
94 | Op::Vote(_)
95 | Op::Copy(_)
96 | Op::Pin(_)
97 | Op::Unpin(_) => true,
98 Op::Ldc(op) => op.offset.is_zero(),
99 // UCLEA USHL USHR
100 _ => false,
101 }
102 }
103
legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op)104 fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) {
105 as_sm70_op_mut(op).legalize(b);
106 }
107
encode_shader(&self, s: &Shader<'_>) -> Vec<u32>108 fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32> {
109 encode_sm70_shader(self, s)
110 }
111 }
112
113 /// A per-op trait that implements Volta+ opcode semantics
114 trait SM70Op {
legalize(&mut self, b: &mut LegalizeBuilder)115 fn legalize(&mut self, b: &mut LegalizeBuilder);
encode(&self, e: &mut SM70Encoder<'_>)116 fn encode(&self, e: &mut SM70Encoder<'_>);
117 }
118
119 struct SM70Encoder<'a> {
120 sm: &'a ShaderModel70,
121 ip: usize,
122 labels: &'a HashMap<Label, usize>,
123 inst: [u32; 4],
124 }
125
126 impl BitViewable for SM70Encoder<'_> {
bits(&self) -> usize127 fn bits(&self) -> usize {
128 BitView::new(&self.inst).bits()
129 }
130
get_bit_range_u64(&self, range: Range<usize>) -> u64131 fn get_bit_range_u64(&self, range: Range<usize>) -> u64 {
132 BitView::new(&self.inst).get_bit_range_u64(range)
133 }
134 }
135
136 impl BitMutViewable for SM70Encoder<'_> {
set_bit_range_u64(&mut self, range: Range<usize>, val: u64)137 fn set_bit_range_u64(&mut self, range: Range<usize>, val: u64) {
138 BitMutView::new(&mut self.inst).set_bit_range_u64(range, val);
139 }
140 }
141
142 impl SetFieldU64 for SM70Encoder<'_> {
set_field_u64(&mut self, range: Range<usize>, val: u64)143 fn set_field_u64(&mut self, range: Range<usize>, val: u64) {
144 BitMutView::new(&mut self.inst).set_field_u64(range, val);
145 }
146 }
147
148 impl SM70Encoder<'_> {
set_opcode(&mut self, opcode: u16)149 fn set_opcode(&mut self, opcode: u16) {
150 self.set_field(0..12, opcode);
151 }
152
set_reg(&mut self, range: Range<usize>, reg: RegRef)153 fn set_reg(&mut self, range: Range<usize>, reg: RegRef) {
154 assert!(range.len() == 8);
155 assert!(reg.file() == RegFile::GPR);
156 self.set_field(range, reg.base_idx());
157 }
158
set_ureg(&mut self, range: Range<usize>, reg: RegRef)159 fn set_ureg(&mut self, range: Range<usize>, reg: RegRef) {
160 assert!(self.sm.sm >= 75);
161 assert!(range.len() == 8);
162 assert!(reg.file() == RegFile::UGPR);
163 assert!(reg.base_idx() <= 63);
164 self.set_field(range, reg.base_idx());
165 }
166
set_pred_reg(&mut self, range: Range<usize>, reg: RegRef)167 fn set_pred_reg(&mut self, range: Range<usize>, reg: RegRef) {
168 assert!(range.len() == 3);
169 assert!(reg.base_idx() <= 7);
170 assert!(reg.comps() == 1);
171 self.set_field(range, reg.base_idx());
172 }
173
set_reg_src(&mut self, range: Range<usize>, src: Src)174 fn set_reg_src(&mut self, range: Range<usize>, src: Src) {
175 assert!(src.src_mod.is_none());
176 match src.src_ref {
177 SrcRef::Zero => self.set_reg(range, RegRef::zero(RegFile::GPR, 1)),
178 SrcRef::Reg(reg) => self.set_reg(range, reg),
179 _ => panic!("Not a register"),
180 }
181 }
182
set_pred_dst(&mut self, range: Range<usize>, dst: Dst)183 fn set_pred_dst(&mut self, range: Range<usize>, dst: Dst) {
184 match dst {
185 Dst::None => {
186 self.set_pred_reg(range, RegRef::zero(RegFile::Pred, 1));
187 }
188 Dst::Reg(reg) => self.set_pred_reg(range, reg),
189 _ => panic!("Not a register"),
190 }
191 }
192
set_pred_src_file( &mut self, range: Range<usize>, not_bit: usize, src: Src, file: RegFile, )193 fn set_pred_src_file(
194 &mut self,
195 range: Range<usize>,
196 not_bit: usize,
197 src: Src,
198 file: RegFile,
199 ) {
200 // The default for predicates is true
201 let true_reg = RegRef::new(file, 7, 1);
202
203 let (not, reg) = match src.src_ref {
204 SrcRef::True => (false, true_reg),
205 SrcRef::False => (true, true_reg),
206 SrcRef::Reg(reg) => {
207 assert!(reg.file() == file);
208 (false, reg)
209 }
210 _ => panic!("Not a register"),
211 };
212 self.set_pred_reg(range, reg);
213 self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod));
214 }
215
set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src)216 fn set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src) {
217 self.set_pred_src_file(range, not_bit, src, RegFile::Pred);
218 }
219
set_upred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src)220 fn set_upred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src) {
221 self.set_pred_src_file(range, not_bit, src, RegFile::UPred);
222 }
223
set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef)224 fn set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef) {
225 let mut v = BitMutView::new_subset(self, range);
226 v.set_field(6..22, cb.offset);
227 match cb.buf {
228 CBuf::Binding(idx) => {
229 v.set_field(22..27, idx);
230 self.set_bit(cx_bit, false);
231 }
232 CBuf::BindlessUGPR(reg) => {
233 assert!(reg.base_idx() <= 63);
234 assert!(reg.file() == RegFile::UGPR);
235 v.set_field(0..6, reg.base_idx());
236 self.set_bit(cx_bit, true);
237 }
238 CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"),
239 }
240 }
241
set_pred(&mut self, pred: &Pred)242 fn set_pred(&mut self, pred: &Pred) {
243 assert!(!pred.is_false());
244 self.set_pred_reg(
245 12..15,
246 match pred.pred_ref {
247 PredRef::None => RegRef::zero(RegFile::Pred, 1),
248 PredRef::Reg(reg) => reg,
249 PredRef::SSA(_) => panic!("SSA values must be lowered"),
250 },
251 );
252 self.set_bit(15, pred.pred_inv);
253 }
254
set_dst(&mut self, dst: Dst)255 fn set_dst(&mut self, dst: Dst) {
256 match dst {
257 Dst::None => self.set_reg(16..24, RegRef::zero(RegFile::GPR, 1)),
258 Dst::Reg(reg) => self.set_reg(16..24, reg),
259 _ => panic!("Not a register"),
260 }
261 }
262
set_udst(&mut self, dst: Dst)263 fn set_udst(&mut self, dst: Dst) {
264 match dst {
265 Dst::None => self.set_ureg(16..24, RegRef::zero(RegFile::UGPR, 1)),
266 Dst::Reg(reg) => self.set_ureg(16..24, reg),
267 _ => panic!("Not a register"),
268 }
269 }
270
set_bar_reg(&mut self, range: Range<usize>, reg: RegRef)271 fn set_bar_reg(&mut self, range: Range<usize>, reg: RegRef) {
272 assert!(range.len() == 4);
273 assert!(reg.file() == RegFile::Bar);
274 assert!(reg.comps() == 1);
275 self.set_field(range, reg.base_idx());
276 }
277
set_bar_dst(&mut self, range: Range<usize>, dst: Dst)278 fn set_bar_dst(&mut self, range: Range<usize>, dst: Dst) {
279 self.set_bar_reg(range, *dst.as_reg().unwrap());
280 }
281
set_bar_src(&mut self, range: Range<usize>, src: Src)282 fn set_bar_src(&mut self, range: Range<usize>, src: Src) {
283 assert!(src.src_mod.is_none());
284 self.set_bar_reg(range, *src.src_ref.as_reg().unwrap());
285 }
286
set_instr_deps(&mut self, deps: &InstrDeps)287 fn set_instr_deps(&mut self, deps: &InstrDeps) {
288 self.set_field(105..109, deps.delay);
289 self.set_bit(109, deps.yld);
290 self.set_field(110..113, deps.wr_bar().unwrap_or(7));
291 self.set_field(113..116, deps.rd_bar().unwrap_or(7));
292 self.set_field(116..122, deps.wt_bar_mask);
293 self.set_field(122..126, deps.reuse_mask);
294 }
295 }
296
297 //
298 // Helpers for encoding of ALU instructions
299 //
300
301 struct ALURegRef {
302 pub reg: RegRef,
303 pub abs: bool,
304 pub neg: bool,
305 pub swizzle: SrcSwizzle,
306 }
307
308 struct ALUCBufRef {
309 pub cb: CBufRef,
310 pub abs: bool,
311 pub neg: bool,
312 pub swizzle: SrcSwizzle,
313 }
314
315 enum ALUSrc {
316 None,
317 Imm32(u32),
318 Reg(ALURegRef),
319 UReg(ALURegRef),
320 CBuf(ALUCBufRef),
321 }
322
src_is_zero_or_gpr(src: &Src) -> bool323 fn src_is_zero_or_gpr(src: &Src) -> bool {
324 match src.src_ref {
325 SrcRef::Zero => true,
326 SrcRef::Reg(reg) => reg.file() == RegFile::GPR,
327 _ => false,
328 }
329 }
330
src_mod_has_abs(src_mod: SrcMod) -> bool331 fn src_mod_has_abs(src_mod: SrcMod) -> bool {
332 match src_mod {
333 SrcMod::None | SrcMod::FNeg | SrcMod::INeg | SrcMod::BNot => false,
334 SrcMod::FAbs | SrcMod::FNegAbs => true,
335 }
336 }
337
src_mod_has_neg(src_mod: SrcMod) -> bool338 fn src_mod_has_neg(src_mod: SrcMod) -> bool {
339 match src_mod {
340 SrcMod::None | SrcMod::FAbs => false,
341 SrcMod::FNeg | SrcMod::FNegAbs | SrcMod::INeg | SrcMod::BNot => true,
342 }
343 }
344
src_mod_is_bnot(src_mod: SrcMod) -> bool345 fn src_mod_is_bnot(src_mod: SrcMod) -> bool {
346 match src_mod {
347 SrcMod::None => false,
348 SrcMod::BNot => true,
349 _ => panic!("Not an predicate source modifier"),
350 }
351 }
352
dst_is_bar(dst: Dst) -> bool353 fn dst_is_bar(dst: Dst) -> bool {
354 match dst {
355 Dst::None => false,
356 Dst::SSA(ssa) => ssa.file().unwrap() == RegFile::Bar,
357 Dst::Reg(reg) => reg.file() == RegFile::Bar,
358 }
359 }
360
361 impl ALUSrc {
from_src(src: Option<&Src>, op_is_uniform: bool) -> ALUSrc362 fn from_src(src: Option<&Src>, op_is_uniform: bool) -> ALUSrc {
363 let Some(src) = src else {
364 return ALUSrc::None;
365 };
366
367 match src.src_ref {
368 SrcRef::Zero | SrcRef::Reg(_) => {
369 let reg = match src.src_ref {
370 SrcRef::Zero => {
371 let file = if op_is_uniform {
372 RegFile::UGPR
373 } else {
374 RegFile::GPR
375 };
376 RegRef::zero(file, 1)
377 }
378 SrcRef::Reg(reg) => reg,
379 _ => panic!("Invalid source ref"),
380 };
381 assert!(reg.comps() <= 2);
382 let alu_ref = ALURegRef {
383 reg: reg,
384 abs: src_mod_has_abs(src.src_mod),
385 neg: src_mod_has_neg(src.src_mod),
386 swizzle: src.src_swizzle,
387 };
388 if op_is_uniform {
389 assert!(reg.file() == RegFile::UGPR);
390 ALUSrc::Reg(alu_ref)
391 } else {
392 match reg.file() {
393 RegFile::GPR => ALUSrc::Reg(alu_ref),
394 RegFile::UGPR => ALUSrc::UReg(alu_ref),
395 _ => panic!("Invalid ALU register file"),
396 }
397 }
398 }
399 SrcRef::Imm32(i) => {
400 assert!(src.src_mod.is_none());
401 assert!(src.src_swizzle.is_none());
402 ALUSrc::Imm32(i)
403 }
404 SrcRef::CBuf(cb) => {
405 let alu_ref = ALUCBufRef {
406 cb: cb,
407 abs: src_mod_has_abs(src.src_mod),
408 neg: src_mod_has_neg(src.src_mod),
409 swizzle: src.src_swizzle,
410 };
411 ALUSrc::CBuf(alu_ref)
412 }
413 _ => panic!("Invalid ALU source"),
414 }
415 }
416
has_src_mod(&self) -> bool417 pub fn has_src_mod(&self) -> bool {
418 match self {
419 ALUSrc::Reg(reg) | ALUSrc::UReg(reg) => reg.abs || reg.neg,
420 ALUSrc::CBuf(cb) => cb.abs || cb.neg,
421 _ => false,
422 }
423 }
424 }
425
426 impl SM70Encoder<'_> {
set_swizzle(&mut self, range: Range<usize>, swizzle: SrcSwizzle)427 fn set_swizzle(&mut self, range: Range<usize>, swizzle: SrcSwizzle) {
428 assert!(range.len() == 2);
429
430 self.set_field(
431 range,
432 match swizzle {
433 SrcSwizzle::None => 0x00_u8,
434 SrcSwizzle::Xx => 0x02_u8,
435 SrcSwizzle::Yy => 0x03_u8,
436 },
437 );
438 }
439
set_alu_reg( &mut self, range: Range<usize>, abs_bit: usize, neg_bit: usize, swizzle_range: Range<usize>, file: RegFile, is_fp16_alu: bool, has_mod: bool, reg: &ALURegRef, )440 fn set_alu_reg(
441 &mut self,
442 range: Range<usize>,
443 abs_bit: usize,
444 neg_bit: usize,
445 swizzle_range: Range<usize>,
446 file: RegFile,
447 is_fp16_alu: bool,
448 has_mod: bool,
449 reg: &ALURegRef,
450 ) {
451 match file {
452 RegFile::GPR => self.set_reg(range, reg.reg),
453 RegFile::UGPR => self.set_ureg(range, reg.reg),
454 _ => panic!("Invalid ALU src register file"),
455 }
456
457 if has_mod {
458 self.set_bit(abs_bit, reg.abs);
459 self.set_bit(neg_bit, reg.neg);
460 } else {
461 assert!(!reg.abs && !reg.neg);
462 }
463
464 if is_fp16_alu {
465 self.set_swizzle(swizzle_range, reg.swizzle);
466 } else {
467 assert!(reg.swizzle == SrcSwizzle::None);
468 }
469 }
470
encode_alu_src0( &mut self, src: &ALUSrc, file: RegFile, is_fp16_alu: bool, )471 fn encode_alu_src0(
472 &mut self,
473 src: &ALUSrc,
474 file: RegFile,
475 is_fp16_alu: bool,
476 ) {
477 let reg = match src {
478 ALUSrc::None => return,
479 ALUSrc::Reg(reg) => reg,
480 _ => panic!("Invalid ALU src"),
481 };
482 self.set_alu_reg(24..32, 73, 72, 74..76, file, is_fp16_alu, true, reg);
483 }
484
encode_alu_src2( &mut self, src: &ALUSrc, file: RegFile, is_fp16_alu: bool, bit74_75_are_mod: bool, )485 fn encode_alu_src2(
486 &mut self,
487 src: &ALUSrc,
488 file: RegFile,
489 is_fp16_alu: bool,
490 bit74_75_are_mod: bool,
491 ) {
492 let reg = match src {
493 ALUSrc::None => return,
494 ALUSrc::Reg(reg) => reg,
495 _ => panic!("Invalid ALU src"),
496 };
497 self.set_alu_reg(
498 64..72,
499 74,
500 75,
501 81..83,
502 file,
503 is_fp16_alu,
504 bit74_75_are_mod,
505 reg,
506 );
507 }
508
encode_alu_reg(&mut self, reg: &ALURegRef, is_fp16_alu: bool)509 fn encode_alu_reg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) {
510 self.set_alu_reg(
511 32..40,
512 62,
513 63,
514 60..62,
515 RegFile::GPR,
516 is_fp16_alu,
517 true,
518 reg,
519 );
520 }
521
encode_alu_ureg(&mut self, reg: &ALURegRef, is_fp16_alu: bool)522 fn encode_alu_ureg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) {
523 self.set_ureg(32..40, reg.reg);
524 self.set_bit(62, reg.abs);
525 self.set_bit(63, reg.neg);
526
527 if is_fp16_alu {
528 self.set_swizzle(60..62, reg.swizzle);
529 } else {
530 assert!(reg.swizzle == SrcSwizzle::None);
531 }
532
533 self.set_bit(91, true);
534 }
535
encode_alu_imm(&mut self, imm: &u32)536 fn encode_alu_imm(&mut self, imm: &u32) {
537 self.set_field(32..64, *imm);
538 }
539
encode_alu_cb(&mut self, cb: &ALUCBufRef, is_fp16_alu: bool)540 fn encode_alu_cb(&mut self, cb: &ALUCBufRef, is_fp16_alu: bool) {
541 self.set_src_cb(32..59, 91, &cb.cb);
542 self.set_bit(62, cb.abs);
543 self.set_bit(63, cb.neg);
544
545 if is_fp16_alu {
546 self.set_swizzle(60..62, cb.swizzle);
547 } else {
548 assert!(cb.swizzle == SrcSwizzle::None);
549 }
550 }
551
encode_alu_base( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, is_fp16_alu: bool, )552 fn encode_alu_base(
553 &mut self,
554 opcode: u16,
555 dst: Option<&Dst>,
556 src0: Option<&Src>,
557 src1: Option<&Src>,
558 src2: Option<&Src>,
559 is_fp16_alu: bool,
560 ) {
561 if let Some(dst) = dst {
562 self.set_dst(*dst);
563 }
564
565 let src0 = ALUSrc::from_src(src0, false);
566 let src1 = ALUSrc::from_src(src1, false);
567 let src2 = ALUSrc::from_src(src2, false);
568
569 // Bits 74..76 are used both for the swizzle on src0 and for the source
570 // modifier for the register source of src1 and src2. When both are
571 // registers, it's used for src2. The hardware elects to always support
572 // a swizzle and not support source modifiers in that case.
573 let bit74_75_are_mod = !is_fp16_alu
574 || matches!(src1, ALUSrc::None)
575 || matches!(src2, ALUSrc::None);
576 debug_assert!(bit74_75_are_mod || !src0.has_src_mod());
577
578 self.encode_alu_src0(&src0, RegFile::GPR, is_fp16_alu);
579
580 let form = match &src2 {
581 ALUSrc::None | ALUSrc::Reg(_) => {
582 self.encode_alu_src2(
583 &src2,
584 RegFile::GPR,
585 is_fp16_alu,
586 bit74_75_are_mod,
587 );
588 match &src1 {
589 ALUSrc::None => 1_u8, // form
590 ALUSrc::Reg(reg1) => {
591 self.encode_alu_reg(reg1, is_fp16_alu);
592 1_u8 // form
593 }
594 ALUSrc::UReg(reg1) => {
595 self.encode_alu_ureg(reg1, is_fp16_alu);
596 6_u8 // form
597 }
598 ALUSrc::Imm32(imm1) => {
599 self.encode_alu_imm(imm1);
600 4_u8 // form
601 }
602 ALUSrc::CBuf(cb1) => {
603 self.encode_alu_cb(cb1, is_fp16_alu);
604 5_u8 // form
605 }
606 }
607 }
608 ALUSrc::UReg(reg2) => {
609 self.encode_alu_ureg(reg2, is_fp16_alu);
610 self.encode_alu_src2(
611 &src1,
612 RegFile::GPR,
613 is_fp16_alu,
614 bit74_75_are_mod,
615 );
616 7_u8 // form
617 }
618 ALUSrc::Imm32(imm2) => {
619 self.encode_alu_imm(imm2);
620 self.encode_alu_src2(
621 &src1,
622 RegFile::GPR,
623 is_fp16_alu,
624 bit74_75_are_mod,
625 );
626 2_u8 // form
627 }
628 ALUSrc::CBuf(cb2) => {
629 // TODO set_src_cx
630 self.encode_alu_cb(cb2, is_fp16_alu);
631 self.encode_alu_src2(
632 &src1,
633 RegFile::GPR,
634 is_fp16_alu,
635 bit74_75_are_mod,
636 );
637 3_u8 // form
638 }
639 };
640
641 self.set_field(0..9, opcode);
642 self.set_field(9..12, form);
643 }
644
encode_alu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )645 fn encode_alu(
646 &mut self,
647 opcode: u16,
648 dst: Option<&Dst>,
649 src0: Option<&Src>,
650 src1: Option<&Src>,
651 src2: Option<&Src>,
652 ) {
653 self.encode_alu_base(opcode, dst, src0, src1, src2, false);
654 }
655
encode_fp16_alu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )656 fn encode_fp16_alu(
657 &mut self,
658 opcode: u16,
659 dst: Option<&Dst>,
660 src0: Option<&Src>,
661 src1: Option<&Src>,
662 src2: Option<&Src>,
663 ) {
664 self.encode_alu_base(opcode, dst, src0, src1, src2, true);
665 }
666
encode_ualu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )667 fn encode_ualu(
668 &mut self,
669 opcode: u16,
670 dst: Option<&Dst>,
671 src0: Option<&Src>,
672 src1: Option<&Src>,
673 src2: Option<&Src>,
674 ) {
675 if let Some(dst) = dst {
676 self.set_udst(*dst);
677 }
678
679 let src0 = ALUSrc::from_src(src0, true);
680 let src1 = ALUSrc::from_src(src1, true);
681 let src2 = ALUSrc::from_src(src2, true);
682
683 // All uniform ALU requires bit 91 set
684 self.set_bit(91, true);
685
686 self.encode_alu_src0(&src0, RegFile::UGPR, false);
687 let form = match &src2 {
688 ALUSrc::None | ALUSrc::Reg(_) => {
689 self.encode_alu_src2(&src2, RegFile::UGPR, false, true);
690 match &src1 {
691 ALUSrc::None => 1_u8, // form
692 ALUSrc::Reg(reg1) => {
693 self.encode_alu_ureg(reg1, false);
694 1_u8 // form
695 }
696 ALUSrc::UReg(_) => panic!("UALU never has UReg"),
697 ALUSrc::Imm32(imm1) => {
698 self.encode_alu_imm(imm1);
699 4_u8 // form
700 }
701 ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"),
702 }
703 }
704 ALUSrc::UReg(_) => panic!("UALU never has UReg"),
705 ALUSrc::Imm32(imm2) => {
706 self.encode_alu_imm(imm2);
707 self.encode_alu_src2(&src1, RegFile::UGPR, false, true);
708 2_u8 // form
709 }
710 ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"),
711 };
712
713 self.set_field(0..9, opcode);
714 self.set_field(9..12, form);
715 }
716
set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode)717 fn set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode) {
718 assert!(range.len() == 2);
719 self.set_field(
720 range,
721 match rnd_mode {
722 FRndMode::NearestEven => 0_u8,
723 FRndMode::NegInf => 1_u8,
724 FRndMode::PosInf => 2_u8,
725 FRndMode::Zero => 3_u8,
726 },
727 );
728 }
729 }
730
731 //
732 // Legalization helpers
733 //
734
op_gpr(op: &impl DstsAsSlice) -> RegFile735 fn op_gpr(op: &impl DstsAsSlice) -> RegFile {
736 if op.is_uniform() {
737 RegFile::UGPR
738 } else {
739 RegFile::GPR
740 }
741 }
742
743 /// Helper to legalize extended or external instructions
744 ///
745 /// These are instructions which reach out external units such as load/store
746 /// and texture ops. They typically can't take anything but GPRs and are the
747 /// only types of instructions that support vectors. They also can never be
748 /// uniform so we always evict uniform sources.
749 ///
legalize_ext_instr(op: &mut impl SrcsAsSlice, b: &mut LegalizeBuilder)750 fn legalize_ext_instr(op: &mut impl SrcsAsSlice, b: &mut LegalizeBuilder) {
751 let src_types = op.src_types();
752 for (i, src) in op.srcs_as_mut_slice().iter_mut().enumerate() {
753 match src_types[i] {
754 SrcType::SSA | SrcType::GPR => match &mut src.src_ref {
755 SrcRef::Zero | SrcRef::True | SrcRef::False => {
756 assert!(src_types[i] != SrcType::SSA);
757 }
758 SrcRef::SSA(ssa) => {
759 b.copy_ssa_ref_if_uniform(ssa);
760 }
761 _ => panic!("Unsupported source reference"),
762 },
763 SrcType::ALU
764 | SrcType::F16
765 | SrcType::F16v2
766 | SrcType::F32
767 | SrcType::F64
768 | SrcType::I32
769 | SrcType::B32 => {
770 panic!("ALU srcs must be legalized explicitly");
771 }
772 SrcType::Pred => {
773 panic!("Predicates must be legalized explicitly");
774 }
775 SrcType::Carry => {
776 panic!("Carry is invalid on Volta+");
777 }
778 SrcType::Bar => (),
779 }
780 }
781 }
782
783 //
784 // Implementations of SM70Op for each op we support on Volta+
785 //
786
787 impl SM70Op for OpFAdd {
legalize(&mut self, b: &mut LegalizeBuilder)788 fn legalize(&mut self, b: &mut LegalizeBuilder) {
789 let gpr = op_gpr(self);
790 let [src0, src1] = &mut self.srcs;
791 swap_srcs_if_not_reg(src0, src1, gpr);
792 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
793 }
794
encode(&self, e: &mut SM70Encoder<'_>)795 fn encode(&self, e: &mut SM70Encoder<'_>) {
796 if src_is_zero_or_gpr(&self.srcs[1]) {
797 e.encode_alu(
798 0x021,
799 Some(&self.dst),
800 Some(&self.srcs[0]),
801 Some(&self.srcs[1]),
802 None,
803 )
804 } else {
805 e.encode_alu(
806 0x021,
807 Some(&self.dst),
808 Some(&self.srcs[0]),
809 Some(&Src::new_zero()),
810 Some(&self.srcs[1]),
811 )
812 };
813 e.set_bit(77, self.saturate);
814 e.set_rnd_mode(78..80, self.rnd_mode);
815 e.set_bit(80, self.ftz);
816 }
817 }
818
819 impl SM70Op for OpFFma {
legalize(&mut self, b: &mut LegalizeBuilder)820 fn legalize(&mut self, b: &mut LegalizeBuilder) {
821 let gpr = op_gpr(self);
822 let [src0, src1, src2] = &mut self.srcs;
823 swap_srcs_if_not_reg(src0, src1, gpr);
824 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
825 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F32);
826 }
827
encode(&self, e: &mut SM70Encoder<'_>)828 fn encode(&self, e: &mut SM70Encoder<'_>) {
829 e.encode_alu(
830 0x023,
831 Some(&self.dst),
832 Some(&self.srcs[0]),
833 Some(&self.srcs[1]),
834 Some(&self.srcs[2]),
835 );
836 e.set_bit(76, self.dnz);
837 e.set_bit(77, self.saturate);
838 e.set_rnd_mode(78..80, self.rnd_mode);
839 e.set_bit(80, self.ftz);
840 }
841 }
842
843 impl SM70Op for OpFMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)844 fn legalize(&mut self, b: &mut LegalizeBuilder) {
845 let gpr = op_gpr(self);
846 let [src0, src1] = &mut self.srcs;
847 swap_srcs_if_not_reg(src0, src1, gpr);
848 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
849 }
850
encode(&self, e: &mut SM70Encoder<'_>)851 fn encode(&self, e: &mut SM70Encoder<'_>) {
852 e.encode_alu(
853 0x009,
854 Some(&self.dst),
855 Some(&self.srcs[0]),
856 Some(&self.srcs[1]),
857 Some(&Src::new_zero()),
858 );
859 e.set_pred_src(87..90, 90, self.min);
860 e.set_bit(80, self.ftz);
861 }
862 }
863
864 impl SM70Op for OpFMul {
legalize(&mut self, b: &mut LegalizeBuilder)865 fn legalize(&mut self, b: &mut LegalizeBuilder) {
866 let gpr = op_gpr(self);
867 let [src0, src1] = &mut self.srcs;
868 swap_srcs_if_not_reg(src0, src1, gpr);
869 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
870 }
871
encode(&self, e: &mut SM70Encoder<'_>)872 fn encode(&self, e: &mut SM70Encoder<'_>) {
873 e.encode_alu(
874 0x020,
875 Some(&self.dst),
876 Some(&self.srcs[0]),
877 Some(&self.srcs[1]),
878 Some(&Src::new_zero()),
879 );
880 e.set_bit(76, self.dnz);
881 e.set_bit(77, self.saturate);
882 e.set_rnd_mode(78..80, self.rnd_mode);
883 e.set_bit(80, self.ftz);
884 e.set_field(84..87, 0x4_u8); // TODO: PDIV
885 }
886 }
887
888 impl SM70Encoder<'_> {
set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp)889 fn set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp) {
890 assert!(range.len() == 4);
891 self.set_field(
892 range,
893 match op {
894 FloatCmpOp::OrdLt => 0x01_u8,
895 FloatCmpOp::OrdEq => 0x02_u8,
896 FloatCmpOp::OrdLe => 0x03_u8,
897 FloatCmpOp::OrdGt => 0x04_u8,
898 FloatCmpOp::OrdNe => 0x05_u8,
899 FloatCmpOp::OrdGe => 0x06_u8,
900 FloatCmpOp::UnordLt => 0x09_u8,
901 FloatCmpOp::UnordEq => 0x0a_u8,
902 FloatCmpOp::UnordLe => 0x0b_u8,
903 FloatCmpOp::UnordGt => 0x0c_u8,
904 FloatCmpOp::UnordNe => 0x0d_u8,
905 FloatCmpOp::UnordGe => 0x0e_u8,
906 FloatCmpOp::IsNum => 0x07_u8,
907 FloatCmpOp::IsNan => 0x08_u8,
908 },
909 );
910 }
911
set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp)912 fn set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp) {
913 assert!(range.len() == 2);
914 self.set_field(
915 range,
916 match op {
917 PredSetOp::And => 0_u8,
918 PredSetOp::Or => 1_u8,
919 PredSetOp::Xor => 2_u8,
920 },
921 );
922 }
923
set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp)924 fn set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp) {
925 assert!(range.len() == 3);
926 self.set_field(
927 range,
928 match op {
929 IntCmpOp::Eq => 2_u8,
930 IntCmpOp::Ne => 5_u8,
931 IntCmpOp::Lt => 1_u8,
932 IntCmpOp::Le => 3_u8,
933 IntCmpOp::Gt => 4_u8,
934 IntCmpOp::Ge => 6_u8,
935 },
936 );
937 }
938 }
939
940 impl SM70Op for OpFSet {
legalize(&mut self, b: &mut LegalizeBuilder)941 fn legalize(&mut self, b: &mut LegalizeBuilder) {
942 let gpr = op_gpr(self);
943 let [src0, src1] = &mut self.srcs;
944 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
945 std::mem::swap(src0, src1);
946 self.cmp_op = self.cmp_op.flip();
947 }
948 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
949 }
950
encode(&self, e: &mut SM70Encoder<'_>)951 fn encode(&self, e: &mut SM70Encoder<'_>) {
952 e.encode_alu(
953 0x00a,
954 Some(&self.dst),
955 Some(&self.srcs[0]),
956 Some(&self.srcs[1]),
957 None,
958 );
959 e.set_float_cmp_op(76..80, self.cmp_op);
960 e.set_bit(80, self.ftz);
961 e.set_field(87..90, 0x7_u8); // TODO: src predicate
962 }
963 }
964
965 impl SM70Op for OpFSetP {
legalize(&mut self, b: &mut LegalizeBuilder)966 fn legalize(&mut self, b: &mut LegalizeBuilder) {
967 let gpr = op_gpr(self);
968 let [src0, src1] = &mut self.srcs;
969 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
970 std::mem::swap(src0, src1);
971 self.cmp_op = self.cmp_op.flip();
972 }
973 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
974 }
975
encode(&self, e: &mut SM70Encoder<'_>)976 fn encode(&self, e: &mut SM70Encoder<'_>) {
977 e.encode_alu(
978 0x00b,
979 None,
980 Some(&self.srcs[0]),
981 Some(&self.srcs[1]),
982 None,
983 );
984
985 e.set_pred_set_op(74..76, self.set_op);
986 e.set_float_cmp_op(76..80, self.cmp_op);
987 e.set_bit(80, self.ftz);
988
989 e.set_pred_dst(81..84, self.dst);
990 e.set_pred_dst(84..87, Dst::None); // dst1
991
992 e.set_pred_src(87..90, 90, self.accum);
993 }
994 }
995
996 impl SM70Op for OpFSwzAdd {
legalize(&mut self, b: &mut LegalizeBuilder)997 fn legalize(&mut self, b: &mut LegalizeBuilder) {
998 let gpr = op_gpr(self);
999 let [src0, src1] = &mut self.srcs;
1000 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
1001 b.copy_alu_src_if_not_reg(src1, gpr, SrcType::F32);
1002 }
1003
encode(&self, e: &mut SM70Encoder<'_>)1004 fn encode(&self, e: &mut SM70Encoder<'_>) {
1005 e.set_opcode(0x822);
1006 e.set_dst(self.dst);
1007
1008 e.set_reg_src(24..32, self.srcs[0]);
1009 e.set_reg_src(64..72, self.srcs[1]);
1010
1011 let mut subop = 0x0_u8;
1012
1013 for (i, swz_op) in self.ops.iter().enumerate() {
1014 let swz_op = match swz_op {
1015 FSwzAddOp::Add => 0,
1016 FSwzAddOp::SubRight => 2,
1017 FSwzAddOp::SubLeft => 1,
1018 FSwzAddOp::MoveLeft => 3,
1019 };
1020
1021 subop |= swz_op << ((self.ops.len() - i - 1) * 2);
1022 }
1023
1024 e.set_field(32..40, subop);
1025
1026 e.set_bit(77, false); // NDV
1027 e.set_rnd_mode(78..80, self.rnd_mode);
1028 e.set_bit(80, self.ftz);
1029 }
1030 }
1031
1032 impl SM70Op for OpMuFu {
legalize(&mut self, _b: &mut LegalizeBuilder)1033 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1034 // Nothing to do
1035 }
1036
encode(&self, e: &mut SM70Encoder<'_>)1037 fn encode(&self, e: &mut SM70Encoder<'_>) {
1038 e.encode_alu(0x108, Some(&self.dst), None, Some(&self.src), None);
1039 e.set_field(
1040 74..80,
1041 match self.op {
1042 MuFuOp::Cos => 0_u8,
1043 MuFuOp::Sin => 1_u8,
1044 MuFuOp::Exp2 => 2_u8,
1045 MuFuOp::Log2 => 3_u8,
1046 MuFuOp::Rcp => 4_u8,
1047 MuFuOp::Rsq => 5_u8,
1048 MuFuOp::Rcp64H => 6_u8,
1049 MuFuOp::Rsq64H => 7_u8,
1050 MuFuOp::Sqrt => 8_u8,
1051 MuFuOp::Tanh => 9_u8,
1052 },
1053 );
1054 }
1055 }
1056
1057 impl SM70Op for OpDAdd {
legalize(&mut self, b: &mut LegalizeBuilder)1058 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1059 let gpr = op_gpr(self);
1060 let [src0, src1] = &mut self.srcs;
1061 swap_srcs_if_not_reg(src0, src1, gpr);
1062 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1063 }
1064
encode(&self, e: &mut SM70Encoder<'_>)1065 fn encode(&self, e: &mut SM70Encoder<'_>) {
1066 e.encode_alu(
1067 0x029,
1068 Some(&self.dst),
1069 Some(&self.srcs[0]),
1070 None,
1071 Some(&self.srcs[1]),
1072 );
1073 e.set_rnd_mode(78..80, self.rnd_mode);
1074 }
1075 }
1076
1077 impl SM70Op for OpDFma {
legalize(&mut self, b: &mut LegalizeBuilder)1078 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1079 let gpr = op_gpr(self);
1080 let [src0, src1, src2] = &mut self.srcs;
1081 swap_srcs_if_not_reg(src0, src1, gpr);
1082 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1083 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F64);
1084 }
1085
encode(&self, e: &mut SM70Encoder<'_>)1086 fn encode(&self, e: &mut SM70Encoder<'_>) {
1087 e.encode_alu(
1088 0x02b,
1089 Some(&self.dst),
1090 Some(&self.srcs[0]),
1091 Some(&self.srcs[1]),
1092 Some(&self.srcs[2]),
1093 );
1094 e.set_rnd_mode(78..80, self.rnd_mode);
1095 }
1096 }
1097
1098 impl SM70Op for OpDMul {
legalize(&mut self, b: &mut LegalizeBuilder)1099 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1100 let gpr = op_gpr(self);
1101 let [src0, src1] = &mut self.srcs;
1102 swap_srcs_if_not_reg(src0, src1, gpr);
1103 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1104 }
1105
encode(&self, e: &mut SM70Encoder<'_>)1106 fn encode(&self, e: &mut SM70Encoder<'_>) {
1107 e.encode_alu(
1108 0x028,
1109 Some(&self.dst),
1110 Some(&self.srcs[0]),
1111 Some(&self.srcs[1]),
1112 None,
1113 );
1114 e.set_rnd_mode(78..80, self.rnd_mode);
1115 }
1116 }
1117
1118 impl SM70Op for OpDSetP {
legalize(&mut self, b: &mut LegalizeBuilder)1119 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1120 let gpr = op_gpr(self);
1121 let [src0, src1] = &mut self.srcs;
1122 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1123 std::mem::swap(src0, src1);
1124 self.cmp_op = self.cmp_op.flip();
1125 }
1126 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1127 }
1128
encode(&self, e: &mut SM70Encoder<'_>)1129 fn encode(&self, e: &mut SM70Encoder<'_>) {
1130 if src_is_zero_or_gpr(&self.srcs[1]) {
1131 e.encode_alu(
1132 0x02a,
1133 None,
1134 Some(&self.srcs[0]),
1135 Some(&self.srcs[1]),
1136 None,
1137 )
1138 } else {
1139 e.encode_alu(
1140 0x02a,
1141 None,
1142 Some(&self.srcs[0]),
1143 None,
1144 Some(&self.srcs[1]),
1145 )
1146 };
1147
1148 e.set_pred_set_op(74..76, self.set_op);
1149 e.set_float_cmp_op(76..80, self.cmp_op);
1150
1151 e.set_pred_dst(81..84, self.dst);
1152 e.set_pred_dst(84..87, Dst::None); /* dst1 */
1153
1154 e.set_pred_src(87..90, 90, self.accum);
1155 }
1156 }
1157
1158 impl SM70Op for OpHAdd2 {
legalize(&mut self, b: &mut LegalizeBuilder)1159 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1160 let gpr = op_gpr(self);
1161 let [src0, src1] = &mut self.srcs;
1162 swap_srcs_if_not_reg(src0, src1, gpr);
1163 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1164 }
1165
encode(&self, e: &mut SM70Encoder<'_>)1166 fn encode(&self, e: &mut SM70Encoder<'_>) {
1167 if src_is_zero_or_gpr(&self.srcs[1]) {
1168 e.encode_fp16_alu(
1169 0x030,
1170 Some(&self.dst),
1171 Some(&self.srcs[0]),
1172 Some(&self.srcs[1]),
1173 None,
1174 )
1175 } else {
1176 e.encode_fp16_alu(
1177 0x030,
1178 Some(&self.dst),
1179 Some(&self.srcs[0]),
1180 None,
1181 Some(&self.srcs[1]),
1182 )
1183 };
1184
1185 e.set_bit(77, self.saturate);
1186 e.set_bit(78, self.f32);
1187 e.set_bit(80, self.ftz);
1188 e.set_bit(85, false); // .BF16_V2 (SM90+)
1189 }
1190 }
1191
1192 impl SM70Op for OpHFma2 {
legalize(&mut self, b: &mut LegalizeBuilder)1193 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1194 let gpr = op_gpr(self);
1195 let [src0, src1, src2] = &mut self.srcs;
1196 swap_srcs_if_not_reg(src0, src1, gpr);
1197 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1198 b.copy_alu_src_if_not_reg(src1, gpr, SrcType::F16v2);
1199 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F16v2);
1200
1201 // HFMA2 doesn't have fabs or fneg on SRC2.
1202 if !src2.src_mod.is_none() {
1203 b.copy_alu_src_and_lower_fmod(src2, SrcType::F16v2);
1204 }
1205 }
1206
encode(&self, e: &mut SM70Encoder<'_>)1207 fn encode(&self, e: &mut SM70Encoder<'_>) {
1208 // HFMA2 doesn't have fneg and fabs on SRC2.
1209 assert!(self.srcs[2].src_mod.is_none());
1210
1211 e.encode_fp16_alu(
1212 0x031,
1213 Some(&self.dst),
1214 Some(&self.srcs[0]),
1215 Some(&self.srcs[1]),
1216 Some(&self.srcs[2]),
1217 );
1218
1219 e.set_bit(76, self.dnz);
1220 e.set_bit(77, self.saturate);
1221 e.set_bit(78, self.f32);
1222 e.set_bit(79, false); // .RELU (SM86+)
1223 e.set_bit(80, self.ftz);
1224 e.set_bit(85, false); // .BF16_V2 (SM86+)
1225 }
1226 }
1227
1228 impl SM70Op for OpHMul2 {
legalize(&mut self, b: &mut LegalizeBuilder)1229 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1230 let gpr = op_gpr(self);
1231 let [src0, src1] = &mut self.srcs;
1232 swap_srcs_if_not_reg(src0, src1, gpr);
1233 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1234 }
1235
encode(&self, e: &mut SM70Encoder<'_>)1236 fn encode(&self, e: &mut SM70Encoder<'_>) {
1237 e.encode_fp16_alu(
1238 0x032,
1239 Some(&self.dst),
1240 Some(&self.srcs[0]),
1241 Some(&self.srcs[1]),
1242 None,
1243 );
1244
1245 e.set_bit(76, self.dnz);
1246 e.set_bit(77, self.saturate);
1247 e.set_bit(78, false); // .F32 (SM70-SM75)
1248 e.set_bit(79, false); // .RELU (SM86+)
1249 e.set_bit(80, self.ftz);
1250 e.set_bit(85, false); // .BF16_V2 (SM90+)
1251 }
1252 }
1253
1254 impl SM70Op for OpHSet2 {
legalize(&mut self, b: &mut LegalizeBuilder)1255 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1256 let gpr = op_gpr(self);
1257 let [src0, src1] = &mut self.srcs;
1258 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1259 std::mem::swap(src0, src1);
1260 self.cmp_op = self.cmp_op.flip();
1261 }
1262 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1263 }
1264
encode(&self, e: &mut SM70Encoder<'_>)1265 fn encode(&self, e: &mut SM70Encoder<'_>) {
1266 if src_is_zero_or_gpr(&self.srcs[1]) {
1267 e.encode_fp16_alu(
1268 0x033,
1269 Some(&self.dst),
1270 Some(&self.srcs[0]),
1271 Some(&self.srcs[1]),
1272 None,
1273 )
1274 } else {
1275 e.encode_fp16_alu(
1276 0x033,
1277 Some(&self.dst),
1278 Some(&self.srcs[0]),
1279 None,
1280 Some(&self.srcs[1]),
1281 )
1282 };
1283
1284 e.set_bit(65, false); // .BF16_V2 (SM90+)
1285 e.set_pred_set_op(69..71, self.set_op);
1286
1287 // This differentiate between integer and fp16 output
1288 e.set_bit(71, true); // .BF
1289 e.set_float_cmp_op(76..80, self.cmp_op);
1290 e.set_bit(80, self.ftz);
1291
1292 e.set_pred_src(87..90, 90, self.accum);
1293 }
1294 }
1295
1296 impl SM70Op for OpHSetP2 {
legalize(&mut self, b: &mut LegalizeBuilder)1297 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1298 let gpr = op_gpr(self);
1299 let [src0, src1] = &mut self.srcs;
1300 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1301 std::mem::swap(src0, src1);
1302 self.cmp_op = self.cmp_op.flip();
1303 }
1304 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1305 }
1306
encode(&self, e: &mut SM70Encoder<'_>)1307 fn encode(&self, e: &mut SM70Encoder<'_>) {
1308 if src_is_zero_or_gpr(&self.srcs[1]) {
1309 e.encode_fp16_alu(
1310 0x034,
1311 None,
1312 Some(&self.srcs[0]),
1313 Some(&self.srcs[1]),
1314 None,
1315 )
1316 } else {
1317 e.encode_fp16_alu(
1318 0x034,
1319 None,
1320 Some(&self.srcs[0]),
1321 None,
1322 Some(&self.srcs[1]),
1323 )
1324 };
1325
1326 e.set_bit(65, false); // .BF16_V2 (SM90+)
1327 e.set_pred_set_op(69..71, self.set_op);
1328 e.set_bit(71, self.horizontal); // .H_AND
1329 e.set_float_cmp_op(76..80, self.cmp_op);
1330 e.set_bit(80, self.ftz);
1331
1332 e.set_pred_dst(81..84, self.dsts[0]);
1333 e.set_pred_dst(84..87, self.dsts[1]);
1334
1335 e.set_pred_src(87..90, 90, self.accum);
1336 }
1337 }
1338
1339 impl SM70Op for OpHMnMx2 {
legalize(&mut self, b: &mut LegalizeBuilder)1340 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1341 let gpr = op_gpr(self);
1342 let [src0, src1] = &mut self.srcs;
1343 swap_srcs_if_not_reg(src0, src1, gpr);
1344 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1345 }
1346
encode(&self, e: &mut SM70Encoder<'_>)1347 fn encode(&self, e: &mut SM70Encoder<'_>) {
1348 assert!(e.sm.sm >= 80);
1349
1350 e.encode_fp16_alu(
1351 0x040,
1352 Some(&self.dst),
1353 Some(&self.srcs[0]),
1354 Some(&self.srcs[1]),
1355 None,
1356 );
1357
1358 // This differentiate between integer and fp16 output
1359 e.set_bit(78, false); // .F32 (SM86)
1360 e.set_bit(80, self.ftz);
1361 e.set_bit(81, false); // .NAN
1362 e.set_bit(82, false); // .XORSIGN
1363 e.set_bit(85, false); // .BF16_V2
1364
1365 e.set_pred_src(87..90, 90, self.min);
1366 }
1367 }
1368
1369 impl SM70Op for OpBMsk {
legalize(&mut self, b: &mut LegalizeBuilder)1370 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1371 let gpr = op_gpr(self);
1372 b.copy_alu_src_if_not_reg(&mut self.pos, gpr, SrcType::ALU);
1373 }
1374
encode(&self, e: &mut SM70Encoder<'_>)1375 fn encode(&self, e: &mut SM70Encoder<'_>) {
1376 if self.is_uniform() {
1377 e.encode_ualu(
1378 0x09b,
1379 Some(&self.dst),
1380 Some(&self.pos),
1381 Some(&self.width),
1382 None,
1383 )
1384 } else {
1385 e.encode_alu(
1386 0x01b,
1387 Some(&self.dst),
1388 Some(&self.pos),
1389 Some(&self.width),
1390 None,
1391 )
1392 };
1393
1394 e.set_bit(75, self.wrap);
1395 }
1396 }
1397
1398 impl SM70Op for OpBRev {
legalize(&mut self, _b: &mut LegalizeBuilder)1399 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1400 // Nothing to do
1401 }
1402
encode(&self, e: &mut SM70Encoder<'_>)1403 fn encode(&self, e: &mut SM70Encoder<'_>) {
1404 if self.is_uniform() {
1405 e.encode_ualu(0x0be, Some(&self.dst), None, Some(&self.src), None)
1406 } else {
1407 e.encode_alu(0x101, Some(&self.dst), None, Some(&self.src), None)
1408 }
1409 }
1410 }
1411
1412 impl SM70Op for OpFlo {
legalize(&mut self, _b: &mut LegalizeBuilder)1413 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1414 // Nothing to do
1415 }
1416
encode(&self, e: &mut SM70Encoder<'_>)1417 fn encode(&self, e: &mut SM70Encoder<'_>) {
1418 if self.is_uniform() {
1419 e.encode_ualu(0x0bd, Some(&self.dst), None, Some(&self.src), None)
1420 } else {
1421 e.encode_alu(0x100, Some(&self.dst), None, Some(&self.src), None)
1422 };
1423 e.set_pred_dst(81..84, Dst::None);
1424 e.set_field(74..75, self.return_shift_amount as u8);
1425 e.set_field(73..74, self.signed as u8);
1426 let not_mod = matches!(self.src.src_mod, SrcMod::BNot);
1427 e.set_field(63..64, not_mod)
1428 }
1429 }
1430
1431 impl SM70Op for OpIAbs {
legalize(&mut self, _b: &mut LegalizeBuilder)1432 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1433 // Nothing to do
1434 }
1435
encode(&self, e: &mut SM70Encoder<'_>)1436 fn encode(&self, e: &mut SM70Encoder<'_>) {
1437 e.encode_alu(0x013, Some(&self.dst), None, Some(&self.src), None)
1438 }
1439 }
1440
1441 impl SM70Op for OpIAdd3 {
legalize(&mut self, b: &mut LegalizeBuilder)1442 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1443 let gpr = op_gpr(self);
1444 let [src0, src1, src2] = &mut self.srcs;
1445 swap_srcs_if_not_reg(src0, src1, gpr);
1446 swap_srcs_if_not_reg(src2, src1, gpr);
1447 if !src0.src_mod.is_none() && !src1.src_mod.is_none() {
1448 assert!(self.overflow[0].is_none());
1449 assert!(self.overflow[1].is_none());
1450 let val = b.alloc_ssa(gpr, 1);
1451 b.push_op(OpIAdd3 {
1452 srcs: [Src::new_zero(), *src0, Src::new_zero()],
1453 overflow: [Dst::None; 2],
1454 dst: val.into(),
1455 });
1456 *src0 = val.into();
1457 }
1458 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::I32);
1459 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::I32);
1460 if !self.overflow[0].is_none() || !self.overflow[1].is_none() {
1461 b.copy_alu_src_if_ineg_imm(src1, gpr, SrcType::I32);
1462 b.copy_alu_src_if_ineg_imm(src2, gpr, SrcType::I32);
1463 }
1464 }
1465
encode(&self, e: &mut SM70Encoder<'_>)1466 fn encode(&self, e: &mut SM70Encoder<'_>) {
1467 // Hardware requires at least one of these be unmodified
1468 assert!(
1469 self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none()
1470 );
1471
1472 if self.is_uniform() {
1473 e.encode_ualu(
1474 0x090,
1475 Some(&self.dst),
1476 Some(&self.srcs[0]),
1477 Some(&self.srcs[1]),
1478 Some(&self.srcs[2]),
1479 )
1480 } else {
1481 e.encode_alu(
1482 0x010,
1483 Some(&self.dst),
1484 Some(&self.srcs[0]),
1485 Some(&self.srcs[1]),
1486 Some(&self.srcs[2]),
1487 )
1488 };
1489
1490 e.set_pred_src(87..90, 90, false.into());
1491 e.set_pred_src(77..80, 80, false.into());
1492
1493 e.set_pred_dst(81..84, self.overflow[0]);
1494 e.set_pred_dst(84..87, self.overflow[1]);
1495 }
1496 }
1497
1498 impl SM70Op for OpIAdd3X {
legalize(&mut self, b: &mut LegalizeBuilder)1499 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1500 let gpr = op_gpr(self);
1501 let [src0, src1, src2] = &mut self.srcs;
1502 swap_srcs_if_not_reg(src0, src1, gpr);
1503 swap_srcs_if_not_reg(src2, src1, gpr);
1504 if !src0.src_mod.is_none() && !src1.src_mod.is_none() {
1505 let val = b.alloc_ssa(gpr, 1);
1506 b.push_op(OpIAdd3X {
1507 srcs: [Src::new_zero(), *src0, Src::new_zero()],
1508 overflow: [Dst::None; 2],
1509 dst: val.into(),
1510 carry: [false.into(); 2],
1511 });
1512 *src0 = val.into();
1513 }
1514 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::B32);
1515 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::B32);
1516 if !self.is_uniform() {
1517 b.copy_src_if_upred(&mut self.carry[0]);
1518 b.copy_src_if_upred(&mut self.carry[1]);
1519 }
1520 }
1521
encode(&self, e: &mut SM70Encoder<'_>)1522 fn encode(&self, e: &mut SM70Encoder<'_>) {
1523 // Hardware requires at least one of these be unmodified
1524 assert!(
1525 self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none()
1526 );
1527
1528 if self.is_uniform() {
1529 e.encode_ualu(
1530 0x090,
1531 Some(&self.dst),
1532 Some(&self.srcs[0]),
1533 Some(&self.srcs[1]),
1534 Some(&self.srcs[2]),
1535 );
1536
1537 e.set_upred_src(87..90, 90, self.carry[0]);
1538 e.set_upred_src(77..80, 80, self.carry[1]);
1539 } else {
1540 e.encode_alu(
1541 0x010,
1542 Some(&self.dst),
1543 Some(&self.srcs[0]),
1544 Some(&self.srcs[1]),
1545 Some(&self.srcs[2]),
1546 );
1547
1548 e.set_pred_src(87..90, 90, self.carry[0]);
1549 e.set_pred_src(77..80, 80, self.carry[1]);
1550 }
1551
1552 e.set_bit(74, true); // .X
1553
1554 e.set_pred_dst(81..84, self.overflow[0]);
1555 e.set_pred_dst(84..87, self.overflow[1]);
1556 }
1557 }
1558
1559 impl SM70Op for OpIDp4 {
legalize(&mut self, b: &mut LegalizeBuilder)1560 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1561 let gpr = op_gpr(self);
1562 let [src_type0, src_type1] = &mut self.src_types;
1563 let [src0, src1, src2] = &mut self.srcs;
1564 if swap_srcs_if_not_reg(src0, src1, gpr) {
1565 std::mem::swap(src_type0, src_type1);
1566 }
1567 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1568 b.copy_alu_src_if_ineg_imm(src1, gpr, SrcType::I32);
1569 b.copy_alu_src_if_not_reg(src2, gpr, SrcType::ALU);
1570 }
1571
encode(&self, e: &mut SM70Encoder<'_>)1572 fn encode(&self, e: &mut SM70Encoder<'_>) {
1573 e.encode_alu(
1574 0x026,
1575 Some(&self.dst),
1576 Some(&self.srcs[0]),
1577 Some(&self.srcs[1]),
1578 Some(&self.srcs[2]),
1579 );
1580
1581 e.set_bit(
1582 73,
1583 match self.src_types[0] {
1584 IntType::U8 => false,
1585 IntType::I8 => true,
1586 _ => panic!("Invalid DP4 source type"),
1587 },
1588 );
1589 e.set_bit(
1590 74,
1591 match self.src_types[1] {
1592 IntType::U8 => false,
1593 IntType::I8 => true,
1594 _ => panic!("Invalid DP4 source type"),
1595 },
1596 );
1597 }
1598 }
1599
1600 impl SM70Op for OpIMad {
legalize(&mut self, b: &mut LegalizeBuilder)1601 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1602 let gpr = op_gpr(self);
1603 let [src0, src1, src2] = &mut self.srcs;
1604 swap_srcs_if_not_reg(src0, src1, gpr);
1605 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1606 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::ALU);
1607 }
1608
encode(&self, e: &mut SM70Encoder<'_>)1609 fn encode(&self, e: &mut SM70Encoder<'_>) {
1610 if self.is_uniform() {
1611 e.encode_ualu(
1612 0x0a4,
1613 Some(&self.dst),
1614 Some(&self.srcs[0]),
1615 Some(&self.srcs[1]),
1616 Some(&self.srcs[2]),
1617 )
1618 } else {
1619 e.encode_alu(
1620 0x024,
1621 Some(&self.dst),
1622 Some(&self.srcs[0]),
1623 Some(&self.srcs[1]),
1624 Some(&self.srcs[2]),
1625 )
1626 };
1627 e.set_pred_dst(81..84, Dst::None);
1628 e.set_bit(73, self.signed);
1629 }
1630 }
1631
1632 impl SM70Op for OpIMad64 {
legalize(&mut self, b: &mut LegalizeBuilder)1633 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1634 let gpr = op_gpr(self);
1635 let [src0, src1, src2] = &mut self.srcs;
1636 swap_srcs_if_not_reg(src0, src1, gpr);
1637 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1638 b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::ALU);
1639 }
1640
encode(&self, e: &mut SM70Encoder<'_>)1641 fn encode(&self, e: &mut SM70Encoder<'_>) {
1642 if self.is_uniform() {
1643 e.encode_ualu(
1644 0x0a5,
1645 Some(&self.dst),
1646 Some(&self.srcs[0]),
1647 Some(&self.srcs[1]),
1648 Some(&self.srcs[2]),
1649 )
1650 } else {
1651 e.encode_alu(
1652 0x025,
1653 Some(&self.dst),
1654 Some(&self.srcs[0]),
1655 Some(&self.srcs[1]),
1656 Some(&self.srcs[2]),
1657 )
1658 };
1659 e.set_pred_dst(81..84, Dst::None);
1660 e.set_bit(73, self.signed);
1661 }
1662 }
1663
1664 impl SM70Op for OpIMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)1665 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1666 let gpr = op_gpr(self);
1667 let [src0, src1] = &mut self.srcs;
1668 swap_srcs_if_not_reg(src0, src1, gpr);
1669 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1670 }
1671
encode(&self, e: &mut SM70Encoder<'_>)1672 fn encode(&self, e: &mut SM70Encoder<'_>) {
1673 e.encode_alu(
1674 0x017,
1675 Some(&self.dst),
1676 Some(&self.srcs[0]),
1677 Some(&self.srcs[1]),
1678 None,
1679 );
1680 e.set_pred_src(87..90, 90, self.min);
1681 e.set_bit(
1682 73,
1683 match self.cmp_type {
1684 IntCmpType::U32 => false,
1685 IntCmpType::I32 => true,
1686 },
1687 );
1688 }
1689 }
1690
1691 impl SM70Op for OpISetP {
legalize(&mut self, b: &mut LegalizeBuilder)1692 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1693 let gpr = op_gpr(self);
1694 let [src0, src1] = &mut self.srcs;
1695 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1696 std::mem::swap(src0, src1);
1697 self.cmp_op = self.cmp_op.flip();
1698 }
1699 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1700 if !self.is_uniform() {
1701 b.copy_src_if_upred(&mut self.low_cmp);
1702 b.copy_src_if_upred(&mut self.accum);
1703 }
1704 }
1705
encode(&self, e: &mut SM70Encoder<'_>)1706 fn encode(&self, e: &mut SM70Encoder<'_>) {
1707 if self.is_uniform() {
1708 e.encode_ualu(
1709 0x08c,
1710 None,
1711 Some(&self.srcs[0]),
1712 Some(&self.srcs[1]),
1713 None,
1714 );
1715
1716 e.set_upred_src(68..71, 71, self.low_cmp);
1717 e.set_upred_src(87..90, 90, self.accum);
1718 } else {
1719 e.encode_alu(
1720 0x00c,
1721 None,
1722 Some(&self.srcs[0]),
1723 Some(&self.srcs[1]),
1724 None,
1725 );
1726
1727 e.set_pred_src(68..71, 71, self.low_cmp);
1728 e.set_pred_src(87..90, 90, self.accum);
1729 }
1730
1731 e.set_bit(72, self.ex);
1732
1733 e.set_field(
1734 73..74,
1735 match self.cmp_type {
1736 IntCmpType::U32 => 0_u32,
1737 IntCmpType::I32 => 1_u32,
1738 },
1739 );
1740 e.set_pred_set_op(74..76, self.set_op);
1741 e.set_int_cmp_op(76..79, self.cmp_op);
1742
1743 e.set_pred_dst(81..84, self.dst);
1744 e.set_pred_dst(84..87, Dst::None); // dst1
1745 }
1746 }
1747
src_as_lop_imm(src: &Src) -> Option<bool>1748 fn src_as_lop_imm(src: &Src) -> Option<bool> {
1749 let x = match src.src_ref {
1750 SrcRef::Zero => false,
1751 SrcRef::True => true,
1752 SrcRef::False => false,
1753 SrcRef::Imm32(i) => {
1754 if i == 0 {
1755 false
1756 } else if i == !0 {
1757 true
1758 } else {
1759 return None;
1760 }
1761 }
1762 _ => return None,
1763 };
1764 Some(x ^ src.src_mod.is_bnot())
1765 }
1766
fold_lop_src(src: &Src, x: &mut u8)1767 fn fold_lop_src(src: &Src, x: &mut u8) {
1768 if let Some(i) = src_as_lop_imm(src) {
1769 *x = if i { !0 } else { 0 };
1770 }
1771 if src.src_mod.is_bnot() {
1772 *x = !*x;
1773 }
1774 }
1775
1776 impl SM70Op for OpLop3 {
legalize(&mut self, b: &mut LegalizeBuilder)1777 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1778 let gpr = op_gpr(self);
1779 // Fold constants and modifiers if we can
1780 self.op = LogicOp3::new_lut(&|mut x, mut y, mut z| {
1781 fold_lop_src(&self.srcs[0], &mut x);
1782 fold_lop_src(&self.srcs[1], &mut y);
1783 fold_lop_src(&self.srcs[2], &mut z);
1784 self.op.eval(x, y, z)
1785 });
1786 for src in &mut self.srcs {
1787 src.src_mod = SrcMod::None;
1788 if src_as_lop_imm(src).is_some() {
1789 src.src_ref = SrcRef::Zero;
1790 }
1791 }
1792
1793 let [src0, src1, src2] = &mut self.srcs;
1794 if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1795 std::mem::swap(src0, src1);
1796 self.op = LogicOp3::new_lut(&|x, y, z| self.op.eval(y, x, z))
1797 }
1798 if !src_is_reg(src2, gpr) && src_is_reg(src1, gpr) {
1799 std::mem::swap(src2, src1);
1800 self.op = LogicOp3::new_lut(&|x, y, z| self.op.eval(x, z, y))
1801 }
1802
1803 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1804 b.copy_alu_src_if_not_reg(src2, gpr, SrcType::ALU);
1805 }
1806
encode(&self, e: &mut SM70Encoder<'_>)1807 fn encode(&self, e: &mut SM70Encoder<'_>) {
1808 if self.is_uniform() {
1809 e.encode_ualu(
1810 0x092,
1811 Some(&self.dst),
1812 Some(&self.srcs[0]),
1813 Some(&self.srcs[1]),
1814 Some(&self.srcs[2]),
1815 );
1816
1817 e.set_upred_src(87..90, 90, SrcRef::False.into());
1818 } else {
1819 e.encode_alu(
1820 0x012,
1821 Some(&self.dst),
1822 Some(&self.srcs[0]),
1823 Some(&self.srcs[1]),
1824 Some(&self.srcs[2]),
1825 );
1826
1827 e.set_pred_src(87..90, 90, SrcRef::False.into());
1828 }
1829
1830 e.set_field(72..80, self.op.lut);
1831 e.set_bit(80, false); // .PAND
1832 e.set_field(81..84, 7_u32); // pred
1833 }
1834 }
1835
1836 impl SM70Op for OpPopC {
legalize(&mut self, _b: &mut LegalizeBuilder)1837 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1838 // Nothing to do
1839 }
1840
encode(&self, e: &mut SM70Encoder<'_>)1841 fn encode(&self, e: &mut SM70Encoder<'_>) {
1842 if self.is_uniform() {
1843 e.encode_ualu(0x0bf, Some(&self.dst), None, Some(&self.src), None)
1844 } else {
1845 e.encode_alu(0x109, Some(&self.dst), None, Some(&self.src), None)
1846 };
1847
1848 let not_mod = matches!(self.src.src_mod, SrcMod::BNot);
1849 e.set_field(63..64, not_mod);
1850 }
1851 }
1852
1853 impl SM70Op for OpShf {
legalize(&mut self, b: &mut LegalizeBuilder)1854 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1855 let gpr = op_gpr(self);
1856 b.copy_alu_src_if_not_reg(&mut self.low, gpr, SrcType::ALU);
1857 b.copy_alu_src_if_both_not_reg(
1858 &self.shift,
1859 &mut self.high,
1860 gpr,
1861 SrcType::ALU,
1862 );
1863 }
1864
encode(&self, e: &mut SM70Encoder<'_>)1865 fn encode(&self, e: &mut SM70Encoder<'_>) {
1866 if self.is_uniform() {
1867 e.encode_ualu(
1868 0x099,
1869 Some(&self.dst),
1870 Some(&self.low),
1871 Some(&self.shift),
1872 Some(&self.high),
1873 )
1874 } else {
1875 e.encode_alu(
1876 0x019,
1877 Some(&self.dst),
1878 Some(&self.low),
1879 Some(&self.shift),
1880 Some(&self.high),
1881 )
1882 };
1883
1884 e.set_field(
1885 73..75,
1886 match self.data_type {
1887 IntType::I64 => 0_u8,
1888 IntType::U64 => 1_u8,
1889 IntType::I32 => 2_u8,
1890 IntType::U32 => 3_u8,
1891 _ => panic!("Invalid shift data type"),
1892 },
1893 );
1894 e.set_bit(75, self.wrap);
1895 e.set_bit(76, self.right);
1896 e.set_bit(80, self.dst_high);
1897 }
1898 }
1899
1900 impl SM70Op for OpF2F {
legalize(&mut self, _b: &mut LegalizeBuilder)1901 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1902 // Nothing to do
1903 }
1904
encode(&self, e: &mut SM70Encoder<'_>)1905 fn encode(&self, e: &mut SM70Encoder<'_>) {
1906 assert!(!self.integer_rnd);
1907 if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1908 e.encode_alu(0x104, Some(&self.dst), None, Some(&self.src), None)
1909 } else {
1910 e.encode_alu(0x110, Some(&self.dst), None, Some(&self.src), None)
1911 };
1912
1913 if self.high {
1914 e.set_field(60..62, 1_u8); // .H1
1915 }
1916
1917 e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1918 e.set_rnd_mode(78..80, self.rnd_mode);
1919 e.set_bit(80, self.ftz);
1920 e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1921 }
1922 }
1923
1924 impl SM70Op for OpF2FP {
legalize(&mut self, b: &mut LegalizeBuilder)1925 fn legalize(&mut self, b: &mut LegalizeBuilder) {
1926 let gpr = op_gpr(self);
1927 let [src0, src1] = &mut self.srcs;
1928 swap_srcs_if_not_reg(src0, src1, gpr);
1929
1930 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1931 }
1932
encode(&self, e: &mut SM70Encoder<'_>)1933 fn encode(&self, e: &mut SM70Encoder<'_>) {
1934 e.encode_alu(
1935 0x03e,
1936 Some(&self.dst),
1937 Some(&self.srcs[0]),
1938 Some(&self.srcs[1]),
1939 Some(&Src::new_zero()),
1940 );
1941
1942 // .MERGE_C behavior
1943 // Use src1 and src2, src0 is unused
1944 // src1 get converted and packed in the lower 16 bits of dest.
1945 // src2 lower or high 16 bits (decided by .H1 flag) get packed in the upper of dest.
1946 e.set_bit(78, false); // TODO: .MERGE_C
1947 e.set_bit(72, false); // .H1 (MERGE_C only)
1948 e.set_rnd_mode(79..81, self.rnd_mode);
1949 }
1950 }
1951
1952 impl SM70Op for OpF2I {
legalize(&mut self, _b: &mut LegalizeBuilder)1953 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1954 // Nothing to do
1955 }
1956
encode(&self, e: &mut SM70Encoder<'_>)1957 fn encode(&self, e: &mut SM70Encoder<'_>) {
1958 if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1959 e.encode_alu(0x105, Some(&self.dst), None, Some(&self.src), None)
1960 } else {
1961 e.encode_alu(0x111, Some(&self.dst), None, Some(&self.src), None)
1962 };
1963
1964 e.set_bit(72, self.dst_type.is_signed());
1965 e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1966 e.set_bit(77, false); // NTZ
1967 e.set_rnd_mode(78..80, self.rnd_mode);
1968 e.set_bit(80, self.ftz);
1969 e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1970 }
1971 }
1972
1973 impl SM70Op for OpI2F {
legalize(&mut self, _b: &mut LegalizeBuilder)1974 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1975 // Nothing to do
1976 }
1977
encode(&self, e: &mut SM70Encoder<'_>)1978 fn encode(&self, e: &mut SM70Encoder<'_>) {
1979 if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1980 e.encode_alu(0x106, Some(&self.dst), None, Some(&self.src), None)
1981 } else {
1982 e.encode_alu(0x112, Some(&self.dst), None, Some(&self.src), None)
1983 };
1984
1985 e.set_field(60..62, 0_u8); // TODO: subop
1986 e.set_bit(74, self.src_type.is_signed());
1987 e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1988 e.set_rnd_mode(78..80, self.rnd_mode);
1989 e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1990 }
1991 }
1992
1993 impl SM70Op for OpFRnd {
legalize(&mut self, _b: &mut LegalizeBuilder)1994 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1995 // Nothing to do
1996 }
1997
encode(&self, e: &mut SM70Encoder<'_>)1998 fn encode(&self, e: &mut SM70Encoder<'_>) {
1999 if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
2000 e.encode_alu(0x107, Some(&self.dst), None, Some(&self.src), None)
2001 } else {
2002 e.encode_alu(0x113, Some(&self.dst), None, Some(&self.src), None)
2003 };
2004
2005 e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
2006 e.set_bit(80, self.ftz);
2007 e.set_rnd_mode(78..80, self.rnd_mode);
2008 e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
2009 }
2010 }
2011
2012 impl SM70Op for OpMov {
legalize(&mut self, _b: &mut LegalizeBuilder)2013 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2014 // Nothing to do
2015 }
2016
encode(&self, e: &mut SM70Encoder<'_>)2017 fn encode(&self, e: &mut SM70Encoder<'_>) {
2018 if self.is_uniform() {
2019 e.set_opcode(0xc82);
2020 e.set_udst(self.dst);
2021
2022 // umov is encoded like a non-uniform ALU op
2023 let src = ALUSrc::from_src(Some(&self.src), true);
2024 let form: u8 = match &src {
2025 ALUSrc::Reg(reg) => {
2026 e.encode_alu_ureg(reg, false);
2027 0x6 // form
2028 }
2029 ALUSrc::Imm32(imm) => {
2030 e.encode_alu_imm(imm);
2031 0x4 // form
2032 }
2033 _ => panic!("Invalid umov src"),
2034 };
2035 e.set_field(9..12, form);
2036 } else {
2037 e.encode_alu(0x002, Some(&self.dst), None, Some(&self.src), None);
2038 e.set_field(72..76, self.quad_lanes);
2039 }
2040 }
2041 }
2042
2043 impl SM70Op for OpPrmt {
legalize(&mut self, b: &mut LegalizeBuilder)2044 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2045 let gpr = op_gpr(self);
2046 let [src0, src1] = &mut self.srcs;
2047 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
2048 b.copy_alu_src_if_not_reg(src1, gpr, SrcType::ALU);
2049 }
2050
encode(&self, e: &mut SM70Encoder<'_>)2051 fn encode(&self, e: &mut SM70Encoder<'_>) {
2052 if self.is_uniform() {
2053 e.encode_ualu(
2054 0x96,
2055 Some(&self.dst),
2056 Some(&self.srcs[0]),
2057 Some(&self.sel),
2058 Some(&self.srcs[1]),
2059 )
2060 } else {
2061 e.encode_alu(
2062 0x16,
2063 Some(&self.dst),
2064 Some(&self.srcs[0]),
2065 Some(&self.sel),
2066 Some(&self.srcs[1]),
2067 )
2068 };
2069
2070 e.set_field(
2071 72..75,
2072 match self.mode {
2073 PrmtMode::Index => 0_u8,
2074 PrmtMode::Forward4Extract => 1_u8,
2075 PrmtMode::Backward4Extract => 2_u8,
2076 PrmtMode::Replicate8 => 3_u8,
2077 PrmtMode::EdgeClampLeft => 4_u8,
2078 PrmtMode::EdgeClampRight => 5_u8,
2079 PrmtMode::Replicate16 => 6_u8,
2080 },
2081 );
2082 }
2083 }
2084
2085 impl SM70Op for OpSel {
legalize(&mut self, b: &mut LegalizeBuilder)2086 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2087 let gpr = op_gpr(self);
2088 if !self.is_uniform() {
2089 b.copy_src_if_upred(&mut self.cond);
2090 }
2091 let [src0, src1] = &mut self.srcs;
2092 if swap_srcs_if_not_reg(src0, src1, gpr) {
2093 self.cond = self.cond.bnot();
2094 }
2095 b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
2096 }
2097
encode(&self, e: &mut SM70Encoder<'_>)2098 fn encode(&self, e: &mut SM70Encoder<'_>) {
2099 if self.is_uniform() {
2100 e.encode_ualu(
2101 0x087,
2102 Some(&self.dst),
2103 Some(&self.srcs[0]),
2104 Some(&self.srcs[1]),
2105 None,
2106 );
2107
2108 e.set_upred_src(87..90, 90, self.cond);
2109 } else {
2110 e.encode_alu(
2111 0x007,
2112 Some(&self.dst),
2113 Some(&self.srcs[0]),
2114 Some(&self.srcs[1]),
2115 None,
2116 );
2117
2118 e.set_pred_src(87..90, 90, self.cond);
2119 }
2120 }
2121 }
2122
2123 impl SM70Op for OpShfl {
legalize(&mut self, b: &mut LegalizeBuilder)2124 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2125 let gpr = op_gpr(self);
2126 b.copy_alu_src_if_not_reg(&mut self.src, gpr, SrcType::GPR);
2127 b.copy_alu_src_if_not_reg_or_imm(&mut self.lane, gpr, SrcType::ALU);
2128 b.copy_alu_src_if_not_reg_or_imm(&mut self.c, gpr, SrcType::ALU);
2129 }
2130
encode(&self, e: &mut SM70Encoder<'_>)2131 fn encode(&self, e: &mut SM70Encoder<'_>) {
2132 assert!(self.lane.src_mod.is_none());
2133 assert!(self.c.src_mod.is_none());
2134
2135 match &self.lane.src_ref {
2136 SrcRef::Zero | SrcRef::Reg(_) => match &self.c.src_ref {
2137 SrcRef::Zero | SrcRef::Reg(_) => {
2138 e.set_opcode(0x389);
2139 e.set_reg_src(32..40, self.lane);
2140 e.set_reg_src(64..72, self.c);
2141 }
2142 SrcRef::Imm32(imm_c) => {
2143 e.set_opcode(0x589);
2144 e.set_reg_src(32..40, self.lane);
2145 e.set_field(40..53, *imm_c & 0x1f1f);
2146 }
2147 _ => panic!("Invalid instruction form"),
2148 },
2149 SrcRef::Imm32(imm_lane) => match &self.c.src_ref {
2150 SrcRef::Zero | SrcRef::Reg(_) => {
2151 e.set_opcode(0x989);
2152 e.set_field(53..58, *imm_lane & 0x1f);
2153 e.set_reg_src(64..72, self.c);
2154 }
2155 SrcRef::Imm32(imm_c) => {
2156 e.set_opcode(0xf89);
2157 e.set_field(40..53, *imm_c & 0x1f1f);
2158 e.set_field(53..58, *imm_lane & 0x1f);
2159 }
2160 _ => panic!("Invalid instruction form"),
2161 },
2162 _ => panic!("Invalid instruction form"),
2163 };
2164
2165 e.set_dst(self.dst);
2166 e.set_pred_dst(81..84, self.in_bounds);
2167 e.set_reg_src(24..32, self.src);
2168 e.set_field(
2169 58..60,
2170 match self.op {
2171 ShflOp::Idx => 0_u8,
2172 ShflOp::Up => 1_u8,
2173 ShflOp::Down => 2_u8,
2174 ShflOp::Bfly => 3_u8,
2175 },
2176 );
2177 }
2178 }
2179
2180 impl SM70Op for OpPLop3 {
legalize(&mut self, b: &mut LegalizeBuilder)2181 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2182 // Fold constants and modifiers if we can
2183 for lop in &mut self.ops {
2184 *lop = LogicOp3::new_lut(&|mut x, mut y, mut z| {
2185 fold_lop_src(&self.srcs[0], &mut x);
2186 fold_lop_src(&self.srcs[1], &mut y);
2187 fold_lop_src(&self.srcs[2], &mut z);
2188 lop.eval(x, y, z)
2189 });
2190 }
2191 for src in &mut self.srcs {
2192 src.src_mod = SrcMod::None;
2193 if src_as_lop_imm(src).is_some() {
2194 src.src_ref = SrcRef::True;
2195 }
2196 }
2197
2198 if !self.is_uniform() {
2199 // The warp form of plop3 allows a single uniform predicate in
2200 // src2. If we have a uniform predicate anywhere, try to move it
2201 // there.
2202 let [src0, src1, src2] = &mut self.srcs;
2203 if src_is_upred_reg(src0) && !src_is_upred_reg(src2) {
2204 std::mem::swap(src0, src2);
2205 for lop in &mut self.ops {
2206 *lop = LogicOp3::new_lut(&|x, y, z| lop.eval(z, y, x))
2207 }
2208 }
2209 if src_is_upred_reg(src1) && !src_is_upred_reg(src2) {
2210 std::mem::swap(src1, src2);
2211 for lop in &mut self.ops {
2212 *lop = LogicOp3::new_lut(&|x, y, z| lop.eval(x, z, y))
2213 }
2214 }
2215 b.copy_src_if_upred(src0);
2216 b.copy_src_if_upred(src1);
2217 }
2218 }
2219
encode(&self, e: &mut SM70Encoder<'_>)2220 fn encode(&self, e: &mut SM70Encoder<'_>) {
2221 if self.is_uniform() {
2222 e.set_opcode(0x89c);
2223
2224 e.set_upred_src(68..71, 71, self.srcs[2]);
2225 e.set_upred_src(77..80, 80, self.srcs[1]);
2226 e.set_upred_src(87..90, 90, self.srcs[0]);
2227 } else {
2228 e.set_opcode(0x81c);
2229
2230 if self.srcs[2]
2231 .src_ref
2232 .as_reg()
2233 .is_some_and(|r| r.is_uniform())
2234 {
2235 e.set_upred_src(68..71, 71, self.srcs[2]);
2236 e.set_bit(67, true);
2237 } else {
2238 e.set_pred_src(68..71, 71, self.srcs[2]);
2239 }
2240 e.set_pred_src(77..80, 80, self.srcs[1]);
2241 e.set_pred_src(87..90, 90, self.srcs[0]);
2242 }
2243 e.set_field(16..24, self.ops[1].lut);
2244 e.set_field(64..67, self.ops[0].lut & 0x7);
2245 e.set_field(72..77, self.ops[0].lut >> 3);
2246
2247 e.set_pred_dst(81..84, self.dsts[0]);
2248 e.set_pred_dst(84..87, self.dsts[1]);
2249 }
2250 }
2251
2252 impl SM70Op for OpR2UR {
legalize(&mut self, _b: &mut LegalizeBuilder)2253 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2254 // Nothing to do
2255 }
2256
encode(&self, e: &mut SM70Encoder<'_>)2257 fn encode(&self, e: &mut SM70Encoder<'_>) {
2258 e.set_opcode(0x3c2);
2259 e.set_udst(self.dst);
2260 e.set_reg_src(24..32, self.src);
2261 e.set_pred_dst(81..84, Dst::None);
2262 }
2263 }
2264
2265 impl SM70Encoder<'_> {
set_tex_dim(&mut self, range: Range<usize>, dim: TexDim)2266 fn set_tex_dim(&mut self, range: Range<usize>, dim: TexDim) {
2267 assert!(range.len() == 3);
2268 self.set_field(
2269 range,
2270 match dim {
2271 TexDim::_1D => 0_u8,
2272 TexDim::Array1D => 4_u8,
2273 TexDim::_2D => 1_u8,
2274 TexDim::Array2D => 5_u8,
2275 TexDim::_3D => 2_u8,
2276 TexDim::Cube => 3_u8,
2277 TexDim::ArrayCube => 7_u8,
2278 },
2279 );
2280 }
2281
set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode)2282 fn set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode) {
2283 assert!(range.len() == 3);
2284 self.set_field(
2285 range,
2286 match lod_mode {
2287 TexLodMode::Auto => 0_u8,
2288 TexLodMode::Zero => 1_u8,
2289 TexLodMode::Bias => 2_u8,
2290 TexLodMode::Lod => 3_u8,
2291 TexLodMode::Clamp => 4_u8,
2292 TexLodMode::BiasClamp => 5_u8,
2293 },
2294 );
2295 }
2296
set_image_dim(&mut self, range: Range<usize>, dim: ImageDim)2297 fn set_image_dim(&mut self, range: Range<usize>, dim: ImageDim) {
2298 assert!(range.len() == 3);
2299 self.set_field(
2300 range,
2301 match dim {
2302 ImageDim::_1D => 0_u8,
2303 ImageDim::_1DBuffer => 1_u8,
2304 ImageDim::_1DArray => 2_u8,
2305 ImageDim::_2D => 3_u8,
2306 ImageDim::_2DArray => 4_u8,
2307 ImageDim::_3D => 5_u8,
2308 },
2309 );
2310 }
2311 }
2312
2313 impl SM70Op for OpTex {
legalize(&mut self, b: &mut LegalizeBuilder)2314 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2315 legalize_ext_instr(self, b);
2316 }
2317
encode(&self, e: &mut SM70Encoder<'_>)2318 fn encode(&self, e: &mut SM70Encoder<'_>) {
2319 e.set_opcode(0x361);
2320 e.set_bit(59, true); // .B
2321
2322 e.set_dst(self.dsts[0]);
2323 if let Dst::Reg(reg) = self.dsts[1] {
2324 e.set_reg(64..72, reg);
2325 } else {
2326 e.set_field(64..72, 255_u8);
2327 }
2328 e.set_pred_dst(81..84, self.fault);
2329
2330 e.set_reg_src(24..32, self.srcs[0]);
2331 e.set_reg_src(32..40, self.srcs[1]);
2332
2333 e.set_tex_dim(61..64, self.dim);
2334 e.set_field(72..76, self.mask);
2335 e.set_bit(76, self.offset);
2336 e.set_bit(77, false); // ToDo: NDV
2337 e.set_bit(78, self.z_cmpr);
2338 e.set_field(84..87, 1);
2339 e.set_tex_lod_mode(87..90, self.lod_mode);
2340 e.set_bit(90, false); // TODO: .NODEP
2341 }
2342 }
2343
2344 impl SM70Op for OpTld {
legalize(&mut self, b: &mut LegalizeBuilder)2345 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2346 legalize_ext_instr(self, b);
2347 }
2348
encode(&self, e: &mut SM70Encoder<'_>)2349 fn encode(&self, e: &mut SM70Encoder<'_>) {
2350 e.set_opcode(0x367);
2351 e.set_bit(59, true); // .B
2352
2353 e.set_dst(self.dsts[0]);
2354 if let Dst::Reg(reg) = self.dsts[1] {
2355 e.set_reg(64..72, reg);
2356 } else {
2357 e.set_field(64..72, 255_u8);
2358 }
2359 e.set_pred_dst(81..84, self.fault);
2360
2361 e.set_reg_src(24..32, self.srcs[0]);
2362 e.set_reg_src(32..40, self.srcs[1]);
2363
2364 e.set_tex_dim(61..64, self.dim);
2365 e.set_field(72..76, self.mask);
2366 e.set_bit(76, self.offset);
2367 // bit 77: .CL
2368 e.set_bit(78, self.is_ms);
2369 // bits 79..81: .F16
2370 assert!(
2371 self.lod_mode == TexLodMode::Zero
2372 || self.lod_mode == TexLodMode::Lod
2373 );
2374 e.set_tex_lod_mode(87..90, self.lod_mode);
2375 e.set_bit(90, false); // TODO: .NODEP
2376 }
2377 }
2378
2379 impl SM70Op for OpTld4 {
legalize(&mut self, b: &mut LegalizeBuilder)2380 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2381 legalize_ext_instr(self, b);
2382 }
2383
encode(&self, e: &mut SM70Encoder<'_>)2384 fn encode(&self, e: &mut SM70Encoder<'_>) {
2385 e.set_opcode(0x364);
2386 e.set_bit(59, true); // .B
2387
2388 e.set_dst(self.dsts[0]);
2389 if let Dst::Reg(reg) = self.dsts[1] {
2390 e.set_reg(64..72, reg);
2391 } else {
2392 e.set_field(64..72, 255_u8);
2393 }
2394 e.set_pred_dst(81..84, self.fault);
2395
2396 e.set_reg_src(24..32, self.srcs[0]);
2397 e.set_reg_src(32..40, self.srcs[1]);
2398
2399 e.set_tex_dim(61..64, self.dim);
2400 e.set_field(72..76, self.mask);
2401 e.set_field(
2402 76..78,
2403 match self.offset_mode {
2404 Tld4OffsetMode::None => 0_u8,
2405 Tld4OffsetMode::AddOffI => 1_u8,
2406 Tld4OffsetMode::PerPx => 2_u8,
2407 },
2408 );
2409 // bit 77: .CL
2410 e.set_bit(78, self.z_cmpr);
2411 e.set_bit(84, true); // !.EF
2412 e.set_field(87..89, self.comp);
2413 e.set_bit(90, false); // TODO: .NODEP
2414 }
2415 }
2416
2417 impl SM70Op for OpTmml {
legalize(&mut self, b: &mut LegalizeBuilder)2418 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2419 legalize_ext_instr(self, b);
2420 }
2421
encode(&self, e: &mut SM70Encoder<'_>)2422 fn encode(&self, e: &mut SM70Encoder<'_>) {
2423 e.set_opcode(0x36a);
2424 e.set_bit(59, true); // .B
2425
2426 e.set_dst(self.dsts[0]);
2427 if let Dst::Reg(reg) = self.dsts[1] {
2428 e.set_reg(64..72, reg);
2429 } else {
2430 e.set_field(64..72, 255_u8);
2431 }
2432
2433 e.set_reg_src(24..32, self.srcs[0]);
2434 e.set_reg_src(32..40, self.srcs[1]);
2435
2436 e.set_tex_dim(61..64, self.dim);
2437 e.set_field(72..76, self.mask);
2438 e.set_bit(77, false); // ToDo: NDV
2439 e.set_bit(90, false); // TODO: .NODEP
2440 }
2441 }
2442
2443 impl SM70Op for OpTxd {
legalize(&mut self, b: &mut LegalizeBuilder)2444 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2445 legalize_ext_instr(self, b);
2446 }
2447
encode(&self, e: &mut SM70Encoder<'_>)2448 fn encode(&self, e: &mut SM70Encoder<'_>) {
2449 e.set_opcode(0x36d);
2450 e.set_bit(59, true); // .B
2451
2452 e.set_dst(self.dsts[0]);
2453 if let Dst::Reg(reg) = self.dsts[1] {
2454 e.set_reg(64..72, reg);
2455 } else {
2456 e.set_field(64..72, 255_u8);
2457 }
2458 e.set_pred_dst(81..84, self.fault);
2459
2460 e.set_reg_src(24..32, self.srcs[0]);
2461 e.set_reg_src(32..40, self.srcs[1]);
2462
2463 e.set_tex_dim(61..64, self.dim);
2464 e.set_field(72..76, self.mask);
2465 e.set_bit(76, self.offset);
2466 e.set_bit(77, false); // ToDo: NDV
2467 e.set_bit(90, false); // TODO: .NODEP
2468 }
2469 }
2470
2471 impl SM70Op for OpTxq {
legalize(&mut self, b: &mut LegalizeBuilder)2472 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2473 legalize_ext_instr(self, b);
2474 }
2475
encode(&self, e: &mut SM70Encoder<'_>)2476 fn encode(&self, e: &mut SM70Encoder<'_>) {
2477 e.set_opcode(0x370);
2478 e.set_bit(59, true); // .B
2479
2480 e.set_dst(self.dsts[0]);
2481 if let Dst::Reg(reg) = self.dsts[1] {
2482 e.set_reg(64..72, reg);
2483 } else {
2484 e.set_field(64..72, 255_u8);
2485 }
2486
2487 e.set_reg_src(24..32, self.src);
2488 e.set_field(
2489 62..64,
2490 match self.query {
2491 TexQuery::Dimension => 0_u8,
2492 TexQuery::TextureType => 1_u8,
2493 TexQuery::SamplerPos => 2_u8,
2494 },
2495 );
2496 e.set_field(72..76, self.mask);
2497 }
2498 }
2499
2500 impl SM70Encoder<'_> {
set_mem_order(&mut self, order: &MemOrder)2501 fn set_mem_order(&mut self, order: &MemOrder) {
2502 if self.sm.sm < 80 {
2503 let scope = match order {
2504 MemOrder::Constant => MemScope::System,
2505 MemOrder::Weak => MemScope::CTA,
2506 MemOrder::Strong(s) => *s,
2507 };
2508 self.set_field(
2509 77..79,
2510 match scope {
2511 MemScope::CTA => 0_u8,
2512 // SM => 1_u8,
2513 MemScope::GPU => 2_u8,
2514 MemScope::System => 3_u8,
2515 },
2516 );
2517 self.set_field(
2518 79..81,
2519 match order {
2520 MemOrder::Constant => 0_u8,
2521 MemOrder::Weak => 1_u8,
2522 MemOrder::Strong(_) => 2_u8,
2523 // MMIO => 3_u8,
2524 },
2525 );
2526 } else {
2527 self.set_field(
2528 77..81,
2529 match order {
2530 MemOrder::Constant => 0x4_u8,
2531 MemOrder::Weak => 0x0_u8,
2532 MemOrder::Strong(MemScope::CTA) => 0x5_u8,
2533 MemOrder::Strong(MemScope::GPU) => 0x7_u8,
2534 MemOrder::Strong(MemScope::System) => 0xa_u8,
2535 },
2536 );
2537 }
2538 }
2539
set_eviction_priority(&mut self, pri: &MemEvictionPriority)2540 fn set_eviction_priority(&mut self, pri: &MemEvictionPriority) {
2541 self.set_field(
2542 84..86,
2543 match pri {
2544 MemEvictionPriority::First => 0_u8,
2545 MemEvictionPriority::Normal => 1_u8,
2546 MemEvictionPriority::Last => 2_u8,
2547 MemEvictionPriority::Unchanged => 3_u8,
2548 },
2549 );
2550 }
2551
set_mem_type(&mut self, range: Range<usize>, mem_type: MemType)2552 fn set_mem_type(&mut self, range: Range<usize>, mem_type: MemType) {
2553 assert!(range.len() == 3);
2554 self.set_field(
2555 range,
2556 match mem_type {
2557 MemType::U8 => 0_u8,
2558 MemType::I8 => 1_u8,
2559 MemType::U16 => 2_u8,
2560 MemType::I16 => 3_u8,
2561 MemType::B32 => 4_u8,
2562 MemType::B64 => 5_u8,
2563 MemType::B128 => 6_u8,
2564 },
2565 );
2566 }
2567
set_mem_access(&mut self, access: &MemAccess)2568 fn set_mem_access(&mut self, access: &MemAccess) {
2569 self.set_field(
2570 72..73,
2571 match access.space.addr_type() {
2572 MemAddrType::A32 => 0_u8,
2573 MemAddrType::A64 => 1_u8,
2574 },
2575 );
2576 self.set_mem_type(73..76, access.mem_type);
2577 self.set_mem_order(&access.order);
2578 self.set_eviction_priority(&access.eviction_priority);
2579 }
2580 }
2581
2582 impl SM70Op for OpSuLd {
legalize(&mut self, b: &mut LegalizeBuilder)2583 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2584 legalize_ext_instr(self, b);
2585 }
2586
encode(&self, e: &mut SM70Encoder<'_>)2587 fn encode(&self, e: &mut SM70Encoder<'_>) {
2588 e.set_opcode(0x998);
2589
2590 e.set_dst(self.dst);
2591 e.set_reg_src(24..32, self.coord);
2592 e.set_reg_src(64..72, self.handle);
2593 e.set_pred_dst(81..84, self.fault);
2594
2595 e.set_image_dim(61..64, self.image_dim);
2596 e.set_mem_order(&self.mem_order);
2597 e.set_eviction_priority(&self.mem_eviction_priority);
2598
2599 assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2600 e.set_field(72..76, self.mask);
2601 }
2602 }
2603
2604 impl SM70Op for OpSuSt {
legalize(&mut self, b: &mut LegalizeBuilder)2605 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2606 legalize_ext_instr(self, b);
2607 }
2608
encode(&self, e: &mut SM70Encoder<'_>)2609 fn encode(&self, e: &mut SM70Encoder<'_>) {
2610 e.set_opcode(0x99c);
2611
2612 e.set_reg_src(24..32, self.coord);
2613 e.set_reg_src(32..40, self.data);
2614 e.set_reg_src(64..72, self.handle);
2615
2616 e.set_image_dim(61..64, self.image_dim);
2617 e.set_mem_order(&self.mem_order);
2618 e.set_eviction_priority(&self.mem_eviction_priority);
2619
2620 assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2621 e.set_field(72..76, self.mask);
2622 }
2623 }
2624
2625 impl SM70Op for OpSuAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2626 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2627 legalize_ext_instr(self, b);
2628 }
2629
encode(&self, e: &mut SM70Encoder<'_>)2630 fn encode(&self, e: &mut SM70Encoder<'_>) {
2631 if self.dst.is_none() {
2632 e.set_opcode(0x3a0);
2633 e.set_atom_op(87..90, self.atom_op);
2634 } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2635 e.set_opcode(0x396);
2636 assert!(cmp_src == AtomCmpSrc::Packed);
2637 } else {
2638 e.set_opcode(0x394);
2639 e.set_atom_op(87..91, self.atom_op);
2640 };
2641
2642 e.set_dst(self.dst);
2643 e.set_reg_src(24..32, self.coord);
2644 e.set_reg_src(32..40, self.data);
2645 e.set_reg_src(64..72, self.handle);
2646 e.set_pred_dst(81..84, self.fault);
2647
2648 e.set_image_dim(61..64, self.image_dim);
2649 e.set_mem_order(&self.mem_order);
2650 e.set_eviction_priority(&self.mem_eviction_priority);
2651
2652 e.set_bit(72, false); // .BA
2653 e.set_atom_type(73..76, self.atom_type);
2654 }
2655 }
2656
2657 impl SM70Op for OpLd {
legalize(&mut self, b: &mut LegalizeBuilder)2658 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2659 legalize_ext_instr(self, b);
2660 }
2661
encode(&self, e: &mut SM70Encoder<'_>)2662 fn encode(&self, e: &mut SM70Encoder<'_>) {
2663 match self.access.space {
2664 MemSpace::Global(_) => {
2665 e.set_opcode(0x381);
2666 e.set_pred_dst(81..84, Dst::None);
2667 e.set_mem_access(&self.access);
2668 }
2669 MemSpace::Local => {
2670 e.set_opcode(0x983);
2671 e.set_field(84..87, 1_u8);
2672
2673 e.set_mem_type(73..76, self.access.mem_type);
2674 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2675 assert!(
2676 self.access.eviction_priority
2677 == MemEvictionPriority::Normal
2678 );
2679 }
2680 MemSpace::Shared => {
2681 e.set_opcode(0x984);
2682
2683 e.set_mem_type(73..76, self.access.mem_type);
2684 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2685 assert!(
2686 self.access.eviction_priority
2687 == MemEvictionPriority::Normal
2688 );
2689
2690 e.set_bit(87, false); // !.ZD - Returns a predicate?
2691 }
2692 }
2693
2694 e.set_dst(self.dst);
2695 e.set_reg_src(24..32, self.addr);
2696 e.set_field(40..64, self.offset);
2697 }
2698 }
2699
2700 impl SM70Op for OpLdc {
legalize(&mut self, b: &mut LegalizeBuilder)2701 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2702 let gpr = op_gpr(self);
2703 b.copy_alu_src_if_not_reg(&mut self.offset, gpr, SrcType::GPR);
2704 }
2705
encode(&self, e: &mut SM70Encoder<'_>)2706 fn encode(&self, e: &mut SM70Encoder<'_>) {
2707 let SrcRef::CBuf(cb) = &self.cb.src_ref else {
2708 panic!("LDC must take a cbuf source");
2709 };
2710
2711 match cb.buf {
2712 CBuf::Binding(idx) => {
2713 if self.is_uniform() {
2714 e.set_opcode(0xab9);
2715 e.set_udst(self.dst);
2716
2717 assert!(self.offset.is_zero());
2718 assert!(self.mode == LdcMode::Indexed);
2719 } else {
2720 e.set_opcode(0xb82);
2721 e.set_dst(self.dst);
2722
2723 e.set_reg_src(24..32, self.offset);
2724 e.set_field(
2725 78..80,
2726 match self.mode {
2727 LdcMode::Indexed => 0_u8,
2728 LdcMode::IndexedLinear => 1_u8,
2729 LdcMode::IndexedSegmented => 2_u8,
2730 LdcMode::IndexedSegmentedLinear => 3_u8,
2731 },
2732 );
2733 }
2734 e.set_field(54..59, idx);
2735 e.set_bit(91, false); // Bound
2736 }
2737 CBuf::BindlessUGPR(handle) => {
2738 if self.is_uniform() {
2739 e.set_opcode(0xab9);
2740 e.set_udst(self.dst);
2741
2742 assert!(self.offset.is_zero());
2743 } else {
2744 e.set_opcode(0x582);
2745 e.set_dst(self.dst);
2746
2747 e.set_reg_src(64..72, self.offset);
2748 }
2749
2750 e.set_ureg(24..32, handle);
2751 e.set_reg_src(64..72, self.offset);
2752 assert!(self.mode == LdcMode::Indexed);
2753 e.set_bit(91, true); // Bindless
2754 }
2755 CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"),
2756 }
2757
2758 e.set_field(38..54, cb.offset);
2759 e.set_mem_type(73..76, self.mem_type);
2760 }
2761 }
2762
2763 impl SM70Op for OpSt {
legalize(&mut self, b: &mut LegalizeBuilder)2764 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2765 legalize_ext_instr(self, b);
2766 }
2767
encode(&self, e: &mut SM70Encoder<'_>)2768 fn encode(&self, e: &mut SM70Encoder<'_>) {
2769 match self.access.space {
2770 MemSpace::Global(_) => {
2771 e.set_opcode(0x386);
2772 e.set_mem_access(&self.access);
2773 }
2774 MemSpace::Local => {
2775 e.set_opcode(0x387);
2776 e.set_field(84..87, 1_u8);
2777
2778 e.set_mem_type(73..76, self.access.mem_type);
2779 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2780 assert!(
2781 self.access.eviction_priority
2782 == MemEvictionPriority::Normal
2783 );
2784 }
2785 MemSpace::Shared => {
2786 e.set_opcode(0x388);
2787
2788 e.set_mem_type(73..76, self.access.mem_type);
2789 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2790 assert!(
2791 self.access.eviction_priority
2792 == MemEvictionPriority::Normal
2793 );
2794 }
2795 }
2796
2797 e.set_reg_src(24..32, self.addr);
2798 e.set_reg_src(32..40, self.data);
2799 e.set_field(40..64, self.offset);
2800 }
2801 }
2802
2803 impl SM70Encoder<'_> {
set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp)2804 fn set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp) {
2805 self.set_field(
2806 range,
2807 match atom_op {
2808 AtomOp::Add => 0_u8,
2809 AtomOp::Min => 1_u8,
2810 AtomOp::Max => 2_u8,
2811 AtomOp::Inc => 3_u8,
2812 AtomOp::Dec => 4_u8,
2813 AtomOp::And => 5_u8,
2814 AtomOp::Or => 6_u8,
2815 AtomOp::Xor => 7_u8,
2816 AtomOp::Exch => 8_u8,
2817 AtomOp::CmpExch(_) => panic!("CmpExch is a separate opcode"),
2818 },
2819 );
2820 }
2821
set_atom_type(&mut self, range: Range<usize>, atom_type: AtomType)2822 fn set_atom_type(&mut self, range: Range<usize>, atom_type: AtomType) {
2823 assert!(range.len() == 3);
2824 self.set_field(
2825 range,
2826 match atom_type {
2827 AtomType::U32 => 0_u8,
2828 AtomType::I32 => 1_u8,
2829 AtomType::U64 => 2_u8,
2830 AtomType::F32 => 3_u8,
2831 AtomType::F16x2 => 4_u8,
2832 AtomType::I64 => 5_u8,
2833 AtomType::F64 => 6_u8,
2834 },
2835 );
2836 }
2837 }
2838
2839 impl SM70Op for OpAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2840 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2841 legalize_ext_instr(self, b);
2842 }
2843
encode(&self, e: &mut SM70Encoder<'_>)2844 fn encode(&self, e: &mut SM70Encoder<'_>) {
2845 match self.mem_space {
2846 MemSpace::Global(_) => {
2847 if self.dst.is_none() {
2848 e.set_opcode(0x98e);
2849
2850 e.set_reg_src(32..40, self.data);
2851 e.set_atom_op(87..90, self.atom_op);
2852 } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2853 e.set_opcode(0x3a9);
2854
2855 assert!(cmp_src == AtomCmpSrc::Separate);
2856 e.set_reg_src(32..40, self.cmpr);
2857 e.set_reg_src(64..72, self.data);
2858 } else {
2859 e.set_opcode(0x3a8);
2860
2861 e.set_reg_src(32..40, self.data);
2862 e.set_atom_op(87..91, self.atom_op);
2863 }
2864
2865 e.set_pred_dst(81..84, Dst::None);
2866
2867 e.set_field(
2868 72..73,
2869 match self.mem_space.addr_type() {
2870 MemAddrType::A32 => 0_u8,
2871 MemAddrType::A64 => 1_u8,
2872 },
2873 );
2874
2875 e.set_mem_order(&self.mem_order);
2876 e.set_eviction_priority(&self.mem_eviction_priority);
2877 }
2878 MemSpace::Local => panic!("Atomics do not support local"),
2879 MemSpace::Shared => {
2880 if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2881 e.set_opcode(0x38d);
2882
2883 assert!(cmp_src == AtomCmpSrc::Separate);
2884 e.set_reg_src(32..40, self.cmpr);
2885 e.set_reg_src(64..72, self.data);
2886 } else {
2887 e.set_opcode(0x38c);
2888
2889 e.set_reg_src(32..40, self.data);
2890 e.set_atom_op(87..91, self.atom_op);
2891 }
2892
2893 assert!(self.mem_order == MemOrder::Strong(MemScope::CTA));
2894 assert!(
2895 self.mem_eviction_priority == MemEvictionPriority::Normal
2896 );
2897 }
2898 }
2899
2900 e.set_dst(self.dst);
2901 e.set_reg_src(24..32, self.addr);
2902 e.set_field(40..64, self.addr_offset);
2903 e.set_atom_type(73..76, self.atom_type);
2904 }
2905 }
2906
2907 impl SM70Op for OpAL2P {
legalize(&mut self, b: &mut LegalizeBuilder)2908 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2909 legalize_ext_instr(self, b);
2910 }
2911
encode(&self, e: &mut SM70Encoder<'_>)2912 fn encode(&self, e: &mut SM70Encoder<'_>) {
2913 e.set_opcode(0x920);
2914
2915 e.set_dst(self.dst);
2916 e.set_reg_src(24..32, self.offset);
2917
2918 e.set_field(40..50, self.access.addr);
2919 e.set_field(74..76, 0_u8); // comps
2920 assert!(!self.access.patch);
2921 e.set_bit(79, self.access.output);
2922 }
2923 }
2924
2925 impl SM70Op for OpALd {
legalize(&mut self, b: &mut LegalizeBuilder)2926 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2927 legalize_ext_instr(self, b);
2928 }
2929
encode(&self, e: &mut SM70Encoder<'_>)2930 fn encode(&self, e: &mut SM70Encoder<'_>) {
2931 e.set_opcode(0x321);
2932
2933 e.set_dst(self.dst);
2934 e.set_reg_src(32..40, self.vtx);
2935 e.set_reg_src(24..32, self.offset);
2936
2937 e.set_field(40..50, self.access.addr);
2938 e.set_field(74..76, self.access.comps - 1);
2939 e.set_field(76..77, self.access.patch);
2940 e.set_field(77..78, self.access.phys);
2941 e.set_field(79..80, self.access.output);
2942 }
2943 }
2944
2945 impl SM70Op for OpASt {
legalize(&mut self, b: &mut LegalizeBuilder)2946 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2947 legalize_ext_instr(self, b);
2948 }
2949
encode(&self, e: &mut SM70Encoder<'_>)2950 fn encode(&self, e: &mut SM70Encoder<'_>) {
2951 e.set_opcode(0x322);
2952
2953 e.set_reg_src(32..40, self.data);
2954 e.set_reg_src(64..72, self.vtx);
2955 e.set_reg_src(24..32, self.offset);
2956
2957 e.set_field(40..50, self.access.addr);
2958 e.set_field(74..76, self.access.comps - 1);
2959 e.set_field(76..77, self.access.patch);
2960 e.set_field(77..78, self.access.phys);
2961 assert!(self.access.output);
2962 }
2963 }
2964
2965 impl SM70Op for OpIpa {
legalize(&mut self, b: &mut LegalizeBuilder)2966 fn legalize(&mut self, b: &mut LegalizeBuilder) {
2967 legalize_ext_instr(self, b);
2968 }
2969
encode(&self, e: &mut SM70Encoder<'_>)2970 fn encode(&self, e: &mut SM70Encoder<'_>) {
2971 e.set_opcode(0x326);
2972
2973 e.set_dst(self.dst);
2974
2975 assert!(self.addr % 4 == 0);
2976 e.set_field(64..72, self.addr >> 2);
2977
2978 e.set_field(
2979 76..78,
2980 match self.loc {
2981 InterpLoc::Default => 0_u8,
2982 InterpLoc::Centroid => 1_u8,
2983 InterpLoc::Offset => 2_u8,
2984 },
2985 );
2986 e.set_field(
2987 78..80,
2988 match self.freq {
2989 InterpFreq::Pass => 0_u8,
2990 InterpFreq::Constant => 1_u8,
2991 InterpFreq::State => 2_u8,
2992 InterpFreq::PassMulW => {
2993 panic!("InterpFreq::PassMulW is invalid on SM70+");
2994 }
2995 },
2996 );
2997
2998 assert!(self.inv_w.is_zero());
2999 e.set_reg_src(32..40, self.offset);
3000
3001 // TODO: What is this for?
3002 e.set_pred_dst(81..84, Dst::None);
3003 }
3004 }
3005
3006 impl SM70Op for OpLdTram {
legalize(&mut self, b: &mut LegalizeBuilder)3007 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3008 legalize_ext_instr(self, b);
3009 }
3010
encode(&self, e: &mut SM70Encoder<'_>)3011 fn encode(&self, e: &mut SM70Encoder<'_>) {
3012 e.set_opcode(0x3ad);
3013 e.set_dst(self.dst);
3014 e.set_ureg(24..32, RegRef::zero(RegFile::UGPR, 1));
3015
3016 assert!(self.addr % 4 == 0);
3017 e.set_field(64..72, self.addr >> 2);
3018
3019 e.set_bit(72, self.use_c);
3020
3021 // Unknown but required
3022 e.set_bit(91, true);
3023 }
3024 }
3025
3026 impl SM70Op for OpCCtl {
legalize(&mut self, b: &mut LegalizeBuilder)3027 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3028 legalize_ext_instr(self, b);
3029 }
3030
encode(&self, e: &mut SM70Encoder<'_>)3031 fn encode(&self, e: &mut SM70Encoder<'_>) {
3032 assert!(matches!(self.mem_space, MemSpace::Global(_)));
3033 e.set_opcode(0x98f);
3034
3035 e.set_reg_src(24..32, self.addr);
3036 e.set_field(32..64, self.addr_offset);
3037
3038 e.set_field(
3039 87..91,
3040 match self.op {
3041 CCtlOp::PF1 => 0_u8,
3042 CCtlOp::PF2 => 1_u8,
3043 CCtlOp::WB => 2_u8,
3044 CCtlOp::IV => 3_u8,
3045 CCtlOp::IVAll => 4_u8,
3046 CCtlOp::RS => 5_u8,
3047 CCtlOp::IVAllP => 6_u8,
3048 CCtlOp::WBAll => 7_u8,
3049 CCtlOp::WBAllP => 8_u8,
3050 op => panic!("Unsupported cache control {op:?}"),
3051 },
3052 );
3053 }
3054 }
3055
3056 impl SM70Op for OpMemBar {
legalize(&mut self, _b: &mut LegalizeBuilder)3057 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3058 // Nothing to do
3059 }
3060
encode(&self, e: &mut SM70Encoder<'_>)3061 fn encode(&self, e: &mut SM70Encoder<'_>) {
3062 e.set_opcode(0x992);
3063
3064 e.set_bit(72, false); // !.MMIO
3065 e.set_field(
3066 76..79,
3067 match self.scope {
3068 MemScope::CTA => 0_u8,
3069 // SM => 1_u8,
3070 MemScope::GPU => 2_u8,
3071 MemScope::System => 3_u8,
3072 },
3073 );
3074 e.set_bit(80, false); // .SC
3075 }
3076 }
3077
3078 impl SM70Encoder<'_> {
set_rel_offset(&mut self, range: Range<usize>, label: &Label)3079 fn set_rel_offset(&mut self, range: Range<usize>, label: &Label) {
3080 let ip = u64::try_from(self.ip).unwrap();
3081 let ip = i64::try_from(ip).unwrap();
3082
3083 let target_ip = *self.labels.get(label).unwrap();
3084 let target_ip = u64::try_from(target_ip).unwrap();
3085 let target_ip = i64::try_from(target_ip).unwrap();
3086
3087 let rel_offset = target_ip - ip - 4;
3088
3089 self.set_field(range, rel_offset);
3090 }
3091 }
3092
3093 impl SM70Op for OpBClear {
legalize(&mut self, _b: &mut LegalizeBuilder)3094 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3095 // Nothing to do
3096 }
3097
encode(&self, e: &mut SM70Encoder<'_>)3098 fn encode(&self, e: &mut SM70Encoder<'_>) {
3099 e.set_opcode(0x355);
3100
3101 e.set_dst(Dst::None);
3102 e.set_bar_dst(24..28, self.dst);
3103
3104 e.set_bit(84, true); // .CLEAR
3105 }
3106 }
3107
3108 impl SM70Op for OpBMov {
legalize(&mut self, _b: &mut LegalizeBuilder)3109 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3110 // Nothing to do
3111 }
3112
encode(&self, e: &mut SM70Encoder<'_>)3113 fn encode(&self, e: &mut SM70Encoder<'_>) {
3114 if dst_is_bar(self.dst) {
3115 e.set_opcode(0x356);
3116
3117 e.set_bar_dst(24..28, self.dst);
3118 e.set_reg_src(32..40, self.src);
3119
3120 e.set_bit(84, self.clear);
3121 } else {
3122 e.set_opcode(0x355);
3123
3124 e.set_dst(self.dst);
3125 e.set_bar_src(24..28, self.src);
3126
3127 e.set_bit(84, self.clear);
3128 }
3129 }
3130 }
3131
3132 impl SM70Op for OpBreak {
legalize(&mut self, _b: &mut LegalizeBuilder)3133 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3134 // Nothing to do
3135 }
3136
encode(&self, e: &mut SM70Encoder<'_>)3137 fn encode(&self, e: &mut SM70Encoder<'_>) {
3138 e.set_opcode(0x942);
3139 assert!(self.bar_in.src_ref.as_reg() == self.bar_out.as_reg());
3140 e.set_bar_dst(16..20, self.bar_out);
3141 e.set_pred_src(87..90, 90, self.cond);
3142 }
3143 }
3144
3145 impl SM70Op for OpBSSy {
legalize(&mut self, _b: &mut LegalizeBuilder)3146 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3147 // Nothing to do
3148 }
3149
encode(&self, e: &mut SM70Encoder<'_>)3150 fn encode(&self, e: &mut SM70Encoder<'_>) {
3151 e.set_opcode(0x945);
3152 assert!(self.bar_in.src_ref.as_reg() == self.bar_out.as_reg());
3153 e.set_bar_dst(16..20, self.bar_out);
3154 e.set_rel_offset(34..64, &self.target);
3155 e.set_pred_src(87..90, 90, self.cond);
3156 }
3157 }
3158
3159 impl SM70Op for OpBSync {
legalize(&mut self, _b: &mut LegalizeBuilder)3160 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3161 // Nothing to do
3162 }
3163
encode(&self, e: &mut SM70Encoder<'_>)3164 fn encode(&self, e: &mut SM70Encoder<'_>) {
3165 e.set_opcode(0x941);
3166 e.set_bar_src(16..20, self.bar);
3167 e.set_pred_src(87..90, 90, self.cond);
3168 }
3169 }
3170
3171 impl SM70Op for OpBra {
legalize(&mut self, _b: &mut LegalizeBuilder)3172 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3173 // Nothing to do
3174 }
3175
encode(&self, e: &mut SM70Encoder<'_>)3176 fn encode(&self, e: &mut SM70Encoder<'_>) {
3177 e.set_opcode(0x947);
3178 e.set_rel_offset(34..82, &self.target);
3179 e.set_field(87..90, 0x7_u8); // TODO: Pred?
3180 }
3181 }
3182
3183 impl SM70Op for OpExit {
legalize(&mut self, _b: &mut LegalizeBuilder)3184 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3185 // Nothing to do
3186 }
3187
encode(&self, e: &mut SM70Encoder<'_>)3188 fn encode(&self, e: &mut SM70Encoder<'_>) {
3189 e.set_opcode(0x94d);
3190
3191 // ./.KEEPREFCOUNT/.PREEMPTED/.INVALID3
3192 e.set_field(84..85, false);
3193 e.set_field(85..86, false); // .NO_ATEXIT
3194 e.set_field(87..90, 0x7_u8); // TODO: Predicate
3195 e.set_field(90..91, false); // NOT
3196 }
3197 }
3198
3199 impl SM70Op for OpWarpSync {
legalize(&mut self, _b: &mut LegalizeBuilder)3200 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3201 // Nothing to do
3202 }
3203
encode(&self, e: &mut SM70Encoder<'_>)3204 fn encode(&self, e: &mut SM70Encoder<'_>) {
3205 e.encode_alu(0x148, None, None, Some(&Src::from(self.mask)), None);
3206 e.set_pred_src(87..90, 90, SrcRef::True.into());
3207 }
3208 }
3209
3210 impl SM70Op for OpBar {
legalize(&mut self, _b: &mut LegalizeBuilder)3211 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3212 // Nothing to do
3213 }
3214
encode(&self, e: &mut SM70Encoder<'_>)3215 fn encode(&self, e: &mut SM70Encoder<'_>) {
3216 e.set_opcode(0xb1d);
3217
3218 // e.set_opcode(0x31d);
3219
3220 // // src0 == src1
3221 // e.set_reg_src(32..40, SrcRef::Zero.into());
3222
3223 // // 00: RED.POPC
3224 // // 01: RED.AND
3225 // // 02: RED.OR
3226 // e.set_field(74..76, 0_u8);
3227
3228 // // 00: SYNC
3229 // // 01: ARV
3230 // // 02: RED
3231 // // 03: SCAN
3232 // e.set_field(77..79, 0_u8);
3233
3234 // e.set_pred_src(87..90, 90, SrcRef::True.into());
3235 }
3236 }
3237
3238 impl SM70Op for OpCS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)3239 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3240 // Nothing to do
3241 }
3242
encode(&self, e: &mut SM70Encoder<'_>)3243 fn encode(&self, e: &mut SM70Encoder<'_>) {
3244 e.set_opcode(0x805);
3245 e.set_dst(self.dst);
3246 e.set_field(72..80, self.idx);
3247 e.set_bit(80, self.dst.as_reg().unwrap().comps() == 2); // .64
3248 }
3249 }
3250
3251 impl SM70Op for OpIsberd {
legalize(&mut self, _b: &mut LegalizeBuilder)3252 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3253 // Nothing to do
3254 }
3255
encode(&self, e: &mut SM70Encoder<'_>)3256 fn encode(&self, e: &mut SM70Encoder<'_>) {
3257 e.set_opcode(0x923);
3258 e.set_dst(self.dst);
3259 e.set_reg_src(24..32, self.idx);
3260 }
3261 }
3262
3263 impl SM70Op for OpKill {
legalize(&mut self, _b: &mut LegalizeBuilder)3264 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3265 // Nothing to do
3266 }
3267
encode(&self, e: &mut SM70Encoder<'_>)3268 fn encode(&self, e: &mut SM70Encoder<'_>) {
3269 e.set_opcode(0x95b);
3270 e.set_pred_src(87..90, 90, SrcRef::True.into());
3271 }
3272 }
3273
3274 impl SM70Op for OpNop {
legalize(&mut self, _b: &mut LegalizeBuilder)3275 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3276 // Nothing to do
3277 }
3278
encode(&self, e: &mut SM70Encoder<'_>)3279 fn encode(&self, e: &mut SM70Encoder<'_>) {
3280 e.set_opcode(0x918);
3281 }
3282 }
3283
3284 impl SM70Op for OpPixLd {
legalize(&mut self, _b: &mut LegalizeBuilder)3285 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3286 // Nothing to do
3287 }
3288
encode(&self, e: &mut SM70Encoder<'_>)3289 fn encode(&self, e: &mut SM70Encoder<'_>) {
3290 e.set_opcode(0x925);
3291 e.set_dst(self.dst);
3292 e.set_field(
3293 78..81,
3294 match &self.val {
3295 PixVal::MsCount => 0_u8,
3296 PixVal::CovMask => 1_u8,
3297 PixVal::CentroidOffset => 2_u8,
3298 PixVal::MyIndex => 3_u8,
3299 PixVal::InnerCoverage => 4_u8,
3300 other => panic!("Unsupported PixVal: {other}"),
3301 },
3302 );
3303 e.set_pred_dst(81..84, Dst::None);
3304 }
3305 }
3306
3307 impl SM70Op for OpS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)3308 fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3309 // Nothing to do
3310 }
3311
encode(&self, e: &mut SM70Encoder<'_>)3312 fn encode(&self, e: &mut SM70Encoder<'_>) {
3313 assert!(!self.is_uniform());
3314 e.set_opcode(if self.is_uniform() { 0x9c3 } else { 0x919 });
3315 e.set_dst(self.dst);
3316 e.set_field(72..80, self.idx);
3317 }
3318 }
3319
3320 impl SM70Op for OpOut {
legalize(&mut self, b: &mut LegalizeBuilder)3321 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3322 let gpr = op_gpr(self);
3323 b.copy_alu_src_if_not_reg(&mut self.handle, gpr, SrcType::GPR);
3324 b.copy_alu_src_if_not_reg_or_imm(&mut self.stream, gpr, SrcType::ALU);
3325 }
3326
encode(&self, e: &mut SM70Encoder<'_>)3327 fn encode(&self, e: &mut SM70Encoder<'_>) {
3328 e.encode_alu(
3329 0x124,
3330 Some(&self.dst),
3331 Some(&self.handle),
3332 Some(&self.stream),
3333 None,
3334 );
3335
3336 e.set_field(
3337 78..80,
3338 match self.out_type {
3339 OutType::Emit => 1_u8,
3340 OutType::Cut => 2_u8,
3341 OutType::EmitThenCut => 3_u8,
3342 },
3343 );
3344 }
3345 }
3346
3347 impl SM70Op for OpOutFinal {
legalize(&mut self, b: &mut LegalizeBuilder)3348 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3349 let gpr = op_gpr(self);
3350 b.copy_alu_src_if_not_reg(&mut self.handle, gpr, SrcType::GPR);
3351 }
3352
encode(&self, e: &mut SM70Encoder<'_>)3353 fn encode(&self, e: &mut SM70Encoder<'_>) {
3354 e.encode_alu(
3355 0x124,
3356 Some(&Dst::None),
3357 Some(&self.handle),
3358 Some(&Src::new_zero()),
3359 None,
3360 );
3361 }
3362 }
3363
3364 impl SM70Op for OpVote {
legalize(&mut self, b: &mut LegalizeBuilder)3365 fn legalize(&mut self, b: &mut LegalizeBuilder) {
3366 b.copy_src_if_upred(&mut self.pred);
3367 }
3368
encode(&self, e: &mut SM70Encoder<'_>)3369 fn encode(&self, e: &mut SM70Encoder<'_>) {
3370 if self.is_uniform() {
3371 e.set_opcode(0x886);
3372 e.set_udst(self.ballot);
3373 } else {
3374 e.set_opcode(0x806);
3375 e.set_dst(self.ballot);
3376 }
3377
3378 e.set_field(
3379 72..74,
3380 match self.op {
3381 VoteOp::All => 0_u8,
3382 VoteOp::Any => 1_u8,
3383 VoteOp::Eq => 2_u8,
3384 },
3385 );
3386
3387 e.set_pred_dst(81..84, self.vote);
3388 e.set_pred_src(87..90, 90, self.pred);
3389 }
3390 }
3391
3392 macro_rules! as_sm70_op_match {
3393 ($op: expr) => {
3394 match $op {
3395 Op::FAdd(op) => op,
3396 Op::FFma(op) => op,
3397 Op::FMnMx(op) => op,
3398 Op::FMul(op) => op,
3399 Op::FSet(op) => op,
3400 Op::FSetP(op) => op,
3401 Op::FSwzAdd(op) => op,
3402 Op::DAdd(op) => op,
3403 Op::DFma(op) => op,
3404 Op::DMul(op) => op,
3405 Op::DSetP(op) => op,
3406 Op::HAdd2(op) => op,
3407 Op::HFma2(op) => op,
3408 Op::HMul2(op) => op,
3409 Op::HSet2(op) => op,
3410 Op::HSetP2(op) => op,
3411 Op::HMnMx2(op) => op,
3412 Op::MuFu(op) => op,
3413 Op::BMsk(op) => op,
3414 Op::BRev(op) => op,
3415 Op::Flo(op) => op,
3416 Op::IAbs(op) => op,
3417 Op::IAdd3(op) => op,
3418 Op::IAdd3X(op) => op,
3419 Op::IDp4(op) => op,
3420 Op::IMad(op) => op,
3421 Op::IMad64(op) => op,
3422 Op::IMnMx(op) => op,
3423 Op::ISetP(op) => op,
3424 Op::Lop3(op) => op,
3425 Op::PopC(op) => op,
3426 Op::Shf(op) => op,
3427 Op::F2F(op) => op,
3428 Op::F2FP(op) => op,
3429 Op::F2I(op) => op,
3430 Op::I2F(op) => op,
3431 Op::FRnd(op) => op,
3432 Op::Mov(op) => op,
3433 Op::Prmt(op) => op,
3434 Op::Sel(op) => op,
3435 Op::Shfl(op) => op,
3436 Op::PLop3(op) => op,
3437 Op::R2UR(op) => op,
3438 Op::Tex(op) => op,
3439 Op::Tld(op) => op,
3440 Op::Tld4(op) => op,
3441 Op::Tmml(op) => op,
3442 Op::Txd(op) => op,
3443 Op::Txq(op) => op,
3444 Op::SuLd(op) => op,
3445 Op::SuSt(op) => op,
3446 Op::SuAtom(op) => op,
3447 Op::Ld(op) => op,
3448 Op::Ldc(op) => op,
3449 Op::St(op) => op,
3450 Op::Atom(op) => op,
3451 Op::AL2P(op) => op,
3452 Op::ALd(op) => op,
3453 Op::ASt(op) => op,
3454 Op::Ipa(op) => op,
3455 Op::LdTram(op) => op,
3456 Op::CCtl(op) => op,
3457 Op::MemBar(op) => op,
3458 Op::BClear(op) => op,
3459 Op::BMov(op) => op,
3460 Op::Break(op) => op,
3461 Op::BSSy(op) => op,
3462 Op::BSync(op) => op,
3463 Op::Bra(op) => op,
3464 Op::Exit(op) => op,
3465 Op::WarpSync(op) => op,
3466 Op::Bar(op) => op,
3467 Op::CS2R(op) => op,
3468 Op::Isberd(op) => op,
3469 Op::Kill(op) => op,
3470 Op::Nop(op) => op,
3471 Op::PixLd(op) => op,
3472 Op::S2R(op) => op,
3473 Op::Out(op) => op,
3474 Op::OutFinal(op) => op,
3475 Op::Vote(op) => op,
3476 _ => panic!("Unsupported op: {}", $op),
3477 }
3478 };
3479 }
3480
as_sm70_op(op: &Op) -> &dyn SM70Op3481 fn as_sm70_op(op: &Op) -> &dyn SM70Op {
3482 as_sm70_op_match!(op)
3483 }
3484
as_sm70_op_mut(op: &mut Op) -> &mut dyn SM70Op3485 fn as_sm70_op_mut(op: &mut Op) -> &mut dyn SM70Op {
3486 as_sm70_op_match!(op)
3487 }
3488
encode_sm70_shader(sm: &ShaderModel70, s: &Shader<'_>) -> Vec<u32>3489 fn encode_sm70_shader(sm: &ShaderModel70, s: &Shader<'_>) -> Vec<u32> {
3490 assert!(s.functions.len() == 1);
3491 let func = &s.functions[0];
3492
3493 let mut ip = 0_usize;
3494 let mut labels = HashMap::new();
3495 for b in &func.blocks {
3496 labels.insert(b.label, ip);
3497 for instr in &b.instrs {
3498 if let Op::Nop(op) = &instr.op {
3499 if let Some(label) = op.label {
3500 labels.insert(label, ip);
3501 }
3502 }
3503 ip += 4;
3504 }
3505 }
3506
3507 let mut encoded = Vec::new();
3508 for b in &func.blocks {
3509 for instr in &b.instrs {
3510 let mut e = SM70Encoder {
3511 sm,
3512 ip: encoded.len(),
3513 labels: &labels,
3514 inst: [0_u32; 4],
3515 };
3516 as_sm70_op(&instr.op).encode(&mut e);
3517 e.set_pred(&instr.pred);
3518 e.set_instr_deps(&instr.deps);
3519 encoded.extend_from_slice(&e.inst[..]);
3520 }
3521 }
3522 encoded
3523 }
3524