xref: /aosp_15_r20/external/mesa3d/src/nouveau/compiler/nak/sm70.rs (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 // Copyright © 2022 Collabora, Ltd.
2 // SPDX-License-Identifier: MIT
3 
4 use crate::ir::*;
5 use crate::legalize::{
6     src_is_reg, src_is_upred_reg, swap_srcs_if_not_reg, LegalizeBuildHelpers,
7     LegalizeBuilder,
8 };
9 use bitview::*;
10 
11 use std::collections::HashMap;
12 use std::ops::Range;
13 
14 pub struct ShaderModel70 {
15     sm: u8,
16 }
17 
18 impl ShaderModel70 {
new(sm: u8) -> Self19     pub fn new(sm: u8) -> Self {
20         assert!(sm >= 70);
21         Self { sm }
22     }
23 
has_uniform_alu(&self) -> bool24     fn has_uniform_alu(&self) -> bool {
25         self.sm >= 75
26     }
27 }
28 
29 impl ShaderModel for ShaderModel70 {
sm(&self) -> u830     fn sm(&self) -> u8 {
31         self.sm
32     }
33 
num_regs(&self, file: RegFile) -> u3234     fn num_regs(&self, file: RegFile) -> u32 {
35         match file {
36             RegFile::GPR => {
37                 // Volta+ has a maximum of 253 registers.  Presumably
38                 // because two registers get burned for UGPRs? Unclear
39                 // on why we need it on Volta though.
40                 253
41             }
42             RegFile::UGPR => {
43                 if self.has_uniform_alu() {
44                     63
45                 } else {
46                     0
47                 }
48             }
49             RegFile::Pred => 7,
50             RegFile::UPred => {
51                 if self.has_uniform_alu() {
52                     7
53                 } else {
54                     0
55                 }
56             }
57             RegFile::Carry => 0,
58             RegFile::Bar => 16,
59             RegFile::Mem => RegRef::MAX_IDX + 1,
60         }
61     }
62 
crs_size(&self, max_crs_depth: u32) -> u3263     fn crs_size(&self, max_crs_depth: u32) -> u32 {
64         assert!(max_crs_depth == 0);
65         0
66     }
67 
op_can_be_uniform(&self, op: &Op) -> bool68     fn op_can_be_uniform(&self, op: &Op) -> bool {
69         if !self.has_uniform_alu() {
70             return false;
71         }
72 
73         match op {
74             Op::R2UR(_)
75             | Op::S2R(_)
76             | Op::BMsk(_)
77             | Op::BRev(_)
78             | Op::Flo(_)
79             | Op::IAdd3(_)
80             | Op::IAdd3X(_)
81             | Op::IMad(_)
82             | Op::IMad64(_)
83             | Op::ISetP(_)
84             | Op::Lop3(_)
85             | Op::Mov(_)
86             | Op::PLop3(_)
87             | Op::PopC(_)
88             | Op::Prmt(_)
89             | Op::PSetP(_)
90             | Op::Sel(_)
91             | Op::Shf(_)
92             | Op::Shl(_)
93             | Op::Shr(_)
94             | Op::Vote(_)
95             | Op::Copy(_)
96             | Op::Pin(_)
97             | Op::Unpin(_) => true,
98             Op::Ldc(op) => op.offset.is_zero(),
99             // UCLEA  USHL  USHR
100             _ => false,
101         }
102     }
103 
legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op)104     fn legalize_op(&self, b: &mut LegalizeBuilder, op: &mut Op) {
105         as_sm70_op_mut(op).legalize(b);
106     }
107 
encode_shader(&self, s: &Shader<'_>) -> Vec<u32>108     fn encode_shader(&self, s: &Shader<'_>) -> Vec<u32> {
109         encode_sm70_shader(self, s)
110     }
111 }
112 
113 /// A per-op trait that implements Volta+ opcode semantics
114 trait SM70Op {
legalize(&mut self, b: &mut LegalizeBuilder)115     fn legalize(&mut self, b: &mut LegalizeBuilder);
encode(&self, e: &mut SM70Encoder<'_>)116     fn encode(&self, e: &mut SM70Encoder<'_>);
117 }
118 
119 struct SM70Encoder<'a> {
120     sm: &'a ShaderModel70,
121     ip: usize,
122     labels: &'a HashMap<Label, usize>,
123     inst: [u32; 4],
124 }
125 
126 impl BitViewable for SM70Encoder<'_> {
bits(&self) -> usize127     fn bits(&self) -> usize {
128         BitView::new(&self.inst).bits()
129     }
130 
get_bit_range_u64(&self, range: Range<usize>) -> u64131     fn get_bit_range_u64(&self, range: Range<usize>) -> u64 {
132         BitView::new(&self.inst).get_bit_range_u64(range)
133     }
134 }
135 
136 impl BitMutViewable for SM70Encoder<'_> {
set_bit_range_u64(&mut self, range: Range<usize>, val: u64)137     fn set_bit_range_u64(&mut self, range: Range<usize>, val: u64) {
138         BitMutView::new(&mut self.inst).set_bit_range_u64(range, val);
139     }
140 }
141 
142 impl SetFieldU64 for SM70Encoder<'_> {
set_field_u64(&mut self, range: Range<usize>, val: u64)143     fn set_field_u64(&mut self, range: Range<usize>, val: u64) {
144         BitMutView::new(&mut self.inst).set_field_u64(range, val);
145     }
146 }
147 
148 impl SM70Encoder<'_> {
set_opcode(&mut self, opcode: u16)149     fn set_opcode(&mut self, opcode: u16) {
150         self.set_field(0..12, opcode);
151     }
152 
set_reg(&mut self, range: Range<usize>, reg: RegRef)153     fn set_reg(&mut self, range: Range<usize>, reg: RegRef) {
154         assert!(range.len() == 8);
155         assert!(reg.file() == RegFile::GPR);
156         self.set_field(range, reg.base_idx());
157     }
158 
set_ureg(&mut self, range: Range<usize>, reg: RegRef)159     fn set_ureg(&mut self, range: Range<usize>, reg: RegRef) {
160         assert!(self.sm.sm >= 75);
161         assert!(range.len() == 8);
162         assert!(reg.file() == RegFile::UGPR);
163         assert!(reg.base_idx() <= 63);
164         self.set_field(range, reg.base_idx());
165     }
166 
set_pred_reg(&mut self, range: Range<usize>, reg: RegRef)167     fn set_pred_reg(&mut self, range: Range<usize>, reg: RegRef) {
168         assert!(range.len() == 3);
169         assert!(reg.base_idx() <= 7);
170         assert!(reg.comps() == 1);
171         self.set_field(range, reg.base_idx());
172     }
173 
set_reg_src(&mut self, range: Range<usize>, src: Src)174     fn set_reg_src(&mut self, range: Range<usize>, src: Src) {
175         assert!(src.src_mod.is_none());
176         match src.src_ref {
177             SrcRef::Zero => self.set_reg(range, RegRef::zero(RegFile::GPR, 1)),
178             SrcRef::Reg(reg) => self.set_reg(range, reg),
179             _ => panic!("Not a register"),
180         }
181     }
182 
set_pred_dst(&mut self, range: Range<usize>, dst: Dst)183     fn set_pred_dst(&mut self, range: Range<usize>, dst: Dst) {
184         match dst {
185             Dst::None => {
186                 self.set_pred_reg(range, RegRef::zero(RegFile::Pred, 1));
187             }
188             Dst::Reg(reg) => self.set_pred_reg(range, reg),
189             _ => panic!("Not a register"),
190         }
191     }
192 
set_pred_src_file( &mut self, range: Range<usize>, not_bit: usize, src: Src, file: RegFile, )193     fn set_pred_src_file(
194         &mut self,
195         range: Range<usize>,
196         not_bit: usize,
197         src: Src,
198         file: RegFile,
199     ) {
200         // The default for predicates is true
201         let true_reg = RegRef::new(file, 7, 1);
202 
203         let (not, reg) = match src.src_ref {
204             SrcRef::True => (false, true_reg),
205             SrcRef::False => (true, true_reg),
206             SrcRef::Reg(reg) => {
207                 assert!(reg.file() == file);
208                 (false, reg)
209             }
210             _ => panic!("Not a register"),
211         };
212         self.set_pred_reg(range, reg);
213         self.set_bit(not_bit, not ^ src_mod_is_bnot(src.src_mod));
214     }
215 
set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src)216     fn set_pred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src) {
217         self.set_pred_src_file(range, not_bit, src, RegFile::Pred);
218     }
219 
set_upred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src)220     fn set_upred_src(&mut self, range: Range<usize>, not_bit: usize, src: Src) {
221         self.set_pred_src_file(range, not_bit, src, RegFile::UPred);
222     }
223 
set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef)224     fn set_src_cb(&mut self, range: Range<usize>, cx_bit: usize, cb: &CBufRef) {
225         let mut v = BitMutView::new_subset(self, range);
226         v.set_field(6..22, cb.offset);
227         match cb.buf {
228             CBuf::Binding(idx) => {
229                 v.set_field(22..27, idx);
230                 self.set_bit(cx_bit, false);
231             }
232             CBuf::BindlessUGPR(reg) => {
233                 assert!(reg.base_idx() <= 63);
234                 assert!(reg.file() == RegFile::UGPR);
235                 v.set_field(0..6, reg.base_idx());
236                 self.set_bit(cx_bit, true);
237             }
238             CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"),
239         }
240     }
241 
set_pred(&mut self, pred: &Pred)242     fn set_pred(&mut self, pred: &Pred) {
243         assert!(!pred.is_false());
244         self.set_pred_reg(
245             12..15,
246             match pred.pred_ref {
247                 PredRef::None => RegRef::zero(RegFile::Pred, 1),
248                 PredRef::Reg(reg) => reg,
249                 PredRef::SSA(_) => panic!("SSA values must be lowered"),
250             },
251         );
252         self.set_bit(15, pred.pred_inv);
253     }
254 
set_dst(&mut self, dst: Dst)255     fn set_dst(&mut self, dst: Dst) {
256         match dst {
257             Dst::None => self.set_reg(16..24, RegRef::zero(RegFile::GPR, 1)),
258             Dst::Reg(reg) => self.set_reg(16..24, reg),
259             _ => panic!("Not a register"),
260         }
261     }
262 
set_udst(&mut self, dst: Dst)263     fn set_udst(&mut self, dst: Dst) {
264         match dst {
265             Dst::None => self.set_ureg(16..24, RegRef::zero(RegFile::UGPR, 1)),
266             Dst::Reg(reg) => self.set_ureg(16..24, reg),
267             _ => panic!("Not a register"),
268         }
269     }
270 
set_bar_reg(&mut self, range: Range<usize>, reg: RegRef)271     fn set_bar_reg(&mut self, range: Range<usize>, reg: RegRef) {
272         assert!(range.len() == 4);
273         assert!(reg.file() == RegFile::Bar);
274         assert!(reg.comps() == 1);
275         self.set_field(range, reg.base_idx());
276     }
277 
set_bar_dst(&mut self, range: Range<usize>, dst: Dst)278     fn set_bar_dst(&mut self, range: Range<usize>, dst: Dst) {
279         self.set_bar_reg(range, *dst.as_reg().unwrap());
280     }
281 
set_bar_src(&mut self, range: Range<usize>, src: Src)282     fn set_bar_src(&mut self, range: Range<usize>, src: Src) {
283         assert!(src.src_mod.is_none());
284         self.set_bar_reg(range, *src.src_ref.as_reg().unwrap());
285     }
286 
set_instr_deps(&mut self, deps: &InstrDeps)287     fn set_instr_deps(&mut self, deps: &InstrDeps) {
288         self.set_field(105..109, deps.delay);
289         self.set_bit(109, deps.yld);
290         self.set_field(110..113, deps.wr_bar().unwrap_or(7));
291         self.set_field(113..116, deps.rd_bar().unwrap_or(7));
292         self.set_field(116..122, deps.wt_bar_mask);
293         self.set_field(122..126, deps.reuse_mask);
294     }
295 }
296 
297 //
298 // Helpers for encoding of ALU instructions
299 //
300 
301 struct ALURegRef {
302     pub reg: RegRef,
303     pub abs: bool,
304     pub neg: bool,
305     pub swizzle: SrcSwizzle,
306 }
307 
308 struct ALUCBufRef {
309     pub cb: CBufRef,
310     pub abs: bool,
311     pub neg: bool,
312     pub swizzle: SrcSwizzle,
313 }
314 
315 enum ALUSrc {
316     None,
317     Imm32(u32),
318     Reg(ALURegRef),
319     UReg(ALURegRef),
320     CBuf(ALUCBufRef),
321 }
322 
src_is_zero_or_gpr(src: &Src) -> bool323 fn src_is_zero_or_gpr(src: &Src) -> bool {
324     match src.src_ref {
325         SrcRef::Zero => true,
326         SrcRef::Reg(reg) => reg.file() == RegFile::GPR,
327         _ => false,
328     }
329 }
330 
src_mod_has_abs(src_mod: SrcMod) -> bool331 fn src_mod_has_abs(src_mod: SrcMod) -> bool {
332     match src_mod {
333         SrcMod::None | SrcMod::FNeg | SrcMod::INeg | SrcMod::BNot => false,
334         SrcMod::FAbs | SrcMod::FNegAbs => true,
335     }
336 }
337 
src_mod_has_neg(src_mod: SrcMod) -> bool338 fn src_mod_has_neg(src_mod: SrcMod) -> bool {
339     match src_mod {
340         SrcMod::None | SrcMod::FAbs => false,
341         SrcMod::FNeg | SrcMod::FNegAbs | SrcMod::INeg | SrcMod::BNot => true,
342     }
343 }
344 
src_mod_is_bnot(src_mod: SrcMod) -> bool345 fn src_mod_is_bnot(src_mod: SrcMod) -> bool {
346     match src_mod {
347         SrcMod::None => false,
348         SrcMod::BNot => true,
349         _ => panic!("Not an predicate source modifier"),
350     }
351 }
352 
dst_is_bar(dst: Dst) -> bool353 fn dst_is_bar(dst: Dst) -> bool {
354     match dst {
355         Dst::None => false,
356         Dst::SSA(ssa) => ssa.file().unwrap() == RegFile::Bar,
357         Dst::Reg(reg) => reg.file() == RegFile::Bar,
358     }
359 }
360 
361 impl ALUSrc {
from_src(src: Option<&Src>, op_is_uniform: bool) -> ALUSrc362     fn from_src(src: Option<&Src>, op_is_uniform: bool) -> ALUSrc {
363         let Some(src) = src else {
364             return ALUSrc::None;
365         };
366 
367         match src.src_ref {
368             SrcRef::Zero | SrcRef::Reg(_) => {
369                 let reg = match src.src_ref {
370                     SrcRef::Zero => {
371                         let file = if op_is_uniform {
372                             RegFile::UGPR
373                         } else {
374                             RegFile::GPR
375                         };
376                         RegRef::zero(file, 1)
377                     }
378                     SrcRef::Reg(reg) => reg,
379                     _ => panic!("Invalid source ref"),
380                 };
381                 assert!(reg.comps() <= 2);
382                 let alu_ref = ALURegRef {
383                     reg: reg,
384                     abs: src_mod_has_abs(src.src_mod),
385                     neg: src_mod_has_neg(src.src_mod),
386                     swizzle: src.src_swizzle,
387                 };
388                 if op_is_uniform {
389                     assert!(reg.file() == RegFile::UGPR);
390                     ALUSrc::Reg(alu_ref)
391                 } else {
392                     match reg.file() {
393                         RegFile::GPR => ALUSrc::Reg(alu_ref),
394                         RegFile::UGPR => ALUSrc::UReg(alu_ref),
395                         _ => panic!("Invalid ALU register file"),
396                     }
397                 }
398             }
399             SrcRef::Imm32(i) => {
400                 assert!(src.src_mod.is_none());
401                 assert!(src.src_swizzle.is_none());
402                 ALUSrc::Imm32(i)
403             }
404             SrcRef::CBuf(cb) => {
405                 let alu_ref = ALUCBufRef {
406                     cb: cb,
407                     abs: src_mod_has_abs(src.src_mod),
408                     neg: src_mod_has_neg(src.src_mod),
409                     swizzle: src.src_swizzle,
410                 };
411                 ALUSrc::CBuf(alu_ref)
412             }
413             _ => panic!("Invalid ALU source"),
414         }
415     }
416 
has_src_mod(&self) -> bool417     pub fn has_src_mod(&self) -> bool {
418         match self {
419             ALUSrc::Reg(reg) | ALUSrc::UReg(reg) => reg.abs || reg.neg,
420             ALUSrc::CBuf(cb) => cb.abs || cb.neg,
421             _ => false,
422         }
423     }
424 }
425 
426 impl SM70Encoder<'_> {
set_swizzle(&mut self, range: Range<usize>, swizzle: SrcSwizzle)427     fn set_swizzle(&mut self, range: Range<usize>, swizzle: SrcSwizzle) {
428         assert!(range.len() == 2);
429 
430         self.set_field(
431             range,
432             match swizzle {
433                 SrcSwizzle::None => 0x00_u8,
434                 SrcSwizzle::Xx => 0x02_u8,
435                 SrcSwizzle::Yy => 0x03_u8,
436             },
437         );
438     }
439 
set_alu_reg( &mut self, range: Range<usize>, abs_bit: usize, neg_bit: usize, swizzle_range: Range<usize>, file: RegFile, is_fp16_alu: bool, has_mod: bool, reg: &ALURegRef, )440     fn set_alu_reg(
441         &mut self,
442         range: Range<usize>,
443         abs_bit: usize,
444         neg_bit: usize,
445         swizzle_range: Range<usize>,
446         file: RegFile,
447         is_fp16_alu: bool,
448         has_mod: bool,
449         reg: &ALURegRef,
450     ) {
451         match file {
452             RegFile::GPR => self.set_reg(range, reg.reg),
453             RegFile::UGPR => self.set_ureg(range, reg.reg),
454             _ => panic!("Invalid ALU src register file"),
455         }
456 
457         if has_mod {
458             self.set_bit(abs_bit, reg.abs);
459             self.set_bit(neg_bit, reg.neg);
460         } else {
461             assert!(!reg.abs && !reg.neg);
462         }
463 
464         if is_fp16_alu {
465             self.set_swizzle(swizzle_range, reg.swizzle);
466         } else {
467             assert!(reg.swizzle == SrcSwizzle::None);
468         }
469     }
470 
encode_alu_src0( &mut self, src: &ALUSrc, file: RegFile, is_fp16_alu: bool, )471     fn encode_alu_src0(
472         &mut self,
473         src: &ALUSrc,
474         file: RegFile,
475         is_fp16_alu: bool,
476     ) {
477         let reg = match src {
478             ALUSrc::None => return,
479             ALUSrc::Reg(reg) => reg,
480             _ => panic!("Invalid ALU src"),
481         };
482         self.set_alu_reg(24..32, 73, 72, 74..76, file, is_fp16_alu, true, reg);
483     }
484 
encode_alu_src2( &mut self, src: &ALUSrc, file: RegFile, is_fp16_alu: bool, bit74_75_are_mod: bool, )485     fn encode_alu_src2(
486         &mut self,
487         src: &ALUSrc,
488         file: RegFile,
489         is_fp16_alu: bool,
490         bit74_75_are_mod: bool,
491     ) {
492         let reg = match src {
493             ALUSrc::None => return,
494             ALUSrc::Reg(reg) => reg,
495             _ => panic!("Invalid ALU src"),
496         };
497         self.set_alu_reg(
498             64..72,
499             74,
500             75,
501             81..83,
502             file,
503             is_fp16_alu,
504             bit74_75_are_mod,
505             reg,
506         );
507     }
508 
encode_alu_reg(&mut self, reg: &ALURegRef, is_fp16_alu: bool)509     fn encode_alu_reg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) {
510         self.set_alu_reg(
511             32..40,
512             62,
513             63,
514             60..62,
515             RegFile::GPR,
516             is_fp16_alu,
517             true,
518             reg,
519         );
520     }
521 
encode_alu_ureg(&mut self, reg: &ALURegRef, is_fp16_alu: bool)522     fn encode_alu_ureg(&mut self, reg: &ALURegRef, is_fp16_alu: bool) {
523         self.set_ureg(32..40, reg.reg);
524         self.set_bit(62, reg.abs);
525         self.set_bit(63, reg.neg);
526 
527         if is_fp16_alu {
528             self.set_swizzle(60..62, reg.swizzle);
529         } else {
530             assert!(reg.swizzle == SrcSwizzle::None);
531         }
532 
533         self.set_bit(91, true);
534     }
535 
encode_alu_imm(&mut self, imm: &u32)536     fn encode_alu_imm(&mut self, imm: &u32) {
537         self.set_field(32..64, *imm);
538     }
539 
encode_alu_cb(&mut self, cb: &ALUCBufRef, is_fp16_alu: bool)540     fn encode_alu_cb(&mut self, cb: &ALUCBufRef, is_fp16_alu: bool) {
541         self.set_src_cb(32..59, 91, &cb.cb);
542         self.set_bit(62, cb.abs);
543         self.set_bit(63, cb.neg);
544 
545         if is_fp16_alu {
546             self.set_swizzle(60..62, cb.swizzle);
547         } else {
548             assert!(cb.swizzle == SrcSwizzle::None);
549         }
550     }
551 
encode_alu_base( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, is_fp16_alu: bool, )552     fn encode_alu_base(
553         &mut self,
554         opcode: u16,
555         dst: Option<&Dst>,
556         src0: Option<&Src>,
557         src1: Option<&Src>,
558         src2: Option<&Src>,
559         is_fp16_alu: bool,
560     ) {
561         if let Some(dst) = dst {
562             self.set_dst(*dst);
563         }
564 
565         let src0 = ALUSrc::from_src(src0, false);
566         let src1 = ALUSrc::from_src(src1, false);
567         let src2 = ALUSrc::from_src(src2, false);
568 
569         // Bits 74..76 are used both for the swizzle on src0 and for the source
570         // modifier for the register source of src1 and src2.  When both are
571         // registers, it's used for src2.  The hardware elects to always support
572         // a swizzle and not support source modifiers in that case.
573         let bit74_75_are_mod = !is_fp16_alu
574             || matches!(src1, ALUSrc::None)
575             || matches!(src2, ALUSrc::None);
576         debug_assert!(bit74_75_are_mod || !src0.has_src_mod());
577 
578         self.encode_alu_src0(&src0, RegFile::GPR, is_fp16_alu);
579 
580         let form = match &src2 {
581             ALUSrc::None | ALUSrc::Reg(_) => {
582                 self.encode_alu_src2(
583                     &src2,
584                     RegFile::GPR,
585                     is_fp16_alu,
586                     bit74_75_are_mod,
587                 );
588                 match &src1 {
589                     ALUSrc::None => 1_u8, // form
590                     ALUSrc::Reg(reg1) => {
591                         self.encode_alu_reg(reg1, is_fp16_alu);
592                         1_u8 // form
593                     }
594                     ALUSrc::UReg(reg1) => {
595                         self.encode_alu_ureg(reg1, is_fp16_alu);
596                         6_u8 // form
597                     }
598                     ALUSrc::Imm32(imm1) => {
599                         self.encode_alu_imm(imm1);
600                         4_u8 // form
601                     }
602                     ALUSrc::CBuf(cb1) => {
603                         self.encode_alu_cb(cb1, is_fp16_alu);
604                         5_u8 // form
605                     }
606                 }
607             }
608             ALUSrc::UReg(reg2) => {
609                 self.encode_alu_ureg(reg2, is_fp16_alu);
610                 self.encode_alu_src2(
611                     &src1,
612                     RegFile::GPR,
613                     is_fp16_alu,
614                     bit74_75_are_mod,
615                 );
616                 7_u8 // form
617             }
618             ALUSrc::Imm32(imm2) => {
619                 self.encode_alu_imm(imm2);
620                 self.encode_alu_src2(
621                     &src1,
622                     RegFile::GPR,
623                     is_fp16_alu,
624                     bit74_75_are_mod,
625                 );
626                 2_u8 // form
627             }
628             ALUSrc::CBuf(cb2) => {
629                 // TODO set_src_cx
630                 self.encode_alu_cb(cb2, is_fp16_alu);
631                 self.encode_alu_src2(
632                     &src1,
633                     RegFile::GPR,
634                     is_fp16_alu,
635                     bit74_75_are_mod,
636                 );
637                 3_u8 // form
638             }
639         };
640 
641         self.set_field(0..9, opcode);
642         self.set_field(9..12, form);
643     }
644 
encode_alu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )645     fn encode_alu(
646         &mut self,
647         opcode: u16,
648         dst: Option<&Dst>,
649         src0: Option<&Src>,
650         src1: Option<&Src>,
651         src2: Option<&Src>,
652     ) {
653         self.encode_alu_base(opcode, dst, src0, src1, src2, false);
654     }
655 
encode_fp16_alu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )656     fn encode_fp16_alu(
657         &mut self,
658         opcode: u16,
659         dst: Option<&Dst>,
660         src0: Option<&Src>,
661         src1: Option<&Src>,
662         src2: Option<&Src>,
663     ) {
664         self.encode_alu_base(opcode, dst, src0, src1, src2, true);
665     }
666 
encode_ualu( &mut self, opcode: u16, dst: Option<&Dst>, src0: Option<&Src>, src1: Option<&Src>, src2: Option<&Src>, )667     fn encode_ualu(
668         &mut self,
669         opcode: u16,
670         dst: Option<&Dst>,
671         src0: Option<&Src>,
672         src1: Option<&Src>,
673         src2: Option<&Src>,
674     ) {
675         if let Some(dst) = dst {
676             self.set_udst(*dst);
677         }
678 
679         let src0 = ALUSrc::from_src(src0, true);
680         let src1 = ALUSrc::from_src(src1, true);
681         let src2 = ALUSrc::from_src(src2, true);
682 
683         // All uniform ALU requires bit 91 set
684         self.set_bit(91, true);
685 
686         self.encode_alu_src0(&src0, RegFile::UGPR, false);
687         let form = match &src2 {
688             ALUSrc::None | ALUSrc::Reg(_) => {
689                 self.encode_alu_src2(&src2, RegFile::UGPR, false, true);
690                 match &src1 {
691                     ALUSrc::None => 1_u8, // form
692                     ALUSrc::Reg(reg1) => {
693                         self.encode_alu_ureg(reg1, false);
694                         1_u8 // form
695                     }
696                     ALUSrc::UReg(_) => panic!("UALU never has UReg"),
697                     ALUSrc::Imm32(imm1) => {
698                         self.encode_alu_imm(imm1);
699                         4_u8 // form
700                     }
701                     ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"),
702                 }
703             }
704             ALUSrc::UReg(_) => panic!("UALU never has UReg"),
705             ALUSrc::Imm32(imm2) => {
706                 self.encode_alu_imm(imm2);
707                 self.encode_alu_src2(&src1, RegFile::UGPR, false, true);
708                 2_u8 // form
709             }
710             ALUSrc::CBuf(_) => panic!("UALU does not support cbufs"),
711         };
712 
713         self.set_field(0..9, opcode);
714         self.set_field(9..12, form);
715     }
716 
set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode)717     fn set_rnd_mode(&mut self, range: Range<usize>, rnd_mode: FRndMode) {
718         assert!(range.len() == 2);
719         self.set_field(
720             range,
721             match rnd_mode {
722                 FRndMode::NearestEven => 0_u8,
723                 FRndMode::NegInf => 1_u8,
724                 FRndMode::PosInf => 2_u8,
725                 FRndMode::Zero => 3_u8,
726             },
727         );
728     }
729 }
730 
731 //
732 // Legalization helpers
733 //
734 
op_gpr(op: &impl DstsAsSlice) -> RegFile735 fn op_gpr(op: &impl DstsAsSlice) -> RegFile {
736     if op.is_uniform() {
737         RegFile::UGPR
738     } else {
739         RegFile::GPR
740     }
741 }
742 
743 /// Helper to legalize extended or external instructions
744 ///
745 /// These are instructions which reach out external units such as load/store
746 /// and texture ops.  They typically can't take anything but GPRs and are the
747 /// only types of instructions that support vectors.  They also can never be
748 /// uniform so we always evict uniform sources.
749 ///
legalize_ext_instr(op: &mut impl SrcsAsSlice, b: &mut LegalizeBuilder)750 fn legalize_ext_instr(op: &mut impl SrcsAsSlice, b: &mut LegalizeBuilder) {
751     let src_types = op.src_types();
752     for (i, src) in op.srcs_as_mut_slice().iter_mut().enumerate() {
753         match src_types[i] {
754             SrcType::SSA | SrcType::GPR => match &mut src.src_ref {
755                 SrcRef::Zero | SrcRef::True | SrcRef::False => {
756                     assert!(src_types[i] != SrcType::SSA);
757                 }
758                 SrcRef::SSA(ssa) => {
759                     b.copy_ssa_ref_if_uniform(ssa);
760                 }
761                 _ => panic!("Unsupported source reference"),
762             },
763             SrcType::ALU
764             | SrcType::F16
765             | SrcType::F16v2
766             | SrcType::F32
767             | SrcType::F64
768             | SrcType::I32
769             | SrcType::B32 => {
770                 panic!("ALU srcs must be legalized explicitly");
771             }
772             SrcType::Pred => {
773                 panic!("Predicates must be legalized explicitly");
774             }
775             SrcType::Carry => {
776                 panic!("Carry is invalid on Volta+");
777             }
778             SrcType::Bar => (),
779         }
780     }
781 }
782 
783 //
784 // Implementations of SM70Op for each op we support on Volta+
785 //
786 
787 impl SM70Op for OpFAdd {
legalize(&mut self, b: &mut LegalizeBuilder)788     fn legalize(&mut self, b: &mut LegalizeBuilder) {
789         let gpr = op_gpr(self);
790         let [src0, src1] = &mut self.srcs;
791         swap_srcs_if_not_reg(src0, src1, gpr);
792         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
793     }
794 
encode(&self, e: &mut SM70Encoder<'_>)795     fn encode(&self, e: &mut SM70Encoder<'_>) {
796         if src_is_zero_or_gpr(&self.srcs[1]) {
797             e.encode_alu(
798                 0x021,
799                 Some(&self.dst),
800                 Some(&self.srcs[0]),
801                 Some(&self.srcs[1]),
802                 None,
803             )
804         } else {
805             e.encode_alu(
806                 0x021,
807                 Some(&self.dst),
808                 Some(&self.srcs[0]),
809                 Some(&Src::new_zero()),
810                 Some(&self.srcs[1]),
811             )
812         };
813         e.set_bit(77, self.saturate);
814         e.set_rnd_mode(78..80, self.rnd_mode);
815         e.set_bit(80, self.ftz);
816     }
817 }
818 
819 impl SM70Op for OpFFma {
legalize(&mut self, b: &mut LegalizeBuilder)820     fn legalize(&mut self, b: &mut LegalizeBuilder) {
821         let gpr = op_gpr(self);
822         let [src0, src1, src2] = &mut self.srcs;
823         swap_srcs_if_not_reg(src0, src1, gpr);
824         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
825         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F32);
826     }
827 
encode(&self, e: &mut SM70Encoder<'_>)828     fn encode(&self, e: &mut SM70Encoder<'_>) {
829         e.encode_alu(
830             0x023,
831             Some(&self.dst),
832             Some(&self.srcs[0]),
833             Some(&self.srcs[1]),
834             Some(&self.srcs[2]),
835         );
836         e.set_bit(76, self.dnz);
837         e.set_bit(77, self.saturate);
838         e.set_rnd_mode(78..80, self.rnd_mode);
839         e.set_bit(80, self.ftz);
840     }
841 }
842 
843 impl SM70Op for OpFMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)844     fn legalize(&mut self, b: &mut LegalizeBuilder) {
845         let gpr = op_gpr(self);
846         let [src0, src1] = &mut self.srcs;
847         swap_srcs_if_not_reg(src0, src1, gpr);
848         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
849     }
850 
encode(&self, e: &mut SM70Encoder<'_>)851     fn encode(&self, e: &mut SM70Encoder<'_>) {
852         e.encode_alu(
853             0x009,
854             Some(&self.dst),
855             Some(&self.srcs[0]),
856             Some(&self.srcs[1]),
857             Some(&Src::new_zero()),
858         );
859         e.set_pred_src(87..90, 90, self.min);
860         e.set_bit(80, self.ftz);
861     }
862 }
863 
864 impl SM70Op for OpFMul {
legalize(&mut self, b: &mut LegalizeBuilder)865     fn legalize(&mut self, b: &mut LegalizeBuilder) {
866         let gpr = op_gpr(self);
867         let [src0, src1] = &mut self.srcs;
868         swap_srcs_if_not_reg(src0, src1, gpr);
869         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
870     }
871 
encode(&self, e: &mut SM70Encoder<'_>)872     fn encode(&self, e: &mut SM70Encoder<'_>) {
873         e.encode_alu(
874             0x020,
875             Some(&self.dst),
876             Some(&self.srcs[0]),
877             Some(&self.srcs[1]),
878             Some(&Src::new_zero()),
879         );
880         e.set_bit(76, self.dnz);
881         e.set_bit(77, self.saturate);
882         e.set_rnd_mode(78..80, self.rnd_mode);
883         e.set_bit(80, self.ftz);
884         e.set_field(84..87, 0x4_u8); // TODO: PDIV
885     }
886 }
887 
888 impl SM70Encoder<'_> {
set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp)889     fn set_float_cmp_op(&mut self, range: Range<usize>, op: FloatCmpOp) {
890         assert!(range.len() == 4);
891         self.set_field(
892             range,
893             match op {
894                 FloatCmpOp::OrdLt => 0x01_u8,
895                 FloatCmpOp::OrdEq => 0x02_u8,
896                 FloatCmpOp::OrdLe => 0x03_u8,
897                 FloatCmpOp::OrdGt => 0x04_u8,
898                 FloatCmpOp::OrdNe => 0x05_u8,
899                 FloatCmpOp::OrdGe => 0x06_u8,
900                 FloatCmpOp::UnordLt => 0x09_u8,
901                 FloatCmpOp::UnordEq => 0x0a_u8,
902                 FloatCmpOp::UnordLe => 0x0b_u8,
903                 FloatCmpOp::UnordGt => 0x0c_u8,
904                 FloatCmpOp::UnordNe => 0x0d_u8,
905                 FloatCmpOp::UnordGe => 0x0e_u8,
906                 FloatCmpOp::IsNum => 0x07_u8,
907                 FloatCmpOp::IsNan => 0x08_u8,
908             },
909         );
910     }
911 
set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp)912     fn set_pred_set_op(&mut self, range: Range<usize>, op: PredSetOp) {
913         assert!(range.len() == 2);
914         self.set_field(
915             range,
916             match op {
917                 PredSetOp::And => 0_u8,
918                 PredSetOp::Or => 1_u8,
919                 PredSetOp::Xor => 2_u8,
920             },
921         );
922     }
923 
set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp)924     fn set_int_cmp_op(&mut self, range: Range<usize>, op: IntCmpOp) {
925         assert!(range.len() == 3);
926         self.set_field(
927             range,
928             match op {
929                 IntCmpOp::Eq => 2_u8,
930                 IntCmpOp::Ne => 5_u8,
931                 IntCmpOp::Lt => 1_u8,
932                 IntCmpOp::Le => 3_u8,
933                 IntCmpOp::Gt => 4_u8,
934                 IntCmpOp::Ge => 6_u8,
935             },
936         );
937     }
938 }
939 
940 impl SM70Op for OpFSet {
legalize(&mut self, b: &mut LegalizeBuilder)941     fn legalize(&mut self, b: &mut LegalizeBuilder) {
942         let gpr = op_gpr(self);
943         let [src0, src1] = &mut self.srcs;
944         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
945             std::mem::swap(src0, src1);
946             self.cmp_op = self.cmp_op.flip();
947         }
948         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
949     }
950 
encode(&self, e: &mut SM70Encoder<'_>)951     fn encode(&self, e: &mut SM70Encoder<'_>) {
952         e.encode_alu(
953             0x00a,
954             Some(&self.dst),
955             Some(&self.srcs[0]),
956             Some(&self.srcs[1]),
957             None,
958         );
959         e.set_float_cmp_op(76..80, self.cmp_op);
960         e.set_bit(80, self.ftz);
961         e.set_field(87..90, 0x7_u8); // TODO: src predicate
962     }
963 }
964 
965 impl SM70Op for OpFSetP {
legalize(&mut self, b: &mut LegalizeBuilder)966     fn legalize(&mut self, b: &mut LegalizeBuilder) {
967         let gpr = op_gpr(self);
968         let [src0, src1] = &mut self.srcs;
969         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
970             std::mem::swap(src0, src1);
971             self.cmp_op = self.cmp_op.flip();
972         }
973         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
974     }
975 
encode(&self, e: &mut SM70Encoder<'_>)976     fn encode(&self, e: &mut SM70Encoder<'_>) {
977         e.encode_alu(
978             0x00b,
979             None,
980             Some(&self.srcs[0]),
981             Some(&self.srcs[1]),
982             None,
983         );
984 
985         e.set_pred_set_op(74..76, self.set_op);
986         e.set_float_cmp_op(76..80, self.cmp_op);
987         e.set_bit(80, self.ftz);
988 
989         e.set_pred_dst(81..84, self.dst);
990         e.set_pred_dst(84..87, Dst::None); // dst1
991 
992         e.set_pred_src(87..90, 90, self.accum);
993     }
994 }
995 
996 impl SM70Op for OpFSwzAdd {
legalize(&mut self, b: &mut LegalizeBuilder)997     fn legalize(&mut self, b: &mut LegalizeBuilder) {
998         let gpr = op_gpr(self);
999         let [src0, src1] = &mut self.srcs;
1000         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F32);
1001         b.copy_alu_src_if_not_reg(src1, gpr, SrcType::F32);
1002     }
1003 
encode(&self, e: &mut SM70Encoder<'_>)1004     fn encode(&self, e: &mut SM70Encoder<'_>) {
1005         e.set_opcode(0x822);
1006         e.set_dst(self.dst);
1007 
1008         e.set_reg_src(24..32, self.srcs[0]);
1009         e.set_reg_src(64..72, self.srcs[1]);
1010 
1011         let mut subop = 0x0_u8;
1012 
1013         for (i, swz_op) in self.ops.iter().enumerate() {
1014             let swz_op = match swz_op {
1015                 FSwzAddOp::Add => 0,
1016                 FSwzAddOp::SubRight => 2,
1017                 FSwzAddOp::SubLeft => 1,
1018                 FSwzAddOp::MoveLeft => 3,
1019             };
1020 
1021             subop |= swz_op << ((self.ops.len() - i - 1) * 2);
1022         }
1023 
1024         e.set_field(32..40, subop);
1025 
1026         e.set_bit(77, false); // NDV
1027         e.set_rnd_mode(78..80, self.rnd_mode);
1028         e.set_bit(80, self.ftz);
1029     }
1030 }
1031 
1032 impl SM70Op for OpMuFu {
legalize(&mut self, _b: &mut LegalizeBuilder)1033     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1034         // Nothing to do
1035     }
1036 
encode(&self, e: &mut SM70Encoder<'_>)1037     fn encode(&self, e: &mut SM70Encoder<'_>) {
1038         e.encode_alu(0x108, Some(&self.dst), None, Some(&self.src), None);
1039         e.set_field(
1040             74..80,
1041             match self.op {
1042                 MuFuOp::Cos => 0_u8,
1043                 MuFuOp::Sin => 1_u8,
1044                 MuFuOp::Exp2 => 2_u8,
1045                 MuFuOp::Log2 => 3_u8,
1046                 MuFuOp::Rcp => 4_u8,
1047                 MuFuOp::Rsq => 5_u8,
1048                 MuFuOp::Rcp64H => 6_u8,
1049                 MuFuOp::Rsq64H => 7_u8,
1050                 MuFuOp::Sqrt => 8_u8,
1051                 MuFuOp::Tanh => 9_u8,
1052             },
1053         );
1054     }
1055 }
1056 
1057 impl SM70Op for OpDAdd {
legalize(&mut self, b: &mut LegalizeBuilder)1058     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1059         let gpr = op_gpr(self);
1060         let [src0, src1] = &mut self.srcs;
1061         swap_srcs_if_not_reg(src0, src1, gpr);
1062         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1063     }
1064 
encode(&self, e: &mut SM70Encoder<'_>)1065     fn encode(&self, e: &mut SM70Encoder<'_>) {
1066         e.encode_alu(
1067             0x029,
1068             Some(&self.dst),
1069             Some(&self.srcs[0]),
1070             None,
1071             Some(&self.srcs[1]),
1072         );
1073         e.set_rnd_mode(78..80, self.rnd_mode);
1074     }
1075 }
1076 
1077 impl SM70Op for OpDFma {
legalize(&mut self, b: &mut LegalizeBuilder)1078     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1079         let gpr = op_gpr(self);
1080         let [src0, src1, src2] = &mut self.srcs;
1081         swap_srcs_if_not_reg(src0, src1, gpr);
1082         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1083         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F64);
1084     }
1085 
encode(&self, e: &mut SM70Encoder<'_>)1086     fn encode(&self, e: &mut SM70Encoder<'_>) {
1087         e.encode_alu(
1088             0x02b,
1089             Some(&self.dst),
1090             Some(&self.srcs[0]),
1091             Some(&self.srcs[1]),
1092             Some(&self.srcs[2]),
1093         );
1094         e.set_rnd_mode(78..80, self.rnd_mode);
1095     }
1096 }
1097 
1098 impl SM70Op for OpDMul {
legalize(&mut self, b: &mut LegalizeBuilder)1099     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1100         let gpr = op_gpr(self);
1101         let [src0, src1] = &mut self.srcs;
1102         swap_srcs_if_not_reg(src0, src1, gpr);
1103         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1104     }
1105 
encode(&self, e: &mut SM70Encoder<'_>)1106     fn encode(&self, e: &mut SM70Encoder<'_>) {
1107         e.encode_alu(
1108             0x028,
1109             Some(&self.dst),
1110             Some(&self.srcs[0]),
1111             Some(&self.srcs[1]),
1112             None,
1113         );
1114         e.set_rnd_mode(78..80, self.rnd_mode);
1115     }
1116 }
1117 
1118 impl SM70Op for OpDSetP {
legalize(&mut self, b: &mut LegalizeBuilder)1119     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1120         let gpr = op_gpr(self);
1121         let [src0, src1] = &mut self.srcs;
1122         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1123             std::mem::swap(src0, src1);
1124             self.cmp_op = self.cmp_op.flip();
1125         }
1126         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F64);
1127     }
1128 
encode(&self, e: &mut SM70Encoder<'_>)1129     fn encode(&self, e: &mut SM70Encoder<'_>) {
1130         if src_is_zero_or_gpr(&self.srcs[1]) {
1131             e.encode_alu(
1132                 0x02a,
1133                 None,
1134                 Some(&self.srcs[0]),
1135                 Some(&self.srcs[1]),
1136                 None,
1137             )
1138         } else {
1139             e.encode_alu(
1140                 0x02a,
1141                 None,
1142                 Some(&self.srcs[0]),
1143                 None,
1144                 Some(&self.srcs[1]),
1145             )
1146         };
1147 
1148         e.set_pred_set_op(74..76, self.set_op);
1149         e.set_float_cmp_op(76..80, self.cmp_op);
1150 
1151         e.set_pred_dst(81..84, self.dst);
1152         e.set_pred_dst(84..87, Dst::None); /* dst1 */
1153 
1154         e.set_pred_src(87..90, 90, self.accum);
1155     }
1156 }
1157 
1158 impl SM70Op for OpHAdd2 {
legalize(&mut self, b: &mut LegalizeBuilder)1159     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1160         let gpr = op_gpr(self);
1161         let [src0, src1] = &mut self.srcs;
1162         swap_srcs_if_not_reg(src0, src1, gpr);
1163         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1164     }
1165 
encode(&self, e: &mut SM70Encoder<'_>)1166     fn encode(&self, e: &mut SM70Encoder<'_>) {
1167         if src_is_zero_or_gpr(&self.srcs[1]) {
1168             e.encode_fp16_alu(
1169                 0x030,
1170                 Some(&self.dst),
1171                 Some(&self.srcs[0]),
1172                 Some(&self.srcs[1]),
1173                 None,
1174             )
1175         } else {
1176             e.encode_fp16_alu(
1177                 0x030,
1178                 Some(&self.dst),
1179                 Some(&self.srcs[0]),
1180                 None,
1181                 Some(&self.srcs[1]),
1182             )
1183         };
1184 
1185         e.set_bit(77, self.saturate);
1186         e.set_bit(78, self.f32);
1187         e.set_bit(80, self.ftz);
1188         e.set_bit(85, false); // .BF16_V2 (SM90+)
1189     }
1190 }
1191 
1192 impl SM70Op for OpHFma2 {
legalize(&mut self, b: &mut LegalizeBuilder)1193     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1194         let gpr = op_gpr(self);
1195         let [src0, src1, src2] = &mut self.srcs;
1196         swap_srcs_if_not_reg(src0, src1, gpr);
1197         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1198         b.copy_alu_src_if_not_reg(src1, gpr, SrcType::F16v2);
1199         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::F16v2);
1200 
1201         // HFMA2 doesn't have fabs or fneg on SRC2.
1202         if !src2.src_mod.is_none() {
1203             b.copy_alu_src_and_lower_fmod(src2, SrcType::F16v2);
1204         }
1205     }
1206 
encode(&self, e: &mut SM70Encoder<'_>)1207     fn encode(&self, e: &mut SM70Encoder<'_>) {
1208         // HFMA2 doesn't have fneg and fabs on SRC2.
1209         assert!(self.srcs[2].src_mod.is_none());
1210 
1211         e.encode_fp16_alu(
1212             0x031,
1213             Some(&self.dst),
1214             Some(&self.srcs[0]),
1215             Some(&self.srcs[1]),
1216             Some(&self.srcs[2]),
1217         );
1218 
1219         e.set_bit(76, self.dnz);
1220         e.set_bit(77, self.saturate);
1221         e.set_bit(78, self.f32);
1222         e.set_bit(79, false); // .RELU (SM86+)
1223         e.set_bit(80, self.ftz);
1224         e.set_bit(85, false); // .BF16_V2 (SM86+)
1225     }
1226 }
1227 
1228 impl SM70Op for OpHMul2 {
legalize(&mut self, b: &mut LegalizeBuilder)1229     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1230         let gpr = op_gpr(self);
1231         let [src0, src1] = &mut self.srcs;
1232         swap_srcs_if_not_reg(src0, src1, gpr);
1233         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1234     }
1235 
encode(&self, e: &mut SM70Encoder<'_>)1236     fn encode(&self, e: &mut SM70Encoder<'_>) {
1237         e.encode_fp16_alu(
1238             0x032,
1239             Some(&self.dst),
1240             Some(&self.srcs[0]),
1241             Some(&self.srcs[1]),
1242             None,
1243         );
1244 
1245         e.set_bit(76, self.dnz);
1246         e.set_bit(77, self.saturate);
1247         e.set_bit(78, false); // .F32 (SM70-SM75)
1248         e.set_bit(79, false); // .RELU (SM86+)
1249         e.set_bit(80, self.ftz);
1250         e.set_bit(85, false); // .BF16_V2 (SM90+)
1251     }
1252 }
1253 
1254 impl SM70Op for OpHSet2 {
legalize(&mut self, b: &mut LegalizeBuilder)1255     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1256         let gpr = op_gpr(self);
1257         let [src0, src1] = &mut self.srcs;
1258         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1259             std::mem::swap(src0, src1);
1260             self.cmp_op = self.cmp_op.flip();
1261         }
1262         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1263     }
1264 
encode(&self, e: &mut SM70Encoder<'_>)1265     fn encode(&self, e: &mut SM70Encoder<'_>) {
1266         if src_is_zero_or_gpr(&self.srcs[1]) {
1267             e.encode_fp16_alu(
1268                 0x033,
1269                 Some(&self.dst),
1270                 Some(&self.srcs[0]),
1271                 Some(&self.srcs[1]),
1272                 None,
1273             )
1274         } else {
1275             e.encode_fp16_alu(
1276                 0x033,
1277                 Some(&self.dst),
1278                 Some(&self.srcs[0]),
1279                 None,
1280                 Some(&self.srcs[1]),
1281             )
1282         };
1283 
1284         e.set_bit(65, false); // .BF16_V2 (SM90+)
1285         e.set_pred_set_op(69..71, self.set_op);
1286 
1287         // This differentiate between integer and fp16 output
1288         e.set_bit(71, true); // .BF
1289         e.set_float_cmp_op(76..80, self.cmp_op);
1290         e.set_bit(80, self.ftz);
1291 
1292         e.set_pred_src(87..90, 90, self.accum);
1293     }
1294 }
1295 
1296 impl SM70Op for OpHSetP2 {
legalize(&mut self, b: &mut LegalizeBuilder)1297     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1298         let gpr = op_gpr(self);
1299         let [src0, src1] = &mut self.srcs;
1300         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1301             std::mem::swap(src0, src1);
1302             self.cmp_op = self.cmp_op.flip();
1303         }
1304         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1305     }
1306 
encode(&self, e: &mut SM70Encoder<'_>)1307     fn encode(&self, e: &mut SM70Encoder<'_>) {
1308         if src_is_zero_or_gpr(&self.srcs[1]) {
1309             e.encode_fp16_alu(
1310                 0x034,
1311                 None,
1312                 Some(&self.srcs[0]),
1313                 Some(&self.srcs[1]),
1314                 None,
1315             )
1316         } else {
1317             e.encode_fp16_alu(
1318                 0x034,
1319                 None,
1320                 Some(&self.srcs[0]),
1321                 None,
1322                 Some(&self.srcs[1]),
1323             )
1324         };
1325 
1326         e.set_bit(65, false); // .BF16_V2 (SM90+)
1327         e.set_pred_set_op(69..71, self.set_op);
1328         e.set_bit(71, self.horizontal); // .H_AND
1329         e.set_float_cmp_op(76..80, self.cmp_op);
1330         e.set_bit(80, self.ftz);
1331 
1332         e.set_pred_dst(81..84, self.dsts[0]);
1333         e.set_pred_dst(84..87, self.dsts[1]);
1334 
1335         e.set_pred_src(87..90, 90, self.accum);
1336     }
1337 }
1338 
1339 impl SM70Op for OpHMnMx2 {
legalize(&mut self, b: &mut LegalizeBuilder)1340     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1341         let gpr = op_gpr(self);
1342         let [src0, src1] = &mut self.srcs;
1343         swap_srcs_if_not_reg(src0, src1, gpr);
1344         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::F16v2);
1345     }
1346 
encode(&self, e: &mut SM70Encoder<'_>)1347     fn encode(&self, e: &mut SM70Encoder<'_>) {
1348         assert!(e.sm.sm >= 80);
1349 
1350         e.encode_fp16_alu(
1351             0x040,
1352             Some(&self.dst),
1353             Some(&self.srcs[0]),
1354             Some(&self.srcs[1]),
1355             None,
1356         );
1357 
1358         // This differentiate between integer and fp16 output
1359         e.set_bit(78, false); // .F32 (SM86)
1360         e.set_bit(80, self.ftz);
1361         e.set_bit(81, false); // .NAN
1362         e.set_bit(82, false); // .XORSIGN
1363         e.set_bit(85, false); // .BF16_V2
1364 
1365         e.set_pred_src(87..90, 90, self.min);
1366     }
1367 }
1368 
1369 impl SM70Op for OpBMsk {
legalize(&mut self, b: &mut LegalizeBuilder)1370     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1371         let gpr = op_gpr(self);
1372         b.copy_alu_src_if_not_reg(&mut self.pos, gpr, SrcType::ALU);
1373     }
1374 
encode(&self, e: &mut SM70Encoder<'_>)1375     fn encode(&self, e: &mut SM70Encoder<'_>) {
1376         if self.is_uniform() {
1377             e.encode_ualu(
1378                 0x09b,
1379                 Some(&self.dst),
1380                 Some(&self.pos),
1381                 Some(&self.width),
1382                 None,
1383             )
1384         } else {
1385             e.encode_alu(
1386                 0x01b,
1387                 Some(&self.dst),
1388                 Some(&self.pos),
1389                 Some(&self.width),
1390                 None,
1391             )
1392         };
1393 
1394         e.set_bit(75, self.wrap);
1395     }
1396 }
1397 
1398 impl SM70Op for OpBRev {
legalize(&mut self, _b: &mut LegalizeBuilder)1399     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1400         // Nothing to do
1401     }
1402 
encode(&self, e: &mut SM70Encoder<'_>)1403     fn encode(&self, e: &mut SM70Encoder<'_>) {
1404         if self.is_uniform() {
1405             e.encode_ualu(0x0be, Some(&self.dst), None, Some(&self.src), None)
1406         } else {
1407             e.encode_alu(0x101, Some(&self.dst), None, Some(&self.src), None)
1408         }
1409     }
1410 }
1411 
1412 impl SM70Op for OpFlo {
legalize(&mut self, _b: &mut LegalizeBuilder)1413     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1414         // Nothing to do
1415     }
1416 
encode(&self, e: &mut SM70Encoder<'_>)1417     fn encode(&self, e: &mut SM70Encoder<'_>) {
1418         if self.is_uniform() {
1419             e.encode_ualu(0x0bd, Some(&self.dst), None, Some(&self.src), None)
1420         } else {
1421             e.encode_alu(0x100, Some(&self.dst), None, Some(&self.src), None)
1422         };
1423         e.set_pred_dst(81..84, Dst::None);
1424         e.set_field(74..75, self.return_shift_amount as u8);
1425         e.set_field(73..74, self.signed as u8);
1426         let not_mod = matches!(self.src.src_mod, SrcMod::BNot);
1427         e.set_field(63..64, not_mod)
1428     }
1429 }
1430 
1431 impl SM70Op for OpIAbs {
legalize(&mut self, _b: &mut LegalizeBuilder)1432     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1433         // Nothing to do
1434     }
1435 
encode(&self, e: &mut SM70Encoder<'_>)1436     fn encode(&self, e: &mut SM70Encoder<'_>) {
1437         e.encode_alu(0x013, Some(&self.dst), None, Some(&self.src), None)
1438     }
1439 }
1440 
1441 impl SM70Op for OpIAdd3 {
legalize(&mut self, b: &mut LegalizeBuilder)1442     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1443         let gpr = op_gpr(self);
1444         let [src0, src1, src2] = &mut self.srcs;
1445         swap_srcs_if_not_reg(src0, src1, gpr);
1446         swap_srcs_if_not_reg(src2, src1, gpr);
1447         if !src0.src_mod.is_none() && !src1.src_mod.is_none() {
1448             assert!(self.overflow[0].is_none());
1449             assert!(self.overflow[1].is_none());
1450             let val = b.alloc_ssa(gpr, 1);
1451             b.push_op(OpIAdd3 {
1452                 srcs: [Src::new_zero(), *src0, Src::new_zero()],
1453                 overflow: [Dst::None; 2],
1454                 dst: val.into(),
1455             });
1456             *src0 = val.into();
1457         }
1458         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::I32);
1459         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::I32);
1460         if !self.overflow[0].is_none() || !self.overflow[1].is_none() {
1461             b.copy_alu_src_if_ineg_imm(src1, gpr, SrcType::I32);
1462             b.copy_alu_src_if_ineg_imm(src2, gpr, SrcType::I32);
1463         }
1464     }
1465 
encode(&self, e: &mut SM70Encoder<'_>)1466     fn encode(&self, e: &mut SM70Encoder<'_>) {
1467         // Hardware requires at least one of these be unmodified
1468         assert!(
1469             self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none()
1470         );
1471 
1472         if self.is_uniform() {
1473             e.encode_ualu(
1474                 0x090,
1475                 Some(&self.dst),
1476                 Some(&self.srcs[0]),
1477                 Some(&self.srcs[1]),
1478                 Some(&self.srcs[2]),
1479             )
1480         } else {
1481             e.encode_alu(
1482                 0x010,
1483                 Some(&self.dst),
1484                 Some(&self.srcs[0]),
1485                 Some(&self.srcs[1]),
1486                 Some(&self.srcs[2]),
1487             )
1488         };
1489 
1490         e.set_pred_src(87..90, 90, false.into());
1491         e.set_pred_src(77..80, 80, false.into());
1492 
1493         e.set_pred_dst(81..84, self.overflow[0]);
1494         e.set_pred_dst(84..87, self.overflow[1]);
1495     }
1496 }
1497 
1498 impl SM70Op for OpIAdd3X {
legalize(&mut self, b: &mut LegalizeBuilder)1499     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1500         let gpr = op_gpr(self);
1501         let [src0, src1, src2] = &mut self.srcs;
1502         swap_srcs_if_not_reg(src0, src1, gpr);
1503         swap_srcs_if_not_reg(src2, src1, gpr);
1504         if !src0.src_mod.is_none() && !src1.src_mod.is_none() {
1505             let val = b.alloc_ssa(gpr, 1);
1506             b.push_op(OpIAdd3X {
1507                 srcs: [Src::new_zero(), *src0, Src::new_zero()],
1508                 overflow: [Dst::None; 2],
1509                 dst: val.into(),
1510                 carry: [false.into(); 2],
1511             });
1512             *src0 = val.into();
1513         }
1514         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::B32);
1515         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::B32);
1516         if !self.is_uniform() {
1517             b.copy_src_if_upred(&mut self.carry[0]);
1518             b.copy_src_if_upred(&mut self.carry[1]);
1519         }
1520     }
1521 
encode(&self, e: &mut SM70Encoder<'_>)1522     fn encode(&self, e: &mut SM70Encoder<'_>) {
1523         // Hardware requires at least one of these be unmodified
1524         assert!(
1525             self.srcs[0].src_mod.is_none() || self.srcs[1].src_mod.is_none()
1526         );
1527 
1528         if self.is_uniform() {
1529             e.encode_ualu(
1530                 0x090,
1531                 Some(&self.dst),
1532                 Some(&self.srcs[0]),
1533                 Some(&self.srcs[1]),
1534                 Some(&self.srcs[2]),
1535             );
1536 
1537             e.set_upred_src(87..90, 90, self.carry[0]);
1538             e.set_upred_src(77..80, 80, self.carry[1]);
1539         } else {
1540             e.encode_alu(
1541                 0x010,
1542                 Some(&self.dst),
1543                 Some(&self.srcs[0]),
1544                 Some(&self.srcs[1]),
1545                 Some(&self.srcs[2]),
1546             );
1547 
1548             e.set_pred_src(87..90, 90, self.carry[0]);
1549             e.set_pred_src(77..80, 80, self.carry[1]);
1550         }
1551 
1552         e.set_bit(74, true); // .X
1553 
1554         e.set_pred_dst(81..84, self.overflow[0]);
1555         e.set_pred_dst(84..87, self.overflow[1]);
1556     }
1557 }
1558 
1559 impl SM70Op for OpIDp4 {
legalize(&mut self, b: &mut LegalizeBuilder)1560     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1561         let gpr = op_gpr(self);
1562         let [src_type0, src_type1] = &mut self.src_types;
1563         let [src0, src1, src2] = &mut self.srcs;
1564         if swap_srcs_if_not_reg(src0, src1, gpr) {
1565             std::mem::swap(src_type0, src_type1);
1566         }
1567         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1568         b.copy_alu_src_if_ineg_imm(src1, gpr, SrcType::I32);
1569         b.copy_alu_src_if_not_reg(src2, gpr, SrcType::ALU);
1570     }
1571 
encode(&self, e: &mut SM70Encoder<'_>)1572     fn encode(&self, e: &mut SM70Encoder<'_>) {
1573         e.encode_alu(
1574             0x026,
1575             Some(&self.dst),
1576             Some(&self.srcs[0]),
1577             Some(&self.srcs[1]),
1578             Some(&self.srcs[2]),
1579         );
1580 
1581         e.set_bit(
1582             73,
1583             match self.src_types[0] {
1584                 IntType::U8 => false,
1585                 IntType::I8 => true,
1586                 _ => panic!("Invalid DP4 source type"),
1587             },
1588         );
1589         e.set_bit(
1590             74,
1591             match self.src_types[1] {
1592                 IntType::U8 => false,
1593                 IntType::I8 => true,
1594                 _ => panic!("Invalid DP4 source type"),
1595             },
1596         );
1597     }
1598 }
1599 
1600 impl SM70Op for OpIMad {
legalize(&mut self, b: &mut LegalizeBuilder)1601     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1602         let gpr = op_gpr(self);
1603         let [src0, src1, src2] = &mut self.srcs;
1604         swap_srcs_if_not_reg(src0, src1, gpr);
1605         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1606         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::ALU);
1607     }
1608 
encode(&self, e: &mut SM70Encoder<'_>)1609     fn encode(&self, e: &mut SM70Encoder<'_>) {
1610         if self.is_uniform() {
1611             e.encode_ualu(
1612                 0x0a4,
1613                 Some(&self.dst),
1614                 Some(&self.srcs[0]),
1615                 Some(&self.srcs[1]),
1616                 Some(&self.srcs[2]),
1617             )
1618         } else {
1619             e.encode_alu(
1620                 0x024,
1621                 Some(&self.dst),
1622                 Some(&self.srcs[0]),
1623                 Some(&self.srcs[1]),
1624                 Some(&self.srcs[2]),
1625             )
1626         };
1627         e.set_pred_dst(81..84, Dst::None);
1628         e.set_bit(73, self.signed);
1629     }
1630 }
1631 
1632 impl SM70Op for OpIMad64 {
legalize(&mut self, b: &mut LegalizeBuilder)1633     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1634         let gpr = op_gpr(self);
1635         let [src0, src1, src2] = &mut self.srcs;
1636         swap_srcs_if_not_reg(src0, src1, gpr);
1637         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1638         b.copy_alu_src_if_both_not_reg(src1, src2, gpr, SrcType::ALU);
1639     }
1640 
encode(&self, e: &mut SM70Encoder<'_>)1641     fn encode(&self, e: &mut SM70Encoder<'_>) {
1642         if self.is_uniform() {
1643             e.encode_ualu(
1644                 0x0a5,
1645                 Some(&self.dst),
1646                 Some(&self.srcs[0]),
1647                 Some(&self.srcs[1]),
1648                 Some(&self.srcs[2]),
1649             )
1650         } else {
1651             e.encode_alu(
1652                 0x025,
1653                 Some(&self.dst),
1654                 Some(&self.srcs[0]),
1655                 Some(&self.srcs[1]),
1656                 Some(&self.srcs[2]),
1657             )
1658         };
1659         e.set_pred_dst(81..84, Dst::None);
1660         e.set_bit(73, self.signed);
1661     }
1662 }
1663 
1664 impl SM70Op for OpIMnMx {
legalize(&mut self, b: &mut LegalizeBuilder)1665     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1666         let gpr = op_gpr(self);
1667         let [src0, src1] = &mut self.srcs;
1668         swap_srcs_if_not_reg(src0, src1, gpr);
1669         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1670     }
1671 
encode(&self, e: &mut SM70Encoder<'_>)1672     fn encode(&self, e: &mut SM70Encoder<'_>) {
1673         e.encode_alu(
1674             0x017,
1675             Some(&self.dst),
1676             Some(&self.srcs[0]),
1677             Some(&self.srcs[1]),
1678             None,
1679         );
1680         e.set_pred_src(87..90, 90, self.min);
1681         e.set_bit(
1682             73,
1683             match self.cmp_type {
1684                 IntCmpType::U32 => false,
1685                 IntCmpType::I32 => true,
1686             },
1687         );
1688     }
1689 }
1690 
1691 impl SM70Op for OpISetP {
legalize(&mut self, b: &mut LegalizeBuilder)1692     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1693         let gpr = op_gpr(self);
1694         let [src0, src1] = &mut self.srcs;
1695         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1696             std::mem::swap(src0, src1);
1697             self.cmp_op = self.cmp_op.flip();
1698         }
1699         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1700         if !self.is_uniform() {
1701             b.copy_src_if_upred(&mut self.low_cmp);
1702             b.copy_src_if_upred(&mut self.accum);
1703         }
1704     }
1705 
encode(&self, e: &mut SM70Encoder<'_>)1706     fn encode(&self, e: &mut SM70Encoder<'_>) {
1707         if self.is_uniform() {
1708             e.encode_ualu(
1709                 0x08c,
1710                 None,
1711                 Some(&self.srcs[0]),
1712                 Some(&self.srcs[1]),
1713                 None,
1714             );
1715 
1716             e.set_upred_src(68..71, 71, self.low_cmp);
1717             e.set_upred_src(87..90, 90, self.accum);
1718         } else {
1719             e.encode_alu(
1720                 0x00c,
1721                 None,
1722                 Some(&self.srcs[0]),
1723                 Some(&self.srcs[1]),
1724                 None,
1725             );
1726 
1727             e.set_pred_src(68..71, 71, self.low_cmp);
1728             e.set_pred_src(87..90, 90, self.accum);
1729         }
1730 
1731         e.set_bit(72, self.ex);
1732 
1733         e.set_field(
1734             73..74,
1735             match self.cmp_type {
1736                 IntCmpType::U32 => 0_u32,
1737                 IntCmpType::I32 => 1_u32,
1738             },
1739         );
1740         e.set_pred_set_op(74..76, self.set_op);
1741         e.set_int_cmp_op(76..79, self.cmp_op);
1742 
1743         e.set_pred_dst(81..84, self.dst);
1744         e.set_pred_dst(84..87, Dst::None); // dst1
1745     }
1746 }
1747 
src_as_lop_imm(src: &Src) -> Option<bool>1748 fn src_as_lop_imm(src: &Src) -> Option<bool> {
1749     let x = match src.src_ref {
1750         SrcRef::Zero => false,
1751         SrcRef::True => true,
1752         SrcRef::False => false,
1753         SrcRef::Imm32(i) => {
1754             if i == 0 {
1755                 false
1756             } else if i == !0 {
1757                 true
1758             } else {
1759                 return None;
1760             }
1761         }
1762         _ => return None,
1763     };
1764     Some(x ^ src.src_mod.is_bnot())
1765 }
1766 
fold_lop_src(src: &Src, x: &mut u8)1767 fn fold_lop_src(src: &Src, x: &mut u8) {
1768     if let Some(i) = src_as_lop_imm(src) {
1769         *x = if i { !0 } else { 0 };
1770     }
1771     if src.src_mod.is_bnot() {
1772         *x = !*x;
1773     }
1774 }
1775 
1776 impl SM70Op for OpLop3 {
legalize(&mut self, b: &mut LegalizeBuilder)1777     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1778         let gpr = op_gpr(self);
1779         // Fold constants and modifiers if we can
1780         self.op = LogicOp3::new_lut(&|mut x, mut y, mut z| {
1781             fold_lop_src(&self.srcs[0], &mut x);
1782             fold_lop_src(&self.srcs[1], &mut y);
1783             fold_lop_src(&self.srcs[2], &mut z);
1784             self.op.eval(x, y, z)
1785         });
1786         for src in &mut self.srcs {
1787             src.src_mod = SrcMod::None;
1788             if src_as_lop_imm(src).is_some() {
1789                 src.src_ref = SrcRef::Zero;
1790             }
1791         }
1792 
1793         let [src0, src1, src2] = &mut self.srcs;
1794         if !src_is_reg(src0, gpr) && src_is_reg(src1, gpr) {
1795             std::mem::swap(src0, src1);
1796             self.op = LogicOp3::new_lut(&|x, y, z| self.op.eval(y, x, z))
1797         }
1798         if !src_is_reg(src2, gpr) && src_is_reg(src1, gpr) {
1799             std::mem::swap(src2, src1);
1800             self.op = LogicOp3::new_lut(&|x, y, z| self.op.eval(x, z, y))
1801         }
1802 
1803         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1804         b.copy_alu_src_if_not_reg(src2, gpr, SrcType::ALU);
1805     }
1806 
encode(&self, e: &mut SM70Encoder<'_>)1807     fn encode(&self, e: &mut SM70Encoder<'_>) {
1808         if self.is_uniform() {
1809             e.encode_ualu(
1810                 0x092,
1811                 Some(&self.dst),
1812                 Some(&self.srcs[0]),
1813                 Some(&self.srcs[1]),
1814                 Some(&self.srcs[2]),
1815             );
1816 
1817             e.set_upred_src(87..90, 90, SrcRef::False.into());
1818         } else {
1819             e.encode_alu(
1820                 0x012,
1821                 Some(&self.dst),
1822                 Some(&self.srcs[0]),
1823                 Some(&self.srcs[1]),
1824                 Some(&self.srcs[2]),
1825             );
1826 
1827             e.set_pred_src(87..90, 90, SrcRef::False.into());
1828         }
1829 
1830         e.set_field(72..80, self.op.lut);
1831         e.set_bit(80, false); // .PAND
1832         e.set_field(81..84, 7_u32); // pred
1833     }
1834 }
1835 
1836 impl SM70Op for OpPopC {
legalize(&mut self, _b: &mut LegalizeBuilder)1837     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1838         // Nothing to do
1839     }
1840 
encode(&self, e: &mut SM70Encoder<'_>)1841     fn encode(&self, e: &mut SM70Encoder<'_>) {
1842         if self.is_uniform() {
1843             e.encode_ualu(0x0bf, Some(&self.dst), None, Some(&self.src), None)
1844         } else {
1845             e.encode_alu(0x109, Some(&self.dst), None, Some(&self.src), None)
1846         };
1847 
1848         let not_mod = matches!(self.src.src_mod, SrcMod::BNot);
1849         e.set_field(63..64, not_mod);
1850     }
1851 }
1852 
1853 impl SM70Op for OpShf {
legalize(&mut self, b: &mut LegalizeBuilder)1854     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1855         let gpr = op_gpr(self);
1856         b.copy_alu_src_if_not_reg(&mut self.low, gpr, SrcType::ALU);
1857         b.copy_alu_src_if_both_not_reg(
1858             &self.shift,
1859             &mut self.high,
1860             gpr,
1861             SrcType::ALU,
1862         );
1863     }
1864 
encode(&self, e: &mut SM70Encoder<'_>)1865     fn encode(&self, e: &mut SM70Encoder<'_>) {
1866         if self.is_uniform() {
1867             e.encode_ualu(
1868                 0x099,
1869                 Some(&self.dst),
1870                 Some(&self.low),
1871                 Some(&self.shift),
1872                 Some(&self.high),
1873             )
1874         } else {
1875             e.encode_alu(
1876                 0x019,
1877                 Some(&self.dst),
1878                 Some(&self.low),
1879                 Some(&self.shift),
1880                 Some(&self.high),
1881             )
1882         };
1883 
1884         e.set_field(
1885             73..75,
1886             match self.data_type {
1887                 IntType::I64 => 0_u8,
1888                 IntType::U64 => 1_u8,
1889                 IntType::I32 => 2_u8,
1890                 IntType::U32 => 3_u8,
1891                 _ => panic!("Invalid shift data type"),
1892             },
1893         );
1894         e.set_bit(75, self.wrap);
1895         e.set_bit(76, self.right);
1896         e.set_bit(80, self.dst_high);
1897     }
1898 }
1899 
1900 impl SM70Op for OpF2F {
legalize(&mut self, _b: &mut LegalizeBuilder)1901     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1902         // Nothing to do
1903     }
1904 
encode(&self, e: &mut SM70Encoder<'_>)1905     fn encode(&self, e: &mut SM70Encoder<'_>) {
1906         assert!(!self.integer_rnd);
1907         if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1908             e.encode_alu(0x104, Some(&self.dst), None, Some(&self.src), None)
1909         } else {
1910             e.encode_alu(0x110, Some(&self.dst), None, Some(&self.src), None)
1911         };
1912 
1913         if self.high {
1914             e.set_field(60..62, 1_u8); // .H1
1915         }
1916 
1917         e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1918         e.set_rnd_mode(78..80, self.rnd_mode);
1919         e.set_bit(80, self.ftz);
1920         e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1921     }
1922 }
1923 
1924 impl SM70Op for OpF2FP {
legalize(&mut self, b: &mut LegalizeBuilder)1925     fn legalize(&mut self, b: &mut LegalizeBuilder) {
1926         let gpr = op_gpr(self);
1927         let [src0, src1] = &mut self.srcs;
1928         swap_srcs_if_not_reg(src0, src1, gpr);
1929 
1930         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
1931     }
1932 
encode(&self, e: &mut SM70Encoder<'_>)1933     fn encode(&self, e: &mut SM70Encoder<'_>) {
1934         e.encode_alu(
1935             0x03e,
1936             Some(&self.dst),
1937             Some(&self.srcs[0]),
1938             Some(&self.srcs[1]),
1939             Some(&Src::new_zero()),
1940         );
1941 
1942         // .MERGE_C behavior
1943         // Use src1 and src2, src0 is unused
1944         // src1 get converted and packed in the lower 16 bits of dest.
1945         // src2 lower or high 16 bits (decided by .H1 flag) get packed in the upper of dest.
1946         e.set_bit(78, false); // TODO: .MERGE_C
1947         e.set_bit(72, false); // .H1 (MERGE_C only)
1948         e.set_rnd_mode(79..81, self.rnd_mode);
1949     }
1950 }
1951 
1952 impl SM70Op for OpF2I {
legalize(&mut self, _b: &mut LegalizeBuilder)1953     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1954         // Nothing to do
1955     }
1956 
encode(&self, e: &mut SM70Encoder<'_>)1957     fn encode(&self, e: &mut SM70Encoder<'_>) {
1958         if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1959             e.encode_alu(0x105, Some(&self.dst), None, Some(&self.src), None)
1960         } else {
1961             e.encode_alu(0x111, Some(&self.dst), None, Some(&self.src), None)
1962         };
1963 
1964         e.set_bit(72, self.dst_type.is_signed());
1965         e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1966         e.set_bit(77, false); // NTZ
1967         e.set_rnd_mode(78..80, self.rnd_mode);
1968         e.set_bit(80, self.ftz);
1969         e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1970     }
1971 }
1972 
1973 impl SM70Op for OpI2F {
legalize(&mut self, _b: &mut LegalizeBuilder)1974     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1975         // Nothing to do
1976     }
1977 
encode(&self, e: &mut SM70Encoder<'_>)1978     fn encode(&self, e: &mut SM70Encoder<'_>) {
1979         if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
1980             e.encode_alu(0x106, Some(&self.dst), None, Some(&self.src), None)
1981         } else {
1982             e.encode_alu(0x112, Some(&self.dst), None, Some(&self.src), None)
1983         };
1984 
1985         e.set_field(60..62, 0_u8); // TODO: subop
1986         e.set_bit(74, self.src_type.is_signed());
1987         e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
1988         e.set_rnd_mode(78..80, self.rnd_mode);
1989         e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
1990     }
1991 }
1992 
1993 impl SM70Op for OpFRnd {
legalize(&mut self, _b: &mut LegalizeBuilder)1994     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
1995         // Nothing to do
1996     }
1997 
encode(&self, e: &mut SM70Encoder<'_>)1998     fn encode(&self, e: &mut SM70Encoder<'_>) {
1999         if self.src_type.bits() <= 32 && self.dst_type.bits() <= 32 {
2000             e.encode_alu(0x107, Some(&self.dst), None, Some(&self.src), None)
2001         } else {
2002             e.encode_alu(0x113, Some(&self.dst), None, Some(&self.src), None)
2003         };
2004 
2005         e.set_field(84..86, (self.src_type.bits() / 8).ilog2());
2006         e.set_bit(80, self.ftz);
2007         e.set_rnd_mode(78..80, self.rnd_mode);
2008         e.set_field(75..77, (self.dst_type.bits() / 8).ilog2());
2009     }
2010 }
2011 
2012 impl SM70Op for OpMov {
legalize(&mut self, _b: &mut LegalizeBuilder)2013     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2014         // Nothing to do
2015     }
2016 
encode(&self, e: &mut SM70Encoder<'_>)2017     fn encode(&self, e: &mut SM70Encoder<'_>) {
2018         if self.is_uniform() {
2019             e.set_opcode(0xc82);
2020             e.set_udst(self.dst);
2021 
2022             // umov is encoded like a non-uniform ALU op
2023             let src = ALUSrc::from_src(Some(&self.src), true);
2024             let form: u8 = match &src {
2025                 ALUSrc::Reg(reg) => {
2026                     e.encode_alu_ureg(reg, false);
2027                     0x6 // form
2028                 }
2029                 ALUSrc::Imm32(imm) => {
2030                     e.encode_alu_imm(imm);
2031                     0x4 // form
2032                 }
2033                 _ => panic!("Invalid umov src"),
2034             };
2035             e.set_field(9..12, form);
2036         } else {
2037             e.encode_alu(0x002, Some(&self.dst), None, Some(&self.src), None);
2038             e.set_field(72..76, self.quad_lanes);
2039         }
2040     }
2041 }
2042 
2043 impl SM70Op for OpPrmt {
legalize(&mut self, b: &mut LegalizeBuilder)2044     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2045         let gpr = op_gpr(self);
2046         let [src0, src1] = &mut self.srcs;
2047         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
2048         b.copy_alu_src_if_not_reg(src1, gpr, SrcType::ALU);
2049     }
2050 
encode(&self, e: &mut SM70Encoder<'_>)2051     fn encode(&self, e: &mut SM70Encoder<'_>) {
2052         if self.is_uniform() {
2053             e.encode_ualu(
2054                 0x96,
2055                 Some(&self.dst),
2056                 Some(&self.srcs[0]),
2057                 Some(&self.sel),
2058                 Some(&self.srcs[1]),
2059             )
2060         } else {
2061             e.encode_alu(
2062                 0x16,
2063                 Some(&self.dst),
2064                 Some(&self.srcs[0]),
2065                 Some(&self.sel),
2066                 Some(&self.srcs[1]),
2067             )
2068         };
2069 
2070         e.set_field(
2071             72..75,
2072             match self.mode {
2073                 PrmtMode::Index => 0_u8,
2074                 PrmtMode::Forward4Extract => 1_u8,
2075                 PrmtMode::Backward4Extract => 2_u8,
2076                 PrmtMode::Replicate8 => 3_u8,
2077                 PrmtMode::EdgeClampLeft => 4_u8,
2078                 PrmtMode::EdgeClampRight => 5_u8,
2079                 PrmtMode::Replicate16 => 6_u8,
2080             },
2081         );
2082     }
2083 }
2084 
2085 impl SM70Op for OpSel {
legalize(&mut self, b: &mut LegalizeBuilder)2086     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2087         let gpr = op_gpr(self);
2088         if !self.is_uniform() {
2089             b.copy_src_if_upred(&mut self.cond);
2090         }
2091         let [src0, src1] = &mut self.srcs;
2092         if swap_srcs_if_not_reg(src0, src1, gpr) {
2093             self.cond = self.cond.bnot();
2094         }
2095         b.copy_alu_src_if_not_reg(src0, gpr, SrcType::ALU);
2096     }
2097 
encode(&self, e: &mut SM70Encoder<'_>)2098     fn encode(&self, e: &mut SM70Encoder<'_>) {
2099         if self.is_uniform() {
2100             e.encode_ualu(
2101                 0x087,
2102                 Some(&self.dst),
2103                 Some(&self.srcs[0]),
2104                 Some(&self.srcs[1]),
2105                 None,
2106             );
2107 
2108             e.set_upred_src(87..90, 90, self.cond);
2109         } else {
2110             e.encode_alu(
2111                 0x007,
2112                 Some(&self.dst),
2113                 Some(&self.srcs[0]),
2114                 Some(&self.srcs[1]),
2115                 None,
2116             );
2117 
2118             e.set_pred_src(87..90, 90, self.cond);
2119         }
2120     }
2121 }
2122 
2123 impl SM70Op for OpShfl {
legalize(&mut self, b: &mut LegalizeBuilder)2124     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2125         let gpr = op_gpr(self);
2126         b.copy_alu_src_if_not_reg(&mut self.src, gpr, SrcType::GPR);
2127         b.copy_alu_src_if_not_reg_or_imm(&mut self.lane, gpr, SrcType::ALU);
2128         b.copy_alu_src_if_not_reg_or_imm(&mut self.c, gpr, SrcType::ALU);
2129     }
2130 
encode(&self, e: &mut SM70Encoder<'_>)2131     fn encode(&self, e: &mut SM70Encoder<'_>) {
2132         assert!(self.lane.src_mod.is_none());
2133         assert!(self.c.src_mod.is_none());
2134 
2135         match &self.lane.src_ref {
2136             SrcRef::Zero | SrcRef::Reg(_) => match &self.c.src_ref {
2137                 SrcRef::Zero | SrcRef::Reg(_) => {
2138                     e.set_opcode(0x389);
2139                     e.set_reg_src(32..40, self.lane);
2140                     e.set_reg_src(64..72, self.c);
2141                 }
2142                 SrcRef::Imm32(imm_c) => {
2143                     e.set_opcode(0x589);
2144                     e.set_reg_src(32..40, self.lane);
2145                     e.set_field(40..53, *imm_c & 0x1f1f);
2146                 }
2147                 _ => panic!("Invalid instruction form"),
2148             },
2149             SrcRef::Imm32(imm_lane) => match &self.c.src_ref {
2150                 SrcRef::Zero | SrcRef::Reg(_) => {
2151                     e.set_opcode(0x989);
2152                     e.set_field(53..58, *imm_lane & 0x1f);
2153                     e.set_reg_src(64..72, self.c);
2154                 }
2155                 SrcRef::Imm32(imm_c) => {
2156                     e.set_opcode(0xf89);
2157                     e.set_field(40..53, *imm_c & 0x1f1f);
2158                     e.set_field(53..58, *imm_lane & 0x1f);
2159                 }
2160                 _ => panic!("Invalid instruction form"),
2161             },
2162             _ => panic!("Invalid instruction form"),
2163         };
2164 
2165         e.set_dst(self.dst);
2166         e.set_pred_dst(81..84, self.in_bounds);
2167         e.set_reg_src(24..32, self.src);
2168         e.set_field(
2169             58..60,
2170             match self.op {
2171                 ShflOp::Idx => 0_u8,
2172                 ShflOp::Up => 1_u8,
2173                 ShflOp::Down => 2_u8,
2174                 ShflOp::Bfly => 3_u8,
2175             },
2176         );
2177     }
2178 }
2179 
2180 impl SM70Op for OpPLop3 {
legalize(&mut self, b: &mut LegalizeBuilder)2181     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2182         // Fold constants and modifiers if we can
2183         for lop in &mut self.ops {
2184             *lop = LogicOp3::new_lut(&|mut x, mut y, mut z| {
2185                 fold_lop_src(&self.srcs[0], &mut x);
2186                 fold_lop_src(&self.srcs[1], &mut y);
2187                 fold_lop_src(&self.srcs[2], &mut z);
2188                 lop.eval(x, y, z)
2189             });
2190         }
2191         for src in &mut self.srcs {
2192             src.src_mod = SrcMod::None;
2193             if src_as_lop_imm(src).is_some() {
2194                 src.src_ref = SrcRef::True;
2195             }
2196         }
2197 
2198         if !self.is_uniform() {
2199             // The warp form of plop3 allows a single uniform predicate in
2200             // src2. If we have a uniform predicate anywhere, try to move it
2201             // there.
2202             let [src0, src1, src2] = &mut self.srcs;
2203             if src_is_upred_reg(src0) && !src_is_upred_reg(src2) {
2204                 std::mem::swap(src0, src2);
2205                 for lop in &mut self.ops {
2206                     *lop = LogicOp3::new_lut(&|x, y, z| lop.eval(z, y, x))
2207                 }
2208             }
2209             if src_is_upred_reg(src1) && !src_is_upred_reg(src2) {
2210                 std::mem::swap(src1, src2);
2211                 for lop in &mut self.ops {
2212                     *lop = LogicOp3::new_lut(&|x, y, z| lop.eval(x, z, y))
2213                 }
2214             }
2215             b.copy_src_if_upred(src0);
2216             b.copy_src_if_upred(src1);
2217         }
2218     }
2219 
encode(&self, e: &mut SM70Encoder<'_>)2220     fn encode(&self, e: &mut SM70Encoder<'_>) {
2221         if self.is_uniform() {
2222             e.set_opcode(0x89c);
2223 
2224             e.set_upred_src(68..71, 71, self.srcs[2]);
2225             e.set_upred_src(77..80, 80, self.srcs[1]);
2226             e.set_upred_src(87..90, 90, self.srcs[0]);
2227         } else {
2228             e.set_opcode(0x81c);
2229 
2230             if self.srcs[2]
2231                 .src_ref
2232                 .as_reg()
2233                 .is_some_and(|r| r.is_uniform())
2234             {
2235                 e.set_upred_src(68..71, 71, self.srcs[2]);
2236                 e.set_bit(67, true);
2237             } else {
2238                 e.set_pred_src(68..71, 71, self.srcs[2]);
2239             }
2240             e.set_pred_src(77..80, 80, self.srcs[1]);
2241             e.set_pred_src(87..90, 90, self.srcs[0]);
2242         }
2243         e.set_field(16..24, self.ops[1].lut);
2244         e.set_field(64..67, self.ops[0].lut & 0x7);
2245         e.set_field(72..77, self.ops[0].lut >> 3);
2246 
2247         e.set_pred_dst(81..84, self.dsts[0]);
2248         e.set_pred_dst(84..87, self.dsts[1]);
2249     }
2250 }
2251 
2252 impl SM70Op for OpR2UR {
legalize(&mut self, _b: &mut LegalizeBuilder)2253     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
2254         // Nothing to do
2255     }
2256 
encode(&self, e: &mut SM70Encoder<'_>)2257     fn encode(&self, e: &mut SM70Encoder<'_>) {
2258         e.set_opcode(0x3c2);
2259         e.set_udst(self.dst);
2260         e.set_reg_src(24..32, self.src);
2261         e.set_pred_dst(81..84, Dst::None);
2262     }
2263 }
2264 
2265 impl SM70Encoder<'_> {
set_tex_dim(&mut self, range: Range<usize>, dim: TexDim)2266     fn set_tex_dim(&mut self, range: Range<usize>, dim: TexDim) {
2267         assert!(range.len() == 3);
2268         self.set_field(
2269             range,
2270             match dim {
2271                 TexDim::_1D => 0_u8,
2272                 TexDim::Array1D => 4_u8,
2273                 TexDim::_2D => 1_u8,
2274                 TexDim::Array2D => 5_u8,
2275                 TexDim::_3D => 2_u8,
2276                 TexDim::Cube => 3_u8,
2277                 TexDim::ArrayCube => 7_u8,
2278             },
2279         );
2280     }
2281 
set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode)2282     fn set_tex_lod_mode(&mut self, range: Range<usize>, lod_mode: TexLodMode) {
2283         assert!(range.len() == 3);
2284         self.set_field(
2285             range,
2286             match lod_mode {
2287                 TexLodMode::Auto => 0_u8,
2288                 TexLodMode::Zero => 1_u8,
2289                 TexLodMode::Bias => 2_u8,
2290                 TexLodMode::Lod => 3_u8,
2291                 TexLodMode::Clamp => 4_u8,
2292                 TexLodMode::BiasClamp => 5_u8,
2293             },
2294         );
2295     }
2296 
set_image_dim(&mut self, range: Range<usize>, dim: ImageDim)2297     fn set_image_dim(&mut self, range: Range<usize>, dim: ImageDim) {
2298         assert!(range.len() == 3);
2299         self.set_field(
2300             range,
2301             match dim {
2302                 ImageDim::_1D => 0_u8,
2303                 ImageDim::_1DBuffer => 1_u8,
2304                 ImageDim::_1DArray => 2_u8,
2305                 ImageDim::_2D => 3_u8,
2306                 ImageDim::_2DArray => 4_u8,
2307                 ImageDim::_3D => 5_u8,
2308             },
2309         );
2310     }
2311 }
2312 
2313 impl SM70Op for OpTex {
legalize(&mut self, b: &mut LegalizeBuilder)2314     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2315         legalize_ext_instr(self, b);
2316     }
2317 
encode(&self, e: &mut SM70Encoder<'_>)2318     fn encode(&self, e: &mut SM70Encoder<'_>) {
2319         e.set_opcode(0x361);
2320         e.set_bit(59, true); // .B
2321 
2322         e.set_dst(self.dsts[0]);
2323         if let Dst::Reg(reg) = self.dsts[1] {
2324             e.set_reg(64..72, reg);
2325         } else {
2326             e.set_field(64..72, 255_u8);
2327         }
2328         e.set_pred_dst(81..84, self.fault);
2329 
2330         e.set_reg_src(24..32, self.srcs[0]);
2331         e.set_reg_src(32..40, self.srcs[1]);
2332 
2333         e.set_tex_dim(61..64, self.dim);
2334         e.set_field(72..76, self.mask);
2335         e.set_bit(76, self.offset);
2336         e.set_bit(77, false); // ToDo: NDV
2337         e.set_bit(78, self.z_cmpr);
2338         e.set_field(84..87, 1);
2339         e.set_tex_lod_mode(87..90, self.lod_mode);
2340         e.set_bit(90, false); // TODO: .NODEP
2341     }
2342 }
2343 
2344 impl SM70Op for OpTld {
legalize(&mut self, b: &mut LegalizeBuilder)2345     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2346         legalize_ext_instr(self, b);
2347     }
2348 
encode(&self, e: &mut SM70Encoder<'_>)2349     fn encode(&self, e: &mut SM70Encoder<'_>) {
2350         e.set_opcode(0x367);
2351         e.set_bit(59, true); // .B
2352 
2353         e.set_dst(self.dsts[0]);
2354         if let Dst::Reg(reg) = self.dsts[1] {
2355             e.set_reg(64..72, reg);
2356         } else {
2357             e.set_field(64..72, 255_u8);
2358         }
2359         e.set_pred_dst(81..84, self.fault);
2360 
2361         e.set_reg_src(24..32, self.srcs[0]);
2362         e.set_reg_src(32..40, self.srcs[1]);
2363 
2364         e.set_tex_dim(61..64, self.dim);
2365         e.set_field(72..76, self.mask);
2366         e.set_bit(76, self.offset);
2367         // bit 77: .CL
2368         e.set_bit(78, self.is_ms);
2369         // bits 79..81: .F16
2370         assert!(
2371             self.lod_mode == TexLodMode::Zero
2372                 || self.lod_mode == TexLodMode::Lod
2373         );
2374         e.set_tex_lod_mode(87..90, self.lod_mode);
2375         e.set_bit(90, false); // TODO: .NODEP
2376     }
2377 }
2378 
2379 impl SM70Op for OpTld4 {
legalize(&mut self, b: &mut LegalizeBuilder)2380     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2381         legalize_ext_instr(self, b);
2382     }
2383 
encode(&self, e: &mut SM70Encoder<'_>)2384     fn encode(&self, e: &mut SM70Encoder<'_>) {
2385         e.set_opcode(0x364);
2386         e.set_bit(59, true); // .B
2387 
2388         e.set_dst(self.dsts[0]);
2389         if let Dst::Reg(reg) = self.dsts[1] {
2390             e.set_reg(64..72, reg);
2391         } else {
2392             e.set_field(64..72, 255_u8);
2393         }
2394         e.set_pred_dst(81..84, self.fault);
2395 
2396         e.set_reg_src(24..32, self.srcs[0]);
2397         e.set_reg_src(32..40, self.srcs[1]);
2398 
2399         e.set_tex_dim(61..64, self.dim);
2400         e.set_field(72..76, self.mask);
2401         e.set_field(
2402             76..78,
2403             match self.offset_mode {
2404                 Tld4OffsetMode::None => 0_u8,
2405                 Tld4OffsetMode::AddOffI => 1_u8,
2406                 Tld4OffsetMode::PerPx => 2_u8,
2407             },
2408         );
2409         // bit 77: .CL
2410         e.set_bit(78, self.z_cmpr);
2411         e.set_bit(84, true); // !.EF
2412         e.set_field(87..89, self.comp);
2413         e.set_bit(90, false); // TODO: .NODEP
2414     }
2415 }
2416 
2417 impl SM70Op for OpTmml {
legalize(&mut self, b: &mut LegalizeBuilder)2418     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2419         legalize_ext_instr(self, b);
2420     }
2421 
encode(&self, e: &mut SM70Encoder<'_>)2422     fn encode(&self, e: &mut SM70Encoder<'_>) {
2423         e.set_opcode(0x36a);
2424         e.set_bit(59, true); // .B
2425 
2426         e.set_dst(self.dsts[0]);
2427         if let Dst::Reg(reg) = self.dsts[1] {
2428             e.set_reg(64..72, reg);
2429         } else {
2430             e.set_field(64..72, 255_u8);
2431         }
2432 
2433         e.set_reg_src(24..32, self.srcs[0]);
2434         e.set_reg_src(32..40, self.srcs[1]);
2435 
2436         e.set_tex_dim(61..64, self.dim);
2437         e.set_field(72..76, self.mask);
2438         e.set_bit(77, false); // ToDo: NDV
2439         e.set_bit(90, false); // TODO: .NODEP
2440     }
2441 }
2442 
2443 impl SM70Op for OpTxd {
legalize(&mut self, b: &mut LegalizeBuilder)2444     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2445         legalize_ext_instr(self, b);
2446     }
2447 
encode(&self, e: &mut SM70Encoder<'_>)2448     fn encode(&self, e: &mut SM70Encoder<'_>) {
2449         e.set_opcode(0x36d);
2450         e.set_bit(59, true); // .B
2451 
2452         e.set_dst(self.dsts[0]);
2453         if let Dst::Reg(reg) = self.dsts[1] {
2454             e.set_reg(64..72, reg);
2455         } else {
2456             e.set_field(64..72, 255_u8);
2457         }
2458         e.set_pred_dst(81..84, self.fault);
2459 
2460         e.set_reg_src(24..32, self.srcs[0]);
2461         e.set_reg_src(32..40, self.srcs[1]);
2462 
2463         e.set_tex_dim(61..64, self.dim);
2464         e.set_field(72..76, self.mask);
2465         e.set_bit(76, self.offset);
2466         e.set_bit(77, false); // ToDo: NDV
2467         e.set_bit(90, false); // TODO: .NODEP
2468     }
2469 }
2470 
2471 impl SM70Op for OpTxq {
legalize(&mut self, b: &mut LegalizeBuilder)2472     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2473         legalize_ext_instr(self, b);
2474     }
2475 
encode(&self, e: &mut SM70Encoder<'_>)2476     fn encode(&self, e: &mut SM70Encoder<'_>) {
2477         e.set_opcode(0x370);
2478         e.set_bit(59, true); // .B
2479 
2480         e.set_dst(self.dsts[0]);
2481         if let Dst::Reg(reg) = self.dsts[1] {
2482             e.set_reg(64..72, reg);
2483         } else {
2484             e.set_field(64..72, 255_u8);
2485         }
2486 
2487         e.set_reg_src(24..32, self.src);
2488         e.set_field(
2489             62..64,
2490             match self.query {
2491                 TexQuery::Dimension => 0_u8,
2492                 TexQuery::TextureType => 1_u8,
2493                 TexQuery::SamplerPos => 2_u8,
2494             },
2495         );
2496         e.set_field(72..76, self.mask);
2497     }
2498 }
2499 
2500 impl SM70Encoder<'_> {
set_mem_order(&mut self, order: &MemOrder)2501     fn set_mem_order(&mut self, order: &MemOrder) {
2502         if self.sm.sm < 80 {
2503             let scope = match order {
2504                 MemOrder::Constant => MemScope::System,
2505                 MemOrder::Weak => MemScope::CTA,
2506                 MemOrder::Strong(s) => *s,
2507             };
2508             self.set_field(
2509                 77..79,
2510                 match scope {
2511                     MemScope::CTA => 0_u8,
2512                     // SM => 1_u8,
2513                     MemScope::GPU => 2_u8,
2514                     MemScope::System => 3_u8,
2515                 },
2516             );
2517             self.set_field(
2518                 79..81,
2519                 match order {
2520                     MemOrder::Constant => 0_u8,
2521                     MemOrder::Weak => 1_u8,
2522                     MemOrder::Strong(_) => 2_u8,
2523                     // MMIO => 3_u8,
2524                 },
2525             );
2526         } else {
2527             self.set_field(
2528                 77..81,
2529                 match order {
2530                     MemOrder::Constant => 0x4_u8,
2531                     MemOrder::Weak => 0x0_u8,
2532                     MemOrder::Strong(MemScope::CTA) => 0x5_u8,
2533                     MemOrder::Strong(MemScope::GPU) => 0x7_u8,
2534                     MemOrder::Strong(MemScope::System) => 0xa_u8,
2535                 },
2536             );
2537         }
2538     }
2539 
set_eviction_priority(&mut self, pri: &MemEvictionPriority)2540     fn set_eviction_priority(&mut self, pri: &MemEvictionPriority) {
2541         self.set_field(
2542             84..86,
2543             match pri {
2544                 MemEvictionPriority::First => 0_u8,
2545                 MemEvictionPriority::Normal => 1_u8,
2546                 MemEvictionPriority::Last => 2_u8,
2547                 MemEvictionPriority::Unchanged => 3_u8,
2548             },
2549         );
2550     }
2551 
set_mem_type(&mut self, range: Range<usize>, mem_type: MemType)2552     fn set_mem_type(&mut self, range: Range<usize>, mem_type: MemType) {
2553         assert!(range.len() == 3);
2554         self.set_field(
2555             range,
2556             match mem_type {
2557                 MemType::U8 => 0_u8,
2558                 MemType::I8 => 1_u8,
2559                 MemType::U16 => 2_u8,
2560                 MemType::I16 => 3_u8,
2561                 MemType::B32 => 4_u8,
2562                 MemType::B64 => 5_u8,
2563                 MemType::B128 => 6_u8,
2564             },
2565         );
2566     }
2567 
set_mem_access(&mut self, access: &MemAccess)2568     fn set_mem_access(&mut self, access: &MemAccess) {
2569         self.set_field(
2570             72..73,
2571             match access.space.addr_type() {
2572                 MemAddrType::A32 => 0_u8,
2573                 MemAddrType::A64 => 1_u8,
2574             },
2575         );
2576         self.set_mem_type(73..76, access.mem_type);
2577         self.set_mem_order(&access.order);
2578         self.set_eviction_priority(&access.eviction_priority);
2579     }
2580 }
2581 
2582 impl SM70Op for OpSuLd {
legalize(&mut self, b: &mut LegalizeBuilder)2583     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2584         legalize_ext_instr(self, b);
2585     }
2586 
encode(&self, e: &mut SM70Encoder<'_>)2587     fn encode(&self, e: &mut SM70Encoder<'_>) {
2588         e.set_opcode(0x998);
2589 
2590         e.set_dst(self.dst);
2591         e.set_reg_src(24..32, self.coord);
2592         e.set_reg_src(64..72, self.handle);
2593         e.set_pred_dst(81..84, self.fault);
2594 
2595         e.set_image_dim(61..64, self.image_dim);
2596         e.set_mem_order(&self.mem_order);
2597         e.set_eviction_priority(&self.mem_eviction_priority);
2598 
2599         assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2600         e.set_field(72..76, self.mask);
2601     }
2602 }
2603 
2604 impl SM70Op for OpSuSt {
legalize(&mut self, b: &mut LegalizeBuilder)2605     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2606         legalize_ext_instr(self, b);
2607     }
2608 
encode(&self, e: &mut SM70Encoder<'_>)2609     fn encode(&self, e: &mut SM70Encoder<'_>) {
2610         e.set_opcode(0x99c);
2611 
2612         e.set_reg_src(24..32, self.coord);
2613         e.set_reg_src(32..40, self.data);
2614         e.set_reg_src(64..72, self.handle);
2615 
2616         e.set_image_dim(61..64, self.image_dim);
2617         e.set_mem_order(&self.mem_order);
2618         e.set_eviction_priority(&self.mem_eviction_priority);
2619 
2620         assert!(self.mask == 0x1 || self.mask == 0x3 || self.mask == 0xf);
2621         e.set_field(72..76, self.mask);
2622     }
2623 }
2624 
2625 impl SM70Op for OpSuAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2626     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2627         legalize_ext_instr(self, b);
2628     }
2629 
encode(&self, e: &mut SM70Encoder<'_>)2630     fn encode(&self, e: &mut SM70Encoder<'_>) {
2631         if self.dst.is_none() {
2632             e.set_opcode(0x3a0);
2633             e.set_atom_op(87..90, self.atom_op);
2634         } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2635             e.set_opcode(0x396);
2636             assert!(cmp_src == AtomCmpSrc::Packed);
2637         } else {
2638             e.set_opcode(0x394);
2639             e.set_atom_op(87..91, self.atom_op);
2640         };
2641 
2642         e.set_dst(self.dst);
2643         e.set_reg_src(24..32, self.coord);
2644         e.set_reg_src(32..40, self.data);
2645         e.set_reg_src(64..72, self.handle);
2646         e.set_pred_dst(81..84, self.fault);
2647 
2648         e.set_image_dim(61..64, self.image_dim);
2649         e.set_mem_order(&self.mem_order);
2650         e.set_eviction_priority(&self.mem_eviction_priority);
2651 
2652         e.set_bit(72, false); // .BA
2653         e.set_atom_type(73..76, self.atom_type);
2654     }
2655 }
2656 
2657 impl SM70Op for OpLd {
legalize(&mut self, b: &mut LegalizeBuilder)2658     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2659         legalize_ext_instr(self, b);
2660     }
2661 
encode(&self, e: &mut SM70Encoder<'_>)2662     fn encode(&self, e: &mut SM70Encoder<'_>) {
2663         match self.access.space {
2664             MemSpace::Global(_) => {
2665                 e.set_opcode(0x381);
2666                 e.set_pred_dst(81..84, Dst::None);
2667                 e.set_mem_access(&self.access);
2668             }
2669             MemSpace::Local => {
2670                 e.set_opcode(0x983);
2671                 e.set_field(84..87, 1_u8);
2672 
2673                 e.set_mem_type(73..76, self.access.mem_type);
2674                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2675                 assert!(
2676                     self.access.eviction_priority
2677                         == MemEvictionPriority::Normal
2678                 );
2679             }
2680             MemSpace::Shared => {
2681                 e.set_opcode(0x984);
2682 
2683                 e.set_mem_type(73..76, self.access.mem_type);
2684                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2685                 assert!(
2686                     self.access.eviction_priority
2687                         == MemEvictionPriority::Normal
2688                 );
2689 
2690                 e.set_bit(87, false); // !.ZD - Returns a predicate?
2691             }
2692         }
2693 
2694         e.set_dst(self.dst);
2695         e.set_reg_src(24..32, self.addr);
2696         e.set_field(40..64, self.offset);
2697     }
2698 }
2699 
2700 impl SM70Op for OpLdc {
legalize(&mut self, b: &mut LegalizeBuilder)2701     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2702         let gpr = op_gpr(self);
2703         b.copy_alu_src_if_not_reg(&mut self.offset, gpr, SrcType::GPR);
2704     }
2705 
encode(&self, e: &mut SM70Encoder<'_>)2706     fn encode(&self, e: &mut SM70Encoder<'_>) {
2707         let SrcRef::CBuf(cb) = &self.cb.src_ref else {
2708             panic!("LDC must take a cbuf source");
2709         };
2710 
2711         match cb.buf {
2712             CBuf::Binding(idx) => {
2713                 if self.is_uniform() {
2714                     e.set_opcode(0xab9);
2715                     e.set_udst(self.dst);
2716 
2717                     assert!(self.offset.is_zero());
2718                     assert!(self.mode == LdcMode::Indexed);
2719                 } else {
2720                     e.set_opcode(0xb82);
2721                     e.set_dst(self.dst);
2722 
2723                     e.set_reg_src(24..32, self.offset);
2724                     e.set_field(
2725                         78..80,
2726                         match self.mode {
2727                             LdcMode::Indexed => 0_u8,
2728                             LdcMode::IndexedLinear => 1_u8,
2729                             LdcMode::IndexedSegmented => 2_u8,
2730                             LdcMode::IndexedSegmentedLinear => 3_u8,
2731                         },
2732                     );
2733                 }
2734                 e.set_field(54..59, idx);
2735                 e.set_bit(91, false); // Bound
2736             }
2737             CBuf::BindlessUGPR(handle) => {
2738                 if self.is_uniform() {
2739                     e.set_opcode(0xab9);
2740                     e.set_udst(self.dst);
2741 
2742                     assert!(self.offset.is_zero());
2743                 } else {
2744                     e.set_opcode(0x582);
2745                     e.set_dst(self.dst);
2746 
2747                     e.set_reg_src(64..72, self.offset);
2748                 }
2749 
2750                 e.set_ureg(24..32, handle);
2751                 e.set_reg_src(64..72, self.offset);
2752                 assert!(self.mode == LdcMode::Indexed);
2753                 e.set_bit(91, true); // Bindless
2754             }
2755             CBuf::BindlessSSA(_) => panic!("SSA values must be lowered"),
2756         }
2757 
2758         e.set_field(38..54, cb.offset);
2759         e.set_mem_type(73..76, self.mem_type);
2760     }
2761 }
2762 
2763 impl SM70Op for OpSt {
legalize(&mut self, b: &mut LegalizeBuilder)2764     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2765         legalize_ext_instr(self, b);
2766     }
2767 
encode(&self, e: &mut SM70Encoder<'_>)2768     fn encode(&self, e: &mut SM70Encoder<'_>) {
2769         match self.access.space {
2770             MemSpace::Global(_) => {
2771                 e.set_opcode(0x386);
2772                 e.set_mem_access(&self.access);
2773             }
2774             MemSpace::Local => {
2775                 e.set_opcode(0x387);
2776                 e.set_field(84..87, 1_u8);
2777 
2778                 e.set_mem_type(73..76, self.access.mem_type);
2779                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2780                 assert!(
2781                     self.access.eviction_priority
2782                         == MemEvictionPriority::Normal
2783                 );
2784             }
2785             MemSpace::Shared => {
2786                 e.set_opcode(0x388);
2787 
2788                 e.set_mem_type(73..76, self.access.mem_type);
2789                 assert!(self.access.order == MemOrder::Strong(MemScope::CTA));
2790                 assert!(
2791                     self.access.eviction_priority
2792                         == MemEvictionPriority::Normal
2793                 );
2794             }
2795         }
2796 
2797         e.set_reg_src(24..32, self.addr);
2798         e.set_reg_src(32..40, self.data);
2799         e.set_field(40..64, self.offset);
2800     }
2801 }
2802 
2803 impl SM70Encoder<'_> {
set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp)2804     fn set_atom_op(&mut self, range: Range<usize>, atom_op: AtomOp) {
2805         self.set_field(
2806             range,
2807             match atom_op {
2808                 AtomOp::Add => 0_u8,
2809                 AtomOp::Min => 1_u8,
2810                 AtomOp::Max => 2_u8,
2811                 AtomOp::Inc => 3_u8,
2812                 AtomOp::Dec => 4_u8,
2813                 AtomOp::And => 5_u8,
2814                 AtomOp::Or => 6_u8,
2815                 AtomOp::Xor => 7_u8,
2816                 AtomOp::Exch => 8_u8,
2817                 AtomOp::CmpExch(_) => panic!("CmpExch is a separate opcode"),
2818             },
2819         );
2820     }
2821 
set_atom_type(&mut self, range: Range<usize>, atom_type: AtomType)2822     fn set_atom_type(&mut self, range: Range<usize>, atom_type: AtomType) {
2823         assert!(range.len() == 3);
2824         self.set_field(
2825             range,
2826             match atom_type {
2827                 AtomType::U32 => 0_u8,
2828                 AtomType::I32 => 1_u8,
2829                 AtomType::U64 => 2_u8,
2830                 AtomType::F32 => 3_u8,
2831                 AtomType::F16x2 => 4_u8,
2832                 AtomType::I64 => 5_u8,
2833                 AtomType::F64 => 6_u8,
2834             },
2835         );
2836     }
2837 }
2838 
2839 impl SM70Op for OpAtom {
legalize(&mut self, b: &mut LegalizeBuilder)2840     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2841         legalize_ext_instr(self, b);
2842     }
2843 
encode(&self, e: &mut SM70Encoder<'_>)2844     fn encode(&self, e: &mut SM70Encoder<'_>) {
2845         match self.mem_space {
2846             MemSpace::Global(_) => {
2847                 if self.dst.is_none() {
2848                     e.set_opcode(0x98e);
2849 
2850                     e.set_reg_src(32..40, self.data);
2851                     e.set_atom_op(87..90, self.atom_op);
2852                 } else if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2853                     e.set_opcode(0x3a9);
2854 
2855                     assert!(cmp_src == AtomCmpSrc::Separate);
2856                     e.set_reg_src(32..40, self.cmpr);
2857                     e.set_reg_src(64..72, self.data);
2858                 } else {
2859                     e.set_opcode(0x3a8);
2860 
2861                     e.set_reg_src(32..40, self.data);
2862                     e.set_atom_op(87..91, self.atom_op);
2863                 }
2864 
2865                 e.set_pred_dst(81..84, Dst::None);
2866 
2867                 e.set_field(
2868                     72..73,
2869                     match self.mem_space.addr_type() {
2870                         MemAddrType::A32 => 0_u8,
2871                         MemAddrType::A64 => 1_u8,
2872                     },
2873                 );
2874 
2875                 e.set_mem_order(&self.mem_order);
2876                 e.set_eviction_priority(&self.mem_eviction_priority);
2877             }
2878             MemSpace::Local => panic!("Atomics do not support local"),
2879             MemSpace::Shared => {
2880                 if let AtomOp::CmpExch(cmp_src) = self.atom_op {
2881                     e.set_opcode(0x38d);
2882 
2883                     assert!(cmp_src == AtomCmpSrc::Separate);
2884                     e.set_reg_src(32..40, self.cmpr);
2885                     e.set_reg_src(64..72, self.data);
2886                 } else {
2887                     e.set_opcode(0x38c);
2888 
2889                     e.set_reg_src(32..40, self.data);
2890                     e.set_atom_op(87..91, self.atom_op);
2891                 }
2892 
2893                 assert!(self.mem_order == MemOrder::Strong(MemScope::CTA));
2894                 assert!(
2895                     self.mem_eviction_priority == MemEvictionPriority::Normal
2896                 );
2897             }
2898         }
2899 
2900         e.set_dst(self.dst);
2901         e.set_reg_src(24..32, self.addr);
2902         e.set_field(40..64, self.addr_offset);
2903         e.set_atom_type(73..76, self.atom_type);
2904     }
2905 }
2906 
2907 impl SM70Op for OpAL2P {
legalize(&mut self, b: &mut LegalizeBuilder)2908     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2909         legalize_ext_instr(self, b);
2910     }
2911 
encode(&self, e: &mut SM70Encoder<'_>)2912     fn encode(&self, e: &mut SM70Encoder<'_>) {
2913         e.set_opcode(0x920);
2914 
2915         e.set_dst(self.dst);
2916         e.set_reg_src(24..32, self.offset);
2917 
2918         e.set_field(40..50, self.access.addr);
2919         e.set_field(74..76, 0_u8); // comps
2920         assert!(!self.access.patch);
2921         e.set_bit(79, self.access.output);
2922     }
2923 }
2924 
2925 impl SM70Op for OpALd {
legalize(&mut self, b: &mut LegalizeBuilder)2926     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2927         legalize_ext_instr(self, b);
2928     }
2929 
encode(&self, e: &mut SM70Encoder<'_>)2930     fn encode(&self, e: &mut SM70Encoder<'_>) {
2931         e.set_opcode(0x321);
2932 
2933         e.set_dst(self.dst);
2934         e.set_reg_src(32..40, self.vtx);
2935         e.set_reg_src(24..32, self.offset);
2936 
2937         e.set_field(40..50, self.access.addr);
2938         e.set_field(74..76, self.access.comps - 1);
2939         e.set_field(76..77, self.access.patch);
2940         e.set_field(77..78, self.access.phys);
2941         e.set_field(79..80, self.access.output);
2942     }
2943 }
2944 
2945 impl SM70Op for OpASt {
legalize(&mut self, b: &mut LegalizeBuilder)2946     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2947         legalize_ext_instr(self, b);
2948     }
2949 
encode(&self, e: &mut SM70Encoder<'_>)2950     fn encode(&self, e: &mut SM70Encoder<'_>) {
2951         e.set_opcode(0x322);
2952 
2953         e.set_reg_src(32..40, self.data);
2954         e.set_reg_src(64..72, self.vtx);
2955         e.set_reg_src(24..32, self.offset);
2956 
2957         e.set_field(40..50, self.access.addr);
2958         e.set_field(74..76, self.access.comps - 1);
2959         e.set_field(76..77, self.access.patch);
2960         e.set_field(77..78, self.access.phys);
2961         assert!(self.access.output);
2962     }
2963 }
2964 
2965 impl SM70Op for OpIpa {
legalize(&mut self, b: &mut LegalizeBuilder)2966     fn legalize(&mut self, b: &mut LegalizeBuilder) {
2967         legalize_ext_instr(self, b);
2968     }
2969 
encode(&self, e: &mut SM70Encoder<'_>)2970     fn encode(&self, e: &mut SM70Encoder<'_>) {
2971         e.set_opcode(0x326);
2972 
2973         e.set_dst(self.dst);
2974 
2975         assert!(self.addr % 4 == 0);
2976         e.set_field(64..72, self.addr >> 2);
2977 
2978         e.set_field(
2979             76..78,
2980             match self.loc {
2981                 InterpLoc::Default => 0_u8,
2982                 InterpLoc::Centroid => 1_u8,
2983                 InterpLoc::Offset => 2_u8,
2984             },
2985         );
2986         e.set_field(
2987             78..80,
2988             match self.freq {
2989                 InterpFreq::Pass => 0_u8,
2990                 InterpFreq::Constant => 1_u8,
2991                 InterpFreq::State => 2_u8,
2992                 InterpFreq::PassMulW => {
2993                     panic!("InterpFreq::PassMulW is invalid on SM70+");
2994                 }
2995             },
2996         );
2997 
2998         assert!(self.inv_w.is_zero());
2999         e.set_reg_src(32..40, self.offset);
3000 
3001         // TODO: What is this for?
3002         e.set_pred_dst(81..84, Dst::None);
3003     }
3004 }
3005 
3006 impl SM70Op for OpLdTram {
legalize(&mut self, b: &mut LegalizeBuilder)3007     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3008         legalize_ext_instr(self, b);
3009     }
3010 
encode(&self, e: &mut SM70Encoder<'_>)3011     fn encode(&self, e: &mut SM70Encoder<'_>) {
3012         e.set_opcode(0x3ad);
3013         e.set_dst(self.dst);
3014         e.set_ureg(24..32, RegRef::zero(RegFile::UGPR, 1));
3015 
3016         assert!(self.addr % 4 == 0);
3017         e.set_field(64..72, self.addr >> 2);
3018 
3019         e.set_bit(72, self.use_c);
3020 
3021         // Unknown but required
3022         e.set_bit(91, true);
3023     }
3024 }
3025 
3026 impl SM70Op for OpCCtl {
legalize(&mut self, b: &mut LegalizeBuilder)3027     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3028         legalize_ext_instr(self, b);
3029     }
3030 
encode(&self, e: &mut SM70Encoder<'_>)3031     fn encode(&self, e: &mut SM70Encoder<'_>) {
3032         assert!(matches!(self.mem_space, MemSpace::Global(_)));
3033         e.set_opcode(0x98f);
3034 
3035         e.set_reg_src(24..32, self.addr);
3036         e.set_field(32..64, self.addr_offset);
3037 
3038         e.set_field(
3039             87..91,
3040             match self.op {
3041                 CCtlOp::PF1 => 0_u8,
3042                 CCtlOp::PF2 => 1_u8,
3043                 CCtlOp::WB => 2_u8,
3044                 CCtlOp::IV => 3_u8,
3045                 CCtlOp::IVAll => 4_u8,
3046                 CCtlOp::RS => 5_u8,
3047                 CCtlOp::IVAllP => 6_u8,
3048                 CCtlOp::WBAll => 7_u8,
3049                 CCtlOp::WBAllP => 8_u8,
3050                 op => panic!("Unsupported cache control {op:?}"),
3051             },
3052         );
3053     }
3054 }
3055 
3056 impl SM70Op for OpMemBar {
legalize(&mut self, _b: &mut LegalizeBuilder)3057     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3058         // Nothing to do
3059     }
3060 
encode(&self, e: &mut SM70Encoder<'_>)3061     fn encode(&self, e: &mut SM70Encoder<'_>) {
3062         e.set_opcode(0x992);
3063 
3064         e.set_bit(72, false); // !.MMIO
3065         e.set_field(
3066             76..79,
3067             match self.scope {
3068                 MemScope::CTA => 0_u8,
3069                 // SM => 1_u8,
3070                 MemScope::GPU => 2_u8,
3071                 MemScope::System => 3_u8,
3072             },
3073         );
3074         e.set_bit(80, false); // .SC
3075     }
3076 }
3077 
3078 impl SM70Encoder<'_> {
set_rel_offset(&mut self, range: Range<usize>, label: &Label)3079     fn set_rel_offset(&mut self, range: Range<usize>, label: &Label) {
3080         let ip = u64::try_from(self.ip).unwrap();
3081         let ip = i64::try_from(ip).unwrap();
3082 
3083         let target_ip = *self.labels.get(label).unwrap();
3084         let target_ip = u64::try_from(target_ip).unwrap();
3085         let target_ip = i64::try_from(target_ip).unwrap();
3086 
3087         let rel_offset = target_ip - ip - 4;
3088 
3089         self.set_field(range, rel_offset);
3090     }
3091 }
3092 
3093 impl SM70Op for OpBClear {
legalize(&mut self, _b: &mut LegalizeBuilder)3094     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3095         // Nothing to do
3096     }
3097 
encode(&self, e: &mut SM70Encoder<'_>)3098     fn encode(&self, e: &mut SM70Encoder<'_>) {
3099         e.set_opcode(0x355);
3100 
3101         e.set_dst(Dst::None);
3102         e.set_bar_dst(24..28, self.dst);
3103 
3104         e.set_bit(84, true); // .CLEAR
3105     }
3106 }
3107 
3108 impl SM70Op for OpBMov {
legalize(&mut self, _b: &mut LegalizeBuilder)3109     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3110         // Nothing to do
3111     }
3112 
encode(&self, e: &mut SM70Encoder<'_>)3113     fn encode(&self, e: &mut SM70Encoder<'_>) {
3114         if dst_is_bar(self.dst) {
3115             e.set_opcode(0x356);
3116 
3117             e.set_bar_dst(24..28, self.dst);
3118             e.set_reg_src(32..40, self.src);
3119 
3120             e.set_bit(84, self.clear);
3121         } else {
3122             e.set_opcode(0x355);
3123 
3124             e.set_dst(self.dst);
3125             e.set_bar_src(24..28, self.src);
3126 
3127             e.set_bit(84, self.clear);
3128         }
3129     }
3130 }
3131 
3132 impl SM70Op for OpBreak {
legalize(&mut self, _b: &mut LegalizeBuilder)3133     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3134         // Nothing to do
3135     }
3136 
encode(&self, e: &mut SM70Encoder<'_>)3137     fn encode(&self, e: &mut SM70Encoder<'_>) {
3138         e.set_opcode(0x942);
3139         assert!(self.bar_in.src_ref.as_reg() == self.bar_out.as_reg());
3140         e.set_bar_dst(16..20, self.bar_out);
3141         e.set_pred_src(87..90, 90, self.cond);
3142     }
3143 }
3144 
3145 impl SM70Op for OpBSSy {
legalize(&mut self, _b: &mut LegalizeBuilder)3146     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3147         // Nothing to do
3148     }
3149 
encode(&self, e: &mut SM70Encoder<'_>)3150     fn encode(&self, e: &mut SM70Encoder<'_>) {
3151         e.set_opcode(0x945);
3152         assert!(self.bar_in.src_ref.as_reg() == self.bar_out.as_reg());
3153         e.set_bar_dst(16..20, self.bar_out);
3154         e.set_rel_offset(34..64, &self.target);
3155         e.set_pred_src(87..90, 90, self.cond);
3156     }
3157 }
3158 
3159 impl SM70Op for OpBSync {
legalize(&mut self, _b: &mut LegalizeBuilder)3160     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3161         // Nothing to do
3162     }
3163 
encode(&self, e: &mut SM70Encoder<'_>)3164     fn encode(&self, e: &mut SM70Encoder<'_>) {
3165         e.set_opcode(0x941);
3166         e.set_bar_src(16..20, self.bar);
3167         e.set_pred_src(87..90, 90, self.cond);
3168     }
3169 }
3170 
3171 impl SM70Op for OpBra {
legalize(&mut self, _b: &mut LegalizeBuilder)3172     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3173         // Nothing to do
3174     }
3175 
encode(&self, e: &mut SM70Encoder<'_>)3176     fn encode(&self, e: &mut SM70Encoder<'_>) {
3177         e.set_opcode(0x947);
3178         e.set_rel_offset(34..82, &self.target);
3179         e.set_field(87..90, 0x7_u8); // TODO: Pred?
3180     }
3181 }
3182 
3183 impl SM70Op for OpExit {
legalize(&mut self, _b: &mut LegalizeBuilder)3184     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3185         // Nothing to do
3186     }
3187 
encode(&self, e: &mut SM70Encoder<'_>)3188     fn encode(&self, e: &mut SM70Encoder<'_>) {
3189         e.set_opcode(0x94d);
3190 
3191         // ./.KEEPREFCOUNT/.PREEMPTED/.INVALID3
3192         e.set_field(84..85, false);
3193         e.set_field(85..86, false); // .NO_ATEXIT
3194         e.set_field(87..90, 0x7_u8); // TODO: Predicate
3195         e.set_field(90..91, false); // NOT
3196     }
3197 }
3198 
3199 impl SM70Op for OpWarpSync {
legalize(&mut self, _b: &mut LegalizeBuilder)3200     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3201         // Nothing to do
3202     }
3203 
encode(&self, e: &mut SM70Encoder<'_>)3204     fn encode(&self, e: &mut SM70Encoder<'_>) {
3205         e.encode_alu(0x148, None, None, Some(&Src::from(self.mask)), None);
3206         e.set_pred_src(87..90, 90, SrcRef::True.into());
3207     }
3208 }
3209 
3210 impl SM70Op for OpBar {
legalize(&mut self, _b: &mut LegalizeBuilder)3211     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3212         // Nothing to do
3213     }
3214 
encode(&self, e: &mut SM70Encoder<'_>)3215     fn encode(&self, e: &mut SM70Encoder<'_>) {
3216         e.set_opcode(0xb1d);
3217 
3218         // e.set_opcode(0x31d);
3219 
3220         // // src0 == src1
3221         // e.set_reg_src(32..40, SrcRef::Zero.into());
3222 
3223         // // 00: RED.POPC
3224         // // 01: RED.AND
3225         // // 02: RED.OR
3226         // e.set_field(74..76, 0_u8);
3227 
3228         // // 00: SYNC
3229         // // 01: ARV
3230         // // 02: RED
3231         // // 03: SCAN
3232         // e.set_field(77..79, 0_u8);
3233 
3234         // e.set_pred_src(87..90, 90, SrcRef::True.into());
3235     }
3236 }
3237 
3238 impl SM70Op for OpCS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)3239     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3240         // Nothing to do
3241     }
3242 
encode(&self, e: &mut SM70Encoder<'_>)3243     fn encode(&self, e: &mut SM70Encoder<'_>) {
3244         e.set_opcode(0x805);
3245         e.set_dst(self.dst);
3246         e.set_field(72..80, self.idx);
3247         e.set_bit(80, self.dst.as_reg().unwrap().comps() == 2); // .64
3248     }
3249 }
3250 
3251 impl SM70Op for OpIsberd {
legalize(&mut self, _b: &mut LegalizeBuilder)3252     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3253         // Nothing to do
3254     }
3255 
encode(&self, e: &mut SM70Encoder<'_>)3256     fn encode(&self, e: &mut SM70Encoder<'_>) {
3257         e.set_opcode(0x923);
3258         e.set_dst(self.dst);
3259         e.set_reg_src(24..32, self.idx);
3260     }
3261 }
3262 
3263 impl SM70Op for OpKill {
legalize(&mut self, _b: &mut LegalizeBuilder)3264     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3265         // Nothing to do
3266     }
3267 
encode(&self, e: &mut SM70Encoder<'_>)3268     fn encode(&self, e: &mut SM70Encoder<'_>) {
3269         e.set_opcode(0x95b);
3270         e.set_pred_src(87..90, 90, SrcRef::True.into());
3271     }
3272 }
3273 
3274 impl SM70Op for OpNop {
legalize(&mut self, _b: &mut LegalizeBuilder)3275     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3276         // Nothing to do
3277     }
3278 
encode(&self, e: &mut SM70Encoder<'_>)3279     fn encode(&self, e: &mut SM70Encoder<'_>) {
3280         e.set_opcode(0x918);
3281     }
3282 }
3283 
3284 impl SM70Op for OpPixLd {
legalize(&mut self, _b: &mut LegalizeBuilder)3285     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3286         // Nothing to do
3287     }
3288 
encode(&self, e: &mut SM70Encoder<'_>)3289     fn encode(&self, e: &mut SM70Encoder<'_>) {
3290         e.set_opcode(0x925);
3291         e.set_dst(self.dst);
3292         e.set_field(
3293             78..81,
3294             match &self.val {
3295                 PixVal::MsCount => 0_u8,
3296                 PixVal::CovMask => 1_u8,
3297                 PixVal::CentroidOffset => 2_u8,
3298                 PixVal::MyIndex => 3_u8,
3299                 PixVal::InnerCoverage => 4_u8,
3300                 other => panic!("Unsupported PixVal: {other}"),
3301             },
3302         );
3303         e.set_pred_dst(81..84, Dst::None);
3304     }
3305 }
3306 
3307 impl SM70Op for OpS2R {
legalize(&mut self, _b: &mut LegalizeBuilder)3308     fn legalize(&mut self, _b: &mut LegalizeBuilder) {
3309         // Nothing to do
3310     }
3311 
encode(&self, e: &mut SM70Encoder<'_>)3312     fn encode(&self, e: &mut SM70Encoder<'_>) {
3313         assert!(!self.is_uniform());
3314         e.set_opcode(if self.is_uniform() { 0x9c3 } else { 0x919 });
3315         e.set_dst(self.dst);
3316         e.set_field(72..80, self.idx);
3317     }
3318 }
3319 
3320 impl SM70Op for OpOut {
legalize(&mut self, b: &mut LegalizeBuilder)3321     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3322         let gpr = op_gpr(self);
3323         b.copy_alu_src_if_not_reg(&mut self.handle, gpr, SrcType::GPR);
3324         b.copy_alu_src_if_not_reg_or_imm(&mut self.stream, gpr, SrcType::ALU);
3325     }
3326 
encode(&self, e: &mut SM70Encoder<'_>)3327     fn encode(&self, e: &mut SM70Encoder<'_>) {
3328         e.encode_alu(
3329             0x124,
3330             Some(&self.dst),
3331             Some(&self.handle),
3332             Some(&self.stream),
3333             None,
3334         );
3335 
3336         e.set_field(
3337             78..80,
3338             match self.out_type {
3339                 OutType::Emit => 1_u8,
3340                 OutType::Cut => 2_u8,
3341                 OutType::EmitThenCut => 3_u8,
3342             },
3343         );
3344     }
3345 }
3346 
3347 impl SM70Op for OpOutFinal {
legalize(&mut self, b: &mut LegalizeBuilder)3348     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3349         let gpr = op_gpr(self);
3350         b.copy_alu_src_if_not_reg(&mut self.handle, gpr, SrcType::GPR);
3351     }
3352 
encode(&self, e: &mut SM70Encoder<'_>)3353     fn encode(&self, e: &mut SM70Encoder<'_>) {
3354         e.encode_alu(
3355             0x124,
3356             Some(&Dst::None),
3357             Some(&self.handle),
3358             Some(&Src::new_zero()),
3359             None,
3360         );
3361     }
3362 }
3363 
3364 impl SM70Op for OpVote {
legalize(&mut self, b: &mut LegalizeBuilder)3365     fn legalize(&mut self, b: &mut LegalizeBuilder) {
3366         b.copy_src_if_upred(&mut self.pred);
3367     }
3368 
encode(&self, e: &mut SM70Encoder<'_>)3369     fn encode(&self, e: &mut SM70Encoder<'_>) {
3370         if self.is_uniform() {
3371             e.set_opcode(0x886);
3372             e.set_udst(self.ballot);
3373         } else {
3374             e.set_opcode(0x806);
3375             e.set_dst(self.ballot);
3376         }
3377 
3378         e.set_field(
3379             72..74,
3380             match self.op {
3381                 VoteOp::All => 0_u8,
3382                 VoteOp::Any => 1_u8,
3383                 VoteOp::Eq => 2_u8,
3384             },
3385         );
3386 
3387         e.set_pred_dst(81..84, self.vote);
3388         e.set_pred_src(87..90, 90, self.pred);
3389     }
3390 }
3391 
3392 macro_rules! as_sm70_op_match {
3393     ($op: expr) => {
3394         match $op {
3395             Op::FAdd(op) => op,
3396             Op::FFma(op) => op,
3397             Op::FMnMx(op) => op,
3398             Op::FMul(op) => op,
3399             Op::FSet(op) => op,
3400             Op::FSetP(op) => op,
3401             Op::FSwzAdd(op) => op,
3402             Op::DAdd(op) => op,
3403             Op::DFma(op) => op,
3404             Op::DMul(op) => op,
3405             Op::DSetP(op) => op,
3406             Op::HAdd2(op) => op,
3407             Op::HFma2(op) => op,
3408             Op::HMul2(op) => op,
3409             Op::HSet2(op) => op,
3410             Op::HSetP2(op) => op,
3411             Op::HMnMx2(op) => op,
3412             Op::MuFu(op) => op,
3413             Op::BMsk(op) => op,
3414             Op::BRev(op) => op,
3415             Op::Flo(op) => op,
3416             Op::IAbs(op) => op,
3417             Op::IAdd3(op) => op,
3418             Op::IAdd3X(op) => op,
3419             Op::IDp4(op) => op,
3420             Op::IMad(op) => op,
3421             Op::IMad64(op) => op,
3422             Op::IMnMx(op) => op,
3423             Op::ISetP(op) => op,
3424             Op::Lop3(op) => op,
3425             Op::PopC(op) => op,
3426             Op::Shf(op) => op,
3427             Op::F2F(op) => op,
3428             Op::F2FP(op) => op,
3429             Op::F2I(op) => op,
3430             Op::I2F(op) => op,
3431             Op::FRnd(op) => op,
3432             Op::Mov(op) => op,
3433             Op::Prmt(op) => op,
3434             Op::Sel(op) => op,
3435             Op::Shfl(op) => op,
3436             Op::PLop3(op) => op,
3437             Op::R2UR(op) => op,
3438             Op::Tex(op) => op,
3439             Op::Tld(op) => op,
3440             Op::Tld4(op) => op,
3441             Op::Tmml(op) => op,
3442             Op::Txd(op) => op,
3443             Op::Txq(op) => op,
3444             Op::SuLd(op) => op,
3445             Op::SuSt(op) => op,
3446             Op::SuAtom(op) => op,
3447             Op::Ld(op) => op,
3448             Op::Ldc(op) => op,
3449             Op::St(op) => op,
3450             Op::Atom(op) => op,
3451             Op::AL2P(op) => op,
3452             Op::ALd(op) => op,
3453             Op::ASt(op) => op,
3454             Op::Ipa(op) => op,
3455             Op::LdTram(op) => op,
3456             Op::CCtl(op) => op,
3457             Op::MemBar(op) => op,
3458             Op::BClear(op) => op,
3459             Op::BMov(op) => op,
3460             Op::Break(op) => op,
3461             Op::BSSy(op) => op,
3462             Op::BSync(op) => op,
3463             Op::Bra(op) => op,
3464             Op::Exit(op) => op,
3465             Op::WarpSync(op) => op,
3466             Op::Bar(op) => op,
3467             Op::CS2R(op) => op,
3468             Op::Isberd(op) => op,
3469             Op::Kill(op) => op,
3470             Op::Nop(op) => op,
3471             Op::PixLd(op) => op,
3472             Op::S2R(op) => op,
3473             Op::Out(op) => op,
3474             Op::OutFinal(op) => op,
3475             Op::Vote(op) => op,
3476             _ => panic!("Unsupported op: {}", $op),
3477         }
3478     };
3479 }
3480 
as_sm70_op(op: &Op) -> &dyn SM70Op3481 fn as_sm70_op(op: &Op) -> &dyn SM70Op {
3482     as_sm70_op_match!(op)
3483 }
3484 
as_sm70_op_mut(op: &mut Op) -> &mut dyn SM70Op3485 fn as_sm70_op_mut(op: &mut Op) -> &mut dyn SM70Op {
3486     as_sm70_op_match!(op)
3487 }
3488 
encode_sm70_shader(sm: &ShaderModel70, s: &Shader<'_>) -> Vec<u32>3489 fn encode_sm70_shader(sm: &ShaderModel70, s: &Shader<'_>) -> Vec<u32> {
3490     assert!(s.functions.len() == 1);
3491     let func = &s.functions[0];
3492 
3493     let mut ip = 0_usize;
3494     let mut labels = HashMap::new();
3495     for b in &func.blocks {
3496         labels.insert(b.label, ip);
3497         for instr in &b.instrs {
3498             if let Op::Nop(op) = &instr.op {
3499                 if let Some(label) = op.label {
3500                     labels.insert(label, ip);
3501                 }
3502             }
3503             ip += 4;
3504         }
3505     }
3506 
3507     let mut encoded = Vec::new();
3508     for b in &func.blocks {
3509         for instr in &b.instrs {
3510             let mut e = SM70Encoder {
3511                 sm,
3512                 ip: encoded.len(),
3513                 labels: &labels,
3514                 inst: [0_u32; 4],
3515             };
3516             as_sm70_op(&instr.op).encode(&mut e);
3517             e.set_pred(&instr.pred);
3518             e.set_instr_deps(&instr.deps);
3519             encoded.extend_from_slice(&e.inst[..]);
3520         }
3521     }
3522     encoded
3523 }
3524