1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines all of the R600-specific intrinsics. 10// 11//===----------------------------------------------------------------------===// 12 13def global_ptr_ty : LLVMQualPointerType<1>; 14 15class AMDGPUReadPreloadRegisterIntrinsic 16 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 17 18class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> 19 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>; 20 21// Used to tag image and resource intrinsics with information used to generate 22// mem operands. 23class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { 24 int RsrcArg = rsrcarg; 25 bit IsImage = isimage; 26} 27 28let TargetPrefix = "r600" in { 29 30multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { 31 def _x : AMDGPUReadPreloadRegisterIntrinsic; 32 def _y : AMDGPUReadPreloadRegisterIntrinsic; 33 def _z : AMDGPUReadPreloadRegisterIntrinsic; 34} 35 36multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { 37 def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; 38 def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; 39 def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; 40} 41 42defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 43 <"__builtin_r600_read_global_size">; 44defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 45 <"__builtin_r600_read_ngroups">; 46defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 47 <"__builtin_r600_read_tgid">; 48 49defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; 50defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; 51 52def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, 53 Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; 54 55// AS 7 is PARAM_I_ADDRESS, used for kernel arguments 56def int_r600_implicitarg_ptr : 57 ClangBuiltin<"__builtin_r600_implicitarg_ptr">, 58 DefaultAttrsIntrinsic<[LLVMQualPointerType<7>], [], 59 [IntrNoMem, IntrSpeculatable]>; 60 61def int_r600_rat_store_typed : 62 // 1st parameter: Data 63 // 2nd parameter: Index 64 // 3rd parameter: Constant RAT ID 65 DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, 66 ClangBuiltin<"__builtin_r600_rat_store_typed">; 67 68def int_r600_recipsqrt_ieee : DefaultAttrsIntrinsic< 69 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 70>; 71 72def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic< 73 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 74>; 75 76def int_r600_cube : DefaultAttrsIntrinsic< 77 [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 78>; 79 80def int_r600_store_stream_output : DefaultAttrsIntrinsic< 81 [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] 82>; 83 84class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [ 85 llvm_v4f32_ty, // Coord 86 llvm_i32_ty, // offset_x 87 llvm_i32_ty, // offset_y, 88 llvm_i32_ty, // offset_z, 89 llvm_i32_ty, // resource_id 90 llvm_i32_ty, // samplerid 91 llvm_i32_ty, // coord_type_x 92 llvm_i32_ty, // coord_type_y 93 llvm_i32_ty, // coord_type_z 94 llvm_i32_ty], // coord_type_w 95 [IntrNoMem] 96>; 97 98class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [ 99 llvm_v4i32_ty, // Coord 100 llvm_i32_ty, // offset_x 101 llvm_i32_ty, // offset_y, 102 llvm_i32_ty, // offset_z, 103 llvm_i32_ty, // resource_id 104 llvm_i32_ty, // samplerid 105 llvm_i32_ty, // coord_type_x 106 llvm_i32_ty, // coord_type_y 107 llvm_i32_ty, // coord_type_z 108 llvm_i32_ty], // coord_type_w 109 [IntrNoMem] 110>; 111 112def int_r600_store_swizzle : 113 Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 114>; 115 116def int_r600_tex : TextureIntrinsicFloatInput; 117def int_r600_texc : TextureIntrinsicFloatInput; 118def int_r600_txl : TextureIntrinsicFloatInput; 119def int_r600_txlc : TextureIntrinsicFloatInput; 120def int_r600_txb : TextureIntrinsicFloatInput; 121def int_r600_txbc : TextureIntrinsicFloatInput; 122def int_r600_txf : TextureIntrinsicInt32Input; 123def int_r600_txq : TextureIntrinsicInt32Input; 124def int_r600_ddx : TextureIntrinsicFloatInput; 125def int_r600_ddy : TextureIntrinsicFloatInput; 126 127def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty], 128 [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 129>; 130 131def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>; 132 133} // End TargetPrefix = "r600" 134 135let TargetPrefix = "amdgcn" in { 136 137//===----------------------------------------------------------------------===// 138// ABI Special Intrinsics 139//===----------------------------------------------------------------------===// 140 141defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; 142defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 143 <"__builtin_amdgcn_workgroup_id">; 144 145def int_amdgcn_dispatch_ptr : 146 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 147 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 148 149def int_amdgcn_queue_ptr : 150 ClangBuiltin<"__builtin_amdgcn_queue_ptr">, 151 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 152 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 153 154def int_amdgcn_kernarg_segment_ptr : 155 ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, 156 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 157 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 158 159def int_amdgcn_implicitarg_ptr : 160 ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, 161 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 162 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 163 164def int_amdgcn_groupstaticsize : 165 ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, 166 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 167 168def int_amdgcn_dispatch_id : 169 ClangBuiltin<"__builtin_amdgcn_dispatch_id">, 170 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>; 171 172// For internal use. Coordinates LDS lowering between IR transform and backend. 173def int_amdgcn_lds_kernel_id : 174 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 175 176def int_amdgcn_implicit_buffer_ptr : 177 ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, 178 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 179 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 180 181// Set EXEC to the 64-bit value given. 182// This is always moved to the beginning of the basic block. 183// FIXME: Should be mangled for wave size. 184def int_amdgcn_init_exec : Intrinsic<[], 185 [llvm_i64_ty], // 64-bit literal constant 186 [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback, 187 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>; 188 189// Set EXEC according to a thread count packed in an SGPR input: 190// thread_count = (input >> bitoffset) & 0x7f; 191// This is always moved to the beginning of the basic block. 192// Note: only inreg arguments to the parent function are valid as 193// inputs to this intrinsic, computed values cannot be used. 194def int_amdgcn_init_exec_from_input : Intrinsic<[], 195 [llvm_i32_ty, // 32-bit SGPR input 196 llvm_i32_ty], // bit offset of the thread count 197 [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback, 198 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 199 200def int_amdgcn_wavefrontsize : 201 ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, 202 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 203 204// Represent a relocation constant. 205def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic< 206 [llvm_i32_ty], [llvm_metadata_ty], 207 [IntrNoMem, IntrSpeculatable] 208>; 209 210//===----------------------------------------------------------------------===// 211// Instruction Intrinsics 212//===----------------------------------------------------------------------===// 213 214// The first parameter is s_sendmsg immediate (i16), 215// the second one is copied to m0 216def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, 217 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 218 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 219def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, 220 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 221 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 222 223 224// gfx11 intrinsic 225// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. 226def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], 227 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 228 229def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, 230 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 231 232def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">, 233 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 234 IntrNoCallback, IntrNoFree]>; 235 236def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, 237 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 238 IntrNoCallback, IntrNoFree]>; 239 240def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">, 241 Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 242 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 243 244def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">, 245 Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 246 IntrNoCallback, IntrNoFree]>; 247 248def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, 249 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, 250 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 251 252def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, 253 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 254 IntrNoCallback, IntrNoFree]>; 255 256def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">, 257 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 258 IntrNoCallback, IntrNoFree]>; 259 260def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, 261 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 262 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 263 264def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">, 265 Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 266 267def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">, 268 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 269 IntrNoCallback, IntrNoFree]>; 270 271def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, 272 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 273 274// The 1st parameter is a mask for the types of instructions that may be allowed 275// to cross the SCHED_BARRIER during scheduling. 276// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. 277// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be 278// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. 279// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. 280// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. 281// MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER. 282// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. 283// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. 284// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. 285// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. 286// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. 287// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. 288def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, 289 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 290 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 291 292// The first parameter is a mask that determines the types of instructions that 293// you would like to synchronize around and add to a scheduling group. The 294// values of the mask are defined above for sched_barrier. These instructions 295// will be selected from the bottom up starting from the sched_group_barrier's 296// location during instruction scheduling. The second parameter is the number of 297// matching instructions that will be associated with this sched_group_barrier. 298// The third parameter is an identifier which is used to describe what other 299// sched_group_barriers should be synchronized with. 300def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, 301 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 302 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, 303 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 304 305// Scheduler optimization hint. 306// MASK = 0: Small gemm opt 307def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, 308 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 309 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 310 311def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, 312 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 313 314// GFX12 intrinsics 315class AMDGPUWaitIntrinsic : 316 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 317def int_amdgcn_s_wait_bvhcnt : AMDGPUWaitIntrinsic; 318def int_amdgcn_s_wait_dscnt : AMDGPUWaitIntrinsic; 319def int_amdgcn_s_wait_expcnt : AMDGPUWaitIntrinsic; 320def int_amdgcn_s_wait_kmcnt : AMDGPUWaitIntrinsic; 321def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic; 322def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic; 323def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic; 324 325def int_amdgcn_div_scale : DefaultAttrsIntrinsic< 326 // 1st parameter: Numerator 327 // 2nd parameter: Denominator 328 // 3rd parameter: Select quotient. Must equal Numerator or Denominator. 329 // (0 = Denominator, 1 = Numerator). 330 [llvm_anyfloat_ty, llvm_i1_ty], 331 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 332 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] 333>; 334 335def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 336 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 337 [IntrNoMem, IntrSpeculatable] 338>; 339 340def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 341 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 342 [IntrNoMem, IntrSpeculatable] 343>; 344 345// Look Up 2.0 / pi src0 with segment select src1[4:0] 346def int_amdgcn_trig_preop : DefaultAttrsIntrinsic< 347 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 348 [IntrNoMem, IntrSpeculatable] 349>; 350 351def int_amdgcn_sin : DefaultAttrsIntrinsic< 352 [llvm_anyfloat_ty], [LLVMMatchType<0>], 353 [IntrNoMem, IntrSpeculatable] 354>; 355 356def int_amdgcn_cos : DefaultAttrsIntrinsic< 357 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 358>; 359 360// v_log_{f16|f32}, performs log2. f32 version does not handle 361// denormals. There is no reason to use this for f16 as it does 362// support denormals, and the generic log2 intrinsic should be 363// preferred. 364def int_amdgcn_log : DefaultAttrsIntrinsic< 365 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 366>; 367 368// v_exp_{f16|f32} (int_amdgcn_exp was taken by export 369// already). Performs exp2. f32 version does not handle 370// denormals. There is no reason to use this for f16 as it does 371// support denormals, and the generic exp2 intrinsic should be 372// preferred. 373def int_amdgcn_exp2 : DefaultAttrsIntrinsic< 374 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 375>; 376 377def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< 378 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 379>; 380 381def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, 382 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], 383 [IntrNoMem, IntrSpeculatable, Commutative] 384>; 385 386// Fused single-precision multiply-add with legacy behaviour for the multiply, 387// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is 388// intended for use on subtargets that have the v_fma_legacy_f32 and/or 389// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and 390// has a completely different kind of legacy behaviour.) 391def int_amdgcn_fma_legacy : 392 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], 393 [IntrNoMem, IntrSpeculatable, Commutative] 394>; 395 396def int_amdgcn_rcp : DefaultAttrsIntrinsic< 397 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 398>; 399 400def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, 401 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], 402 [IntrNoMem, IntrSpeculatable] 403>; 404 405def int_amdgcn_sqrt : DefaultAttrsIntrinsic< 406 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 407>; 408 409def int_amdgcn_rsq : DefaultAttrsIntrinsic< 410 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 411>; 412 413def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, 414 DefaultAttrsIntrinsic< 415 [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] 416>; 417 418// out = 1.0 / sqrt(a) result clamped to +/- max_float. 419def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic< 420 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; 421 422def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic< 423 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 424>; 425 426def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic< 427 [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] 428>; 429 430// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 431// and always uses rtz, so is not suitable for implementing the OpenCL 432// fract function. It should be ok on VI. 433def int_amdgcn_fract : DefaultAttrsIntrinsic< 434 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 435>; 436 437def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, 438 DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], 439 [IntrNoMem, IntrSpeculatable] 440>; 441 442def int_amdgcn_cvt_pknorm_i16 : 443 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, 444 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 445 [IntrNoMem, IntrSpeculatable] 446>; 447 448def int_amdgcn_cvt_pknorm_u16 : 449 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, 450 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 451 [IntrNoMem, IntrSpeculatable] 452>; 453 454def int_amdgcn_cvt_pk_i16 : 455 ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, 456 DefaultAttrsIntrinsic< 457 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 458 [IntrNoMem, IntrSpeculatable] 459>; 460 461def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, 462 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 463 [IntrNoMem, IntrSpeculatable] 464>; 465 466def int_amdgcn_class : DefaultAttrsIntrinsic< 467 [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], 468 [IntrNoMem, IntrSpeculatable] 469>; 470 471def int_amdgcn_fmed3 : 472 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 473 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 474 [IntrNoMem, IntrSpeculatable] 475>; 476 477def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, 478 DefaultAttrsIntrinsic<[llvm_float_ty], 479 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 480 [IntrNoMem, IntrSpeculatable] 481>; 482 483def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, 484 DefaultAttrsIntrinsic<[llvm_float_ty], 485 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 486 [IntrNoMem, IntrSpeculatable] 487>; 488 489def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, 490 DefaultAttrsIntrinsic<[llvm_float_ty], 491 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 492 [IntrNoMem, IntrSpeculatable] 493>; 494 495def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, 496 DefaultAttrsIntrinsic<[llvm_float_ty], 497 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 498 [IntrNoMem, IntrSpeculatable] 499>; 500 501// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz 502// should be used. 503def int_amdgcn_sffbh : 504 DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], 505 [IntrNoMem, IntrSpeculatable] 506>; 507 508// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. 509def int_amdgcn_fmad_ftz : 510 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 511 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 512 [IntrNoMem, IntrSpeculatable] 513>; 514 515class AMDGPULDSIntrin : 516 Intrinsic<[llvm_any_ty], 517 [LLVMQualPointerType<3>, 518 LLVMMatchType<0>, 519 llvm_i32_ty, // ordering 520 llvm_i32_ty, // scope 521 llvm_i1_ty], // isVolatile 522 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, 523 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree] 524>; 525 526// FIXME: The m0 argument should be moved after the normal arguments 527class AMDGPUDSOrderedIntrinsic : Intrinsic< 528 [llvm_i32_ty], 529 // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that 530 // the bit packing can be optimized at the IR level. 531 [LLVMQualPointerType<2>, // IntToPtr(M0) 532 llvm_i32_ty, // value to add or swap 533 llvm_i32_ty, // ordering 534 llvm_i32_ty, // scope 535 llvm_i1_ty, // isVolatile 536 llvm_i32_ty, // ordered count index (OA index), also added to the address 537 // gfx10: bits 24-27 indicate the number of active threads/dwords 538 llvm_i1_ty, // wave release, usually set to 1 539 llvm_i1_ty], // wave done, set to 1 for the last ordered instruction 540 [IntrWillReturn, NoCapture<ArgIndex<0>>, 541 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, 542 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree 543 ] 544>; 545 546class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< 547 [llvm_i32_ty], 548 [llvm_anyptr_ty, // LDS or GDS ptr 549 llvm_i1_ty], // isVolatile 550 [IntrConvergent, IntrWillReturn, IntrArgMemOnly, 551 NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree], 552 "", 553 [SDNPMemOperand] 554>; 555 556def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; 557def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; 558 559// The pointer argument is assumed to be dynamically uniform if a VGPR. 560def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; 561def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; 562 563def int_amdgcn_ds_fadd : AMDGPULDSIntrin; 564def int_amdgcn_ds_fmin : AMDGPULDSIntrin; 565def int_amdgcn_ds_fmax : AMDGPULDSIntrin; 566 567} // TargetPrefix = "amdgcn" 568 569// New-style image intrinsics 570 571////////////////////////////////////////////////////////////////////////// 572// Dimension-aware image intrinsics framework 573////////////////////////////////////////////////////////////////////////// 574 575// Helper class to represent (type, name) combinations of arguments. The 576// argument names are explanatory and used as DAG operand names for codegen 577// pattern matching. 578class AMDGPUArg<LLVMType ty, string name> { 579 LLVMType Type = ty; 580 string Name = name; 581} 582 583// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] 584class makeArgList<list<string> names, LLVMType basety> { 585 list<AMDGPUArg> ret = 586 !listconcat([AMDGPUArg<basety, names[0]>], 587 !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); 588} 589 590// Return arglist, with LLVMMatchType's references shifted by 'shift'. 591class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { 592 list<AMDGPUArg> ret = 593 !foreach(arg, arglist, 594 !if(!isa<LLVMMatchType>(arg.Type), 595 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, 596 arg.Name>, 597 arg)); 598} 599 600// Return the concatenation of the given arglists. LLVMMatchType's are adjusted 601// accordingly, and shifted by an additional 'shift'. 602class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { 603 list<AMDGPUArg> ret = 604 !foldl([]<AMDGPUArg>, arglists, lhs, rhs, 605 !listconcat( 606 lhs, 607 arglistmatchshift<rhs, 608 !add(shift, !foldl(0, lhs, a, b, 609 !add(a, b.Type.isAny)))>.ret)); 610} 611 612// Represent texture/image types / dimensionality. 613class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, 614 list<string> coord_names, list<string> slice_names, 615 bit msaa = 0> { 616 AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); 617 string Name = name; // e.g. "2darraymsaa" 618 string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) 619 bits<3> Encoding = enc; 620 bit DA = 0; // DA bit in MIMG encoding 621 bit MSAA = msaa; 622 623 list<AMDGPUArg> CoordSliceArgs = 624 makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; 625 list<AMDGPUArg> CoordSliceIntArgs = 626 makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; 627 list<AMDGPUArg> GradientArgs = 628 makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), 629 !foreach(name, coord_names, "d" # name # "dv")), 630 llvm_anyfloat_ty>.ret; 631 632 bits<8> NumCoords = !size(CoordSliceArgs); 633 bits<8> NumGradients = !size(GradientArgs); 634} 635 636def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; 637def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; 638def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; 639let DA = 1 in { 640 def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; 641 def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; 642 def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; 643} 644def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; 645let DA = 1 in { 646 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; 647} 648 649def AMDGPUDims { 650 list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, 651 AMDGPUDimCube, AMDGPUDim1DArray, 652 AMDGPUDim2DArray]; 653 list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; 654 list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); 655} 656 657// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. 658class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { 659 string UpperCaseMod = ucmod; 660 string LowerCaseMod = lcmod; 661 662 // {offset} {bias} {z-compare} 663 list<AMDGPUArg> ExtraAddrArgs = extra_addr; 664 bit Offset = false; 665 bit Bias = false; 666 bit ZCompare = false; 667 bit Gradients = false; 668 669 // Name of the {lod} or {clamp} argument that is appended to the coordinates, 670 // if any. 671 string LodOrClamp = ""; 672} 673 674// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE 675// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 676defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { 677 multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, 678 list<AMDGPUArg> extra_addr> { 679 def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; 680 let Offset = true in 681 def NAME#lcmod#_o : AMDGPUSampleVariant< 682 ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; 683 } 684 685 multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, 686 list<AMDGPUArg> extra_addr> { 687 defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; 688 let ZCompare = true in 689 defm NAME : AMDGPUSampleHelper_Offset< 690 "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; 691 } 692 693 multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, 694 list<AMDGPUArg> extra_addr> { 695 defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; 696 let LodOrClamp = "clamp" in 697 defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; 698 } 699 700 defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { 701 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; 702 let Bias = true in 703 defm AMDGPUSample : AMDGPUSampleHelper_Clamp< 704 "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; 705 let LodOrClamp = "lod" in 706 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; 707 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; 708 } 709 710 let Gradients = true in { 711 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; 712 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; 713 } 714} 715 716// Helper class to capture the profile of a dimension-aware image intrinsic. 717// This information is used to generate the intrinsic's type and to inform 718// codegen pattern matching. 719class AMDGPUDimProfile<string opmod, 720 AMDGPUDimProps dim> { 721 AMDGPUDimProps Dim = dim; 722 string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod 723 724 // These are intended to be overwritten by subclasses 725 bit IsSample = false; 726 bit IsAtomic = false; 727 list<LLVMType> RetTypes = []; 728 list<AMDGPUArg> DataArgs = []; 729 list<AMDGPUArg> ExtraAddrArgs = []; 730 bit Offset = false; 731 bit Bias = false; 732 bit ZCompare = false; 733 bit Gradients = false; 734 string LodClampMip = ""; 735 736 int NumRetAndDataAnyTypes = 737 !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, 738 !add(a, b.isAny)); 739 740 list<AMDGPUArg> AddrArgs = 741 arglistconcat<[ExtraAddrArgs, 742 !if(Gradients, dim.GradientArgs, []), 743 !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), 744 !if(!empty(LodClampMip), 745 []<AMDGPUArg>, 746 [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], 747 NumRetAndDataAnyTypes>.ret; 748 list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); 749 list<AMDGPUArg> AddrDefaultArgs = 750 !foreach(arg, AddrArgs, 751 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 752 !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), 753 arg.Name>); 754 list<AMDGPUArg> AddrA16Args = 755 !foreach(arg, AddrArgs, 756 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 757 !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), 758 arg.Name>); 759} 760 761class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { 762 let IsSample = base.IsSample; 763 let IsAtomic = base.IsAtomic; 764 let RetTypes = base.RetTypes; 765 let DataArgs = base.DataArgs; 766 let ExtraAddrArgs = base.ExtraAddrArgs; 767 let Offset = base.Offset; 768 let Bias = base.Bias; 769 let ZCompare = base.ZCompare; 770 let Gradients = base.Gradients; 771 let LodClampMip = base.LodClampMip; 772} 773 774class AMDGPUDimSampleProfile<string opmod, 775 AMDGPUDimProps dim, 776 AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> { 777 let IsSample = true; 778 let RetTypes = [llvm_any_ty]; 779 let ExtraAddrArgs = sample.ExtraAddrArgs; 780 let Offset = sample.Offset; 781 let Bias = sample.Bias; 782 let ZCompare = sample.ZCompare; 783 let Gradients = sample.Gradients; 784 let LodClampMip = sample.LodOrClamp; 785} 786 787class AMDGPUDimNoSampleProfile<string opmod, 788 AMDGPUDimProps dim, 789 list<LLVMType> retty, 790 list<AMDGPUArg> dataargs, 791 bit Mip = false> : AMDGPUDimProfile<opmod, dim> { 792 let RetTypes = retty; 793 let DataArgs = dataargs; 794 let LodClampMip = !if(Mip, "mip", ""); 795} 796 797class AMDGPUDimAtomicProfile<string opmod, 798 AMDGPUDimProps dim, 799 list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> { 800 let RetTypes = [llvm_anyint_ty]; 801 let DataArgs = dataargs; 802 let IsAtomic = true; 803} 804 805class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim, 806 list<AMDGPUArg> dataargs> 807 : AMDGPUDimAtomicProfile<opmod, dim, dataargs> { 808 let RetTypes = [llvm_anyfloat_ty]; 809} 810 811class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> 812 : AMDGPUDimProfile<"GET_RESINFO", dim> { 813 let RetTypes = [llvm_anyfloat_ty]; 814 let DataArgs = []; 815 let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; 816 let LodClampMip = "mip"; 817} 818 819// Helper class for figuring out image intrinsic argument indexes. 820class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { 821 int NumDataArgs = !size(P_.DataArgs); 822 int NumDmaskArgs = !not(P_.IsAtomic); 823 int NumOffsetArgs = !if(P_.Offset, 1, 0); 824 int NumBiasArgs = !if(P_.Bias, 1, 0); 825 int NumZCompareArgs = !if(P_.ZCompare, 1, 0); 826 int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); 827 int NumVAddrArgs = !size(P_.AddrArgs); 828 int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); 829 int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); 830 int NumRSrcArgs = 1; 831 int NumSampArgs = !if(P_.IsSample, 2, 0); 832 int DmaskArgIndex = NumDataArgs; 833 int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); 834 int OffsetArgIndex = VAddrArgIndex; 835 int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); 836 int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); 837 int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); 838 int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); 839 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); 840 int MipArgIndex = LodArgIndex; 841 int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); 842 int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); 843 int UnormArgIndex = !add(SampArgIndex, 1); 844 int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); 845 int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); 846} 847 848// All dimension-aware intrinsics are derived from this class. 849class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, 850 list<IntrinsicProperty> props, 851 list<SDNodeProperty> sdnodeprops> : Intrinsic< 852 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return 853 !listconcat( 854 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic 855 !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) 856 P_.AddrTypes, // vaddr(VGPR) 857 [llvm_v8i32_ty], // rsrc(SGPR) 858 !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) 859 llvm_i1_ty], []), // unorm(imm) 860 [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) 861 llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc; 862 // gfx12+ imm: bits [0-2] = th, bits [3-4] = scope) 863 864 !listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn], 865 !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), 866 !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), 867 [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, 868 ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>], 869 !if(P_.IsAtomic, [], [IntrNoSync])), 870 871 872 "", sdnodeprops>, 873 AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), 874 !if(P_.IsAtomic, 0, 1)), 1> { 875 AMDGPUDimProfile P = P_; 876 877 AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); 878 879 let TargetPrefix = "amdgcn"; 880} 881 882// Marker class for intrinsics with a DMask that determines the returned 883// channels. 884class AMDGPUImageDMaskIntrinsic; 885 886defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { 887 888 ////////////////////////////////////////////////////////////////////////// 889 // Load and store intrinsics 890 ////////////////////////////////////////////////////////////////////////// 891 multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, 892 list<LLVMType> retty, 893 list<AMDGPUArg> dataargs, 894 list<IntrinsicProperty> props, 895 list<SDNodeProperty> sdnodeprops, 896 bit Mip = false> { 897 foreach dim = AMDGPUDims.NoMsaa in { 898 def !strconcat(NAME, "_", dim.Name) 899 : AMDGPUImageDimIntrinsic< 900 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 901 props, sdnodeprops>; 902 } 903 } 904 905 multiclass AMDGPUImageDimIntrinsicsAll<string opmod, 906 list<LLVMType> retty, 907 list<AMDGPUArg> dataargs, 908 list<IntrinsicProperty> props, 909 list<SDNodeProperty> sdnodeprops, 910 bit Mip = false> { 911 foreach dim = AMDGPUDims.All in { 912 def !strconcat(NAME, "_", dim.Name) 913 : AMDGPUImageDimIntrinsic< 914 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 915 props, sdnodeprops>; 916 } 917 } 918 919 defm int_amdgcn_image_load 920 : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], 921 [SDNPMemOperand]>, 922 AMDGPUImageDMaskIntrinsic; 923 defm int_amdgcn_image_load_mip 924 : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], 925 [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, 926 AMDGPUImageDMaskIntrinsic; 927 928 defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< 929 "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 930 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>, 931 AMDGPUImageDMaskIntrinsic; 932 defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< 933 "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 934 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>, 935 AMDGPUImageDMaskIntrinsic; 936 937 ////////////////////////////////////////////////////////////////////////// 938 // MSAA intrinsics 939 ////////////////////////////////////////////////////////////////////////// 940 foreach dim = AMDGPUDims.Msaa in { 941 def int_amdgcn_image_msaa_load_x # _ # dim.Name: 942 AMDGPUImageDimIntrinsic< 943 AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, 944 [IntrReadMem], [SDNPMemOperand]>; 945 } 946 947 foreach dim = AMDGPUDims.Msaa in { 948 def int_amdgcn_image_msaa_load # _ # dim.Name: 949 AMDGPUImageDimIntrinsic< 950 AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, 951 [IntrReadMem], [SDNPMemOperand]>; 952 } 953 954 ////////////////////////////////////////////////////////////////////////// 955 // sample and getlod intrinsics 956 ////////////////////////////////////////////////////////////////////////// 957 multiclass AMDGPUImageDimSampleDims<string opmod, 958 AMDGPUSampleVariant sample, 959 bit NoMem = false> { 960 foreach dim = AMDGPUDims.NoMsaa in { 961 def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< 962 AMDGPUDimSampleProfile<opmod, dim, sample>, 963 !if(NoMem, [IntrNoMem], [IntrReadMem]), 964 !if(NoMem, [], [SDNPMemOperand])>; 965 } 966 } 967 968 foreach sample = AMDGPUSampleVariants in { 969 defm int_amdgcn_image_sample # sample.LowerCaseMod 970 : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, 971 AMDGPUImageDMaskIntrinsic; 972 } 973 974 defm int_amdgcn_image_getlod 975 : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, 976 AMDGPUImageDMaskIntrinsic; 977 978 ////////////////////////////////////////////////////////////////////////// 979 // getresinfo intrinsics 980 ////////////////////////////////////////////////////////////////////////// 981 foreach dim = AMDGPUDims.All in { 982 def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) 983 : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, 984 AMDGPUImageDMaskIntrinsic; 985 } 986 987 ////////////////////////////////////////////////////////////////////////// 988 // gather4 intrinsics 989 ////////////////////////////////////////////////////////////////////////// 990 foreach sample = AMDGPUSampleVariantsNoGradients in { 991 foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { 992 def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: 993 AMDGPUImageDimIntrinsic< 994 AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, 995 [IntrReadMem], [SDNPMemOperand]>; 996 } 997 } 998} 999 1000////////////////////////////////////////////////////////////////////////// 1001// atomic intrinsics 1002////////////////////////////////////////////////////////////////////////// 1003defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { 1004 multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, 1005 int isFloat = 0> { 1006 foreach dim = AMDGPUDims.All in { 1007 def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic< 1008 !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>, 1009 AMDGPUDimAtomicProfile<opmod, dim, dataargs>), 1010 [], [SDNPMemOperand]>; 1011 } 1012 } 1013 1014 multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> { 1015 defm "" 1016 : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], 1017 isFloat>; 1018 } 1019 1020 multiclass AMDGPUImageDimFloatAtomic<string opmod> { 1021 defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>; 1022 } 1023 1024 defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; 1025 defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; 1026 defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; 1027 defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; 1028 defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; 1029 defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; 1030 defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; 1031 defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; 1032 defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; 1033 defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; 1034 defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; 1035 defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; 1036 defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; 1037 defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; 1038 defm int_amdgcn_image_atomic_add_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_ADD_FLT">; 1039 defm int_amdgcn_image_atomic_min_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MIN_FLT">; 1040 defm int_amdgcn_image_atomic_max_flt : AMDGPUImageDimFloatAtomic<"ATOMIC_MAX_FLT">; 1041 1042 defm int_amdgcn_image_atomic_cmpswap : 1043 AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, 1044 AMDGPUArg<LLVMMatchType<0>, "cmp">]>; 1045 1046 defm int_amdgcn_image_atomic_pk_add_f16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_F16">; 1047 defm int_amdgcn_image_atomic_pk_add_bf16 : AMDGPUImageDimFloatAtomic<"ATOMIC_PK_ADD_BF16">; 1048} 1049 1050////////////////////////////////////////////////////////////////////////// 1051// Buffer intrinsics 1052////////////////////////////////////////////////////////////////////////// 1053 1054// Data type for buffer resources (V#). Maybe, in the future, we can create a 1055// similar one for textures (T#). 1056def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>; 1057 1058let TargetPrefix = "amdgcn" in { 1059 1060def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < 1061 [AMDGPUBufferRsrcTy], 1062 [llvm_anyptr_ty, // base 1063 llvm_i16_ty, // stride (and swizzle control) 1064 llvm_i32_ty, // NumRecords / extent 1065 llvm_i32_ty], // flags 1066 // Attributes lifted from ptrmask + some extra argument attributes. 1067 [IntrNoMem, ReadNone<ArgIndex<0>>, 1068 IntrSpeculatable, IntrWillReturn]>; 1069 1070defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { 1071 1072class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1073 [data_ty], 1074 [llvm_v4i32_ty, // rsrc(SGPR) 1075 llvm_i32_ty, // vindex(VGPR) 1076 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1077 llvm_i1_ty, // glc(imm) 1078 llvm_i1_ty], // slc(imm) 1079 [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1080 AMDGPURsrcIntrinsic<0>; 1081def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>; 1082def int_amdgcn_buffer_load : AMDGPUBufferLoad; 1083 1084// Generate a buffer_load instruction that may be optimized to s_buffer_load if 1085// the offset argument is uniform. 1086def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < 1087 [llvm_any_ty], 1088 [llvm_v4i32_ty, // rsrc(SGPR) 1089 llvm_i32_ty, // byte offset 1090 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc; 1091 // gfx12+ imm: bits [0-2] = th, bits [3-4] = scope) 1092 // Note: volatile bit is **not** permitted here. 1093 [IntrNoMem, ImmArg<ArgIndex<2>>]>, 1094 AMDGPURsrcIntrinsic<0>; 1095 1096class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1097 [], 1098 [data_ty, // vdata(VGPR) 1099 llvm_v4i32_ty, // rsrc(SGPR) 1100 llvm_i32_ty, // vindex(VGPR) 1101 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1102 llvm_i1_ty, // glc(imm) 1103 llvm_i1_ty], // slc(imm) 1104 [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1105 AMDGPURsrcIntrinsic<1>; 1106def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>; 1107def int_amdgcn_buffer_store : AMDGPUBufferStore; 1108 1109// New buffer intrinsics with separate raw and struct variants. The raw 1110// variant never has an index. The struct variant always has an index, even if 1111// it is const 0. A struct intrinsic with constant 0 index is different to the 1112// corresponding raw intrinsic on gfx9+ because the behavior of bound checking 1113// and swizzling changes depending on whether idxen is set in the instruction. 1114// These new instrinsics also keep the offset and soffset arguments separate as 1115// they behave differently in bounds checking and swizzling. 1116 1117// The versions of these intrinsics that take <4 x i32> arguments are deprecated 1118// in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments, 1119// which allow for improved reasoning about memory accesses. 1120// 1121// Note that in the cachepolicy for all these intrinsics, bit 31 is not preserved 1122// through to final assembly selection and is used to signal that the buffer 1123// operation is volatile. 1124class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1125 [data_ty], 1126 [llvm_v4i32_ty, // rsrc(SGPR) 1127 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1128 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1129 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1130 // bit 1 = slc, 1131 // bit 2 = dlc on gfx10/gfx11), 1132 // swizzled buffer (bit 3 = swz), 1133 // gfx12+: 1134 // cachepolicy (bits [0-2] = th, 1135 // bits [3-4] = scope) 1136 // swizzled buffer (bit 6 = swz), 1137 // all: 1138 // volatile op (bit 31, stripped at lowering)) 1139 [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1140 AMDGPURsrcIntrinsic<0>; 1141def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>; 1142def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; 1143 1144class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1145 [data_ty], 1146 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1147 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1148 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1149 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1150 // bit 1 = slc, 1151 // bit 2 = dlc on gfx10/gfx11), 1152 // swizzled buffer (bit 3 = swz), 1153 // gfx12+: 1154 // cachepolicy (bits [0-2] = th, 1155 // bits [3-4] = scope) 1156 // swizzled buffer (bit 6 = swz), 1157 // all: 1158 // volatile op (bit 31, stripped at lowering)) 1159 1160 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1161 ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1162 AMDGPURsrcIntrinsic<0>; 1163def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>; 1164def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad; 1165 1166class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1167 [data_ty], 1168 [llvm_v4i32_ty, // rsrc(SGPR) 1169 llvm_i32_ty, // vindex(VGPR) 1170 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1171 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1172 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1173 // bit 1 = slc, 1174 // bit 2 = dlc on gfx10/gfx11), 1175 // swizzled buffer (bit 3 = swz), 1176 // gfx12+: 1177 // cachepolicy (bits [0-2] = th, 1178 // bits [3-4] = scope) 1179 // swizzled buffer (bit 6 = swz), 1180 // all: 1181 // volatile op (bit 31, stripped at lowering)) 1182 [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1183 AMDGPURsrcIntrinsic<0>; 1184def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; 1185def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; 1186 1187class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1188 [data_ty], 1189 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1190 llvm_i32_ty, // vindex(VGPR) 1191 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1192 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1193 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1194 // bit 1 = slc, 1195 // bit 2 = dlc on gfx10/gfx11), 1196 // swizzled buffer (bit 3 = swz), 1197 // gfx12+: 1198 // cachepolicy (bits [0-2] = th, 1199 // bits [3-4] = scope) 1200 // swizzled buffer (bit 6 = swz), 1201 // all: 1202 // volatile op (bit 31, stripped at lowering)) 1203 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1204 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1205 AMDGPURsrcIntrinsic<0>; 1206def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad; 1207def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad; 1208 1209class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1210 [], 1211 [data_ty, // vdata(VGPR) 1212 llvm_v4i32_ty, // rsrc(SGPR) 1213 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1214 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1215 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1216 // bit 1 = slc, 1217 // bit 2 = dlc on gfx10/gfx11), 1218 // swizzled buffer (bit 3 = swz), 1219 // gfx12+: 1220 // cachepolicy (bits [0-2] = th, 1221 // bits [3-4] = scope) 1222 // swizzled buffer (bit 6 = swz), 1223 // all: 1224 // volatile op (bit 31, stripped at lowering)) 1225 [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1226 AMDGPURsrcIntrinsic<1>; 1227def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; 1228def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; 1229 1230class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1231 [], 1232 [data_ty, // vdata(VGPR) 1233 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1234 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1235 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1236 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1237 // bit 1 = slc, 1238 // bit 2 = dlc on gfx10/gfx11), 1239 // swizzled buffer (bit 3 = swz), 1240 // gfx12+: 1241 // cachepolicy (bits [0-2] = th, 1242 // bits [3-4] = scope) 1243 // swizzled buffer (bit 6 = swz), 1244 // all: 1245 // volatile op (bit 31, stripped at lowering)) 1246 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1247 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1248 AMDGPURsrcIntrinsic<1>; 1249def int_amdgcn_raw_ptr_buffer_store_format : AMDGPURawPtrBufferStore<llvm_anyfloat_ty>; 1250def int_amdgcn_raw_ptr_buffer_store : AMDGPURawPtrBufferStore; 1251 1252class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1253 [], 1254 [data_ty, // vdata(VGPR) 1255 llvm_v4i32_ty, // rsrc(SGPR) 1256 llvm_i32_ty, // vindex(VGPR) 1257 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1258 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1259 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1260 // bit 1 = slc, 1261 // bit 2 = dlc on gfx10/gfx11), 1262 // swizzled buffer (bit 3 = swz), 1263 // gfx12+: 1264 // cachepolicy (bits [0-2] = th, 1265 // bits [3-4] = scope) 1266 // swizzled buffer (bit 6 = swz), 1267 // all: 1268 // volatile op (bit 31, stripped at lowering)) 1269 [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1270 AMDGPURsrcIntrinsic<1>; 1271def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; 1272def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; 1273 1274class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1275 [], 1276 [data_ty, // vdata(VGPR) 1277 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1278 llvm_i32_ty, // vindex(VGPR) 1279 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1280 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1281 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1282 // bit 1 = slc, 1283 // bit 2 = dlc on gfx10/gfx11), 1284 // swizzled buffer (bit 3 = swz), 1285 // gfx12+: 1286 // cachepolicy (bits [0-2] = th, 1287 // bits [3-4] = scope) 1288 // swizzled buffer (bit 6 = swz), 1289 // all: 1290 // volatile op (bit 31, stripped at lowering)) 1291 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1292 ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1293 AMDGPURsrcIntrinsic<1>; 1294def int_amdgcn_struct_ptr_buffer_store_format : AMDGPUStructPtrBufferStore; 1295def int_amdgcn_struct_ptr_buffer_store : AMDGPUStructPtrBufferStore; 1296 1297class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1298 [data_ty], 1299 [LLVMMatchType<0>, // vdata(VGPR) 1300 llvm_v4i32_ty, // rsrc(SGPR) 1301 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1302 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1303 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1304 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1305 AMDGPURsrcIntrinsic<1, 0>; 1306def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; 1307def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; 1308def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; 1309def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; 1310def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; 1311def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1312def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; 1313def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; 1314def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1315def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; 1316def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; 1317def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; 1318def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; 1319def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; 1320def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic; 1321def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< 1322 [llvm_anyint_ty], 1323 [LLVMMatchType<0>, // src(VGPR) 1324 LLVMMatchType<0>, // cmp(VGPR) 1325 llvm_v4i32_ty, // rsrc(SGPR) 1326 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1327 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1328 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1329 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1330 AMDGPURsrcIntrinsic<2, 0>; 1331 1332class AMDGPURawPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1333 [data_ty], 1334 [LLVMMatchType<0>, // vdata(VGPR) 1335 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1336 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1337 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1338 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1339 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1340 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1341 AMDGPURsrcIntrinsic<1, 0>; 1342 1343def int_amdgcn_raw_ptr_buffer_atomic_swap : AMDGPURawPtrBufferAtomic; 1344def int_amdgcn_raw_ptr_buffer_atomic_add : AMDGPURawPtrBufferAtomic; 1345def int_amdgcn_raw_ptr_buffer_atomic_sub : AMDGPURawPtrBufferAtomic; 1346def int_amdgcn_raw_ptr_buffer_atomic_smin : AMDGPURawPtrBufferAtomic; 1347def int_amdgcn_raw_ptr_buffer_atomic_umin : AMDGPURawPtrBufferAtomic; 1348def int_amdgcn_raw_ptr_buffer_atomic_fmin : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1349def int_amdgcn_raw_ptr_buffer_atomic_smax : AMDGPURawPtrBufferAtomic; 1350def int_amdgcn_raw_ptr_buffer_atomic_umax : AMDGPURawPtrBufferAtomic; 1351def int_amdgcn_raw_ptr_buffer_atomic_fmax : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1352def int_amdgcn_raw_ptr_buffer_atomic_and : AMDGPURawPtrBufferAtomic; 1353def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; 1354def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; 1355def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; 1356def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; 1357def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic; 1358def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< 1359 [llvm_anyint_ty], 1360 [LLVMMatchType<0>, // src(VGPR) 1361 LLVMMatchType<0>, // cmp(VGPR) 1362 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1363 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1364 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1365 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1366 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1367 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1368 AMDGPURsrcIntrinsic<2, 0>; 1369 1370// gfx908 intrinsic 1371def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1372def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1373// gfx12+ intrinsic 1374def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic < 1375 [llvm_v2bf16_ty], 1376 [llvm_v2bf16_ty, 1377 llvm_v4i32_ty, 1378 llvm_i32_ty, 1379 llvm_i32_ty, 1380 llvm_i32_ty], 1381 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1382 AMDGPURsrcIntrinsic<1, 0>; 1383def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic < 1384 [llvm_v2bf16_ty], 1385 [llvm_v2bf16_ty, 1386 AMDGPUBufferRsrcTy, 1387 llvm_i32_ty, 1388 llvm_i32_ty, 1389 llvm_i32_ty], 1390 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1391 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1392 AMDGPURsrcIntrinsic<1, 0>; 1393 1394class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1395 [data_ty], 1396 [LLVMMatchType<0>, // vdata(VGPR) 1397 llvm_v4i32_ty, // rsrc(SGPR) 1398 llvm_i32_ty, // vindex(VGPR) 1399 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1400 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1401 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1402 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1403 AMDGPURsrcIntrinsic<1, 0>; 1404def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; 1405def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; 1406def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; 1407def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; 1408def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; 1409def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; 1410def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; 1411def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; 1412def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; 1413def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; 1414def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; 1415def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; 1416def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic; 1417def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< 1418 [llvm_anyint_ty], 1419 [LLVMMatchType<0>, // src(VGPR) 1420 LLVMMatchType<0>, // cmp(VGPR) 1421 llvm_v4i32_ty, // rsrc(SGPR) 1422 llvm_i32_ty, // vindex(VGPR) 1423 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1424 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1425 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1426 [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1427 AMDGPURsrcIntrinsic<2, 0>; 1428 1429class AMDGPUStructPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1430 [data_ty], 1431 [LLVMMatchType<0>, // vdata(VGPR) 1432 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1433 llvm_i32_ty, // vindex(VGPR) 1434 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1435 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1436 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1437 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1438 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1439 AMDGPURsrcIntrinsic<1, 0>; 1440def int_amdgcn_struct_ptr_buffer_atomic_swap : AMDGPUStructPtrBufferAtomic; 1441def int_amdgcn_struct_ptr_buffer_atomic_add : AMDGPUStructPtrBufferAtomic; 1442def int_amdgcn_struct_ptr_buffer_atomic_sub : AMDGPUStructPtrBufferAtomic; 1443def int_amdgcn_struct_ptr_buffer_atomic_smin : AMDGPUStructPtrBufferAtomic; 1444def int_amdgcn_struct_ptr_buffer_atomic_umin : AMDGPUStructPtrBufferAtomic; 1445def int_amdgcn_struct_ptr_buffer_atomic_smax : AMDGPUStructPtrBufferAtomic; 1446def int_amdgcn_struct_ptr_buffer_atomic_umax : AMDGPUStructPtrBufferAtomic; 1447def int_amdgcn_struct_ptr_buffer_atomic_and : AMDGPUStructPtrBufferAtomic; 1448def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; 1449def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; 1450def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; 1451def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; 1452def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic; 1453def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< 1454 [llvm_anyint_ty], 1455 [LLVMMatchType<0>, // src(VGPR) 1456 LLVMMatchType<0>, // cmp(VGPR) 1457 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1458 llvm_i32_ty, // vindex(VGPR) 1459 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1460 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1461 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc, ..., bit 31 = volatile) 1462 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1463 ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1464 AMDGPURsrcIntrinsic<2, 0>; 1465 1466// gfx908 intrinsic 1467def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1468def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1469// gfx12 intrinsic 1470def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic < 1471 [llvm_v2bf16_ty], 1472 [llvm_v2bf16_ty, 1473 llvm_v4i32_ty, 1474 llvm_i32_ty, 1475 llvm_i32_ty, 1476 llvm_i32_ty, 1477 llvm_i32_ty], 1478 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1479 AMDGPURsrcIntrinsic<1, 0>; 1480def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic < 1481 [llvm_v2bf16_ty], 1482 [llvm_v2bf16_ty, 1483 AMDGPUBufferRsrcTy, 1484 llvm_i32_ty, 1485 llvm_i32_ty, 1486 llvm_i32_ty, 1487 llvm_i32_ty], 1488 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1489 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1490 AMDGPURsrcIntrinsic<1, 0>; 1491 1492// gfx90a intrinsics 1493def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1494def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1495 1496def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1497def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1498 1499// Obsolescent tbuffer intrinsics. 1500def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic < 1501 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1502 [llvm_v4i32_ty, // rsrc(SGPR) 1503 llvm_i32_ty, // vindex(VGPR) 1504 llvm_i32_ty, // voffset(VGPR) 1505 llvm_i32_ty, // soffset(SGPR) 1506 llvm_i32_ty, // offset(imm) 1507 llvm_i32_ty, // dfmt(imm) 1508 llvm_i32_ty, // nfmt(imm) 1509 llvm_i1_ty, // glc(imm) 1510 llvm_i1_ty], // slc(imm) 1511 [IntrReadMem, 1512 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, 1513 ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>, 1514 AMDGPURsrcIntrinsic<0>; 1515 1516def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic < 1517 [], 1518 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1519 llvm_v4i32_ty, // rsrc(SGPR) 1520 llvm_i32_ty, // vindex(VGPR) 1521 llvm_i32_ty, // voffset(VGPR) 1522 llvm_i32_ty, // soffset(SGPR) 1523 llvm_i32_ty, // offset(imm) 1524 llvm_i32_ty, // dfmt(imm) 1525 llvm_i32_ty, // nfmt(imm) 1526 llvm_i1_ty, // glc(imm) 1527 llvm_i1_ty], // slc(imm) 1528 [IntrWriteMem, ImmArg<ArgIndex<5>>, 1529 ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, 1530 ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>, 1531 AMDGPURsrcIntrinsic<1>; 1532 1533// New tbuffer intrinsics, with: 1534// - raw and struct variants 1535// - joint format field 1536// - joint cachepolicy field 1537def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < 1538 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1539 [llvm_v4i32_ty, // rsrc(SGPR) 1540 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1541 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1542 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1543 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1544 // bit 1 = slc, 1545 // bit 2 = dlc on gfx10/gfx11), 1546 // swizzled buffer (bit 3 = swz)) 1547 // gfx12+: 1548 // cachepolicy (bits [0-2] = th, 1549 // bits [3-4] = scope) 1550 // swizzled buffer (bit 6 = swz) 1551 [IntrReadMem, 1552 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1553 AMDGPURsrcIntrinsic<0>; 1554 1555def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1556 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1557 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1558 llvm_i32_ty, // offset(VGPR/imm, included in bounds` checking and swizzling) 1559 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1560 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1561 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1562 // bit 1 = slc, 1563 // bit 2 = dlc on gfx10/gfx11), 1564 // swizzled buffer (bit 3 = swz), 1565 // gfx12+: 1566 // cachepolicy (bits [0-2] = th, 1567 // bits [3-4] = scope) 1568 // swizzled buffer (bit 6 = swz) 1569 // volatile op (bit 31, stripped at lowering)) 1570 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1571 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1572 AMDGPURsrcIntrinsic<0>; 1573 1574def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < 1575 [], 1576 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1577 llvm_v4i32_ty, // rsrc(SGPR) 1578 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1579 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1580 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1581 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1582 // bit 1 = slc, 1583 // bit 2 = dlc on gfx10/gfx11), 1584 // swizzled buffer (bit 3 = swz), 1585 // gfx12+: 1586 // cachepolicy (bits [0-2] = th, 1587 // bits [3-4] = scope) 1588 // swizzled buffer (bit 6 = swz), 1589 // all: 1590 // volatile op (bit 31, stripped at lowering)) 1591 [IntrWriteMem, 1592 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1593 AMDGPURsrcIntrinsic<1>; 1594 1595def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1596 [], 1597 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1598 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1599 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1600 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1601 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1602 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1603 // bit 1 = slc, 1604 // bit 2 = dlc on gfx10/gfx11), 1605 // swizzled buffer (bit 3 = swz), 1606 // gfx12+: 1607 // cachepolicy (bits [0-2] = th, 1608 // bits [3-4] = scope) 1609 // swizzled buffer (bit 6 = swz), 1610 // all: 1611 // volatile op (bit 31, stripped at lowering)) 1612 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1613 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1614 AMDGPURsrcIntrinsic<1>; 1615 1616def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < 1617 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1618 [llvm_v4i32_ty, // rsrc(SGPR) 1619 llvm_i32_ty, // vindex(VGPR) 1620 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1621 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1622 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1623 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1624 // bit 1 = slc, 1625 // bit 2 = dlc on gfx10/gfx11), 1626 // swizzled buffer (bit 3 = swz), 1627 // gfx12+: 1628 // cachepolicy (bits [0-2] = th, 1629 // bits [3-4] = scope) 1630 // swizzled buffer (bit 6 = swz), 1631 // all: 1632 // volatile op (bit 31, stripped at lowering)) 1633 [IntrReadMem, 1634 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1635 AMDGPURsrcIntrinsic<0>; 1636 1637def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1638 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1639 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1640 llvm_i32_ty, // vindex(VGPR) 1641 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1642 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1643 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1644 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1645 // bit 1 = slc, 1646 // bit 2 = dlc on gfx10/gfx11), 1647 // swizzled buffer (bit 3 = swz), 1648 // gfx12+: 1649 // cachepolicy (bits [0-2] = th, 1650 // bits [3-4] = scope) 1651 // swizzled buffer (bit 6 = swz), 1652 // all: 1653 // volatile op (bit 31, stripped at lowering)) 1654 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1655 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1656 AMDGPURsrcIntrinsic<0>; 1657 1658def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1659 [], 1660 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1661 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1662 llvm_i32_ty, // vindex(VGPR) 1663 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1664 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1665 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1666 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1667 // bit 1 = slc, 1668 // bit 2 = dlc on gfx10/gfx11), 1669 // swizzled buffer (bit 3 = swz), 1670 // gfx12+: 1671 // cachepolicy (bits [0-2] = th, 1672 // bits [3-4] = scope) 1673 // swizzled buffer (bit 6 = swz), 1674 // all: 1675 // volatile op (bit 31, stripped at lowering)) 1676 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1677 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1678 AMDGPURsrcIntrinsic<1>; 1679 1680def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < 1681 [], 1682 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1683 llvm_v4i32_ty, // rsrc(SGPR) 1684 llvm_i32_ty, // vindex(VGPR) 1685 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1686 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1687 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1688 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1689 // bit 1 = slc, 1690 // bit 2 = dlc on gfx10/gfx11), 1691 // swizzled buffer (bit 3 = swz), 1692 // gfx12+: 1693 // cachepolicy (bits [0-2] = th, 1694 // bits [3-4] = scope) 1695 // swizzled buffer (bit 6 = swz), 1696 // all: 1697 // volatile op (bit 31, stripped at lowering)) 1698 [IntrWriteMem, 1699 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1700 AMDGPURsrcIntrinsic<1>; 1701 1702class AMDGPUBufferAtomic : Intrinsic < 1703 [llvm_anyint_ty], 1704 [LLVMMatchType<0>, // vdata(VGPR) 1705 llvm_v4i32_ty, // rsrc(SGPR) 1706 llvm_i32_ty, // vindex(VGPR) 1707 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1708 llvm_i1_ty], // slc(imm) 1709 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1710 AMDGPURsrcIntrinsic<1, 0>; 1711def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; 1712def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; 1713def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; 1714def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; 1715def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; 1716def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; 1717def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; 1718def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; 1719def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; 1720def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; 1721def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< 1722 [llvm_i32_ty], 1723 [llvm_i32_ty, // src(VGPR) 1724 llvm_i32_ty, // cmp(VGPR) 1725 llvm_v4i32_ty, // rsrc(SGPR) 1726 llvm_i32_ty, // vindex(VGPR) 1727 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1728 llvm_i1_ty], // slc(imm) 1729 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1730 AMDGPURsrcIntrinsic<2, 0>; 1731 1732def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; 1733 1734class AMDGPUBufferAtomicFP : Intrinsic < 1735 [llvm_anyfloat_ty], 1736 [LLVMMatchType<0>, // vdata(VGPR) 1737 llvm_v4i32_ty, // rsrc(SGPR) 1738 llvm_i32_ty, // vindex(VGPR) 1739 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1740 llvm_i1_ty], // slc(imm) 1741 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1742 AMDGPURsrcIntrinsic<1, 0>; 1743 1744// Legacy form of the intrinsic. raw and struct forms should be preferred. 1745def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; 1746 1747class AMDGPURawBufferLoadLDS : Intrinsic < 1748 [], 1749 [llvm_v4i32_ty, // rsrc(SGPR) 1750 LLVMQualPointerType<3>, // LDS base offset 1751 llvm_i32_ty, // Data byte size: 1/2/4 1752 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1753 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1754 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1755 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1756 // bit 1 = slc, 1757 // bit 2 = dlc on gfx10/gfx11)) 1758 // swizzled buffer (bit 3 = swz), 1759 // gfx12+: 1760 // cachepolicy (bits [0-2] = th, 1761 // bits [3-4] = scope) 1762 // swizzled buffer (bit 6 = swz), 1763 // all: 1764 // volatile op (bit 31, stripped at lowering)) 1765 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1766 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1767def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; 1768 1769class AMDGPURawPtrBufferLoadLDS : Intrinsic < 1770 [], 1771 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1772 LLVMQualPointerType<3>, // LDS base offset 1773 llvm_i32_ty, // Data byte size: 1/2/4 1774 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1775 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1776 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1777 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1778 // bit 1 = slc, 1779 // bit 2 = dlc on gfx10/gfx11)) 1780 // swizzled buffer (bit 3 = swz), 1781 // gfx12+: 1782 // cachepolicy (bits [0-2] = th, 1783 // bits [3-4] = scope) 1784 // swizzled buffer (bit 6 = swz), 1785 // all: 1786 // volatile op (bit 31, stripped at lowering)) 1787 [IntrWillReturn, IntrArgMemOnly, 1788 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1789 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1790 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1791 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1792def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS; 1793 1794class AMDGPUStructBufferLoadLDS : Intrinsic < 1795 [], 1796 [llvm_v4i32_ty, // rsrc(SGPR) 1797 LLVMQualPointerType<3>, // LDS base offset 1798 llvm_i32_ty, // Data byte size: 1/2/4 1799 llvm_i32_ty, // vindex(VGPR) 1800 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1801 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1802 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1803 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1804 // bit 1 = slc, 1805 // bit 2 = dlc on gfx10/gfx11)) 1806 // swizzled buffer (bit 3 = swz), 1807 // gfx12+: 1808 // cachepolicy (bits [0-2] = th, 1809 // bits [3-4] = scope) 1810 // swizzled buffer (bit 6 = swz), 1811 // all: 1812 // volatile op (bit 31, stripped at lowering)) 1813 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1814 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1815def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; 1816 1817class AMDGPUStructPtrBufferLoadLDS : Intrinsic < 1818 [], 1819 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1820 LLVMQualPointerType<3> , // LDS base offset 1821 llvm_i32_ty, // Data byte size: 1/2/4 1822 llvm_i32_ty, // vindex(VGPR) 1823 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1824 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1825 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1826 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1827 // bit 1 = slc, 1828 // bit 2 = dlc on gfx10/gfx11)) 1829 // swizzled buffer (bit 3 = swz), 1830 // gfx12+: 1831 // cachepolicy (bits [0-2] = th, 1832 // bits [3-4] = scope) 1833 // swizzled buffer (bit 6 = swz), 1834 // all: 1835 // volatile op (bit 31, stripped at lowering)) 1836 [IntrWillReturn, IntrArgMemOnly, 1837 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1838 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1839 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1840 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1841def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS; 1842 1843} // defset AMDGPUBufferIntrinsics 1844 1845// Uses that do not set the done bit should set IntrWriteMem on the 1846// call site. 1847def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ 1848 llvm_i32_ty, // tgt, 1849 llvm_i32_ty, // en 1850 llvm_any_ty, // src0 (f32 or i32) 1851 LLVMMatchType<0>, // src1 1852 LLVMMatchType<0>, // src2 1853 LLVMMatchType<0>, // src3 1854 llvm_i1_ty, // done 1855 llvm_i1_ty // vm (ignored on GFX11+) 1856 ], 1857 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1858 ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly] 1859>; 1860 1861// exp with row_en bit set. Only supported on GFX11+. 1862def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [ 1863 llvm_i32_ty, // tgt, 1864 llvm_i32_ty, // en 1865 llvm_any_ty, // src0 (f32 or i32) 1866 LLVMMatchType<0>, // src1 1867 LLVMMatchType<0>, // src2 1868 LLVMMatchType<0>, // src3 1869 llvm_i1_ty, // done 1870 llvm_i32_ty], // row number 1871 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1872 IntrWriteMem, IntrInaccessibleMemOnly] 1873>; 1874 1875// exp with compr bit set. Not supported on GFX11+. 1876def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [ 1877 llvm_i32_ty, // tgt, 1878 llvm_i32_ty, // en 1879 llvm_anyvector_ty, // src0 (v2f16 or v2i16) 1880 LLVMMatchType<0>, // src1 1881 llvm_i1_ty, // done 1882 llvm_i1_ty], // vm 1883 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, 1884 ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly] 1885>; 1886 1887def int_amdgcn_buffer_wbinvl1_sc : 1888 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, 1889 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1890 1891def int_amdgcn_buffer_wbinvl1 : 1892 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, 1893 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1894 1895def int_amdgcn_s_dcache_inv : 1896 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, 1897 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1898 1899def int_amdgcn_s_memtime : 1900 ClangBuiltin<"__builtin_amdgcn_s_memtime">, 1901 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>; 1902 1903def int_amdgcn_s_sleep : 1904 ClangBuiltin<"__builtin_amdgcn_s_sleep">, 1905 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1906 IntrHasSideEffects]> { 1907} 1908 1909def int_amdgcn_s_sleep_var 1910 : ClangBuiltin<"__builtin_amdgcn_s_sleep_var">, 1911 Intrinsic<[], [llvm_i32_ty], 1912 [IntrNoMem, IntrHasSideEffects, IntrWillReturn]> { 1913} 1914 1915def int_amdgcn_s_nop : 1916 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1917 IntrHasSideEffects]> { 1918} 1919 1920def int_amdgcn_s_incperflevel : 1921 ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, 1922 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1923 IntrHasSideEffects]> { 1924} 1925 1926def int_amdgcn_s_decperflevel : 1927 ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, 1928 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1929 IntrHasSideEffects]> { 1930} 1931 1932def int_amdgcn_s_sethalt : 1933 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1934 IntrHasSideEffects]>; 1935 1936def int_amdgcn_s_setprio : 1937 ClangBuiltin<"__builtin_amdgcn_s_setprio">, 1938 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1939 IntrHasSideEffects]>; 1940 1941def int_amdgcn_s_ttracedata : 1942 DefaultAttrsIntrinsic<[], [llvm_i32_ty], 1943 [IntrNoMem, IntrHasSideEffects]>; 1944def int_amdgcn_s_ttracedata_imm : 1945 DefaultAttrsIntrinsic<[], [llvm_i16_ty], 1946 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>; 1947 1948// This is IntrHasSideEffects so it can be used to read cycle counters. 1949def int_amdgcn_s_getreg : 1950 ClangBuiltin<"__builtin_amdgcn_s_getreg">, 1951 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], 1952 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1953>; 1954 1955// Note this can be used to set FP environment properties that are 1956// unsafe to change in non-strictfp functions. The register properties 1957// available (and value required to access them) may differ per 1958// subtarget. llvm.amdgcn.s.setreg(hwmode, value) 1959def int_amdgcn_s_setreg : 1960 ClangBuiltin<"__builtin_amdgcn_s_setreg">, 1961 DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty], 1962 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1963>; 1964 1965// int_amdgcn_s_getpc is provided to allow a specific style of position 1966// independent code to determine the high part of its address when it is 1967// known (through convention) that the code and any data of interest does 1968// not cross a 4Gb address boundary. Use for any other purpose may not 1969// produce the desired results as optimizations may cause code movement, 1970// especially as we explicitly use IntrNoMem to allow optimizations. 1971// This intrinsic always returns PC sign-extended from 48 bits even if the 1972// s_getpc_b64 instruction returns a zero-extended value. 1973def int_amdgcn_s_getpc : 1974 ClangBuiltin<"__builtin_amdgcn_s_getpc">, 1975 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, 1976 IntrWillReturn]>; 1977 1978// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> 1979// param values: 0 = P10, 1 = P20, 2 = P0 1980def int_amdgcn_interp_mov : 1981 ClangBuiltin<"__builtin_amdgcn_interp_mov">, 1982 DefaultAttrsIntrinsic<[llvm_float_ty], 1983 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1984 [IntrNoMem, IntrSpeculatable, 1985 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1986 1987// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> 1988// This intrinsic reads from lds, but the memory values are constant, 1989// so it behaves like IntrNoMem. 1990def int_amdgcn_interp_p1 : 1991 ClangBuiltin<"__builtin_amdgcn_interp_p1">, 1992 DefaultAttrsIntrinsic<[llvm_float_ty], 1993 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1994 [IntrNoMem, IntrSpeculatable, 1995 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1996 1997// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> 1998def int_amdgcn_interp_p2 : 1999 ClangBuiltin<"__builtin_amdgcn_interp_p2">, 2000 DefaultAttrsIntrinsic<[llvm_float_ty], 2001 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2002 [IntrNoMem, IntrSpeculatable, 2003 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 2004 // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. 2005 2006// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> 2007// high selects whether high or low 16-bits are loaded from LDS 2008def int_amdgcn_interp_p1_f16 : 2009 ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, 2010 DefaultAttrsIntrinsic<[llvm_float_ty], 2011 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 2012 [IntrNoMem, IntrSpeculatable, 2013 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 2014 2015// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> 2016// high selects whether high or low 16-bits are loaded from LDS 2017def int_amdgcn_interp_p2_f16 : 2018 ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, 2019 DefaultAttrsIntrinsic<[llvm_half_ty], 2020 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 2021 [IntrNoMem, IntrSpeculatable, 2022 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 2023 2024// llvm.amdgcn.lds.direct.load <m0> 2025// The input argument is m0, which contains a packed combination of address 2026// offset and flags describing the data type. 2027def int_amdgcn_lds_direct_load : 2028 DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 2029 [llvm_i32_ty], 2030 [IntrReadMem, IntrSpeculatable]>; 2031 2032// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> 2033// Like interp intrinsics, this reads from lds, but the memory values are constant, 2034// so it behaves like IntrNoMem. 2035def int_amdgcn_lds_param_load : 2036 DefaultAttrsIntrinsic<[llvm_float_ty], 2037 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2038 [IntrNoMem, IntrSpeculatable, 2039 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; 2040 2041// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> 2042def int_amdgcn_interp_inreg_p10 : 2043 DefaultAttrsIntrinsic<[llvm_float_ty], 2044 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 2045 [IntrNoMem, IntrSpeculatable]>; 2046 2047// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> 2048def int_amdgcn_interp_inreg_p2 : 2049 DefaultAttrsIntrinsic<[llvm_float_ty], 2050 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 2051 [IntrNoMem, IntrSpeculatable]>; 2052 2053// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> 2054// high selects whether high or low 16-bits are used for p and p0 operands 2055def int_amdgcn_interp_inreg_p10_f16: 2056 DefaultAttrsIntrinsic<[llvm_float_ty], 2057 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2058 [IntrNoMem, IntrSpeculatable, 2059 ImmArg<ArgIndex<3>>]>; 2060 2061// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> 2062// high selects whether high or low 16-bits are used for p operand 2063def int_amdgcn_interp_inreg_p2_f16 : 2064 DefaultAttrsIntrinsic<[llvm_half_ty], 2065 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 2066 [IntrNoMem, IntrSpeculatable, 2067 ImmArg<ArgIndex<3>>]>; 2068 2069// Deprecated: use llvm.amdgcn.live.mask instead. 2070def int_amdgcn_ps_live : DefaultAttrsIntrinsic < 2071 [llvm_i1_ty], 2072 [], 2073 [IntrNoMem]>; 2074 2075// Query currently live lanes. 2076// Returns true if lane is live (and not a helper lane). 2077def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty], 2078 [], [IntrReadMem, IntrInaccessibleMemOnly] 2079>; 2080 2081def int_amdgcn_mbcnt_lo : 2082 ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, 2083 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2084 [IntrNoMem]>; 2085 2086def int_amdgcn_mbcnt_hi : 2087 ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, 2088 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2089 [IntrNoMem]>; 2090 2091// llvm.amdgcn.ds.swizzle src offset 2092def int_amdgcn_ds_swizzle : 2093 ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, 2094 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2095 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, 2096 ImmArg<ArgIndex<1>>]>; 2097 2098def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2099 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 2100 [IntrNoMem, IntrSpeculatable] 2101>; 2102 2103def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2104 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 2105 [IntrNoMem, IntrSpeculatable] 2106>; 2107 2108def int_amdgcn_lerp : 2109 ClangBuiltin<"__builtin_amdgcn_lerp">, 2110 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2111 [IntrNoMem, IntrSpeculatable] 2112>; 2113 2114def int_amdgcn_sad_u8 : 2115 ClangBuiltin<"__builtin_amdgcn_sad_u8">, 2116 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2117 [IntrNoMem, IntrSpeculatable] 2118>; 2119 2120def int_amdgcn_msad_u8 : 2121 ClangBuiltin<"__builtin_amdgcn_msad_u8">, 2122 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2123 [IntrNoMem, IntrSpeculatable] 2124>; 2125 2126def int_amdgcn_sad_hi_u8 : 2127 ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, 2128 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2129 [IntrNoMem, IntrSpeculatable] 2130>; 2131 2132def int_amdgcn_sad_u16 : 2133 ClangBuiltin<"__builtin_amdgcn_sad_u16">, 2134 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2135 [IntrNoMem, IntrSpeculatable] 2136>; 2137 2138def int_amdgcn_qsad_pk_u16_u8 : 2139 ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, 2140 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 2141 [IntrNoMem, IntrSpeculatable] 2142>; 2143 2144def int_amdgcn_mqsad_pk_u16_u8 : 2145 ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, 2146 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 2147 [IntrNoMem, IntrSpeculatable] 2148>; 2149 2150def int_amdgcn_mqsad_u32_u8 : 2151 ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, 2152 DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], 2153 [IntrNoMem, IntrSpeculatable] 2154>; 2155 2156def int_amdgcn_cvt_pk_u8_f32 : 2157 ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, 2158 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], 2159 [IntrNoMem, IntrSpeculatable] 2160>; 2161 2162def int_amdgcn_icmp : 2163 Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], 2164 [IntrNoMem, IntrConvergent, 2165 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2166 2167def int_amdgcn_fcmp : 2168 Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], 2169 [IntrNoMem, IntrConvergent, 2170 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2171 2172def int_amdgcn_ballot : 2173 Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], 2174 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2175 2176def int_amdgcn_inverse_ballot : 2177 Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], 2178 [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2179 2180// Lowers to S_BITREPLICATE_B64_B32. 2181// The argument must be uniform; otherwise, the result is undefined. 2182def int_amdgcn_s_bitreplicate : 2183 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; 2184 2185// Lowers to S_QUADMASK_B{32,64} 2186// The argument must be uniform; otherwise, the result is undefined. 2187def int_amdgcn_s_quadmask : 2188 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 2189 2190// Lowers to S_WQM_B{32,64} 2191// The argument must be uniform; otherwise, the result is undefined. 2192// Does not set WQM; merely calculates the bitmask. 2193def int_amdgcn_s_wqm : 2194 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 2195 2196class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< 2197 [data_ty], 2198 [ 2199 LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR) 2200 llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default, 2201 // 1: Iterative strategy, and 2202 // 2. DPP) 2203 ], 2204 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>; 2205 2206def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; 2207def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; 2208 2209def int_amdgcn_readfirstlane : 2210 ClangBuiltin<"__builtin_amdgcn_readfirstlane">, 2211 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 2212 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2213 2214// The lane argument must be uniform across the currently active threads of the 2215// current wave. Otherwise, the result is undefined. 2216def int_amdgcn_readlane : 2217 ClangBuiltin<"__builtin_amdgcn_readlane">, 2218 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2219 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2220 2221// The value to write and lane select arguments must be uniform across the 2222// currently active threads of the current wave. Otherwise, the result is 2223// undefined. 2224def int_amdgcn_writelane : 2225 ClangBuiltin<"__builtin_amdgcn_writelane">, 2226 Intrinsic<[llvm_i32_ty], [ 2227 llvm_i32_ty, // uniform value to write: returned by the selected lane 2228 llvm_i32_ty, // uniform lane select 2229 llvm_i32_ty // returned by all lanes other than the selected one 2230 ], 2231 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2232>; 2233 2234def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, 2235 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2236 [IntrNoMem, IntrSpeculatable] 2237>; 2238 2239// mul24 intrinsics can return i32 or i64. 2240// When returning i64, they're lowered to a mul24/mulhi24 pair. 2241def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2242 [llvm_i32_ty, llvm_i32_ty], 2243 [IntrNoMem, IntrSpeculatable] 2244>; 2245 2246def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2247 [llvm_i32_ty, llvm_i32_ty], 2248 [IntrNoMem, IntrSpeculatable] 2249>; 2250 2251def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2252 [llvm_i32_ty, llvm_i32_ty], 2253 [IntrNoMem, IntrSpeculatable] 2254>; 2255 2256def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2257 [llvm_i32_ty, llvm_i32_ty], 2258 [IntrNoMem, IntrSpeculatable] 2259>; 2260 2261// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) 2262// 2263// bar_val is the total number of waves that will wait on this 2264// barrier, minus 1. 2265def int_amdgcn_ds_gws_init : 2266 ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, 2267 Intrinsic<[], 2268 [llvm_i32_ty, llvm_i32_ty], 2269 [IntrConvergent, IntrWriteMem, 2270 IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2271 [SDNPMemOperand] 2272>; 2273 2274// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) 2275// bar_val is the total number of waves that will wait on this 2276// barrier, minus 1. 2277def int_amdgcn_ds_gws_barrier : 2278 ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, 2279 Intrinsic<[], 2280 [llvm_i32_ty, llvm_i32_ty], 2281 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2282 [SDNPMemOperand] 2283>; 2284 2285// llvm.amdgcn.ds.gws.sema.v(i32 resource_id) 2286def int_amdgcn_ds_gws_sema_v : 2287 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, 2288 Intrinsic<[], 2289 [llvm_i32_ty], 2290 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2291 [SDNPMemOperand] 2292>; 2293 2294// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) 2295def int_amdgcn_ds_gws_sema_br : 2296 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, 2297 Intrinsic<[], 2298 [llvm_i32_ty, llvm_i32_ty], 2299 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2300 [SDNPMemOperand] 2301>; 2302 2303// llvm.amdgcn.ds.gws.sema.p(i32 resource_id) 2304def int_amdgcn_ds_gws_sema_p : 2305 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, 2306 Intrinsic<[], 2307 [llvm_i32_ty], 2308 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2309 [SDNPMemOperand] 2310>; 2311 2312// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) 2313def int_amdgcn_ds_gws_sema_release_all : 2314 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, 2315 Intrinsic<[], 2316 [llvm_i32_ty], 2317 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2318 [SDNPMemOperand] 2319>; 2320 2321 2322// Copies the source value to the destination value, with the guarantee that 2323// the source value is computed as if the entire program were executed in WQM. 2324def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], 2325 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2326>; 2327 2328// Copies the source value to the destination value, such that the source 2329// is computed as if the entire program were executed in WQM if any other 2330// program code executes in WQM. 2331def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], 2332 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2333>; 2334 2335// Return true if at least one thread within the pixel quad passes true into 2336// the function. 2337def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], 2338 [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2339>; 2340 2341// If false, set EXEC=0 for the current thread until the end of program. 2342// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? 2343def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>; 2344 2345def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, 2346 Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrConvergent, 2347 IntrNoCallback, IntrNoFree] 2348>; 2349 2350// If false, mark all active lanes as helper lanes until the end of program. 2351def int_amdgcn_wqm_demote : Intrinsic<[], 2352 [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree] 2353>; 2354 2355// Copies the active channels of the source value to the destination value, 2356// with the guarantee that the source value is computed as if the entire 2357// program were executed in Whole Wavefront Mode, i.e. with all channels 2358// enabled, with a few exceptions: - Phi nodes which require WWM return an 2359// undefined value. 2360def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], 2361 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2362 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2363>; 2364// Deprecated. Use int_amdgcn_strict_wwm instead. 2365def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], 2366 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2367 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2368>; 2369def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], 2370 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2371 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2372>; 2373 2374// Given a value, copies it while setting all the inactive lanes to a given 2375// value. Note that OpenGL helper lanes are considered active, so if the 2376// program ever uses WQM, then the instruction and the first source will be 2377// computed in WQM. 2378def int_amdgcn_set_inactive : 2379 Intrinsic<[llvm_anyint_ty], 2380 [LLVMMatchType<0>, // value to be copied 2381 LLVMMatchType<0>], // value for the inactive lanes to take 2382 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2383 2384// Similar to int_amdgcn_set_inactive, but the value for the inactive lanes must 2385// be a VGPR function argument. 2386// Can only be used in functions with the `amdgpu_cs_chain` or 2387// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2388// flow. 2389def int_amdgcn_set_inactive_chain_arg : 2390 Intrinsic<[llvm_anyint_ty], 2391 [LLVMMatchType<0>, // value to be copied 2392 LLVMMatchType<0>], // value for the inactive lanes to take 2393 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2394 2395// Return if the given flat pointer points to a local memory address. 2396def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, 2397 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2398 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2399>; 2400 2401// Return if the given flat pointer points to a prvate memory address. 2402def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, 2403 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2404 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2405>; 2406 2407// A uniform tail call to a function with the `amdgpu_cs_chain` or 2408// `amdgpu_cs_chain_preserve` calling convention. It will populate the SGPRs 2409// starting at s0 and the VGPRs starting at v8, set EXEC and perform a jump to 2410// the given function. 2411// Can only be used in functions with the `amdgpu_cs`, `amdgpu_cs_chain` or 2412// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2413// flow. 2414def int_amdgcn_cs_chain: 2415 Intrinsic<[], 2416 [llvm_anyptr_ty, // The function to jump to. 2417 llvm_anyint_ty, // Value to put in EXEC (should be i32 or i64). 2418 llvm_any_ty, // Arguments that will be copied into SGPRs (s0+). 2419 // Must be uniform. 2420 llvm_any_ty, // Arguments that will be copied into VGPRs (v8+). 2421 // Need not be uniform. 2422 llvm_i32_ty, // Flags. 2423 llvm_vararg_ty // Additional arguments. Only present if Flags is 2424 // non-zero. 2425 ], 2426 [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>; 2427 2428 2429//===----------------------------------------------------------------------===// 2430// CI+ Intrinsics 2431//===----------------------------------------------------------------------===// 2432 2433def int_amdgcn_s_dcache_inv_vol : 2434 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, 2435 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2436 2437def int_amdgcn_buffer_wbinvl1_vol : 2438 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, 2439 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2440 2441//===----------------------------------------------------------------------===// 2442// VI Intrinsics 2443//===----------------------------------------------------------------------===// 2444 2445// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2446def int_amdgcn_mov_dpp : 2447 Intrinsic<[llvm_anyint_ty], 2448 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, 2449 llvm_i1_ty], 2450 [IntrNoMem, IntrConvergent, IntrWillReturn, 2451 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, 2452 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2453 2454// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2455// Should be equivalent to: 2456// v_mov_b32 <dest> <old> 2457// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2458def int_amdgcn_update_dpp : 2459 Intrinsic<[llvm_any_ty], 2460 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, 2461 llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], 2462 [IntrNoMem, IntrConvergent, IntrWillReturn, 2463 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, 2464 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2465 2466def int_amdgcn_s_dcache_wb : 2467 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, 2468 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2469 2470def int_amdgcn_s_dcache_wb_vol : 2471 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, 2472 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2473 2474def int_amdgcn_s_memrealtime : 2475 ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, 2476 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2477 2478// llvm.amdgcn.ds.permute <index> <src> 2479def int_amdgcn_ds_permute : 2480 ClangBuiltin<"__builtin_amdgcn_ds_permute">, 2481 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2482 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2483 2484// llvm.amdgcn.ds.bpermute <index> <src> 2485def int_amdgcn_ds_bpermute : 2486 ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, 2487 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2488 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2489 2490// llvm.amdgcn.perm <src0> <src1> <selector> 2491def int_amdgcn_perm : 2492 ClangBuiltin<"__builtin_amdgcn_perm">, 2493 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2494 [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2495 2496//===----------------------------------------------------------------------===// 2497// GFX9 Intrinsics 2498//===----------------------------------------------------------------------===// 2499 2500class AMDGPUGlobalLoadLDS : Intrinsic < 2501 [], 2502 [LLVMQualPointerType<1>, // Base global pointer to load from 2503 LLVMQualPointerType<3>, // LDS base pointer to store to 2504 llvm_i32_ty, // Data byte size: 1/2/4 2505 llvm_i32_ty, // imm offset (applied to both global and LDS address) 2506 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, 2507 // bit 1 = slc/sc1, 2508 // bit 2 = dlc on gfx10/gfx11)) 2509 // bit 4 = scc/nt on gfx90a+)) 2510 // gfx12+: 2511 // cachepolicy (bits [0-2] = th, 2512 // bits [3-4] = scope) 2513 // swizzled buffer (bit 6 = swz), 2514 [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, 2515 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], 2516 "", [SDNPMemOperand]>; 2517def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; 2518 2519//===----------------------------------------------------------------------===// 2520// GFX10 Intrinsics 2521//===----------------------------------------------------------------------===// 2522 2523// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> 2524def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, 2525 Intrinsic<[llvm_i32_ty], 2526 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2527 [IntrNoMem, IntrConvergent, IntrWillReturn, 2528 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2529 2530// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> 2531def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, 2532 Intrinsic<[llvm_i32_ty], 2533 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2534 [IntrNoMem, IntrConvergent, IntrWillReturn, 2535 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2536 2537// llvm.amdgcn.mov.dpp8.i32 <src> <sel> 2538// <sel> is a 32-bit constant whose high 8 bits must be zero which selects 2539// the lanes to read from. 2540def int_amdgcn_mov_dpp8 : 2541 Intrinsic<[llvm_anyint_ty], 2542 [LLVMMatchType<0>, llvm_i32_ty], 2543 [IntrNoMem, IntrConvergent, IntrWillReturn, 2544 ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>; 2545 2546def int_amdgcn_s_get_waveid_in_workgroup : 2547 ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, 2548 Intrinsic<[llvm_i32_ty], [], 2549 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2550 2551class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic < 2552 [vt], 2553 [pt, // vaddr 2554 vt], // vdata(VGPR) 2555 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", 2556 [SDNPMemOperand]>; 2557 2558def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>; 2559 2560// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, 2561// <ray_dir>, <ray_inv_dir>, <texture_descr> 2562// <node_ptr> is i32 or i64. 2563// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. 2564def int_amdgcn_image_bvh_intersect_ray : 2565 DefaultAttrsIntrinsic<[llvm_v4i32_ty], 2566 [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, 2567 LLVMMatchType<1>, llvm_v4i32_ty], 2568 [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2569 2570//===----------------------------------------------------------------------===// 2571// GFX11 Intrinsics 2572//===----------------------------------------------------------------------===// 2573 2574// llvm.amdgcn.permlane64 <src0> 2575def int_amdgcn_permlane64 : 2576 ClangBuiltin<"__builtin_amdgcn_permlane64">, 2577 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 2578 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2579 2580def int_amdgcn_ds_add_gs_reg_rtn : 2581 ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, 2582 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2583 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2584 "", [SDNPMemOperand]>; 2585 2586def int_amdgcn_ds_sub_gs_reg_rtn : 2587 ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, 2588 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2589 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2590 "", [SDNPMemOperand]>; 2591 2592def int_amdgcn_ds_bvh_stack_rtn : 2593 Intrinsic< 2594 [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr 2595 [ 2596 llvm_i32_ty, // %addr 2597 llvm_i32_ty, // %data0 2598 llvm_v4i32_ty, // %data1 2599 llvm_i32_ty, // %offset 2600 ], 2601 [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2602 >; 2603 2604def int_amdgcn_s_wait_event_export_ready : 2605 ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, 2606 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] 2607>; 2608 2609// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2610// 2611// These operations perform a matrix multiplication and accumulation of 2612// the form: D = A * B + C . 2613 2614class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : 2615 Intrinsic< 2616 [CD], // %D 2617 [ 2618 AB, // %A 2619 LLVMMatchType<1>, // %B 2620 LLVMMatchType<0>, // %C 2621 ], 2622 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2623>; 2624 2625class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : 2626 Intrinsic< 2627 [CD], // %D 2628 [ 2629 AB, // %A 2630 LLVMMatchType<1>, // %B 2631 LLVMMatchType<0>, // %C 2632 llvm_i1_ty, // %high (op_sel) for GFX11, 0 for GFX12 2633 ], 2634 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2635>; 2636 2637class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : 2638 Intrinsic< 2639 [CD], // %D 2640 [ 2641 llvm_i1_ty, // %A_sign 2642 AB, // %A 2643 llvm_i1_ty, // %B_sign 2644 LLVMMatchType<1>, // %B 2645 LLVMMatchType<0>, // %C 2646 llvm_i1_ty, // %clamp 2647 ], 2648 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2649>; 2650 2651// WMMA GFX11Only 2652 2653// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. 2654// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers. 2655// The content of the other 16-bit half is preserved from the input. 2656def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2657def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>; 2658 2659// WMMA GFX11Plus 2660 2661def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2662def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2663def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2664def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2665 2666// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. 2667// The content of the other 16-bit half is undefined. 2668// GFX12: The op_sel bit must be 0. 2669def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>; 2670def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>; 2671 2672//===----------------------------------------------------------------------===// 2673// GFX12 Intrinsics 2674//===----------------------------------------------------------------------===// 2675 2676// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control> 2677def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, 2678 Intrinsic<[llvm_i32_ty], 2679 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2680 [IntrNoMem, IntrConvergent, IntrWillReturn, 2681 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2682 2683// llvm.amdgcn.permlanex16.var <old> <src0> <src1> <fi> <bound_control> 2684def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var">, 2685 Intrinsic<[llvm_i32_ty], 2686 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2687 [IntrNoMem, IntrConvergent, IntrWillReturn, 2688 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2689 2690 2691// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2692// 2693// These operations perform a matrix multiplication and accumulation of 2694// the form: D = A * B + C . 2695 2696// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>. 2697def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2698def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2699def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2700def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>; 2701// A and B are <16 x iu4>. 2702def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>; 2703 2704// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics 2705// 2706// These operations perform a sparse matrix multiplication and accumulation of 2707// the form: D = A * B + C. 2708// A is sparse matrix, half the size of B, and is expanded using sparsity index. 2709 2710class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> : 2711 Intrinsic< 2712 [CD], // %D 2713 [ 2714 A, // %A 2715 B, // %B 2716 LLVMMatchType<0>, // %C 2717 Index // %Sparsity index for A 2718 ], 2719 [IntrNoMem, IntrConvergent, IntrWillReturn] 2720>; 2721 2722class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> : 2723 Intrinsic< 2724 [CD], // %D 2725 [ 2726 llvm_i1_ty, // %A_sign 2727 A, // %A 2728 llvm_i1_ty, // %B_sign 2729 B, // %B 2730 LLVMMatchType<0>, // %C 2731 Index, // %Sparsity index for A 2732 llvm_i1_ty, // %clamp 2733 ], 2734 [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>] 2735>; 2736 2737def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2738def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2739def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2740def int_amdgcn_swmmac_bf16_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2741def int_amdgcn_swmmac_i32_16x16x32_iu8 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2742def int_amdgcn_swmmac_i32_16x16x32_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2743def int_amdgcn_swmmac_i32_16x16x64_iu4 : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>; 2744def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2745def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2746def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2747def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>; 2748 2749def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>; 2750 2751def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2752def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2753def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2754def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2755 2756def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>; 2757 2758class AMDGPULoadIntrinsic<LLVMType ptr_ty>: 2759 Intrinsic< 2760 [llvm_any_ty], 2761 [ptr_ty], 2762 [IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], 2763 "", 2764 [SDNPMemOperand] 2765 >; 2766 2767// Wave32 2768// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64 2769// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128 2770// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128 2771// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128 2772// Wave64 2773// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64 2774// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128 2775// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128 2776// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128 2777 2778def int_amdgcn_global_load_tr : AMDGPULoadIntrinsic<global_ptr_ty>; 2779 2780// i32 @llvm.amdgcn.wave.id() 2781def int_amdgcn_wave_id : 2782 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 2783 2784//===----------------------------------------------------------------------===// 2785// Deep learning intrinsics. 2786//===----------------------------------------------------------------------===// 2787 2788// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) 2789// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2790def int_amdgcn_fdot2 : 2791 ClangBuiltin<"__builtin_amdgcn_fdot2">, 2792 DefaultAttrsIntrinsic< 2793 [llvm_float_ty], // %r 2794 [ 2795 llvm_v2f16_ty, // %a 2796 llvm_v2f16_ty, // %b 2797 llvm_float_ty, // %c 2798 llvm_i1_ty // %clamp 2799 ], 2800 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2801 >; 2802 2803// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) 2804// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2805def int_amdgcn_fdot2_f16_f16 : 2806 ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, 2807 DefaultAttrsIntrinsic< 2808 [llvm_half_ty], // %r 2809 [ 2810 llvm_v2f16_ty, // %a 2811 llvm_v2f16_ty, // %b 2812 llvm_half_ty // %c 2813 ], 2814 [IntrNoMem, IntrSpeculatable] 2815 >; 2816 2817// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) 2818// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2819def int_amdgcn_fdot2_bf16_bf16 : 2820 ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, 2821 DefaultAttrsIntrinsic< 2822 [llvm_bfloat_ty], // %r 2823 [ 2824 llvm_v2bf16_ty, // %a 2825 llvm_v2bf16_ty, // %b 2826 llvm_bfloat_ty // %c 2827 ], 2828 [IntrNoMem, IntrSpeculatable] 2829 >; 2830 2831// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) 2832// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2833def int_amdgcn_fdot2_f32_bf16 : 2834 ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, 2835 DefaultAttrsIntrinsic< 2836 [llvm_float_ty], // %r 2837 [ 2838 llvm_v2bf16_ty, // %a 2839 llvm_v2bf16_ty, // %b 2840 llvm_float_ty, // %c 2841 llvm_i1_ty // %clamp 2842 ], 2843 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2844 >; 2845 2846// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) 2847// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2848def int_amdgcn_sdot2 : 2849 ClangBuiltin<"__builtin_amdgcn_sdot2">, 2850 DefaultAttrsIntrinsic< 2851 [llvm_i32_ty], // %r 2852 [ 2853 llvm_v2i16_ty, // %a 2854 llvm_v2i16_ty, // %b 2855 llvm_i32_ty, // %c 2856 llvm_i1_ty // %clamp 2857 ], 2858 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2859 >; 2860 2861// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) 2862// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2863def int_amdgcn_udot2 : 2864 ClangBuiltin<"__builtin_amdgcn_udot2">, 2865 DefaultAttrsIntrinsic< 2866 [llvm_i32_ty], // %r 2867 [ 2868 llvm_v2i16_ty, // %a 2869 llvm_v2i16_ty, // %b 2870 llvm_i32_ty, // %c 2871 llvm_i1_ty // %clamp 2872 ], 2873 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2874 >; 2875 2876// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2877// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2878def int_amdgcn_sdot4 : 2879 ClangBuiltin<"__builtin_amdgcn_sdot4">, 2880 DefaultAttrsIntrinsic< 2881 [llvm_i32_ty], // %r 2882 [ 2883 llvm_i32_ty, // %a 2884 llvm_i32_ty, // %b 2885 llvm_i32_ty, // %c 2886 llvm_i1_ty // %clamp 2887 ], 2888 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2889 >; 2890 2891// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) 2892// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2893def int_amdgcn_udot4 : 2894 ClangBuiltin<"__builtin_amdgcn_udot4">, 2895 DefaultAttrsIntrinsic< 2896 [llvm_i32_ty], // %r 2897 [ 2898 llvm_i32_ty, // %a 2899 llvm_i32_ty, // %b 2900 llvm_i32_ty, // %c 2901 llvm_i1_ty // %clamp 2902 ], 2903 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2904 >; 2905 2906// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2907// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2908// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); 2909// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); 2910// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2911def int_amdgcn_sudot4 : 2912 ClangBuiltin<"__builtin_amdgcn_sudot4">, 2913 DefaultAttrsIntrinsic< 2914 [llvm_i32_ty], // %r 2915 [ 2916 llvm_i1_ty, // %a_sign 2917 llvm_i32_ty, // %a 2918 llvm_i1_ty, // %b_sign 2919 llvm_i32_ty, // %b 2920 llvm_i32_ty, // %c 2921 llvm_i1_ty // %clamp 2922 ], 2923 [IntrNoMem, IntrSpeculatable, 2924 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2925 >; 2926 2927// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2928// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2929// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2930def int_amdgcn_sdot8 : 2931 ClangBuiltin<"__builtin_amdgcn_sdot8">, 2932 DefaultAttrsIntrinsic< 2933 [llvm_i32_ty], // %r 2934 [ 2935 llvm_i32_ty, // %a 2936 llvm_i32_ty, // %b 2937 llvm_i32_ty, // %c 2938 llvm_i1_ty // %clamp 2939 ], 2940 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2941 >; 2942 2943// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) 2944// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2945// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2946def int_amdgcn_udot8 : 2947 ClangBuiltin<"__builtin_amdgcn_udot8">, 2948 DefaultAttrsIntrinsic< 2949 [llvm_i32_ty], // %r 2950 [ 2951 llvm_i32_ty, // %a 2952 llvm_i32_ty, // %b 2953 llvm_i32_ty, // %c 2954 llvm_i1_ty // %clamp 2955 ], 2956 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2957 >; 2958 2959// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2960// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2961// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); 2962// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); 2963// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2964// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2965 def int_amdgcn_sudot8 : 2966 ClangBuiltin<"__builtin_amdgcn_sudot8">, 2967 DefaultAttrsIntrinsic< 2968 [llvm_i32_ty], // %r 2969 [ 2970 llvm_i1_ty, // %a_sign 2971 llvm_i32_ty, // %a 2972 llvm_i1_ty, // %b_sign 2973 llvm_i32_ty, // %b 2974 llvm_i32_ty, // %c 2975 llvm_i1_ty // %clamp 2976 ], 2977 [IntrNoMem, IntrSpeculatable, 2978 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2979 >; 2980 2981// f32 %r = llvm.amdgcn.dot4.f32.type_a.type_b (v4type_a (as i32) %a, v4type_b (as i32) %b, f32 %c) 2982// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2983class AMDGPU8bitFloatDot4Intrinsic : 2984 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2985 DefaultAttrsIntrinsic< 2986 [llvm_float_ty], // %r 2987 [ 2988 llvm_i32_ty, // %a 2989 llvm_i32_ty, // %b 2990 llvm_float_ty, // %c 2991 ], 2992 [IntrNoMem, IntrSpeculatable] 2993 >; 2994 2995def int_amdgcn_dot4_f32_fp8_bf8 : AMDGPU8bitFloatDot4Intrinsic; 2996def int_amdgcn_dot4_f32_bf8_fp8 : AMDGPU8bitFloatDot4Intrinsic; 2997def int_amdgcn_dot4_f32_fp8_fp8 : AMDGPU8bitFloatDot4Intrinsic; 2998def int_amdgcn_dot4_f32_bf8_bf8 : AMDGPU8bitFloatDot4Intrinsic; 2999 3000//===----------------------------------------------------------------------===// 3001// gfx908 intrinsics 3002// ===----------------------------------------------------------------------===// 3003 3004def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3005 3006// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp 3007class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : 3008 ClangBuiltin<!subst("int", "__builtin", NAME)>, 3009 DefaultAttrsIntrinsic<[DestTy], 3010 [SrcABTy, SrcABTy, DestTy, 3011 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3012 [IntrConvergent, IntrNoMem, 3013 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 3014 3015def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; 3016def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 3017def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 3018def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 3019def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 3020def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; 3021def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 3022def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 3023def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 3024def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 3025def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; 3026def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 3027def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 3028def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 3029def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 3030def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; 3031def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 3032def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 3033def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 3034def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 3035 3036//===----------------------------------------------------------------------===// 3037// gfx90a intrinsics 3038// ===----------------------------------------------------------------------===// 3039 3040def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3041def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3042def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3043def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3044def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 3045 3046def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; 3047def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 3048def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 3049def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 3050def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 3051 3052// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. 3053// Three bits corresponding to the neg modifier applied to the respective 3054// source operand. 3055def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; 3056def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; 3057 3058//===----------------------------------------------------------------------===// 3059// gfx940 intrinsics 3060// ===----------------------------------------------------------------------===// 3061 3062// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. 3063def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>; 3064def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>; 3065def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic< 3066 [llvm_v2i16_ty], 3067 [LLVMQualPointerType<3>, llvm_v2i16_ty], 3068 [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>, 3069 ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; 3070 3071def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; 3072def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; 3073def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; 3074def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; 3075 3076class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> : 3077 AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>; 3078 3079multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> { 3080 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 3081 def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>; 3082} 3083 3084defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>; 3085defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>; 3086 3087// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid 3088class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : 3089 ClangBuiltin<!subst("int", "__builtin", NAME)>, 3090 DefaultAttrsIntrinsic<[DestTy], 3091 [SrcA, SrcB, DestTy, llvm_i32_ty, 3092 llvm_i32_ty, llvm_i32_ty], 3093 [IntrConvergent, IntrNoMem, 3094 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 3095 3096def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 3097def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 3098def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 3099def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 3100def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 3101def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 3102 3103class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> : 3104 AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>; 3105 3106multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> { 3107 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 3108 def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>; 3109} 3110 3111defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>; 3112defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>; 3113 3114// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] 3115// byte_sel selects byte from srcA. 3116def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">, 3117 DefaultAttrsIntrinsic<[llvm_float_ty], 3118 [llvm_i32_ty, llvm_i32_ty], 3119 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3120 3121// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3] 3122def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">, 3123 DefaultAttrsIntrinsic<[llvm_float_ty], 3124 [llvm_i32_ty, llvm_i32_ty], 3125 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3126 3127// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel 3128// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes. 3129def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">, 3130 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 3131 [llvm_i32_ty, llvm_i1_ty], 3132 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3133 3134// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel. 3135def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">, 3136 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 3137 [llvm_i32_ty, llvm_i1_ty], 3138 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 3139 3140// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 3141// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes. 3142def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">, 3143 DefaultAttrsIntrinsic<[llvm_i32_ty], 3144 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 3145 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3146 3147// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 3148def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">, 3149 DefaultAttrsIntrinsic<[llvm_i32_ty], 3150 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 3151 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3152 3153// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 3154// byte_sel selects byte to write into vdst. 3155def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">, 3156 DefaultAttrsIntrinsic<[llvm_i32_ty], 3157 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3158 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3159 3160// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 3161def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, 3162 DefaultAttrsIntrinsic<[llvm_i32_ty], 3163 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 3164 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 3165 3166//===----------------------------------------------------------------------===// 3167// Special Intrinsics for backend internal use only. No frontend 3168// should emit calls to these. 3169// ===----------------------------------------------------------------------===// 3170def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 3171 [llvm_i1_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 3172>; 3173 3174def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 3175 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 3176>; 3177 3178def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], 3179 [llvm_i1_ty, LLVMMatchType<0>], 3180 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 3181>; 3182 3183def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], 3184 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 3185>; 3186 3187def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], 3188 [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 3189 3190// Represent unreachable in a divergent region. 3191def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; 3192 3193// Emit 2.5 ulp, no denormal division. Should only be inserted by 3194// pass based on !fpmath metadata. 3195def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< 3196 [llvm_float_ty], [llvm_float_ty, llvm_float_ty], 3197 [IntrNoMem, IntrSpeculatable] 3198>; 3199 3200/// Emit an addrspacecast without null pointer checking. 3201/// Should only be inserted by a pass based on analysis of an addrspacecast's src. 3202def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic< 3203 [llvm_anyptr_ty], [llvm_anyptr_ty], 3204 [IntrNoMem, IntrSpeculatable] 3205>; 3206} 3207