1//===- IntrinsicsAMDGPU.td - Defines AMDGPU intrinsics -----*- tablegen -*-===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// 9// This file defines all of the R600-specific intrinsics. 10// 11//===----------------------------------------------------------------------===// 12 13class AMDGPUReadPreloadRegisterIntrinsic 14 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 15 16class AMDGPUReadPreloadRegisterIntrinsicNamed<string name> 17 : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, ClangBuiltin<name>; 18 19// Used to tag image and resource intrinsics with information used to generate 20// mem operands. 21class AMDGPURsrcIntrinsic<int rsrcarg, bit isimage = false> { 22 int RsrcArg = rsrcarg; 23 bit IsImage = isimage; 24} 25 26let TargetPrefix = "r600" in { 27 28multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz { 29 def _x : AMDGPUReadPreloadRegisterIntrinsic; 30 def _y : AMDGPUReadPreloadRegisterIntrinsic; 31 def _z : AMDGPUReadPreloadRegisterIntrinsic; 32} 33 34multiclass AMDGPUReadPreloadRegisterIntrinsic_xyz_named<string prefix> { 35 def _x : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_x")>; 36 def _y : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_y")>; 37 def _z : AMDGPUReadPreloadRegisterIntrinsicNamed<!strconcat(prefix, "_z")>; 38} 39 40defm int_r600_read_global_size : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 41 <"__builtin_r600_read_global_size">; 42defm int_r600_read_ngroups : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 43 <"__builtin_r600_read_ngroups">; 44defm int_r600_read_tgid : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 45 <"__builtin_r600_read_tgid">; 46 47defm int_r600_read_local_size : AMDGPUReadPreloadRegisterIntrinsic_xyz; 48defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz; 49 50def int_r600_group_barrier : ClangBuiltin<"__builtin_r600_group_barrier">, 51 Intrinsic<[], [], [IntrConvergent, IntrWillReturn]>; 52 53// AS 7 is PARAM_I_ADDRESS, used for kernel arguments 54def int_r600_implicitarg_ptr : 55 ClangBuiltin<"__builtin_r600_implicitarg_ptr">, 56 DefaultAttrsIntrinsic<[LLVMQualPointerType<7>], [], 57 [IntrNoMem, IntrSpeculatable]>; 58 59def int_r600_rat_store_typed : 60 // 1st parameter: Data 61 // 2nd parameter: Index 62 // 3rd parameter: Constant RAT ID 63 DefaultAttrsIntrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, 64 ClangBuiltin<"__builtin_r600_rat_store_typed">; 65 66def int_r600_recipsqrt_ieee : DefaultAttrsIntrinsic< 67 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 68>; 69 70def int_r600_recipsqrt_clamped : DefaultAttrsIntrinsic< 71 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 72>; 73 74def int_r600_cube : DefaultAttrsIntrinsic< 75 [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 76>; 77 78def int_r600_store_stream_output : DefaultAttrsIntrinsic< 79 [], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [] 80>; 81 82class TextureIntrinsicFloatInput : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [ 83 llvm_v4f32_ty, // Coord 84 llvm_i32_ty, // offset_x 85 llvm_i32_ty, // offset_y, 86 llvm_i32_ty, // offset_z, 87 llvm_i32_ty, // resource_id 88 llvm_i32_ty, // samplerid 89 llvm_i32_ty, // coord_type_x 90 llvm_i32_ty, // coord_type_y 91 llvm_i32_ty, // coord_type_z 92 llvm_i32_ty], // coord_type_w 93 [IntrNoMem] 94>; 95 96class TextureIntrinsicInt32Input : DefaultAttrsIntrinsic<[llvm_v4i32_ty], [ 97 llvm_v4i32_ty, // Coord 98 llvm_i32_ty, // offset_x 99 llvm_i32_ty, // offset_y, 100 llvm_i32_ty, // offset_z, 101 llvm_i32_ty, // resource_id 102 llvm_i32_ty, // samplerid 103 llvm_i32_ty, // coord_type_x 104 llvm_i32_ty, // coord_type_y 105 llvm_i32_ty, // coord_type_z 106 llvm_i32_ty], // coord_type_w 107 [IntrNoMem] 108>; 109 110def int_r600_store_swizzle : 111 Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree] 112>; 113 114def int_r600_tex : TextureIntrinsicFloatInput; 115def int_r600_texc : TextureIntrinsicFloatInput; 116def int_r600_txl : TextureIntrinsicFloatInput; 117def int_r600_txlc : TextureIntrinsicFloatInput; 118def int_r600_txb : TextureIntrinsicFloatInput; 119def int_r600_txbc : TextureIntrinsicFloatInput; 120def int_r600_txf : TextureIntrinsicInt32Input; 121def int_r600_txq : TextureIntrinsicInt32Input; 122def int_r600_ddx : TextureIntrinsicFloatInput; 123def int_r600_ddy : TextureIntrinsicFloatInput; 124 125def int_r600_dot4 : DefaultAttrsIntrinsic<[llvm_float_ty], 126 [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] 127>; 128 129def int_r600_kill : DefaultAttrsIntrinsic<[], [llvm_float_ty], []>; 130 131} // End TargetPrefix = "r600" 132 133let TargetPrefix = "amdgcn" in { 134 135//===----------------------------------------------------------------------===// 136// ABI Special Intrinsics 137//===----------------------------------------------------------------------===// 138 139defm int_amdgcn_workitem_id : AMDGPUReadPreloadRegisterIntrinsic_xyz; 140defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named 141 <"__builtin_amdgcn_workgroup_id">; 142 143def int_amdgcn_dispatch_ptr : 144 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 145 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 146 147def int_amdgcn_queue_ptr : 148 ClangBuiltin<"__builtin_amdgcn_queue_ptr">, 149 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 150 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 151 152def int_amdgcn_kernarg_segment_ptr : 153 ClangBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, 154 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 155 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 156 157def int_amdgcn_implicitarg_ptr : 158 ClangBuiltin<"__builtin_amdgcn_implicitarg_ptr">, 159 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 160 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 161 162def int_amdgcn_groupstaticsize : 163 ClangBuiltin<"__builtin_amdgcn_groupstaticsize">, 164 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 165 166def int_amdgcn_dispatch_id : 167 ClangBuiltin<"__builtin_amdgcn_dispatch_id">, 168 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>; 169 170// For internal use. Coordinates LDS lowering between IR transform and backend. 171def int_amdgcn_lds_kernel_id : 172 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 173 174def int_amdgcn_implicit_buffer_ptr : 175 ClangBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, 176 DefaultAttrsIntrinsic<[LLVMQualPointerType<4>], [], 177 [Align<RetIndex, 4>, IntrNoMem, IntrSpeculatable]>; 178 179// Set EXEC to the 64-bit value given. 180// This is always moved to the beginning of the basic block. 181// FIXME: Should be mangled for wave size. 182def int_amdgcn_init_exec : Intrinsic<[], 183 [llvm_i64_ty], // 64-bit literal constant 184 [IntrConvergent, IntrNoMem, IntrHasSideEffects, IntrNoCallback, 185 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<0>>]>; 186 187// Set EXEC according to a thread count packed in an SGPR input: 188// thread_count = (input >> bitoffset) & 0x7f; 189// This is always moved to the beginning of the basic block. 190// Note: only inreg arguments to the parent function are valid as 191// inputs to this intrinsic, computed values cannot be used. 192def int_amdgcn_init_exec_from_input : Intrinsic<[], 193 [llvm_i32_ty, // 32-bit SGPR input 194 llvm_i32_ty], // bit offset of the thread count 195 [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback, 196 IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>; 197 198def int_amdgcn_wavefrontsize : 199 ClangBuiltin<"__builtin_amdgcn_wavefrontsize">, 200 DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; 201 202// Represent a relocation constant. 203def int_amdgcn_reloc_constant : DefaultAttrsIntrinsic< 204 [llvm_i32_ty], [llvm_metadata_ty], 205 [IntrNoMem, IntrSpeculatable] 206>; 207 208//===----------------------------------------------------------------------===// 209// Instruction Intrinsics 210//===----------------------------------------------------------------------===// 211 212// The first parameter is s_sendmsg immediate (i16), 213// the second one is copied to m0 214def int_amdgcn_s_sendmsg : ClangBuiltin<"__builtin_amdgcn_s_sendmsg">, 215 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 216 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 217def int_amdgcn_s_sendmsghalt : ClangBuiltin<"__builtin_amdgcn_s_sendmsghalt">, 218 Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], 219 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 220 221 222// gfx11 intrinsic 223// The first parameter is s_sendmsg immediate (i16). Return type is i32 or i64. 224def int_amdgcn_s_sendmsg_rtn : Intrinsic <[llvm_anyint_ty], [llvm_i32_ty], 225 [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>; 226 227def int_amdgcn_s_barrier : ClangBuiltin<"__builtin_amdgcn_s_barrier">, 228 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 229 230def int_amdgcn_s_barrier_signal : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal">, 231 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 232 IntrNoCallback, IntrNoFree]>; 233 234def int_amdgcn_s_barrier_signal_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_var">, 235 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 236 IntrNoCallback, IntrNoFree]>; 237 238def int_amdgcn_s_barrier_signal_isfirst : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst">, 239 Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 240 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 241 242def int_amdgcn_s_barrier_signal_isfirst_var : ClangBuiltin<"__builtin_amdgcn_s_barrier_signal_isfirst_var">, 243 Intrinsic<[llvm_i1_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 244 IntrNoCallback, IntrNoFree]>; 245 246def int_amdgcn_s_barrier_init : ClangBuiltin<"__builtin_amdgcn_s_barrier_init">, 247 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, 248 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 249 250def int_amdgcn_s_barrier_join : ClangBuiltin<"__builtin_amdgcn_s_barrier_join">, 251 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 252 IntrNoCallback, IntrNoFree]>; 253 254def int_amdgcn_s_wakeup_barrier : ClangBuiltin<"__builtin_amdgcn_s_wakeup_barrier">, 255 Intrinsic<[], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 256 IntrNoCallback, IntrNoFree]>; 257 258def int_amdgcn_s_barrier_wait : ClangBuiltin<"__builtin_amdgcn_s_barrier_wait">, 259 Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 260 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 261 262def int_amdgcn_s_barrier_leave : ClangBuiltin<"__builtin_amdgcn_s_barrier_leave">, 263 Intrinsic<[llvm_i1_ty], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 264 265def int_amdgcn_s_get_barrier_state : ClangBuiltin<"__builtin_amdgcn_s_get_barrier_state">, 266 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, 267 IntrNoCallback, IntrNoFree]>; 268 269def int_amdgcn_wave_barrier : ClangBuiltin<"__builtin_amdgcn_wave_barrier">, 270 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 271 272// The 1st parameter is a mask for the types of instructions that may be allowed 273// to cross the SCHED_BARRIER during scheduling. 274// MASK = 0x0000 0000: No instructions may be scheduled across SCHED_BARRIER. 275// MASK = 0x0000 0001: ALL, non-memory, non-side-effect producing instructions may be 276// scheduled across SCHED_BARRIER, i.e. allow ALU instructions to pass. 277// MASK = 0x0000 0002: VALU instructions may be scheduled across SCHED_BARRIER. 278// MASK = 0x0000 0004: SALU instructions may be scheduled across SCHED_BARRIER. 279// MASK = 0x0000 0008: MFMA/WMMA instructions may be scheduled across SCHED_BARRIER. 280// MASK = 0x0000 0010: ALL VMEM instructions may be scheduled across SCHED_BARRIER. 281// MASK = 0x0000 0020: VMEM read instructions may be scheduled across SCHED_BARRIER. 282// MASK = 0x0000 0040: VMEM write instructions may be scheduled across SCHED_BARRIER. 283// MASK = 0x0000 0080: ALL DS instructions may be scheduled across SCHED_BARRIER. 284// MASK = 0x0000 0100: ALL DS read instructions may be scheduled accoss SCHED_BARRIER. 285// MASK = 0x0000 0200: ALL DS write instructions may be scheduled across SCHED_BARRIER. 286def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">, 287 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 288 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 289 290// The first parameter is a mask that determines the types of instructions that 291// you would like to synchronize around and add to a scheduling group. The 292// values of the mask are defined above for sched_barrier. These instructions 293// will be selected from the bottom up starting from the sched_group_barrier's 294// location during instruction scheduling. The second parameter is the number of 295// matching instructions that will be associated with this sched_group_barrier. 296// The third parameter is an identifier which is used to describe what other 297// sched_group_barriers should be synchronized with. 298def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">, 299 Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 300 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects, 301 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 302 303// Scheduler optimization hint. 304// MASK = 0: Small gemm opt 305def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">, 306 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrConvergent, 307 IntrWillReturn, IntrNoCallback, IntrNoFree]>; 308 309def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">, 310 Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 311 312def int_amdgcn_div_scale : DefaultAttrsIntrinsic< 313 // 1st parameter: Numerator 314 // 2nd parameter: Denominator 315 // 3rd parameter: Select quotient. Must equal Numerator or Denominator. 316 // (0 = Denominator, 1 = Numerator). 317 [llvm_anyfloat_ty, llvm_i1_ty], 318 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 319 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<2>>] 320>; 321 322def int_amdgcn_div_fmas : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 323 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], 324 [IntrNoMem, IntrSpeculatable] 325>; 326 327def int_amdgcn_div_fixup : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 328 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 329 [IntrNoMem, IntrSpeculatable] 330>; 331 332// Look Up 2.0 / pi src0 with segment select src1[4:0] 333def int_amdgcn_trig_preop : DefaultAttrsIntrinsic< 334 [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], 335 [IntrNoMem, IntrSpeculatable] 336>; 337 338def int_amdgcn_sin : DefaultAttrsIntrinsic< 339 [llvm_anyfloat_ty], [LLVMMatchType<0>], 340 [IntrNoMem, IntrSpeculatable] 341>; 342 343def int_amdgcn_cos : DefaultAttrsIntrinsic< 344 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 345>; 346 347// v_log_{f16|f32}, performs log2. f32 version does not handle 348// denormals. There is no reason to use this for f16 as it does 349// support denormals, and the generic log2 intrinsic should be 350// preferred. 351def int_amdgcn_log : DefaultAttrsIntrinsic< 352 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 353>; 354 355// v_exp_{f16|f32} (int_amdgcn_exp was taken by export 356// already). Performs exp2. f32 version does not handle 357// denormals. There is no reason to use this for f16 as it does 358// support denormals, and the generic exp2 intrinsic should be 359// preferred. 360def int_amdgcn_exp2 : DefaultAttrsIntrinsic< 361 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 362>; 363 364def int_amdgcn_log_clamp : DefaultAttrsIntrinsic< 365 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 366>; 367 368def int_amdgcn_fmul_legacy : ClangBuiltin<"__builtin_amdgcn_fmul_legacy">, 369 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], 370 [IntrNoMem, IntrSpeculatable, Commutative] 371>; 372 373// Fused single-precision multiply-add with legacy behaviour for the multiply, 374// which is that +/- 0.0 * anything (even NaN or infinity) is +0.0. This is 375// intended for use on subtargets that have the v_fma_legacy_f32 and/or 376// v_fmac_legacy_f32 instructions. (Note that v_fma_legacy_f16 is unrelated and 377// has a completely different kind of legacy behaviour.) 378def int_amdgcn_fma_legacy : 379 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], 380 [IntrNoMem, IntrSpeculatable, Commutative] 381>; 382 383def int_amdgcn_rcp : DefaultAttrsIntrinsic< 384 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 385>; 386 387def int_amdgcn_rcp_legacy : ClangBuiltin<"__builtin_amdgcn_rcp_legacy">, 388 DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], 389 [IntrNoMem, IntrSpeculatable] 390>; 391 392def int_amdgcn_sqrt : DefaultAttrsIntrinsic< 393 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 394>; 395 396def int_amdgcn_rsq : DefaultAttrsIntrinsic< 397 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 398>; 399 400def int_amdgcn_rsq_legacy : ClangBuiltin<"__builtin_amdgcn_rsq_legacy">, 401 DefaultAttrsIntrinsic< 402 [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] 403>; 404 405// out = 1.0 / sqrt(a) result clamped to +/- max_float. 406def int_amdgcn_rsq_clamp : DefaultAttrsIntrinsic< 407 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; 408 409def int_amdgcn_frexp_mant : DefaultAttrsIntrinsic< 410 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 411>; 412 413def int_amdgcn_frexp_exp : DefaultAttrsIntrinsic< 414 [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] 415>; 416 417// v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 418// and always uses rtz, so is not suitable for implementing the OpenCL 419// fract function. It should be ok on VI. 420def int_amdgcn_fract : DefaultAttrsIntrinsic< 421 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] 422>; 423 424def int_amdgcn_cvt_pkrtz : ClangBuiltin<"__builtin_amdgcn_cvt_pkrtz">, 425 DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], 426 [IntrNoMem, IntrSpeculatable] 427>; 428 429def int_amdgcn_cvt_pknorm_i16 : 430 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_i16">, 431 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 432 [IntrNoMem, IntrSpeculatable] 433>; 434 435def int_amdgcn_cvt_pknorm_u16 : 436 ClangBuiltin<"__builtin_amdgcn_cvt_pknorm_u16">, 437 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty], 438 [IntrNoMem, IntrSpeculatable] 439>; 440 441def int_amdgcn_cvt_pk_i16 : 442 ClangBuiltin<"__builtin_amdgcn_cvt_pk_i16">, 443 DefaultAttrsIntrinsic< 444 [llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 445 [IntrNoMem, IntrSpeculatable] 446>; 447 448def int_amdgcn_cvt_pk_u16 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_u16">, 449 DefaultAttrsIntrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty], 450 [IntrNoMem, IntrSpeculatable] 451>; 452 453def int_amdgcn_class : DefaultAttrsIntrinsic< 454 [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], 455 [IntrNoMem, IntrSpeculatable] 456>; 457 458def int_amdgcn_fmed3 : 459 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 460 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 461 [IntrNoMem, IntrSpeculatable] 462>; 463 464def int_amdgcn_cubeid : ClangBuiltin<"__builtin_amdgcn_cubeid">, 465 DefaultAttrsIntrinsic<[llvm_float_ty], 466 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 467 [IntrNoMem, IntrSpeculatable] 468>; 469 470def int_amdgcn_cubema : ClangBuiltin<"__builtin_amdgcn_cubema">, 471 DefaultAttrsIntrinsic<[llvm_float_ty], 472 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 473 [IntrNoMem, IntrSpeculatable] 474>; 475 476def int_amdgcn_cubesc : ClangBuiltin<"__builtin_amdgcn_cubesc">, 477 DefaultAttrsIntrinsic<[llvm_float_ty], 478 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 479 [IntrNoMem, IntrSpeculatable] 480>; 481 482def int_amdgcn_cubetc : ClangBuiltin<"__builtin_amdgcn_cubetc">, 483 DefaultAttrsIntrinsic<[llvm_float_ty], 484 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 485 [IntrNoMem, IntrSpeculatable] 486>; 487 488// v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz 489// should be used. 490def int_amdgcn_sffbh : 491 DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], 492 [IntrNoMem, IntrSpeculatable] 493>; 494 495// v_mad_f32|f16/v_mac_f32|f16, selected regardless of denorm support. 496def int_amdgcn_fmad_ftz : 497 DefaultAttrsIntrinsic<[llvm_anyfloat_ty], 498 [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], 499 [IntrNoMem, IntrSpeculatable] 500>; 501 502class AMDGPULDSIntrin : 503 Intrinsic<[llvm_any_ty], 504 [LLVMQualPointerType<3>, 505 LLVMMatchType<0>, 506 llvm_i32_ty, // ordering 507 llvm_i32_ty, // scope 508 llvm_i1_ty], // isVolatile 509 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, 510 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree] 511>; 512 513// FIXME: The m0 argument should be moved after the normal arguments 514class AMDGPUDSOrderedIntrinsic : Intrinsic< 515 [llvm_i32_ty], 516 // M0 = {hi16:address, lo16:waveID}. Allow passing M0 as a pointer, so that 517 // the bit packing can be optimized at the IR level. 518 [LLVMQualPointerType<2>, // IntToPtr(M0) 519 llvm_i32_ty, // value to add or swap 520 llvm_i32_ty, // ordering 521 llvm_i32_ty, // scope 522 llvm_i1_ty, // isVolatile 523 llvm_i32_ty, // ordered count index (OA index), also added to the address 524 // gfx10: bits 24-27 indicate the number of active threads/dwords 525 llvm_i1_ty, // wave release, usually set to 1 526 llvm_i1_ty], // wave done, set to 1 for the last ordered instruction 527 [IntrWillReturn, NoCapture<ArgIndex<0>>, 528 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, 529 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree 530 ] 531>; 532 533class AMDGPUDSAppendConsumedIntrinsic : Intrinsic< 534 [llvm_i32_ty], 535 [llvm_anyptr_ty, // LDS or GDS ptr 536 llvm_i1_ty], // isVolatile 537 [IntrConvergent, IntrWillReturn, IntrArgMemOnly, 538 NoCapture<ArgIndex<0>>, ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree], 539 "", 540 [SDNPMemOperand] 541>; 542 543def int_amdgcn_ds_ordered_add : AMDGPUDSOrderedIntrinsic; 544def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; 545 546// The pointer argument is assumed to be dynamically uniform if a VGPR. 547def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; 548def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; 549 550def int_amdgcn_ds_fadd : AMDGPULDSIntrin; 551def int_amdgcn_ds_fmin : AMDGPULDSIntrin; 552def int_amdgcn_ds_fmax : AMDGPULDSIntrin; 553 554} // TargetPrefix = "amdgcn" 555 556// New-style image intrinsics 557 558////////////////////////////////////////////////////////////////////////// 559// Dimension-aware image intrinsics framework 560////////////////////////////////////////////////////////////////////////// 561 562// Helper class to represent (type, name) combinations of arguments. The 563// argument names are explanatory and used as DAG operand names for codegen 564// pattern matching. 565class AMDGPUArg<LLVMType ty, string name> { 566 LLVMType Type = ty; 567 string Name = name; 568} 569 570// Return [AMDGPUArg<basety, names[0]>, AMDGPUArg<LLVMMatchType<0>, names[1]>, ...] 571class makeArgList<list<string> names, LLVMType basety> { 572 list<AMDGPUArg> ret = 573 !listconcat([AMDGPUArg<basety, names[0]>], 574 !foreach(name, !tail(names), AMDGPUArg<LLVMMatchType<0>, name>)); 575} 576 577// Return arglist, with LLVMMatchType's references shifted by 'shift'. 578class arglistmatchshift<list<AMDGPUArg> arglist, int shift> { 579 list<AMDGPUArg> ret = 580 !foreach(arg, arglist, 581 !if(!isa<LLVMMatchType>(arg.Type), 582 AMDGPUArg<LLVMMatchType<!add(!cast<LLVMMatchType>(arg.Type).Number, shift)>, 583 arg.Name>, 584 arg)); 585} 586 587// Return the concatenation of the given arglists. LLVMMatchType's are adjusted 588// accordingly, and shifted by an additional 'shift'. 589class arglistconcat<list<list<AMDGPUArg>> arglists, int shift = 0> { 590 list<AMDGPUArg> ret = 591 !foldl([]<AMDGPUArg>, arglists, lhs, rhs, 592 !listconcat( 593 lhs, 594 arglistmatchshift<rhs, 595 !add(shift, !foldl(0, lhs, a, b, 596 !add(a, b.Type.isAny)))>.ret)); 597} 598 599// Represent texture/image types / dimensionality. 600class AMDGPUDimProps<bits<3> enc, string name, string asmsuffix, 601 list<string> coord_names, list<string> slice_names, 602 bit msaa = 0> { 603 AMDGPUDimProps Dim = !cast<AMDGPUDimProps>(NAME); 604 string Name = name; // e.g. "2darraymsaa" 605 string AsmSuffix = asmsuffix; // e.g. 2D_MSAA_ARRAY (used in assembly strings) 606 bits<3> Encoding = enc; 607 bit DA = 0; // DA bit in MIMG encoding 608 bit MSAA = msaa; 609 610 list<AMDGPUArg> CoordSliceArgs = 611 makeArgList<!listconcat(coord_names, slice_names), llvm_anyfloat_ty>.ret; 612 list<AMDGPUArg> CoordSliceIntArgs = 613 makeArgList<!listconcat(coord_names, slice_names), llvm_anyint_ty>.ret; 614 list<AMDGPUArg> GradientArgs = 615 makeArgList<!listconcat(!foreach(name, coord_names, "d" # name # "dh"), 616 !foreach(name, coord_names, "d" # name # "dv")), 617 llvm_anyfloat_ty>.ret; 618 619 bits<8> NumCoords = !size(CoordSliceArgs); 620 bits<8> NumGradients = !size(GradientArgs); 621} 622 623def AMDGPUDim1D : AMDGPUDimProps<0x0, "1d", "1D", ["s"], []>; 624def AMDGPUDim2D : AMDGPUDimProps<0x1, "2d", "2D", ["s", "t"], []>; 625def AMDGPUDim3D : AMDGPUDimProps<0x2, "3d", "3D", ["s", "t", "r"], []>; 626let DA = 1 in { 627 def AMDGPUDimCube : AMDGPUDimProps<0x3, "cube", "CUBE", ["s", "t"], ["face"]>; 628 def AMDGPUDim1DArray : AMDGPUDimProps<0x4, "1darray", "1D_ARRAY", ["s"], ["slice"]>; 629 def AMDGPUDim2DArray : AMDGPUDimProps<0x5, "2darray", "2D_ARRAY", ["s", "t"], ["slice"]>; 630} 631def AMDGPUDim2DMsaa : AMDGPUDimProps<0x6, "2dmsaa", "2D_MSAA", ["s", "t"], ["fragid"], 1>; 632let DA = 1 in { 633 def AMDGPUDim2DArrayMsaa : AMDGPUDimProps<0x7, "2darraymsaa", "2D_MSAA_ARRAY", ["s", "t"], ["slice", "fragid"], 1>; 634} 635 636def AMDGPUDims { 637 list<AMDGPUDimProps> NoMsaa = [AMDGPUDim1D, AMDGPUDim2D, AMDGPUDim3D, 638 AMDGPUDimCube, AMDGPUDim1DArray, 639 AMDGPUDim2DArray]; 640 list<AMDGPUDimProps> Msaa = [AMDGPUDim2DMsaa, AMDGPUDim2DArrayMsaa]; 641 list<AMDGPUDimProps> All = !listconcat(NoMsaa, Msaa); 642} 643 644// Represent sample variants, i.e. _C, _O, _B, ... and combinations thereof. 645class AMDGPUSampleVariant<string ucmod, string lcmod, list<AMDGPUArg> extra_addr> { 646 string UpperCaseMod = ucmod; 647 string LowerCaseMod = lcmod; 648 649 // {offset} {bias} {z-compare} 650 list<AMDGPUArg> ExtraAddrArgs = extra_addr; 651 bit Offset = false; 652 bit Bias = false; 653 bit ZCompare = false; 654 bit Gradients = false; 655 656 // Name of the {lod} or {clamp} argument that is appended to the coordinates, 657 // if any. 658 string LodOrClamp = ""; 659} 660 661// AMDGPUSampleVariants: all variants supported by IMAGE_SAMPLE 662// AMDGPUSampleVariantsNoGradients: variants supported by IMAGE_GATHER4 663defset list<AMDGPUSampleVariant> AMDGPUSampleVariants = { 664 multiclass AMDGPUSampleHelper_Offset<string ucmod, string lcmod, 665 list<AMDGPUArg> extra_addr> { 666 def NAME#lcmod : AMDGPUSampleVariant<ucmod, lcmod, extra_addr>; 667 let Offset = true in 668 def NAME#lcmod#_o : AMDGPUSampleVariant< 669 ucmod#"_O", lcmod#"_o", !listconcat([AMDGPUArg<llvm_i32_ty, "offset">], extra_addr)>; 670 } 671 672 multiclass AMDGPUSampleHelper_Compare<string ucmod, string lcmod, 673 list<AMDGPUArg> extra_addr> { 674 defm NAME : AMDGPUSampleHelper_Offset<ucmod, lcmod, extra_addr>; 675 let ZCompare = true in 676 defm NAME : AMDGPUSampleHelper_Offset< 677 "_C"#ucmod, "_c"#lcmod, !listconcat(extra_addr, [AMDGPUArg<llvm_float_ty, "zcompare">])>; 678 } 679 680 multiclass AMDGPUSampleHelper_Clamp<string ucmod, string lcmod, 681 list<AMDGPUArg> extra_addr> { 682 defm NAME : AMDGPUSampleHelper_Compare<ucmod, lcmod, extra_addr>; 683 let LodOrClamp = "clamp" in 684 defm NAME : AMDGPUSampleHelper_Compare<ucmod#"_CL", lcmod#"_cl", extra_addr>; 685 } 686 687 defset list<AMDGPUSampleVariant> AMDGPUSampleVariantsNoGradients = { 688 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"", "", []>; 689 let Bias = true in 690 defm AMDGPUSample : AMDGPUSampleHelper_Clamp< 691 "_B", "_b", [AMDGPUArg<llvm_anyfloat_ty, "bias">]>; 692 let LodOrClamp = "lod" in 693 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_L", "_l", []>; 694 defm AMDGPUSample : AMDGPUSampleHelper_Compare<"_LZ", "_lz", []>; 695 } 696 697 let Gradients = true in { 698 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_D", "_d", []>; 699 defm AMDGPUSample : AMDGPUSampleHelper_Clamp<"_CD", "_cd", []>; 700 } 701} 702 703// Helper class to capture the profile of a dimension-aware image intrinsic. 704// This information is used to generate the intrinsic's type and to inform 705// codegen pattern matching. 706class AMDGPUDimProfile<string opmod, 707 AMDGPUDimProps dim> { 708 AMDGPUDimProps Dim = dim; 709 string OpMod = opmod; // the corresponding instruction is named IMAGE_OpMod 710 711 // These are intended to be overwritten by subclasses 712 bit IsSample = false; 713 bit IsAtomic = false; 714 list<LLVMType> RetTypes = []; 715 list<AMDGPUArg> DataArgs = []; 716 list<AMDGPUArg> ExtraAddrArgs = []; 717 bit Offset = false; 718 bit Bias = false; 719 bit ZCompare = false; 720 bit Gradients = false; 721 string LodClampMip = ""; 722 723 int NumRetAndDataAnyTypes = 724 !foldl(0, !listconcat(RetTypes, !foreach(arg, DataArgs, arg.Type)), a, b, 725 !add(a, b.isAny)); 726 727 list<AMDGPUArg> AddrArgs = 728 arglistconcat<[ExtraAddrArgs, 729 !if(Gradients, dim.GradientArgs, []), 730 !listconcat(!if(IsSample, dim.CoordSliceArgs, dim.CoordSliceIntArgs), 731 !if(!empty(LodClampMip), 732 []<AMDGPUArg>, 733 [AMDGPUArg<LLVMMatchType<0>, LodClampMip>]))], 734 NumRetAndDataAnyTypes>.ret; 735 list<LLVMType> AddrTypes = !foreach(arg, AddrArgs, arg.Type); 736 list<AMDGPUArg> AddrDefaultArgs = 737 !foreach(arg, AddrArgs, 738 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 739 !if(IsSample, llvm_float_ty, llvm_i32_ty), arg.Type), 740 arg.Name>); 741 list<AMDGPUArg> AddrA16Args = 742 !foreach(arg, AddrArgs, 743 AMDGPUArg<!if(!or(arg.Type.isAny, !isa<LLVMMatchType>(arg.Type)), 744 !if(IsSample, llvm_half_ty, llvm_i16_ty), arg.Type), 745 arg.Name>); 746} 747 748class AMDGPUDimProfileCopy<AMDGPUDimProfile base> : AMDGPUDimProfile<base.OpMod, base.Dim> { 749 let IsSample = base.IsSample; 750 let IsAtomic = base.IsAtomic; 751 let RetTypes = base.RetTypes; 752 let DataArgs = base.DataArgs; 753 let ExtraAddrArgs = base.ExtraAddrArgs; 754 let Offset = base.Offset; 755 let Bias = base.Bias; 756 let ZCompare = base.ZCompare; 757 let Gradients = base.Gradients; 758 let LodClampMip = base.LodClampMip; 759} 760 761class AMDGPUDimSampleProfile<string opmod, 762 AMDGPUDimProps dim, 763 AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> { 764 let IsSample = true; 765 let RetTypes = [llvm_any_ty]; 766 let ExtraAddrArgs = sample.ExtraAddrArgs; 767 let Offset = sample.Offset; 768 let Bias = sample.Bias; 769 let ZCompare = sample.ZCompare; 770 let Gradients = sample.Gradients; 771 let LodClampMip = sample.LodOrClamp; 772} 773 774class AMDGPUDimNoSampleProfile<string opmod, 775 AMDGPUDimProps dim, 776 list<LLVMType> retty, 777 list<AMDGPUArg> dataargs, 778 bit Mip = false> : AMDGPUDimProfile<opmod, dim> { 779 let RetTypes = retty; 780 let DataArgs = dataargs; 781 let LodClampMip = !if(Mip, "mip", ""); 782} 783 784class AMDGPUDimAtomicProfile<string opmod, 785 AMDGPUDimProps dim, 786 list<AMDGPUArg> dataargs> : AMDGPUDimProfile<opmod, dim> { 787 let RetTypes = [llvm_anyint_ty]; 788 let DataArgs = dataargs; 789 let IsAtomic = true; 790} 791 792class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim, 793 list<AMDGPUArg> dataargs> 794 : AMDGPUDimAtomicProfile<opmod, dim, dataargs> { 795 let RetTypes = [llvm_anyfloat_ty]; 796} 797 798class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> 799 : AMDGPUDimProfile<"GET_RESINFO", dim> { 800 let RetTypes = [llvm_anyfloat_ty]; 801 let DataArgs = []; 802 let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">]; 803 let LodClampMip = "mip"; 804} 805 806// Helper class for figuring out image intrinsic argument indexes. 807class AMDGPUImageDimIntrinsicEval<AMDGPUDimProfile P_> { 808 int NumDataArgs = !size(P_.DataArgs); 809 int NumDmaskArgs = !not(P_.IsAtomic); 810 int NumOffsetArgs = !if(P_.Offset, 1, 0); 811 int NumBiasArgs = !if(P_.Bias, 1, 0); 812 int NumZCompareArgs = !if(P_.ZCompare, 1, 0); 813 int NumExtraAddrArgs = !add(NumOffsetArgs, NumBiasArgs, NumZCompareArgs); 814 int NumVAddrArgs = !size(P_.AddrArgs); 815 int NumGradientArgs = !if(P_.Gradients, !size(P_.Dim.GradientArgs), 0); 816 int NumCoordArgs = !if(P_.IsSample, !size(P_.Dim.CoordSliceArgs), !size(P_.Dim.CoordSliceIntArgs)); 817 int NumRSrcArgs = 1; 818 int NumSampArgs = !if(P_.IsSample, 2, 0); 819 int DmaskArgIndex = NumDataArgs; 820 int VAddrArgIndex = !add(DmaskArgIndex, NumDmaskArgs); 821 int OffsetArgIndex = VAddrArgIndex; 822 int BiasArgIndex = !add(VAddrArgIndex, NumOffsetArgs); 823 int ZCompareArgIndex = !add(BiasArgIndex, NumBiasArgs); 824 int GradientArgIndex = !add(VAddrArgIndex, NumExtraAddrArgs); 825 int CoordArgIndex = !add(GradientArgIndex, NumGradientArgs); 826 int LodArgIndex = !add(VAddrArgIndex, NumVAddrArgs, -1); 827 int MipArgIndex = LodArgIndex; 828 int RsrcArgIndex = !add(VAddrArgIndex, NumVAddrArgs); 829 int SampArgIndex = !add(RsrcArgIndex, NumRSrcArgs); 830 int UnormArgIndex = !add(SampArgIndex, 1); 831 int TexFailCtrlArgIndex = !add(SampArgIndex, NumSampArgs); 832 int CachePolicyArgIndex = !add(TexFailCtrlArgIndex, 1); 833} 834 835// All dimension-aware intrinsics are derived from this class. 836class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_, 837 list<IntrinsicProperty> props, 838 list<SDNodeProperty> sdnodeprops> : DefaultAttrsIntrinsic< 839 P_.RetTypes, // vdata(VGPR) -- for load/atomic-with-return 840 !listconcat( 841 !foreach(arg, P_.DataArgs, arg.Type), // vdata(VGPR) -- for store/atomic 842 !if(P_.IsAtomic, [], [llvm_i32_ty]), // dmask(imm) 843 P_.AddrTypes, // vaddr(VGPR) 844 [llvm_v8i32_ty], // rsrc(SGPR) 845 !if(P_.IsSample, [llvm_v4i32_ty, // samp(SGPR) 846 llvm_i1_ty], []), // unorm(imm) 847 [llvm_i32_ty, // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe) 848 llvm_i32_ty]), // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc; 849 // gfx12+ imm: bits [0-2] = th, bits [3-4] = scope) 850 // TODO-GFX12: Update all other cachepolicy descriptions. 851 852 !listconcat(props, 853 !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]), 854 !if(P_.IsSample, [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.UnormArgIndex>>], []), 855 [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.TexFailCtrlArgIndex>>, 856 ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.CachePolicyArgIndex>>]), 857 858 859 "", sdnodeprops>, 860 AMDGPURsrcIntrinsic<!add(!size(P_.DataArgs), !size(P_.AddrTypes), 861 !if(P_.IsAtomic, 0, 1)), 1> { 862 AMDGPUDimProfile P = P_; 863 864 AMDGPUImageDimIntrinsic Intr = !cast<AMDGPUImageDimIntrinsic>(NAME); 865 866 let TargetPrefix = "amdgcn"; 867} 868 869// Marker class for intrinsics with a DMask that determines the returned 870// channels. 871class AMDGPUImageDMaskIntrinsic; 872 873defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = { 874 875 ////////////////////////////////////////////////////////////////////////// 876 // Load and store intrinsics 877 ////////////////////////////////////////////////////////////////////////// 878 multiclass AMDGPUImageDimIntrinsicsNoMsaa<string opmod, 879 list<LLVMType> retty, 880 list<AMDGPUArg> dataargs, 881 list<IntrinsicProperty> props, 882 list<SDNodeProperty> sdnodeprops, 883 bit Mip = false> { 884 foreach dim = AMDGPUDims.NoMsaa in { 885 def !strconcat(NAME, "_", dim.Name) 886 : AMDGPUImageDimIntrinsic< 887 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 888 props, sdnodeprops>; 889 } 890 } 891 892 multiclass AMDGPUImageDimIntrinsicsAll<string opmod, 893 list<LLVMType> retty, 894 list<AMDGPUArg> dataargs, 895 list<IntrinsicProperty> props, 896 list<SDNodeProperty> sdnodeprops, 897 bit Mip = false> { 898 foreach dim = AMDGPUDims.All in { 899 def !strconcat(NAME, "_", dim.Name) 900 : AMDGPUImageDimIntrinsic< 901 AMDGPUDimNoSampleProfile<opmod, dim, retty, dataargs, Mip>, 902 props, sdnodeprops>; 903 } 904 } 905 906 defm int_amdgcn_image_load 907 : AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem], 908 [SDNPMemOperand]>, 909 AMDGPUImageDMaskIntrinsic; 910 defm int_amdgcn_image_load_mip 911 : AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [], 912 [IntrReadMem, IntrWillReturn], [SDNPMemOperand], 1>, 913 AMDGPUImageDMaskIntrinsic; 914 915 defm int_amdgcn_image_store : AMDGPUImageDimIntrinsicsAll< 916 "STORE", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 917 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand]>, 918 AMDGPUImageDMaskIntrinsic; 919 defm int_amdgcn_image_store_mip : AMDGPUImageDimIntrinsicsNoMsaa< 920 "STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">], 921 [IntrWriteMem, IntrWillReturn], [SDNPMemOperand], 1>, 922 AMDGPUImageDMaskIntrinsic; 923 924 ////////////////////////////////////////////////////////////////////////// 925 // MSAA intrinsics 926 ////////////////////////////////////////////////////////////////////////// 927 foreach dim = AMDGPUDims.Msaa in { 928 def int_amdgcn_image_msaa_load_x # _ # dim.Name: 929 AMDGPUImageDimIntrinsic< 930 AMDGPUDimNoSampleProfile<"MSAA_LOAD_X", dim, [llvm_any_ty], []>, 931 [IntrReadMem], [SDNPMemOperand]>; 932 } 933 934 foreach dim = AMDGPUDims.Msaa in { 935 def int_amdgcn_image_msaa_load # _ # dim.Name: 936 AMDGPUImageDimIntrinsic< 937 AMDGPUDimNoSampleProfile<"MSAA_LOAD", dim, [llvm_any_ty], []>, 938 [IntrReadMem], [SDNPMemOperand]>; 939 } 940 941 ////////////////////////////////////////////////////////////////////////// 942 // sample and getlod intrinsics 943 ////////////////////////////////////////////////////////////////////////// 944 multiclass AMDGPUImageDimSampleDims<string opmod, 945 AMDGPUSampleVariant sample, 946 bit NoMem = false> { 947 foreach dim = AMDGPUDims.NoMsaa in { 948 def !strconcat(NAME, "_", dim.Name) : AMDGPUImageDimIntrinsic< 949 AMDGPUDimSampleProfile<opmod, dim, sample>, 950 !if(NoMem, [IntrNoMem], [IntrReadMem]), 951 !if(NoMem, [], [SDNPMemOperand])>; 952 } 953 } 954 955 foreach sample = AMDGPUSampleVariants in { 956 defm int_amdgcn_image_sample # sample.LowerCaseMod 957 : AMDGPUImageDimSampleDims<"SAMPLE" # sample.UpperCaseMod, sample>, 958 AMDGPUImageDMaskIntrinsic; 959 } 960 961 defm int_amdgcn_image_getlod 962 : AMDGPUImageDimSampleDims<"GET_LOD", AMDGPUSample, 1>, 963 AMDGPUImageDMaskIntrinsic; 964 965 ////////////////////////////////////////////////////////////////////////// 966 // getresinfo intrinsics 967 ////////////////////////////////////////////////////////////////////////// 968 foreach dim = AMDGPUDims.All in { 969 def !strconcat("int_amdgcn_image_getresinfo_", dim.Name) 970 : AMDGPUImageDimIntrinsic<AMDGPUDimGetResInfoProfile<dim>, [IntrNoMem], []>, 971 AMDGPUImageDMaskIntrinsic; 972 } 973 974 ////////////////////////////////////////////////////////////////////////// 975 // gather4 intrinsics 976 ////////////////////////////////////////////////////////////////////////// 977 foreach sample = AMDGPUSampleVariantsNoGradients in { 978 foreach dim = [AMDGPUDim2D, AMDGPUDimCube, AMDGPUDim2DArray] in { 979 def int_amdgcn_image_gather4 # sample.LowerCaseMod # _ # dim.Name: 980 AMDGPUImageDimIntrinsic< 981 AMDGPUDimSampleProfile<"GATHER4" # sample.UpperCaseMod, dim, sample>, 982 [IntrReadMem], [SDNPMemOperand]>; 983 } 984 } 985} 986 987////////////////////////////////////////////////////////////////////////// 988// atomic intrinsics 989////////////////////////////////////////////////////////////////////////// 990defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = { 991 multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs, 992 int isFloat = 0> { 993 foreach dim = AMDGPUDims.All in { 994 def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic< 995 !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>, 996 AMDGPUDimAtomicProfile<opmod, dim, dataargs>), 997 [], [SDNPMemOperand]>; 998 } 999 } 1000 1001 multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> { 1002 defm "" 1003 : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">], 1004 isFloat>; 1005 } 1006 1007 multiclass AMDGPUImageDimFloatAtomic<string opmod> { 1008 defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>; 1009 } 1010 1011 defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">; 1012 defm int_amdgcn_image_atomic_add : AMDGPUImageDimAtomic<"ATOMIC_ADD">; 1013 defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">; 1014 defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">; 1015 defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">; 1016 defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">; 1017 defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">; 1018 defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">; 1019 defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">; 1020 defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; 1021 defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; 1022 defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; 1023 defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; 1024 defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; 1025 1026 defm int_amdgcn_image_atomic_cmpswap : 1027 AMDGPUImageDimAtomicX<"ATOMIC_CMPSWAP", [AMDGPUArg<LLVMMatchType<0>, "src">, 1028 AMDGPUArg<LLVMMatchType<0>, "cmp">]>; 1029} 1030 1031////////////////////////////////////////////////////////////////////////// 1032// Buffer intrinsics 1033////////////////////////////////////////////////////////////////////////// 1034 1035// Data type for buffer resources (V#). Maybe, in the future, we can create a 1036// similar one for textures (T#). 1037def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>; 1038 1039let TargetPrefix = "amdgcn" in { 1040 1041def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic < 1042 [AMDGPUBufferRsrcTy], 1043 [llvm_anyptr_ty, // base 1044 llvm_i16_ty, // stride (and swizzle control) 1045 llvm_i32_ty, // NumRecords / extent 1046 llvm_i32_ty], // flags 1047 // Attributes lifted from ptrmask + some extra argument attributes. 1048 [IntrNoMem, ReadNone<ArgIndex<0>>, 1049 IntrSpeculatable, IntrWillReturn]>; 1050 1051defset list<AMDGPURsrcIntrinsic> AMDGPUBufferIntrinsics = { 1052 1053class AMDGPUBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1054 [data_ty], 1055 [llvm_v4i32_ty, // rsrc(SGPR) 1056 llvm_i32_ty, // vindex(VGPR) 1057 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1058 llvm_i1_ty, // glc(imm) 1059 llvm_i1_ty], // slc(imm) 1060 [IntrReadMem, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1061 AMDGPURsrcIntrinsic<0>; 1062def int_amdgcn_buffer_load_format : AMDGPUBufferLoad<llvm_anyfloat_ty>; 1063def int_amdgcn_buffer_load : AMDGPUBufferLoad; 1064 1065// Generate a buffer_load instruction that may be optimized to s_buffer_load if 1066// the offset argument is uniform. 1067def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < 1068 [llvm_any_ty], 1069 [llvm_v4i32_ty, // rsrc(SGPR) 1070 llvm_i32_ty, // byte offset 1071 llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 2 = dlc) 1072 [IntrNoMem, ImmArg<ArgIndex<2>>]>, 1073 AMDGPURsrcIntrinsic<0>; 1074 1075class AMDGPUBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1076 [], 1077 [data_ty, // vdata(VGPR) 1078 llvm_v4i32_ty, // rsrc(SGPR) 1079 llvm_i32_ty, // vindex(VGPR) 1080 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1081 llvm_i1_ty, // glc(imm) 1082 llvm_i1_ty], // slc(imm) 1083 [IntrWriteMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1084 AMDGPURsrcIntrinsic<1>; 1085def int_amdgcn_buffer_store_format : AMDGPUBufferStore<llvm_anyfloat_ty>; 1086def int_amdgcn_buffer_store : AMDGPUBufferStore; 1087 1088// New buffer intrinsics with separate raw and struct variants. The raw 1089// variant never has an index. The struct variant always has an index, even if 1090// it is const 0. A struct intrinsic with constant 0 index is different to the 1091// corresponding raw intrinsic on gfx9+ because the behavior of bound checking 1092// and swizzling changes depending on whether idxen is set in the instruction. 1093// These new instrinsics also keep the offset and soffset arguments separate as 1094// they behave differently in bounds checking and swizzling. 1095 1096// The versions of these intrinsics that take <4 x i32> arguments are deprecated 1097// in favor of their .ptr.buffer variants that take ptr addrspace(8) arguments, 1098// which allow for improved reasoning about memory accesses. 1099class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1100 [data_ty], 1101 [llvm_v4i32_ty, // rsrc(SGPR) 1102 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1103 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1104 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1105 // bit 1 = slc, 1106 // bit 2 = dlc on gfx10+), 1107 // swizzled buffer (bit 3 = swz)) 1108 [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1109 AMDGPURsrcIntrinsic<0>; 1110def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad<llvm_anyfloat_ty>; 1111def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; 1112 1113class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1114 [data_ty], 1115 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1116 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1117 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1118 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1119 // bit 1 = slc, 1120 // bit 2 = dlc on gfx10+), 1121 // swizzled buffer (bit 3 = swz)) 1122 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1123 ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>, 1124 AMDGPURsrcIntrinsic<0>; 1125def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad<llvm_anyfloat_ty>; 1126def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad; 1127 1128class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1129 [data_ty], 1130 [llvm_v4i32_ty, // rsrc(SGPR) 1131 llvm_i32_ty, // vindex(VGPR) 1132 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1133 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1134 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1135 // bit 1 = slc, 1136 // bit 2 = dlc on gfx10+), 1137 // swizzled buffer (bit 3 = swz)) 1138 [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1139 AMDGPURsrcIntrinsic<0>; 1140def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; 1141def int_amdgcn_struct_buffer_load : AMDGPUStructBufferLoad; 1142 1143class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1144 [data_ty], 1145 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1146 llvm_i32_ty, // vindex(VGPR) 1147 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1148 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1149 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1150 // bit 1 = slc, 1151 // bit 2 = dlc on gfx10+), 1152 // swizzled buffer (bit 3 = swz)) 1153 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1154 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1155 AMDGPURsrcIntrinsic<0>; 1156def int_amdgcn_struct_ptr_buffer_load_format : AMDGPUStructPtrBufferLoad; 1157def int_amdgcn_struct_ptr_buffer_load : AMDGPUStructPtrBufferLoad; 1158 1159class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1160 [], 1161 [data_ty, // vdata(VGPR) 1162 llvm_v4i32_ty, // rsrc(SGPR) 1163 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1164 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1165 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1166 // bit 1 = slc, 1167 // bit 2 = dlc on gfx10+), 1168 // swizzled buffer (bit 3 = swz)) 1169 [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1170 AMDGPURsrcIntrinsic<1>; 1171def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore<llvm_anyfloat_ty>; 1172def int_amdgcn_raw_buffer_store : AMDGPURawBufferStore; 1173 1174class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1175 [], 1176 [data_ty, // vdata(VGPR) 1177 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1178 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1179 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1180 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1181 // bit 1 = slc, 1182 // bit 2 = dlc on gfx10+), 1183 // swizzled buffer (bit 3 = swz)) 1184 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1185 ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1186 AMDGPURsrcIntrinsic<1>; 1187def int_amdgcn_raw_ptr_buffer_store_format : AMDGPURawPtrBufferStore<llvm_anyfloat_ty>; 1188def int_amdgcn_raw_ptr_buffer_store : AMDGPURawPtrBufferStore; 1189 1190class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1191 [], 1192 [data_ty, // vdata(VGPR) 1193 llvm_v4i32_ty, // rsrc(SGPR) 1194 llvm_i32_ty, // vindex(VGPR) 1195 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1196 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1197 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1198 // bit 1 = slc, 1199 // bit 2 = dlc on gfx10+), 1200 // swizzled buffer (bit 3 = swz)) 1201 [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1202 AMDGPURsrcIntrinsic<1>; 1203def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; 1204def int_amdgcn_struct_buffer_store : AMDGPUStructBufferStore; 1205 1206class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsic < 1207 [], 1208 [data_ty, // vdata(VGPR) 1209 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1210 llvm_i32_ty, // vindex(VGPR) 1211 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1212 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1213 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1214 // bit 1 = slc, 1215 // bit 2 = dlc on gfx10+), 1216 // swizzled buffer (bit 3 = swz)) 1217 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1218 ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1219 AMDGPURsrcIntrinsic<1>; 1220def int_amdgcn_struct_ptr_buffer_store_format : AMDGPUStructPtrBufferStore; 1221def int_amdgcn_struct_ptr_buffer_store : AMDGPUStructPtrBufferStore; 1222 1223class AMDGPURawBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1224 [data_ty], 1225 [LLVMMatchType<0>, // vdata(VGPR) 1226 llvm_v4i32_ty, // rsrc(SGPR) 1227 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1228 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1229 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1230 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1231 AMDGPURsrcIntrinsic<1, 0>; 1232def int_amdgcn_raw_buffer_atomic_swap : AMDGPURawBufferAtomic; 1233def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic; 1234def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic; 1235def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic; 1236def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic; 1237def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1238def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic; 1239def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; 1240def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1241def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; 1242def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; 1243def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; 1244def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; 1245def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; 1246def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< 1247 [llvm_anyint_ty], 1248 [LLVMMatchType<0>, // src(VGPR) 1249 LLVMMatchType<0>, // cmp(VGPR) 1250 llvm_v4i32_ty, // rsrc(SGPR) 1251 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1252 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1253 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1254 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1255 AMDGPURsrcIntrinsic<2, 0>; 1256 1257class AMDGPURawPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1258 [data_ty], 1259 [LLVMMatchType<0>, // vdata(VGPR) 1260 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1261 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1262 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1263 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1264 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1265 ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1266 AMDGPURsrcIntrinsic<1, 0>; 1267 1268def int_amdgcn_raw_ptr_buffer_atomic_swap : AMDGPURawPtrBufferAtomic; 1269def int_amdgcn_raw_ptr_buffer_atomic_add : AMDGPURawPtrBufferAtomic; 1270def int_amdgcn_raw_ptr_buffer_atomic_sub : AMDGPURawPtrBufferAtomic; 1271def int_amdgcn_raw_ptr_buffer_atomic_smin : AMDGPURawPtrBufferAtomic; 1272def int_amdgcn_raw_ptr_buffer_atomic_umin : AMDGPURawPtrBufferAtomic; 1273def int_amdgcn_raw_ptr_buffer_atomic_fmin : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1274def int_amdgcn_raw_ptr_buffer_atomic_smax : AMDGPURawPtrBufferAtomic; 1275def int_amdgcn_raw_ptr_buffer_atomic_umax : AMDGPURawPtrBufferAtomic; 1276def int_amdgcn_raw_ptr_buffer_atomic_fmax : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1277def int_amdgcn_raw_ptr_buffer_atomic_and : AMDGPURawPtrBufferAtomic; 1278def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic; 1279def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic; 1280def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic; 1281def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic; 1282def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< 1283 [llvm_anyint_ty], 1284 [LLVMMatchType<0>, // src(VGPR) 1285 LLVMMatchType<0>, // cmp(VGPR) 1286 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1287 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1288 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1289 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1290 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1291 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1292 AMDGPURsrcIntrinsic<2, 0>; 1293 1294// gfx908 intrinsic 1295def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>; 1296def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>; 1297 1298class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1299 [data_ty], 1300 [LLVMMatchType<0>, // vdata(VGPR) 1301 llvm_v4i32_ty, // rsrc(SGPR) 1302 llvm_i32_ty, // vindex(VGPR) 1303 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1304 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1305 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1306 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1307 AMDGPURsrcIntrinsic<1, 0>; 1308def int_amdgcn_struct_buffer_atomic_swap : AMDGPUStructBufferAtomic; 1309def int_amdgcn_struct_buffer_atomic_add : AMDGPUStructBufferAtomic; 1310def int_amdgcn_struct_buffer_atomic_sub : AMDGPUStructBufferAtomic; 1311def int_amdgcn_struct_buffer_atomic_smin : AMDGPUStructBufferAtomic; 1312def int_amdgcn_struct_buffer_atomic_umin : AMDGPUStructBufferAtomic; 1313def int_amdgcn_struct_buffer_atomic_smax : AMDGPUStructBufferAtomic; 1314def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; 1315def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; 1316def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; 1317def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; 1318def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; 1319def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; 1320def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< 1321 [llvm_anyint_ty], 1322 [LLVMMatchType<0>, // src(VGPR) 1323 LLVMMatchType<0>, // cmp(VGPR) 1324 llvm_v4i32_ty, // rsrc(SGPR) 1325 llvm_i32_ty, // vindex(VGPR) 1326 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1327 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1328 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1329 [ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1330 AMDGPURsrcIntrinsic<2, 0>; 1331 1332class AMDGPUStructPtrBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic < 1333 [data_ty], 1334 [LLVMMatchType<0>, // vdata(VGPR) 1335 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1336 llvm_i32_ty, // vindex(VGPR) 1337 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1338 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1339 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1340 [IntrArgMemOnly, NoCapture<ArgIndex<1>>, 1341 ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1342 AMDGPURsrcIntrinsic<1, 0>; 1343def int_amdgcn_struct_ptr_buffer_atomic_swap : AMDGPUStructPtrBufferAtomic; 1344def int_amdgcn_struct_ptr_buffer_atomic_add : AMDGPUStructPtrBufferAtomic; 1345def int_amdgcn_struct_ptr_buffer_atomic_sub : AMDGPUStructPtrBufferAtomic; 1346def int_amdgcn_struct_ptr_buffer_atomic_smin : AMDGPUStructPtrBufferAtomic; 1347def int_amdgcn_struct_ptr_buffer_atomic_umin : AMDGPUStructPtrBufferAtomic; 1348def int_amdgcn_struct_ptr_buffer_atomic_smax : AMDGPUStructPtrBufferAtomic; 1349def int_amdgcn_struct_ptr_buffer_atomic_umax : AMDGPUStructPtrBufferAtomic; 1350def int_amdgcn_struct_ptr_buffer_atomic_and : AMDGPUStructPtrBufferAtomic; 1351def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic; 1352def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic; 1353def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic; 1354def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic; 1355def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic< 1356 [llvm_anyint_ty], 1357 [LLVMMatchType<0>, // src(VGPR) 1358 LLVMMatchType<0>, // cmp(VGPR) 1359 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1360 llvm_i32_ty, // vindex(VGPR) 1361 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1362 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1363 llvm_i32_ty], // cachepolicy(imm; bit 1 = slc) 1364 [IntrArgMemOnly, NoCapture<ArgIndex<2>>, 1365 ImmArg<ArgIndex<6>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1366 AMDGPURsrcIntrinsic<2, 0>; 1367 1368// gfx908 intrinsic 1369def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1370def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1371 1372// gfx90a intrinsics 1373def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1374def int_amdgcn_struct_buffer_atomic_fmax : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>; 1375 1376def int_amdgcn_struct_ptr_buffer_atomic_fmin : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1377def int_amdgcn_struct_ptr_buffer_atomic_fmax : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>; 1378 1379// Obsolescent tbuffer intrinsics. 1380def int_amdgcn_tbuffer_load : DefaultAttrsIntrinsic < 1381 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1382 [llvm_v4i32_ty, // rsrc(SGPR) 1383 llvm_i32_ty, // vindex(VGPR) 1384 llvm_i32_ty, // voffset(VGPR) 1385 llvm_i32_ty, // soffset(SGPR) 1386 llvm_i32_ty, // offset(imm) 1387 llvm_i32_ty, // dfmt(imm) 1388 llvm_i32_ty, // nfmt(imm) 1389 llvm_i1_ty, // glc(imm) 1390 llvm_i1_ty], // slc(imm) 1391 [IntrReadMem, 1392 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>, 1393 ImmArg<ArgIndex<7>>, ImmArg<ArgIndex<8>>], "", [SDNPMemOperand]>, 1394 AMDGPURsrcIntrinsic<0>; 1395 1396def int_amdgcn_tbuffer_store : DefaultAttrsIntrinsic < 1397 [], 1398 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1399 llvm_v4i32_ty, // rsrc(SGPR) 1400 llvm_i32_ty, // vindex(VGPR) 1401 llvm_i32_ty, // voffset(VGPR) 1402 llvm_i32_ty, // soffset(SGPR) 1403 llvm_i32_ty, // offset(imm) 1404 llvm_i32_ty, // dfmt(imm) 1405 llvm_i32_ty, // nfmt(imm) 1406 llvm_i1_ty, // glc(imm) 1407 llvm_i1_ty], // slc(imm) 1408 [IntrWriteMem, ImmArg<ArgIndex<5>>, 1409 ImmArg<ArgIndex<6>>, ImmArg<ArgIndex<7>>, 1410 ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<9>>], "", [SDNPMemOperand]>, 1411 AMDGPURsrcIntrinsic<1>; 1412 1413// New tbuffer intrinsics, with: 1414// - raw and struct variants 1415// - joint format field 1416// - joint cachepolicy field 1417def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < 1418 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1419 [llvm_v4i32_ty, // rsrc(SGPR) 1420 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1421 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1422 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1423 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1424 // bit 1 = slc, 1425 // bit 2 = dlc on gfx10+), 1426 // swizzled buffer (bit 3 = swz)) 1427 [IntrReadMem, 1428 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1429 AMDGPURsrcIntrinsic<0>; 1430 1431def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1432 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1433 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1434 llvm_i32_ty, // offset(VGPR/imm, included in bounds` checking and swizzling) 1435 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1436 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1437 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1438 // bit 1 = slc, 1439 // bit 2 = dlc on gfx10+), 1440 // swizzled buffer (bit 3 = swz)) 1441 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1442 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>, 1443 AMDGPURsrcIntrinsic<0>; 1444 1445def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < 1446 [], 1447 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1448 llvm_v4i32_ty, // rsrc(SGPR) 1449 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1450 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1451 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1452 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1453 // bit 1 = slc, 1454 // bit 2 = dlc on gfx10+), 1455 // swizzled buffer (bit 3 = swz)) 1456 [IntrWriteMem, 1457 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1458 AMDGPURsrcIntrinsic<1>; 1459 1460def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1461 [], 1462 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1463 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1464 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1465 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1466 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1467 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1468 // bit 1 = slc, 1469 // bit 2 = dlc on gfx10+), 1470 // swizzled buffer (bit 3 = swz)) 1471 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1472 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1473 AMDGPURsrcIntrinsic<1>; 1474 1475def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < 1476 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1477 [llvm_v4i32_ty, // rsrc(SGPR) 1478 llvm_i32_ty, // vindex(VGPR) 1479 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1480 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1481 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1482 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1483 // bit 1 = slc, 1484 // bit 2 = dlc on gfx10+), 1485 // swizzled buffer (bit 3 = swz)) 1486 [IntrReadMem, 1487 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1488 AMDGPURsrcIntrinsic<0>; 1489 1490def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < 1491 [llvm_any_ty], // overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1492 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1493 llvm_i32_ty, // vindex(VGPR) 1494 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1495 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1496 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1497 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1498 // bit 1 = slc, 1499 // bit 2 = dlc on gfx10+), 1500 // swizzled buffer (bit 3 = swz)) 1501 [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1502 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>, 1503 AMDGPURsrcIntrinsic<0>; 1504 1505def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < 1506 [], 1507 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1508 AMDGPUBufferRsrcTy, // rsrc(SGPR) 1509 llvm_i32_ty, // vindex(VGPR) 1510 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1511 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1512 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1513 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1514 // bit 1 = slc, 1515 // bit 2 = dlc on gfx10+), 1516 // swizzled buffer (bit 3 = swz)) 1517 [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1518 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1519 AMDGPURsrcIntrinsic<1>; 1520 1521def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < 1522 [], 1523 [llvm_any_ty, // vdata(VGPR), overloaded for types f32/i32, v2f32/v2i32, v4f32/v4i32 1524 llvm_v4i32_ty, // rsrc(SGPR) 1525 llvm_i32_ty, // vindex(VGPR) 1526 llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) 1527 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1528 llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) 1529 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1530 // bit 1 = slc, 1531 // bit 2 = dlc on gfx10+), 1532 // swizzled buffer (bit 3 = swz)) 1533 [IntrWriteMem, 1534 ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>, 1535 AMDGPURsrcIntrinsic<1>; 1536 1537class AMDGPUBufferAtomic : Intrinsic < 1538 [llvm_anyint_ty], 1539 [LLVMMatchType<0>, // vdata(VGPR) 1540 llvm_v4i32_ty, // rsrc(SGPR) 1541 llvm_i32_ty, // vindex(VGPR) 1542 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1543 llvm_i1_ty], // slc(imm) 1544 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1545 AMDGPURsrcIntrinsic<1, 0>; 1546def int_amdgcn_buffer_atomic_swap : AMDGPUBufferAtomic; 1547def int_amdgcn_buffer_atomic_add : AMDGPUBufferAtomic; 1548def int_amdgcn_buffer_atomic_sub : AMDGPUBufferAtomic; 1549def int_amdgcn_buffer_atomic_smin : AMDGPUBufferAtomic; 1550def int_amdgcn_buffer_atomic_umin : AMDGPUBufferAtomic; 1551def int_amdgcn_buffer_atomic_smax : AMDGPUBufferAtomic; 1552def int_amdgcn_buffer_atomic_umax : AMDGPUBufferAtomic; 1553def int_amdgcn_buffer_atomic_and : AMDGPUBufferAtomic; 1554def int_amdgcn_buffer_atomic_or : AMDGPUBufferAtomic; 1555def int_amdgcn_buffer_atomic_xor : AMDGPUBufferAtomic; 1556def int_amdgcn_buffer_atomic_cmpswap : Intrinsic< 1557 [llvm_i32_ty], 1558 [llvm_i32_ty, // src(VGPR) 1559 llvm_i32_ty, // cmp(VGPR) 1560 llvm_v4i32_ty, // rsrc(SGPR) 1561 llvm_i32_ty, // vindex(VGPR) 1562 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1563 llvm_i1_ty], // slc(imm) 1564 [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1565 AMDGPURsrcIntrinsic<2, 0>; 1566 1567def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic; 1568 1569class AMDGPUBufferAtomicFP : Intrinsic < 1570 [llvm_anyfloat_ty], 1571 [LLVMMatchType<0>, // vdata(VGPR) 1572 llvm_v4i32_ty, // rsrc(SGPR) 1573 llvm_i32_ty, // vindex(VGPR) 1574 llvm_i32_ty, // offset(SGPR/VGPR/imm) 1575 llvm_i1_ty], // slc(imm) 1576 [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, 1577 AMDGPURsrcIntrinsic<1, 0>; 1578 1579// Legacy form of the intrinsic. raw and struct forms should be preferred. 1580def int_amdgcn_buffer_atomic_fadd : AMDGPUBufferAtomicFP; 1581 1582class AMDGPURawBufferLoadLDS : Intrinsic < 1583 [], 1584 [llvm_v4i32_ty, // rsrc(SGPR) 1585 LLVMQualPointerType<3>, // LDS base offset 1586 llvm_i32_ty, // Data byte size: 1/2/4 1587 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1588 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1589 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1590 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1591 // bit 1 = slc, 1592 // bit 2 = dlc on gfx10+)) 1593 // swizzled buffer (bit 3 = swz)) 1594 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1595 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1596def int_amdgcn_raw_buffer_load_lds : AMDGPURawBufferLoadLDS; 1597 1598class AMDGPURawPtrBufferLoadLDS : Intrinsic < 1599 [], 1600 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1601 LLVMQualPointerType<3>, // LDS base offset 1602 llvm_i32_ty, // Data byte size: 1/2/4 1603 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1604 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1605 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1606 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1607 // bit 1 = slc, 1608 // bit 2 = dlc on gfx10+)) 1609 // swizzled buffer (bit 3 = swz)) 1610 [IntrWillReturn, IntrArgMemOnly, 1611 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1612 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1613 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, 1614 ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1615def int_amdgcn_raw_ptr_buffer_load_lds : AMDGPURawPtrBufferLoadLDS; 1616 1617class AMDGPUStructBufferLoadLDS : Intrinsic < 1618 [], 1619 [llvm_v4i32_ty, // rsrc(SGPR) 1620 LLVMQualPointerType<3>, // LDS base offset 1621 llvm_i32_ty, // Data byte size: 1/2/4 1622 llvm_i32_ty, // vindex(VGPR) 1623 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1624 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1625 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1626 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1627 // bit 1 = slc, 1628 // bit 2 = dlc on gfx10+)) 1629 // swizzled buffer (bit 3 = swz)) 1630 [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1631 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1632def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; 1633 1634class AMDGPUStructPtrBufferLoadLDS : Intrinsic < 1635 [], 1636 [AMDGPUBufferRsrcTy, // rsrc(SGPR) 1637 LLVMQualPointerType<3> , // LDS base offset 1638 llvm_i32_ty, // Data byte size: 1/2/4 1639 llvm_i32_ty, // vindex(VGPR) 1640 llvm_i32_ty, // voffset(VGPR, included in bounds checking and swizzling) 1641 llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) 1642 llvm_i32_ty, // imm offset(imm, included in bounds checking and swizzling) 1643 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, 1644 // bit 1 = slc, 1645 // bit 2 = dlc on gfx10+)) 1646 // swizzled buffer (bit 3 = swz)) 1647 [IntrWillReturn, IntrArgMemOnly, 1648 ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>, 1649 WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>, 1650 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>, 1651 ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; 1652def int_amdgcn_struct_ptr_buffer_load_lds : AMDGPUStructPtrBufferLoadLDS; 1653 1654} // defset AMDGPUBufferIntrinsics 1655 1656// Uses that do not set the done bit should set IntrWriteMem on the 1657// call site. 1658def int_amdgcn_exp : DefaultAttrsIntrinsic <[], [ 1659 llvm_i32_ty, // tgt, 1660 llvm_i32_ty, // en 1661 llvm_any_ty, // src0 (f32 or i32) 1662 LLVMMatchType<0>, // src1 1663 LLVMMatchType<0>, // src2 1664 LLVMMatchType<0>, // src3 1665 llvm_i1_ty, // done 1666 llvm_i1_ty // vm (ignored on GFX11+) 1667 ], 1668 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1669 ImmArg<ArgIndex<7>>, IntrWriteMem, IntrInaccessibleMemOnly] 1670>; 1671 1672// exp with row_en bit set. Only supported on GFX11+. 1673def int_amdgcn_exp_row : DefaultAttrsIntrinsic <[], [ 1674 llvm_i32_ty, // tgt, 1675 llvm_i32_ty, // en 1676 llvm_any_ty, // src0 (f32 or i32) 1677 LLVMMatchType<0>, // src1 1678 LLVMMatchType<0>, // src2 1679 LLVMMatchType<0>, // src3 1680 llvm_i1_ty, // done 1681 llvm_i32_ty], // row number 1682 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<6>>, 1683 IntrWriteMem, IntrInaccessibleMemOnly] 1684>; 1685 1686// exp with compr bit set. Not supported on GFX11+. 1687def int_amdgcn_exp_compr : DefaultAttrsIntrinsic <[], [ 1688 llvm_i32_ty, // tgt, 1689 llvm_i32_ty, // en 1690 llvm_anyvector_ty, // src0 (v2f16 or v2i16) 1691 LLVMMatchType<0>, // src1 1692 llvm_i1_ty, // done 1693 llvm_i1_ty], // vm 1694 [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>>, 1695 ImmArg<ArgIndex<5>>, IntrWriteMem, IntrInaccessibleMemOnly] 1696>; 1697 1698def int_amdgcn_buffer_wbinvl1_sc : 1699 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_sc">, 1700 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1701 1702def int_amdgcn_buffer_wbinvl1 : 1703 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1">, 1704 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1705 1706def int_amdgcn_s_dcache_inv : 1707 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv">, 1708 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 1709 1710def int_amdgcn_s_memtime : 1711 ClangBuiltin<"__builtin_amdgcn_s_memtime">, 1712 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>; 1713 1714def int_amdgcn_s_sleep : 1715 ClangBuiltin<"__builtin_amdgcn_s_sleep">, 1716 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1717 IntrHasSideEffects]> { 1718} 1719 1720def int_amdgcn_s_sleep_var 1721 : ClangBuiltin<"__builtin_amdgcn_s_sleep_var">, 1722 Intrinsic<[], [llvm_i32_ty], 1723 [IntrNoMem, IntrHasSideEffects, IntrWillReturn]> { 1724} 1725 1726def int_amdgcn_s_nop : 1727 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1728 IntrHasSideEffects]> { 1729} 1730 1731def int_amdgcn_s_incperflevel : 1732 ClangBuiltin<"__builtin_amdgcn_s_incperflevel">, 1733 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1734 IntrHasSideEffects]> { 1735} 1736 1737def int_amdgcn_s_decperflevel : 1738 ClangBuiltin<"__builtin_amdgcn_s_decperflevel">, 1739 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1740 IntrHasSideEffects]> { 1741} 1742 1743def int_amdgcn_s_sethalt : 1744 DefaultAttrsIntrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1745 IntrHasSideEffects]>; 1746 1747def int_amdgcn_s_setprio : 1748 ClangBuiltin<"__builtin_amdgcn_s_setprio">, 1749 DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, 1750 IntrHasSideEffects]>; 1751 1752def int_amdgcn_s_ttracedata : 1753 DefaultAttrsIntrinsic<[], [llvm_i32_ty], 1754 [IntrNoMem, IntrHasSideEffects]>; 1755def int_amdgcn_s_ttracedata_imm : 1756 DefaultAttrsIntrinsic<[], [llvm_i16_ty], 1757 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>; 1758 1759// This is IntrHasSideEffects so it can be used to read cycle counters. 1760def int_amdgcn_s_getreg : 1761 ClangBuiltin<"__builtin_amdgcn_s_getreg">, 1762 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty], 1763 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1764>; 1765 1766// Note this can be used to set FP environment properties that are 1767// unsafe to change in non-strictfp functions. The register properties 1768// available (and value required to access them) may differ per 1769// subtarget. llvm.amdgcn.s.setreg(hwmode, value) 1770def int_amdgcn_s_setreg : 1771 ClangBuiltin<"__builtin_amdgcn_s_setreg">, 1772 DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_i32_ty], 1773 [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>] 1774>; 1775 1776// int_amdgcn_s_getpc is provided to allow a specific style of position 1777// independent code to determine the high part of its address when it is 1778// known (through convention) that the code and any data of interest does 1779// not cross a 4Gb address boundary. Use for any other purpose may not 1780// produce the desired results as optimizations may cause code movement, 1781// especially as we explicitly use IntrNoMem to allow optimizations. 1782def int_amdgcn_s_getpc : 1783 ClangBuiltin<"__builtin_amdgcn_s_getpc">, 1784 DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable, 1785 IntrWillReturn]>; 1786 1787// __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0> 1788// param values: 0 = P10, 1 = P20, 2 = P0 1789def int_amdgcn_interp_mov : 1790 ClangBuiltin<"__builtin_amdgcn_interp_mov">, 1791 DefaultAttrsIntrinsic<[llvm_float_ty], 1792 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1793 [IntrNoMem, IntrSpeculatable, 1794 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1795 1796// __builtin_amdgcn_interp_p1 <i>, <attr_chan>, <attr>, <m0> 1797// This intrinsic reads from lds, but the memory values are constant, 1798// so it behaves like IntrNoMem. 1799def int_amdgcn_interp_p1 : 1800 ClangBuiltin<"__builtin_amdgcn_interp_p1">, 1801 DefaultAttrsIntrinsic<[llvm_float_ty], 1802 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1803 [IntrNoMem, IntrSpeculatable, 1804 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>; 1805 1806// __builtin_amdgcn_interp_p2 <p1>, <j>, <attr_chan>, <attr>, <m0> 1807def int_amdgcn_interp_p2 : 1808 ClangBuiltin<"__builtin_amdgcn_interp_p2">, 1809 DefaultAttrsIntrinsic<[llvm_float_ty], 1810 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1811 [IntrNoMem, IntrSpeculatable, 1812 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1813 // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. 1814 1815// __builtin_amdgcn_interp_p1_f16 <i>, <attr_chan>, <attr>, <high>, <m0> 1816// high selects whether high or low 16-bits are loaded from LDS 1817def int_amdgcn_interp_p1_f16 : 1818 ClangBuiltin<"__builtin_amdgcn_interp_p1_f16">, 1819 DefaultAttrsIntrinsic<[llvm_float_ty], 1820 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1821 [IntrNoMem, IntrSpeculatable, 1822 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>; 1823 1824// __builtin_amdgcn_interp_p2_f16 <p1>, <j>, <attr_chan>, <attr>, <high>, <m0> 1825// high selects whether high or low 16-bits are loaded from LDS 1826def int_amdgcn_interp_p2_f16 : 1827 ClangBuiltin<"__builtin_amdgcn_interp_p2_f16">, 1828 DefaultAttrsIntrinsic<[llvm_half_ty], 1829 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], 1830 [IntrNoMem, IntrSpeculatable, 1831 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>]>; 1832 1833// llvm.amdgcn.lds.direct.load <m0> 1834// The input argument is m0, which contains a packed combination of address 1835// offset and flags describing the data type. 1836def int_amdgcn_lds_direct_load : 1837 DefaultAttrsIntrinsic<[llvm_any_ty], // overloaded for types u8, u16, i32/f32, i8, i16 1838 [llvm_i32_ty], 1839 [IntrReadMem, IntrSpeculatable]>; 1840 1841// llvm.amdgcn.lds.param.load <attr_chan>, <attr>, <m0> 1842// Like interp intrinsics, this reads from lds, but the memory values are constant, 1843// so it behaves like IntrNoMem. 1844def int_amdgcn_lds_param_load : 1845 DefaultAttrsIntrinsic<[llvm_float_ty], 1846 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1847 [IntrNoMem, IntrSpeculatable, 1848 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>]>; 1849 1850// llvm.amdgcn.interp.inreg.p10 <p>, <i>, <p0> 1851def int_amdgcn_interp_inreg_p10 : 1852 DefaultAttrsIntrinsic<[llvm_float_ty], 1853 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1854 [IntrNoMem, IntrSpeculatable]>; 1855 1856// llvm.amdgcn.interp.inreg.p2 <p>, <j>, <tmp> 1857def int_amdgcn_interp_inreg_p2 : 1858 DefaultAttrsIntrinsic<[llvm_float_ty], 1859 [llvm_float_ty, llvm_float_ty, llvm_float_ty], 1860 [IntrNoMem, IntrSpeculatable]>; 1861 1862// llvm.amdgcn.interp.inreg.p10.f16 <p>, <i>, <p0>, <high> 1863// high selects whether high or low 16-bits are used for p and p0 operands 1864def int_amdgcn_interp_inreg_p10_f16: 1865 DefaultAttrsIntrinsic<[llvm_float_ty], 1866 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1867 [IntrNoMem, IntrSpeculatable, 1868 ImmArg<ArgIndex<3>>]>; 1869 1870// llvm.amdgcn.interp.inreg.p2.f16 <p>, <j>, <tmp>, <high> 1871// high selects whether high or low 16-bits are used for p operand 1872def int_amdgcn_interp_inreg_p2_f16 : 1873 DefaultAttrsIntrinsic<[llvm_half_ty], 1874 [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty], 1875 [IntrNoMem, IntrSpeculatable, 1876 ImmArg<ArgIndex<3>>]>; 1877 1878// Deprecated: use llvm.amdgcn.live.mask instead. 1879def int_amdgcn_ps_live : DefaultAttrsIntrinsic < 1880 [llvm_i1_ty], 1881 [], 1882 [IntrNoMem]>; 1883 1884// Query currently live lanes. 1885// Returns true if lane is live (and not a helper lane). 1886def int_amdgcn_live_mask : DefaultAttrsIntrinsic <[llvm_i1_ty], 1887 [], [IntrReadMem, IntrInaccessibleMemOnly] 1888>; 1889 1890def int_amdgcn_mbcnt_lo : 1891 ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, 1892 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1893 [IntrNoMem]>; 1894 1895def int_amdgcn_mbcnt_hi : 1896 ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, 1897 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1898 [IntrNoMem]>; 1899 1900// llvm.amdgcn.ds.swizzle src offset 1901def int_amdgcn_ds_swizzle : 1902 ClangBuiltin<"__builtin_amdgcn_ds_swizzle">, 1903 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 1904 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, 1905 ImmArg<ArgIndex<1>>]>; 1906 1907def int_amdgcn_ubfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 1908 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1909 [IntrNoMem, IntrSpeculatable] 1910>; 1911 1912def int_amdgcn_sbfe : DefaultAttrsIntrinsic<[llvm_anyint_ty], 1913 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], 1914 [IntrNoMem, IntrSpeculatable] 1915>; 1916 1917def int_amdgcn_lerp : 1918 ClangBuiltin<"__builtin_amdgcn_lerp">, 1919 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1920 [IntrNoMem, IntrSpeculatable] 1921>; 1922 1923def int_amdgcn_sad_u8 : 1924 ClangBuiltin<"__builtin_amdgcn_sad_u8">, 1925 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1926 [IntrNoMem, IntrSpeculatable] 1927>; 1928 1929def int_amdgcn_msad_u8 : 1930 ClangBuiltin<"__builtin_amdgcn_msad_u8">, 1931 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1932 [IntrNoMem, IntrSpeculatable] 1933>; 1934 1935def int_amdgcn_sad_hi_u8 : 1936 ClangBuiltin<"__builtin_amdgcn_sad_hi_u8">, 1937 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1938 [IntrNoMem, IntrSpeculatable] 1939>; 1940 1941def int_amdgcn_sad_u16 : 1942 ClangBuiltin<"__builtin_amdgcn_sad_u16">, 1943 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 1944 [IntrNoMem, IntrSpeculatable] 1945>; 1946 1947def int_amdgcn_qsad_pk_u16_u8 : 1948 ClangBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, 1949 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 1950 [IntrNoMem, IntrSpeculatable] 1951>; 1952 1953def int_amdgcn_mqsad_pk_u16_u8 : 1954 ClangBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, 1955 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], 1956 [IntrNoMem, IntrSpeculatable] 1957>; 1958 1959def int_amdgcn_mqsad_u32_u8 : 1960 ClangBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, 1961 DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], 1962 [IntrNoMem, IntrSpeculatable] 1963>; 1964 1965def int_amdgcn_cvt_pk_u8_f32 : 1966 ClangBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, 1967 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], 1968 [IntrNoMem, IntrSpeculatable] 1969>; 1970 1971def int_amdgcn_icmp : 1972 Intrinsic<[llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>, llvm_i32_ty], 1973 [IntrNoMem, IntrConvergent, 1974 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1975 1976def int_amdgcn_fcmp : 1977 Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, LLVMMatchType<1>, llvm_i32_ty], 1978 [IntrNoMem, IntrConvergent, 1979 ImmArg<ArgIndex<2>>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1980 1981def int_amdgcn_ballot : 1982 Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], 1983 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1984 1985def int_amdgcn_inverse_ballot : 1986 Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], 1987 [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 1988 1989// Lowers to S_BITREPLICATE_B64_B32. 1990// The argument must be uniform; otherwise, the result is undefined. 1991def int_amdgcn_s_bitreplicate : 1992 DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; 1993 1994// Lowers to S_QUADMASK_B{32,64} 1995// The argument must be uniform; otherwise, the result is undefined. 1996def int_amdgcn_s_quadmask : 1997 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 1998 1999// Lowers to S_WQM_B{32,64} 2000// The argument must be uniform; otherwise, the result is undefined. 2001// Does not set WQM; merely calculates the bitmask. 2002def int_amdgcn_s_wqm : 2003 DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>; 2004 2005class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic< 2006 [data_ty], 2007 [ 2008 LLVMMatchType<0>, // llvm value to reduce (SGPR/VGPR) 2009 llvm_i32_ty // Reduction Strategy Switch for lowering ( 0: Default, 2010 // 1: Iterative strategy, and 2011 // 2. DPP) 2012 ], 2013 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>; 2014 2015def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce; 2016def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce; 2017 2018def int_amdgcn_readfirstlane : 2019 ClangBuiltin<"__builtin_amdgcn_readfirstlane">, 2020 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 2021 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2022 2023// The lane argument must be uniform across the currently active threads of the 2024// current wave. Otherwise, the result is undefined. 2025def int_amdgcn_readlane : 2026 ClangBuiltin<"__builtin_amdgcn_readlane">, 2027 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2028 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2029 2030// The value to write and lane select arguments must be uniform across the 2031// currently active threads of the current wave. Otherwise, the result is 2032// undefined. 2033def int_amdgcn_writelane : 2034 ClangBuiltin<"__builtin_amdgcn_writelane">, 2035 Intrinsic<[llvm_i32_ty], [ 2036 llvm_i32_ty, // uniform value to write: returned by the selected lane 2037 llvm_i32_ty, // uniform lane select 2038 llvm_i32_ty // returned by all lanes other than the selected one 2039 ], 2040 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2041>; 2042 2043def int_amdgcn_alignbyte : ClangBuiltin<"__builtin_amdgcn_alignbyte">, 2044 DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2045 [IntrNoMem, IntrSpeculatable] 2046>; 2047 2048// mul24 intrinsics can return i32 or i64. 2049// When returning i64, they're lowered to a mul24/mulhi24 pair. 2050def int_amdgcn_mul_i24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2051 [llvm_i32_ty, llvm_i32_ty], 2052 [IntrNoMem, IntrSpeculatable] 2053>; 2054 2055def int_amdgcn_mul_u24 : DefaultAttrsIntrinsic<[llvm_anyint_ty], 2056 [llvm_i32_ty, llvm_i32_ty], 2057 [IntrNoMem, IntrSpeculatable] 2058>; 2059 2060def int_amdgcn_mulhi_i24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2061 [llvm_i32_ty, llvm_i32_ty], 2062 [IntrNoMem, IntrSpeculatable] 2063>; 2064 2065def int_amdgcn_mulhi_u24 : DefaultAttrsIntrinsic<[llvm_i32_ty], 2066 [llvm_i32_ty, llvm_i32_ty], 2067 [IntrNoMem, IntrSpeculatable] 2068>; 2069 2070// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id) 2071// 2072// bar_val is the total number of waves that will wait on this 2073// barrier, minus 1. 2074def int_amdgcn_ds_gws_init : 2075 ClangBuiltin<"__builtin_amdgcn_ds_gws_init">, 2076 Intrinsic<[], 2077 [llvm_i32_ty, llvm_i32_ty], 2078 [IntrConvergent, IntrWriteMem, 2079 IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2080 [SDNPMemOperand] 2081>; 2082 2083// llvm.amdgcn.ds.gws.barrier(i32 vsrc0, i32 resource_id) 2084// bar_val is the total number of waves that will wait on this 2085// barrier, minus 1. 2086def int_amdgcn_ds_gws_barrier : 2087 ClangBuiltin<"__builtin_amdgcn_ds_gws_barrier">, 2088 Intrinsic<[], 2089 [llvm_i32_ty, llvm_i32_ty], 2090 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2091 [SDNPMemOperand] 2092>; 2093 2094// llvm.amdgcn.ds.gws.sema.v(i32 resource_id) 2095def int_amdgcn_ds_gws_sema_v : 2096 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_v">, 2097 Intrinsic<[], 2098 [llvm_i32_ty], 2099 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2100 [SDNPMemOperand] 2101>; 2102 2103// llvm.amdgcn.ds.gws.sema.br(i32 vsrc, i32 resource_id) 2104def int_amdgcn_ds_gws_sema_br : 2105 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_br">, 2106 Intrinsic<[], 2107 [llvm_i32_ty, llvm_i32_ty], 2108 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2109 [SDNPMemOperand] 2110>; 2111 2112// llvm.amdgcn.ds.gws.sema.p(i32 resource_id) 2113def int_amdgcn_ds_gws_sema_p : 2114 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_p">, 2115 Intrinsic<[], 2116 [llvm_i32_ty], 2117 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2118 [SDNPMemOperand] 2119>; 2120 2121// llvm.amdgcn.ds.gws.sema.release.all(i32 resource_id) 2122def int_amdgcn_ds_gws_sema_release_all : 2123 ClangBuiltin<"__builtin_amdgcn_ds_gws_sema_release_all">, 2124 Intrinsic<[], 2125 [llvm_i32_ty], 2126 [IntrConvergent, IntrInaccessibleMemOnly, IntrWillReturn, IntrNoCallback, IntrNoFree], "", 2127 [SDNPMemOperand] 2128>; 2129 2130 2131// Copies the source value to the destination value, with the guarantee that 2132// the source value is computed as if the entire program were executed in WQM. 2133def int_amdgcn_wqm : Intrinsic<[llvm_any_ty], 2134 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2135>; 2136 2137// Copies the source value to the destination value, such that the source 2138// is computed as if the entire program were executed in WQM if any other 2139// program code executes in WQM. 2140def int_amdgcn_softwqm : Intrinsic<[llvm_any_ty], 2141 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree] 2142>; 2143 2144// Return true if at least one thread within the pixel quad passes true into 2145// the function. 2146def int_amdgcn_wqm_vote : Intrinsic<[llvm_i1_ty], 2147 [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2148>; 2149 2150// If false, set EXEC=0 for the current thread until the end of program. 2151// FIXME: Should this be IntrNoMem, IntrHasSideEffects, or IntrWillReturn? 2152def int_amdgcn_kill : Intrinsic<[], [llvm_i1_ty], [IntrNoCallback, IntrNoFree]>; 2153 2154def int_amdgcn_endpgm : ClangBuiltin<"__builtin_amdgcn_endpgm">, 2155 Intrinsic<[], [], [IntrNoReturn, IntrCold, IntrNoMem, IntrHasSideEffects, IntrConvergent, 2156 IntrNoCallback, IntrNoFree] 2157>; 2158 2159// If false, mark all active lanes as helper lanes until the end of program. 2160def int_amdgcn_wqm_demote : Intrinsic<[], 2161 [llvm_i1_ty], [IntrWriteMem, IntrInaccessibleMemOnly, IntrNoCallback, IntrNoFree] 2162>; 2163 2164// Copies the active channels of the source value to the destination value, 2165// with the guarantee that the source value is computed as if the entire 2166// program were executed in Whole Wavefront Mode, i.e. with all channels 2167// enabled, with a few exceptions: - Phi nodes which require WWM return an 2168// undefined value. 2169def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty], 2170 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2171 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2172>; 2173// Deprecated. Use int_amdgcn_strict_wwm instead. 2174def int_amdgcn_wwm : Intrinsic<[llvm_any_ty], 2175 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2176 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2177>; 2178def int_amdgcn_strict_wqm : Intrinsic<[llvm_any_ty], 2179 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, 2180 IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2181>; 2182 2183// Given a value, copies it while setting all the inactive lanes to a given 2184// value. Note that OpenGL helper lanes are considered active, so if the 2185// program ever uses WQM, then the instruction and the first source will be 2186// computed in WQM. 2187def int_amdgcn_set_inactive : 2188 Intrinsic<[llvm_anyint_ty], 2189 [LLVMMatchType<0>, // value to be copied 2190 LLVMMatchType<0>], // value for the inactive lanes to take 2191 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2192 2193// Similar to int_amdgcn_set_inactive, but the value for the inactive lanes must 2194// be a VGPR function argument. 2195// Can only be used in functions with the `amdgpu_cs_chain` or 2196// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2197// flow. 2198def int_amdgcn_set_inactive_chain_arg : 2199 Intrinsic<[llvm_anyint_ty], 2200 [LLVMMatchType<0>, // value to be copied 2201 LLVMMatchType<0>], // value for the inactive lanes to take 2202 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2203 2204// Return if the given flat pointer points to a local memory address. 2205def int_amdgcn_is_shared : ClangBuiltin<"__builtin_amdgcn_is_shared">, 2206 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2207 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2208>; 2209 2210// Return if the given flat pointer points to a prvate memory address. 2211def int_amdgcn_is_private : ClangBuiltin<"__builtin_amdgcn_is_private">, 2212 DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], 2213 [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>] 2214>; 2215 2216// A uniform tail call to a function with the `amdgpu_cs_chain` or 2217// `amdgpu_cs_chain_preserve` calling convention. It will populate the SGPRs 2218// starting at s0 and the VGPRs starting at v8, set EXEC and perform a jump to 2219// the given function. 2220// Can only be used in functions with the `amdgpu_cs`, `amdgpu_cs_chain` or 2221// `amdgpu_cs_chain_preserve` calling conventions, and only in uniform control 2222// flow. 2223def int_amdgcn_cs_chain: 2224 Intrinsic<[], 2225 [llvm_anyptr_ty, // The function to jump to. 2226 llvm_anyint_ty, // Value to put in EXEC (should be i32 or i64). 2227 llvm_any_ty, // Arguments that will be copied into SGPRs (s0+). 2228 // Must be uniform. 2229 llvm_any_ty, // Arguments that will be copied into VGPRs (v8+). 2230 // Need not be uniform. 2231 llvm_i32_ty, // Flags. 2232 llvm_vararg_ty // Additional arguments. Only present if Flags is 2233 // non-zero. 2234 ], 2235 [IntrConvergent, IntrNoReturn, ImmArg<ArgIndex<4>>]>; 2236 2237 2238//===----------------------------------------------------------------------===// 2239// CI+ Intrinsics 2240//===----------------------------------------------------------------------===// 2241 2242def int_amdgcn_s_dcache_inv_vol : 2243 ClangBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">, 2244 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2245 2246def int_amdgcn_buffer_wbinvl1_vol : 2247 ClangBuiltin<"__builtin_amdgcn_buffer_wbinvl1_vol">, 2248 DefaultAttrsIntrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; 2249 2250//===----------------------------------------------------------------------===// 2251// VI Intrinsics 2252//===----------------------------------------------------------------------===// 2253 2254// llvm.amdgcn.mov.dpp.i32 <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2255def int_amdgcn_mov_dpp : 2256 Intrinsic<[llvm_anyint_ty], 2257 [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, 2258 llvm_i1_ty], 2259 [IntrNoMem, IntrConvergent, IntrWillReturn, 2260 ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, 2261 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2262 2263// llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2264// Should be equivalent to: 2265// v_mov_b32 <dest> <old> 2266// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl> 2267def int_amdgcn_update_dpp : 2268 Intrinsic<[llvm_any_ty], 2269 [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, 2270 llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], 2271 [IntrNoMem, IntrConvergent, IntrWillReturn, 2272 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, 2273 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2274 2275def int_amdgcn_s_dcache_wb : 2276 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb">, 2277 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2278 2279def int_amdgcn_s_dcache_wb_vol : 2280 ClangBuiltin<"__builtin_amdgcn_s_dcache_wb_vol">, 2281 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2282 2283def int_amdgcn_s_memrealtime : 2284 ClangBuiltin<"__builtin_amdgcn_s_memrealtime">, 2285 Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2286 2287// llvm.amdgcn.ds.permute <index> <src> 2288def int_amdgcn_ds_permute : 2289 ClangBuiltin<"__builtin_amdgcn_ds_permute">, 2290 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2291 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2292 2293// llvm.amdgcn.ds.bpermute <index> <src> 2294def int_amdgcn_ds_bpermute : 2295 ClangBuiltin<"__builtin_amdgcn_ds_bpermute">, 2296 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], 2297 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2298 2299// llvm.amdgcn.perm <src0> <src1> <selector> 2300def int_amdgcn_perm : 2301 ClangBuiltin<"__builtin_amdgcn_perm">, 2302 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2303 [IntrNoMem, IntrSpeculatable, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2304 2305//===----------------------------------------------------------------------===// 2306// GFX9 Intrinsics 2307//===----------------------------------------------------------------------===// 2308 2309class AMDGPUGlobalLoadLDS : Intrinsic < 2310 [], 2311 [LLVMQualPointerType<1>, // Base global pointer to load from 2312 LLVMQualPointerType<3>, // LDS base pointer to store to 2313 llvm_i32_ty, // Data byte size: 1/2/4 2314 llvm_i32_ty, // imm offset (applied to both global and LDS address) 2315 llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0, 2316 // bit 1 = slc/sc1, 2317 // bit 2 = dlc on gfx10+)) 2318 // bit 4 = scc/nt on gfx90a+)) 2319 [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, 2320 ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree], 2321 "", [SDNPMemOperand]>; 2322def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS; 2323 2324//===----------------------------------------------------------------------===// 2325// GFX10 Intrinsics 2326//===----------------------------------------------------------------------===// 2327 2328// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control> 2329def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">, 2330 Intrinsic<[llvm_i32_ty], 2331 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2332 [IntrNoMem, IntrConvergent, IntrWillReturn, 2333 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2334 2335// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control> 2336def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">, 2337 Intrinsic<[llvm_i32_ty], 2338 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2339 [IntrNoMem, IntrConvergent, IntrWillReturn, 2340 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>; 2341 2342// llvm.amdgcn.mov.dpp8.i32 <src> <sel> 2343// <sel> is a 32-bit constant whose high 8 bits must be zero which selects 2344// the lanes to read from. 2345def int_amdgcn_mov_dpp8 : 2346 Intrinsic<[llvm_anyint_ty], 2347 [LLVMMatchType<0>, llvm_i32_ty], 2348 [IntrNoMem, IntrConvergent, IntrWillReturn, 2349 ImmArg<ArgIndex<1>>, IntrNoCallback, IntrNoFree]>; 2350 2351def int_amdgcn_s_get_waveid_in_workgroup : 2352 ClangBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">, 2353 Intrinsic<[llvm_i32_ty], [], 2354 [IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2355 2356class AMDGPUAtomicRtn<LLVMType vt> : Intrinsic < 2357 [vt], 2358 [llvm_anyptr_ty, // vaddr 2359 vt], // vdata(VGPR) 2360 [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "", 2361 [SDNPMemOperand]>; 2362 2363def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>; 2364 2365// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>, 2366// <ray_dir>, <ray_inv_dir>, <texture_descr> 2367// <node_ptr> is i32 or i64. 2368// <ray_dir> and <ray_inv_dir> are both v3f16 or both v3f32. 2369def int_amdgcn_image_bvh_intersect_ray : 2370 DefaultAttrsIntrinsic<[llvm_v4i32_ty], 2371 [llvm_anyint_ty, llvm_float_ty, llvm_v3f32_ty, llvm_anyvector_ty, 2372 LLVMMatchType<1>, llvm_v4i32_ty], 2373 [IntrReadMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2374 2375//===----------------------------------------------------------------------===// 2376// GFX11 Intrinsics 2377//===----------------------------------------------------------------------===// 2378 2379// llvm.amdgcn.permlane64 <src0> 2380def int_amdgcn_permlane64 : 2381 ClangBuiltin<"__builtin_amdgcn_permlane64">, 2382 Intrinsic<[llvm_i32_ty], [llvm_i32_ty], 2383 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2384 2385def int_amdgcn_ds_add_gs_reg_rtn : 2386 ClangBuiltin<"__builtin_amdgcn_ds_add_gs_reg_rtn">, 2387 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2388 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2389 "", [SDNPMemOperand]>; 2390 2391def int_amdgcn_ds_sub_gs_reg_rtn : 2392 ClangBuiltin<"__builtin_amdgcn_ds_sub_gs_reg_rtn">, 2393 Intrinsic<[llvm_anyint_ty], [llvm_i32_ty, llvm_i32_ty], 2394 [ImmArg<ArgIndex<1>>, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree], 2395 "", [SDNPMemOperand]>; 2396 2397def int_amdgcn_ds_bvh_stack_rtn : 2398 Intrinsic< 2399 [llvm_i32_ty, llvm_i32_ty], // %vdst, %addr 2400 [ 2401 llvm_i32_ty, // %addr 2402 llvm_i32_ty, // %data0 2403 llvm_v4i32_ty, // %data1 2404 llvm_i32_ty, // %offset 2405 ], 2406 [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2407 >; 2408 2409// WMMA (Wave Matrix Multiply-Accumulate) intrinsics 2410// 2411// These operations perform a matrix multiplication and accumulation of 2412// the form: D = A * B + C . 2413 2414class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> : 2415 Intrinsic< 2416 [CD], // %D 2417 [ 2418 AB, // %A 2419 AB, // %B 2420 LLVMMatchType<0>, // %C 2421 ], 2422 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2423>; 2424 2425class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> : 2426 Intrinsic< 2427 [CD], // %D 2428 [ 2429 AB, // %A 2430 AB, // %B 2431 LLVMMatchType<0>, // %C 2432 llvm_i1_ty, // %high 2433 ], 2434 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2435>; 2436 2437class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> : 2438 Intrinsic< 2439 [CD], // %D 2440 [ 2441 llvm_i1_ty, // %A_sign 2442 AB, // %A 2443 llvm_i1_ty, // %B_sign 2444 AB, // %B 2445 LLVMMatchType<0>, // %C 2446 llvm_i1_ty, // %clamp 2447 ], 2448 [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree] 2449>; 2450 2451def int_amdgcn_wmma_f32_16x16x16_f16 : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>; 2452def int_amdgcn_wmma_f32_16x16x16_bf16 : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>; 2453// The regular, untied f16/bf16 wmma intrinsics only write to one half 2454// of the registers (set via the op_sel bit). 2455// The content of the other 16-bit of the registers is undefined. 2456def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>; 2457def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>; 2458// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix 2459// registers to the input accumulator registers. 2460// Essentially, the content of the other 16-bit is preserved from the input. 2461def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>; 2462def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>; 2463def int_amdgcn_wmma_i32_16x16x16_iu8 : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>; 2464def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>; 2465 2466def int_amdgcn_s_wait_event_export_ready : 2467 ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">, 2468 Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn] 2469>; 2470 2471//===----------------------------------------------------------------------===// 2472// GFX12 Intrinsics 2473//===----------------------------------------------------------------------===// 2474 2475// llvm.amdgcn.permlane16.var <old> <src0> <src1> <fi> <bound_control> 2476def int_amdgcn_permlane16_var : ClangBuiltin<"__builtin_amdgcn_permlane16_var">, 2477 Intrinsic<[llvm_i32_ty], 2478 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2479 [IntrNoMem, IntrConvergent, IntrWillReturn, 2480 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2481 2482// llvm.amdgcn.permlanex16.var <old> <src0> <src1> <fi> <bound_control> 2483def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var">, 2484 Intrinsic<[llvm_i32_ty], 2485 [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty], 2486 [IntrNoMem, IntrConvergent, IntrWillReturn, 2487 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>; 2488 2489def int_amdgcn_flat_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2490def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2491def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2492def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2493 2494//===----------------------------------------------------------------------===// 2495// Deep learning intrinsics. 2496//===----------------------------------------------------------------------===// 2497 2498// f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c, i1 %clamp) 2499// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2500def int_amdgcn_fdot2 : 2501 ClangBuiltin<"__builtin_amdgcn_fdot2">, 2502 DefaultAttrsIntrinsic< 2503 [llvm_float_ty], // %r 2504 [ 2505 llvm_v2f16_ty, // %a 2506 llvm_v2f16_ty, // %b 2507 llvm_float_ty, // %c 2508 llvm_i1_ty // %clamp 2509 ], 2510 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2511 >; 2512 2513// f16 %r = llvm.amdgcn.fdot2.f16.f16(v2f16 %a, v2f16 %b, f16 %c) 2514// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2515def int_amdgcn_fdot2_f16_f16 : 2516 ClangBuiltin<"__builtin_amdgcn_fdot2_f16_f16">, 2517 DefaultAttrsIntrinsic< 2518 [llvm_half_ty], // %r 2519 [ 2520 llvm_v2f16_ty, // %a 2521 llvm_v2f16_ty, // %b 2522 llvm_half_ty // %c 2523 ], 2524 [IntrNoMem, IntrSpeculatable] 2525 >; 2526 2527// bf16 %r = llvm.amdgcn.fdot2.bf16.bf16(v2bf16 %a, v2bf16 %b, bf16 %c) 2528// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2529def int_amdgcn_fdot2_bf16_bf16 : 2530 ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">, 2531 DefaultAttrsIntrinsic< 2532 [llvm_i16_ty], // %r 2533 [ 2534 llvm_v2i16_ty, // %a 2535 llvm_v2i16_ty, // %b 2536 llvm_i16_ty // %c 2537 ], 2538 [IntrNoMem, IntrSpeculatable] 2539 >; 2540 2541// f32 %r = llvm.amdgcn.fdot2.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) 2542// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2543def int_amdgcn_fdot2_f32_bf16 : 2544 ClangBuiltin<"__builtin_amdgcn_fdot2_f32_bf16">, 2545 DefaultAttrsIntrinsic< 2546 [llvm_float_ty], // %r 2547 [ 2548 llvm_v2i16_ty, // %a 2549 llvm_v2i16_ty, // %b 2550 llvm_float_ty, // %c 2551 llvm_i1_ty // %clamp 2552 ], 2553 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2554 >; 2555 2556// i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c, i1 %clamp) 2557// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2558def int_amdgcn_sdot2 : 2559 ClangBuiltin<"__builtin_amdgcn_sdot2">, 2560 DefaultAttrsIntrinsic< 2561 [llvm_i32_ty], // %r 2562 [ 2563 llvm_v2i16_ty, // %a 2564 llvm_v2i16_ty, // %b 2565 llvm_i32_ty, // %c 2566 llvm_i1_ty // %clamp 2567 ], 2568 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2569 >; 2570 2571// u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c, i1 %clamp) 2572// %r = %a[0] * %b[0] + %a[1] * %b[1] + %c 2573def int_amdgcn_udot2 : 2574 ClangBuiltin<"__builtin_amdgcn_udot2">, 2575 DefaultAttrsIntrinsic< 2576 [llvm_i32_ty], // %r 2577 [ 2578 llvm_v2i16_ty, // %a 2579 llvm_v2i16_ty, // %b 2580 llvm_i32_ty, // %c 2581 llvm_i1_ty // %clamp 2582 ], 2583 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2584 >; 2585 2586// i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2587// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2588def int_amdgcn_sdot4 : 2589 ClangBuiltin<"__builtin_amdgcn_sdot4">, 2590 DefaultAttrsIntrinsic< 2591 [llvm_i32_ty], // %r 2592 [ 2593 llvm_i32_ty, // %a 2594 llvm_i32_ty, // %b 2595 llvm_i32_ty, // %c 2596 llvm_i1_ty // %clamp 2597 ], 2598 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2599 >; 2600 2601// u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c, i1 %clamp) 2602// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2603def int_amdgcn_udot4 : 2604 ClangBuiltin<"__builtin_amdgcn_udot4">, 2605 DefaultAttrsIntrinsic< 2606 [llvm_i32_ty], // %r 2607 [ 2608 llvm_i32_ty, // %a 2609 llvm_i32_ty, // %b 2610 llvm_i32_ty, // %c 2611 llvm_i1_ty // %clamp 2612 ], 2613 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2614 >; 2615 2616// i32 %r = llvm.amdgcn.sudot4(i1 %a_sign, v4i8 (as i32) %a, i1 %b_sign, v4i8 (as i32) %b, i32 %c, i1 %clamp) 2617// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2618// a[i in 0. . . 3] = (%a_sign ? a.i8[i] : promoteToSigned(a.u8[i])); 2619// b[i in 0. . . 3] = (%b_sign ? b.i8[i] : promoteToSigned(b.u8[i])); 2620// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c 2621def int_amdgcn_sudot4 : 2622 ClangBuiltin<"__builtin_amdgcn_sudot4">, 2623 DefaultAttrsIntrinsic< 2624 [llvm_i32_ty], // %r 2625 [ 2626 llvm_i1_ty, // %a_sign 2627 llvm_i32_ty, // %a 2628 llvm_i1_ty, // %b_sign 2629 llvm_i32_ty, // %b 2630 llvm_i32_ty, // %c 2631 llvm_i1_ty // %clamp 2632 ], 2633 [IntrNoMem, IntrSpeculatable, 2634 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2635 >; 2636 2637// i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2638// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2639// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2640def int_amdgcn_sdot8 : 2641 ClangBuiltin<"__builtin_amdgcn_sdot8">, 2642 DefaultAttrsIntrinsic< 2643 [llvm_i32_ty], // %r 2644 [ 2645 llvm_i32_ty, // %a 2646 llvm_i32_ty, // %b 2647 llvm_i32_ty, // %c 2648 llvm_i1_ty // %clamp 2649 ], 2650 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2651 >; 2652 2653// u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c, i1 %clamp) 2654// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2655// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2656def int_amdgcn_udot8 : 2657 ClangBuiltin<"__builtin_amdgcn_udot8">, 2658 DefaultAttrsIntrinsic< 2659 [llvm_i32_ty], // %r 2660 [ 2661 llvm_i32_ty, // %a 2662 llvm_i32_ty, // %b 2663 llvm_i32_ty, // %c 2664 llvm_i1_ty // %clamp 2665 ], 2666 [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>] 2667 >; 2668 2669// i32 %r = llvm.amdgcn.sudot8(i1 %a_sign, v8i4 (as i32) %a, i1 %b_sign, v8i4 (as i32) %b, i32 %c, i1 %clamp) 2670// Treat input as signed (_sign = 1) or unsigned (_sign = 0). 2671// a[i in 0. . . 7] = (%a_sign ? a.i4[i] : promoteToSigned(a.u4[i])); 2672// b[i in 0. . . 7] = (%b_sign ? b.i4[i] : promoteToSigned(b.u4[i])); 2673// %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + 2674// %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c 2675 def int_amdgcn_sudot8 : 2676 ClangBuiltin<"__builtin_amdgcn_sudot8">, 2677 DefaultAttrsIntrinsic< 2678 [llvm_i32_ty], // %r 2679 [ 2680 llvm_i1_ty, // %a_sign 2681 llvm_i32_ty, // %a 2682 llvm_i1_ty, // %b_sign 2683 llvm_i32_ty, // %b 2684 llvm_i32_ty, // %c 2685 llvm_i1_ty // %clamp 2686 ], 2687 [IntrNoMem, IntrSpeculatable, 2688 ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>] 2689 >; 2690 2691//===----------------------------------------------------------------------===// 2692// gfx908 intrinsics 2693// ===----------------------------------------------------------------------===// 2694 2695def int_amdgcn_global_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2696 2697// llvm.amdgcn.mfma.*.* vdst, srcA, srcB, srcC, cbsz, abid, blgp 2698class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> : 2699 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2700 DefaultAttrsIntrinsic<[DestTy], 2701 [SrcABTy, SrcABTy, DestTy, 2702 llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2703 [IntrConvergent, IntrNoMem, 2704 ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2705 2706def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>; 2707def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2708def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2709def int_amdgcn_mfma_f32_32x32x2f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>; 2710def int_amdgcn_mfma_f32_16x16x4f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>; 2711def int_amdgcn_mfma_f32_32x32x4f16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4f16_ty>; 2712def int_amdgcn_mfma_f32_16x16x4f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2713def int_amdgcn_mfma_f32_4x4x4f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2714def int_amdgcn_mfma_f32_32x32x8f16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty>; 2715def int_amdgcn_mfma_f32_16x16x16f16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty>; 2716def int_amdgcn_mfma_i32_32x32x4i8 : AMDGPUMfmaIntrinsic<llvm_v32i32_ty, llvm_i32_ty>; 2717def int_amdgcn_mfma_i32_16x16x4i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2718def int_amdgcn_mfma_i32_4x4x4i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2719def int_amdgcn_mfma_i32_32x32x8i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i32_ty>; 2720def int_amdgcn_mfma_i32_16x16x16i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i32_ty>; 2721def int_amdgcn_mfma_f32_32x32x2bf16 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v2i16_ty>; 2722def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2723def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2724def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>; 2725def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>; 2726 2727//===----------------------------------------------------------------------===// 2728// gfx90a intrinsics 2729// ===----------------------------------------------------------------------===// 2730 2731def int_amdgcn_global_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2732def int_amdgcn_global_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2733def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2734def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2735def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>; 2736 2737def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>; 2738def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2739def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2740def int_amdgcn_mfma_f32_32x32x8bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>; 2741def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>; 2742 2743// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. 2744// Three bits corresponding to the neg modifier applied to the respective 2745// source operand. 2746def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>; 2747def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>; 2748 2749//===----------------------------------------------------------------------===// 2750// gfx940 intrinsics 2751// ===----------------------------------------------------------------------===// 2752 2753// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. 2754def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>; 2755def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>; 2756def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic< 2757 [llvm_v2i16_ty], 2758 [LLVMQualPointerType<3>, llvm_v2i16_ty], 2759 [IntrArgMemOnly, NoCapture<ArgIndex<0>>]>, 2760 ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; 2761 2762def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>; 2763def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>; 2764def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>; 2765def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>; 2766 2767class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> : 2768 AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>; 2769 2770multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> { 2771 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 2772 def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>; 2773} 2774 2775defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>; 2776defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>; 2777 2778// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid 2779class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> : 2780 ClangBuiltin<!subst("int", "__builtin", NAME)>, 2781 DefaultAttrsIntrinsic<[DestTy], 2782 [SrcA, SrcB, DestTy, llvm_i32_ty, 2783 llvm_i32_ty, llvm_i32_ty], 2784 [IntrConvergent, IntrNoMem, 2785 ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>; 2786 2787def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2788def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>; 2789def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2790def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>; 2791def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2792def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>; 2793 2794class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> : 2795 AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>; 2796 2797multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> { 2798 foreach kind = ["bf8_bf8", "bf8_fp8", "fp8_bf8", "fp8_fp8"] in 2799 def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>; 2800} 2801 2802defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>; 2803defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>; 2804 2805// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] 2806// byte_sel selects byte from srcA. 2807def int_amdgcn_cvt_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_bf8">, 2808 DefaultAttrsIntrinsic<[llvm_float_ty], 2809 [llvm_i32_ty, llvm_i32_ty], 2810 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2811 2812// llvm.amdgcn.cvt.f32.fp8 float vdst, int srcA, imm byte_sel [0..3] 2813def int_amdgcn_cvt_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_f32_fp8">, 2814 DefaultAttrsIntrinsic<[llvm_float_ty], 2815 [llvm_i32_ty, llvm_i32_ty], 2816 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2817 2818// llvm.amdgcn.cvt.pk.f32.bf8 float2 vdst, int srcA, imm word_sel 2819// word_sel = 1 selects 2 high bytes, 0 selects 2 low bytes. 2820def int_amdgcn_cvt_pk_f32_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_bf8">, 2821 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 2822 [llvm_i32_ty, llvm_i1_ty], 2823 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2824 2825// llvm.amdgcn.cvt.pk.f32.fp8 float2 vdst, int srcA, imm word_sel. 2826def int_amdgcn_cvt_pk_f32_fp8 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_f32_fp8">, 2827 DefaultAttrsIntrinsic<[llvm_v2f32_ty], 2828 [llvm_i32_ty, llvm_i1_ty], 2829 [IntrNoMem, ImmArg<ArgIndex<1>>]>; 2830 2831// llvm.amdgcn.cvt.pk.bf8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 2832// word_sel = 1 selects 2 high bytes in the vdst, 0 selects 2 low bytes. 2833def int_amdgcn_cvt_pk_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_bf8_f32">, 2834 DefaultAttrsIntrinsic<[llvm_i32_ty], 2835 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 2836 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 2837 2838// llvm.amdgcn.cvt.pk.fp8.f32 int vdst, float srcA, float srcB, int old, imm word_sel 2839def int_amdgcn_cvt_pk_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_pk_fp8_f32">, 2840 DefaultAttrsIntrinsic<[llvm_i32_ty], 2841 [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i1_ty], 2842 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 2843 2844// llvm.amdgcn.cvt.sr.bf8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 2845// byte_sel selects byte to write into vdst. 2846def int_amdgcn_cvt_sr_bf8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_bf8_f32">, 2847 DefaultAttrsIntrinsic<[llvm_i32_ty], 2848 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2849 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 2850 2851// llvm.amdgcn.cvt.sr.fp8.f32 int vdst, float srcA, int srcB, int old, imm byte_sel [0..3] 2852def int_amdgcn_cvt_sr_fp8_f32 : ClangBuiltin<"__builtin_amdgcn_cvt_sr_fp8_f32">, 2853 DefaultAttrsIntrinsic<[llvm_i32_ty], 2854 [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], 2855 [IntrNoMem, ImmArg<ArgIndex<3>>]>; 2856 2857//===----------------------------------------------------------------------===// 2858// Special Intrinsics for backend internal use only. No frontend 2859// should emit calls to these. 2860// ===----------------------------------------------------------------------===// 2861def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 2862 [llvm_i1_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2863>; 2864 2865def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty], 2866 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2867>; 2868 2869def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty], 2870 [llvm_i1_ty, LLVMMatchType<0>], 2871 [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2872>; 2873 2874def int_amdgcn_loop : Intrinsic<[llvm_i1_ty], 2875 [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree] 2876>; 2877 2878def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty], 2879 [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; 2880 2881// Represent unreachable in a divergent region. 2882def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent, IntrNoCallback, IntrNoFree]>; 2883 2884// Emit 2.5 ulp, no denormal division. Should only be inserted by 2885// pass based on !fpmath metadata. 2886def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< 2887 [llvm_float_ty], [llvm_float_ty, llvm_float_ty], 2888 [IntrNoMem, IntrSpeculatable] 2889>; 2890} 2891