1//===-- VOP3PInstructions.td - Vector Instruction Definitions -------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9//===----------------------------------------------------------------------===// 10// VOP3P Classes 11//===----------------------------------------------------------------------===// 12 13// Used for FMA_MIX* and MAD_MIX* insts 14// Their operands are only sort of f16 operands. Depending on 15// op_sel_hi, these may be interpreted as f32. The inline immediate 16// values are really f16 converted to f32, so we treat these as f16 17// operands. 18class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR, 19 bit useTiedOutput = 0> : VOP3_Profile<P, Features> { 20 bit UseTiedOutput = useTiedOutput; 21 22 dag srcs = 23 (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, 24 FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, 25 FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); 26 27 // FIXME: clampmod0 misbehaves with the non-default vdst_in 28 // following it. For now workaround this by requiring clamp 29 // in tied patterns. This should use undef_tied_input, but it 30 // seems underdeveloped and doesn't apply the right register 31 // class constraints. 32 dag mods = !con(!if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in), 33 (ins clampmod0:$clamp)), 34 (ins op_sel0:$op_sel, op_sel_hi0:$op_sel_hi)); 35 // We use Ins64 because that is the one which populates InOperandList 36 // due to the logic in class VOP3_Pseudo 37 let Ins64 = !con(srcs, mods); 38 let Asm64 = 39 "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; 40} 41 42multiclass VOP3PInst<string OpName, VOPProfile P, 43 SDPatternOperator node = null_frag, bit HasExplicitClamp = 0> { 44 def NAME : VOP3P_Pseudo<OpName, P, 45 !if (P.HasModifiers, 46 getVOP3PModPat<P, node, HasExplicitClamp>.ret, 47 getVOP3Pat<P, node>.ret)>; 48} 49 50 51// Non-packed instructions that use the VOP3P encoding. 52// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. 53multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> { 54 def NAME : VOP3P_Pseudo<OpName, P> { 55 let Constraints = !if(P.UseTiedOutput, "$vdst = $vdst_in", ""); 56 let DisableEncoding = !if(P.UseTiedOutput, "$vdst_in", ""); 57 } 58} 59 60let isCommutable = 1 in { 61defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; 62defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; 63 64let FPDPRounding = 1 in { 65defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>; 66defm V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>; 67defm V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>; 68} // End FPDPRounding = 1 69defm V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; 70defm V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; 71 72defm V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; 73defm V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; 74defm V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; 75 76defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; 77defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>; 78defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>; 79defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; 80} 81 82defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; 83defm V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; 84 85defm V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshl_rev_16>; 86defm V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, cashr_rev_16>; 87defm V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, clshr_rev_16>; 88 89 90let SubtargetPredicate = HasVOP3PInsts in { 91 92// Undo sub x, c -> add x, -c canonicalization since c is more likely 93// an inline immediate than -c. 94// The constant will be emitted as a mov, and folded later. 95// TODO: We could directly encode the immediate now 96def : GCNPat< 97 (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1), 98 (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1) 99>; 100 101// Integer operations with clamp bit set. 102class VOP3PSatPat<SDPatternOperator pat, Instruction inst> : GCNPat< 103 (pat (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), 104 (v2i16 (VOP3PMods v2i16:$src1, i32:$src1_modifiers))), 105 (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE) 106>; 107 108def : VOP3PSatPat<uaddsat, V_PK_ADD_U16>; 109def : VOP3PSatPat<saddsat, V_PK_ADD_I16>; 110def : VOP3PSatPat<usubsat, V_PK_SUB_U16>; 111def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>; 112} // End SubtargetPredicate = HasVOP3PInsts 113 114multiclass MadFmaMixPats<SDPatternOperator fma_like, 115 Instruction mixlo_inst, 116 Instruction mixhi_inst> { 117 def : GCNPat < 118 (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 119 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 120 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), 121 (mixlo_inst $src0_modifiers, $src0, 122 $src1_modifiers, $src1, 123 $src2_modifiers, $src2, 124 DSTCLAMP.NONE, 125 (i32 (IMPLICIT_DEF))) 126 >; 127 128 // FIXME: Special case handling for maxhi (especially for clamp) 129 // because dealing with the write to high half of the register is 130 // difficult. 131 def : GCNPat < 132 (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 133 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 134 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), 135 (v2f16 (mixhi_inst $src0_modifiers, $src0, 136 $src1_modifiers, $src1, 137 $src2_modifiers, $src2, 138 DSTCLAMP.NONE, 139 $elt0)) 140 >; 141 142 def : GCNPat < 143 (build_vector 144 f16:$elt0, 145 (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 146 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 147 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), 148 (v2f16 (mixhi_inst $src0_modifiers, $src0, 149 $src1_modifiers, $src1, 150 $src2_modifiers, $src2, 151 DSTCLAMP.ENABLE, 152 $elt0)) 153 >; 154 155 def : GCNPat < 156 (AMDGPUclamp (build_vector 157 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), 158 (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), 159 (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), 160 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), 161 (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), 162 (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), 163 (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, 164 $hi_src1_modifiers, $hi_src1, 165 $hi_src2_modifiers, $hi_src2, 166 DSTCLAMP.ENABLE, 167 (mixlo_inst $lo_src0_modifiers, $lo_src0, 168 $lo_src1_modifiers, $lo_src1, 169 $lo_src2_modifiers, $lo_src2, 170 DSTCLAMP.ENABLE, 171 (i32 (IMPLICIT_DEF))))) 172 >; 173} 174 175let SubtargetPredicate = HasMadMixInsts in { 176 177// These are VOP3a-like opcodes which accept no omod. 178// Size of src arguments (16/32) is controlled by op_sel. 179// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. 180let isCommutable = 1, mayRaiseFPException = 0 in { 181defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; 182 183let FPDPRounding = 1 in { 184// Clamp modifier is applied after conversion to f16. 185defm V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; 186 187let ClampLo = 0, ClampHi = 1 in { 188defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; 189} 190} // End FPDPRounding = 1 191} 192 193defm : MadFmaMixPats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; 194} // End SubtargetPredicate = HasMadMixInsts 195 196 197// Essentially the same as the mad_mix versions 198let SubtargetPredicate = HasFmaMixInsts in { 199let isCommutable = 1 in { 200defm V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; 201 202let FPDPRounding = 1 in { 203// Clamp modifier is applied after conversion to f16. 204defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; 205 206let ClampLo = 0, ClampHi = 1 in { 207defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>; 208} 209} // End FPDPRounding = 1 210} 211 212defm : MadFmaMixPats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; 213} 214 215// Defines patterns that extract signed 4bit from each Idx[0]. 216foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in 217 def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src), 218 (sra (shl node:$src, (i32 Idx[1])), (i32 28))>; 219 220// Defines code pattern that extracts U(unsigned/signed) 4/8bit from FromBitIndex. 221class Extract<int FromBitIndex, int BitMask, bit U>: PatFrag< 222 (ops node:$src), 223 !if (!or (!and (!eq (BitMask, 255), !eq (FromBitIndex, 24)), !eq (FromBitIndex, 28)), // last element 224 !if (U, (srl node:$src, (i32 FromBitIndex)), (sra node:$src, (i32 FromBitIndex))), 225 !if (!eq (FromBitIndex, 0), // first element 226 !if (U, (and node:$src, (i32 BitMask)), 227 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), 228 (sext_inreg node:$src, i8))), 229 !if (U, (and (srl node:$src, (i32 FromBitIndex)), (i32 BitMask)), 230 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), 231 (sext_inreg (srl node:$src, (i32 FromBitIndex)), i8)))))>; 232 233 234foreach Type = ["I", "U"] in 235 foreach Index = 0-3 in { 236 // Defines patterns that extract each Index'ed 8bit from an unsigned 237 // 32bit scalar value; 238 def Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !eq (Type, "U")>; 239 240 // Defines multiplication patterns where the multiplication is happening on each 241 // Index'ed 8bit of a 32bit scalar value. 242 243 def Mul#Type#_Elt#Index : PatFrag< 244 (ops node:$src0, node:$src1), 245 (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse)) 246 (!cast<Extract>(Type#Index#"_8bit") node:$src0), 247 (!cast<Extract>(Type#Index#"_8bit") node:$src1))>; 248 } 249 250// Different variants of dot8 patterns cause a huge increase in the compile time. 251// Define non-associative/commutative add/mul to prevent permutation in the dot8 252// pattern. 253def NonACAdd : SDNode<"ISD::ADD" , SDTIntBinOp>; 254def NonACAdd_oneuse : HasOneUseBinOp<NonACAdd>; 255 256def NonACAMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24" , SDTIntBinOp>; 257def NonACAMDGPUmul_u24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_u24>; 258 259def NonACAMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24" , SDTIntBinOp>; 260def NonACAMDGPUmul_i24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_i24>; 261 262foreach Type = ["I", "U"] in 263 foreach Index = 0-7 in { 264 // Defines patterns that extract each Index'ed 4bit from an unsigned 265 // 32bit scalar value; 266 def Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !eq (Type, "U")>; 267 268 // Defines multiplication patterns where the multiplication is happening on each 269 // Index'ed 8bit of a 32bit scalar value. 270 def Mul#Type#Index#"_4bit" : PatFrag< 271 (ops node:$src0, node:$src1), 272 (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse)) 273 (!cast<Extract>(Type#Index#"_4bit") node:$src0), 274 (!cast<Extract>(Type#Index#"_4bit") node:$src1))>; 275 } 276 277class UDot2Pat<Instruction Inst> : GCNPat < 278 (add (add_oneuse (AMDGPUmul_u24_oneuse (srl i32:$src0, (i32 16)), 279 (srl i32:$src1, (i32 16))), i32:$src2), 280 (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)), 281 (and i32:$src1, (i32 65535))) 282 ), 283 (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { 284 let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; 285} 286 287class SDot2Pat<Instruction Inst> : GCNPat < 288 (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)), 289 (sra i32:$src1, (i32 16))), i32:$src2), 290 (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16), 291 (sext_inreg i32:$src1, i16))), 292 (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { 293 let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; 294} 295 296let IsDOT = 1 in { 297let SubtargetPredicate = HasDot2Insts in { 298 299defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", 300 VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>; 301defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", 302 VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>; 303 304} // End SubtargetPredicate = HasDot2Insts 305 306let SubtargetPredicate = HasDot7Insts in { 307 308defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", 309 VOP3_Profile<VOP_F32_V2F16_V2F16_F32>, 310 AMDGPUfdot2, 1/*ExplicitClamp*/>; 311defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", 312 VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>; 313defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", 314 VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>; 315 316} // End SubtargetPredicate = HasDot7Insts 317 318let SubtargetPredicate = HasDot1Insts in { 319 320defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", 321 VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>; 322defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", 323 VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>; 324 325} // End SubtargetPredicate = HasDot1Insts 326} // End let IsDOT = 1 327 328def : UDot2Pat<V_DOT2_U32_U16>; 329def : SDot2Pat<V_DOT2_I32_I16>; 330 331foreach Type = ["U", "I"] in 332 let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in 333 def : GCNPat < 334 !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, 335 (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), 336 (!cast<VOP3P_Pseudo>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 337 338foreach Type = ["U", "I"] in 339 let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in 340 def : GCNPat < 341 !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), 342 [1, 2, 3, 4, 5, 6, 7], lhs, y, 343 (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), 344 (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 345 346// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase 347// in the compile time. Directly handle the pattern generated by the FE here. 348foreach Type = ["U", "I"] in 349 let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in 350 def : GCNPat < 351 !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), 352 [7, 1, 2, 3, 4, 5, 6], lhs, y, 353 (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), 354 (!cast<VOP3P_Pseudo>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 355 356def ADst_32 : VOPDstOperand<AGPR_32>; 357def ADst_64 : VOPDstOperand<AReg_64>; 358def ADst_128 : VOPDstOperand<AReg_128>; 359def ADst_256 : VOPDstOperand<AReg_256>; 360def ADst_512 : VOPDstOperand<AReg_512>; 361def ADst_1024 : VOPDstOperand<AReg_1024>; 362def VDst_64 : VOPDstOperand<VReg_64>; 363def VDst_128 : VOPDstOperand<VReg_128>; 364def VDst_256 : VOPDstOperand<VReg_256>; 365def VDst_512 : VOPDstOperand<VReg_512>; 366def VDst_1024 : VOPDstOperand<VReg_1024>; 367 368def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { 369 let Src0RC64 = ARegSrc_32; 370} 371 372def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { 373 let DstRC = ADst_32; 374 let Src0RC64 = VISrc_b32; 375} 376 377class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC, 378 RegisterOperand SrcABRC = AVSrc_32> 379 : VOP3_Profile<P, VOP3_MAI> { 380 let DstRC = _DstRC; 381 let Src0RC64 = SrcABRC; 382 let Src1RC64 = SrcABRC; 383 let Src2RC64 = _SrcRC; 384 let HasOpSel = 0; 385 let HasClamp = 0; 386 let HasIntClamp = 0; 387 let HasOMod = 0; 388 let HasModifiers = 0; 389 let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp"; 390 let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); 391 // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs. 392 // We then create two versions of the instruction: with tied dst and src2 393 // and with the eralyclobber flag on the dst. This is strciter than the 394 // actual HW restriction. In particular earlyclobber also affects src0 and 395 // src1 allocation which is not required. 396 bit NoDstOverlap = !gt(DstVT.Size, 128); 397} 398 399def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>; 400def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, AISrc_512_f32, ADst_512>; 401def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, AISrc_1024_f32, ADst_1024>; 402def VOPProfileMAI_I32_I32_X4 : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, AISrc_128_b32, ADst_128>; 403def VOPProfileMAI_I32_I32_X16 : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, AISrc_512_b32, ADst_512>; 404def VOPProfileMAI_I32_I32_X32 : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, AISrc_1024_b32, ADst_1024>; 405def VOPProfileMAI_F32_V2I16_X4 : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, AISrc_128_b32, ADst_128>; 406def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, AISrc_512_b32, ADst_512>; 407def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, AISrc_1024_b32, ADst_1024>; 408def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; 409def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; 410def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; 411def VOPProfileMAI_F32_V4I16_X4 : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; 412def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; 413def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; 414def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>; 415def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>; 416 417def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>; 418def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>; 419def VOPProfileMAI_F32_F32_X32_VCD : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, VISrc_1024_f32, VDst_1024>; 420def VOPProfileMAI_I32_I32_X4_VCD : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, VISrc_128_b32, VDst_128>; 421def VOPProfileMAI_I32_I32_X16_VCD : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, VISrc_512_b32, VDst_512>; 422def VOPProfileMAI_I32_I32_X32_VCD : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, VISrc_1024_b32, VDst_1024>; 423def VOPProfileMAI_F32_V2I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, VISrc_128_b32, VDst_128>; 424def VOPProfileMAI_F32_V2I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, VISrc_512_b32, VDst_512>; 425def VOPProfileMAI_F32_V2I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, VISrc_1024_b32, VDst_1024>; 426def VOPProfileMAI_F32_V4F16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; 427def VOPProfileMAI_F32_V4F16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; 428def VOPProfileMAI_F32_V4F16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>; 429def VOPProfileMAI_F32_V4I16_X4_VCD : VOPProfileMAI<VOP_V4F32_V4I16_V4I16_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>; 430def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>; 431def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>; 432def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>; 433def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>; 434 435class MFMATable <bit is_mac, string Name> { 436 bit IsMac = is_mac; 437 string FMAOp = Name; 438} 439 440let Predicates = [HasMAIInsts] in { 441 442let isAsCheapAsAMove = 1, isReMaterializable = 1 in { 443 defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; 444 let isMoveImm = 1 in { 445 defm V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite>; 446 } // End isMoveImm = 1 447} // End isAsCheapAsAMove = 1, isReMaterializable = 1 448 449multiclass MAIInst<string OpName, string P, SDPatternOperator node, 450 bit NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap> { 451 let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in { 452 // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported. 453 let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in { 454 defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), !if(NoDstOverlap, null_frag, node)>, 455 MFMATable<0, NAME # "_e64">; 456 457 let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in 458 defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>, 459 MFMATable<0, NAME # "_vgprcd_e64">; 460 } 461 462 foreach _ = BoolToList<NoDstOverlap>.ret in { 463 let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""), 464 isConvertibleToThreeAddress = NoDstOverlap, 465 Mnemonic = OpName in { 466 defm "_mac" : VOP3Inst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>, 467 MFMATable<1, NAME # "_e64">; 468 469 let SubtargetPredicate = isGFX90APlus in 470 defm _mac_vgprcd : VOP3Inst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>, 471 MFMATable<1, NAME # "_vgprcd_e64">; 472 } 473 } 474 } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 475} 476 477defm V_MFMA_F32_4X4X1F32 : MAIInst<"v_mfma_f32_4x4x1f32", "F32_F32_X4", int_amdgcn_mfma_f32_4x4x1f32>; 478defm V_MFMA_F32_4X4X4F16 : MAIInst<"v_mfma_f32_4x4x4f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_4x4x4f16>; 479defm V_MFMA_I32_4X4X4I8 : MAIInst<"v_mfma_i32_4x4x4i8", "I32_I32_X4", int_amdgcn_mfma_i32_4x4x4i8>; 480defm V_MFMA_F32_16X16X1F32 : MAIInst<"v_mfma_f32_16x16x1f32", "F32_F32_X16", int_amdgcn_mfma_f32_16x16x1f32>; 481defm V_MFMA_F32_16X16X4F32 : MAIInst<"v_mfma_f32_16x16x4f32", "F32_F32_X4", int_amdgcn_mfma_f32_16x16x4f32>; 482defm V_MFMA_F32_16X16X4F16 : MAIInst<"v_mfma_f32_16x16x4f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_16x16x4f16>; 483defm V_MFMA_F32_16X16X16F16 : MAIInst<"v_mfma_f32_16x16x16f16", "F32_V4F16_X4", int_amdgcn_mfma_f32_16x16x16f16>; 484defm V_MFMA_I32_16X16X4I8 : MAIInst<"v_mfma_i32_16x16x4i8", "I32_I32_X16", int_amdgcn_mfma_i32_16x16x4i8>; 485defm V_MFMA_F32_32X32X1F32 : MAIInst<"v_mfma_f32_32x32x1f32", "F32_F32_X32", int_amdgcn_mfma_f32_32x32x1f32>; 486defm V_MFMA_F32_32X32X2F32 : MAIInst<"v_mfma_f32_32x32x2f32", "F32_F32_X16", int_amdgcn_mfma_f32_32x32x2f32>; 487defm V_MFMA_F32_32X32X4F16 : MAIInst<"v_mfma_f32_32x32x4f16", "F32_V4F16_X32", int_amdgcn_mfma_f32_32x32x4f16>; 488defm V_MFMA_F32_32X32X8F16 : MAIInst<"v_mfma_f32_32x32x8f16", "F32_V4F16_X16", int_amdgcn_mfma_f32_32x32x8f16>; 489defm V_MFMA_I32_32X32X4I8 : MAIInst<"v_mfma_i32_32x32x4i8", "I32_I32_X32", int_amdgcn_mfma_i32_32x32x4i8>; 490defm V_MFMA_I32_16X16X16I8 : MAIInst<"v_mfma_i32_16x16x16i8", "I32_I32_X4", int_amdgcn_mfma_i32_16x16x16i8>; 491defm V_MFMA_I32_32X32X8I8 : MAIInst<"v_mfma_i32_32x32x8i8", "I32_I32_X16", int_amdgcn_mfma_i32_32x32x8i8>; 492defm V_MFMA_F32_4X4X2BF16 : MAIInst<"v_mfma_f32_4x4x2bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_4x4x2bf16>; 493defm V_MFMA_F32_16X16X2BF16 : MAIInst<"v_mfma_f32_16x16x2bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_16x16x2bf16>; 494defm V_MFMA_F32_16X16X8BF16 : MAIInst<"v_mfma_f32_16x16x8bf16", "F32_V2I16_X4", int_amdgcn_mfma_f32_16x16x8bf16>; 495defm V_MFMA_F32_32X32X2BF16 : MAIInst<"v_mfma_f32_32x32x2bf16", "F32_V2I16_X32", int_amdgcn_mfma_f32_32x32x2bf16>; 496defm V_MFMA_F32_32X32X4BF16 : MAIInst<"v_mfma_f32_32x32x4bf16", "F32_V2I16_X16", int_amdgcn_mfma_f32_32x32x4bf16>; 497 498} // End SubtargetPredicate = HasMAIInsts 499 500let Predicates = [isGFX90APlus] in { 501 defm V_MFMA_F32_32X32X4BF16_1K : MAIInst<"v_mfma_f32_32x32x4bf16_1k", "F32_V4I16_X32", int_amdgcn_mfma_f32_32x32x4bf16_1k>; 502 defm V_MFMA_F32_16X16X4BF16_1K : MAIInst<"v_mfma_f32_16x16x4bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_16x16x4bf16_1k>; 503 defm V_MFMA_F32_4X4X4BF16_1K : MAIInst<"v_mfma_f32_4x4x4bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_4x4x4bf16_1k>; 504 defm V_MFMA_F32_32X32X8BF16_1K : MAIInst<"v_mfma_f32_32x32x8bf16_1k", "F32_V4I16_X16", int_amdgcn_mfma_f32_32x32x8bf16_1k>; 505 defm V_MFMA_F32_16X16X16BF16_1K : MAIInst<"v_mfma_f32_16x16x16bf16_1k", "F32_V4I16_X4", int_amdgcn_mfma_f32_16x16x16bf16_1k>; 506 507 defm V_MFMA_F64_16X16X4F64 : MAIInst<"v_mfma_f64_16x16x4f64", "F64_16X16X4F64", int_amdgcn_mfma_f64_16x16x4f64>; 508 defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>; 509} // End Predicates = [isGFX90APlus] 510 511let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in { 512 defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>; 513 defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>; 514 defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>; 515 defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; 516} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 517 518def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; 519def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; 520 521//===----------------------------------------------------------------------===// 522// Begin Real Encodings 523//===----------------------------------------------------------------------===// 524 525//===----------------------------------------------------------------------===// 526// GFX8 (VI) 527//===----------------------------------------------------------------------===// 528 529multiclass VOP3P_Real_vi<bits<7> op> { 530 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, 531 VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> { 532 let AssemblerPredicate = HasVOP3PInsts; 533 let DecoderNamespace = "GFX8"; 534 let VOP3P = 1; 535 } 536} 537 538multiclass VOP3P_Real_MAI<bits<7> op> { 539 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, 540 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> { 541 let AssemblerPredicate = HasMAIInsts; 542 let DecoderNamespace = "GFX8"; 543 let Inst{14} = ?; // op_sel_hi(2) 544 let Inst{59} = ?; // op_sel_hi(0) 545 let Inst{60} = ?; // op_sel_hi(1) 546 } 547} 548 549let Constraints = "" in { 550multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> { 551 let SubtargetPredicate = isGFX90AOnly, 552 AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in { 553 def _gfx90a_acd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX90A>, 554 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, 1>; 555 556 def _gfx90a_vcd : VOP3P_Real<!cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64"), SIEncodingFamily.GFX90A>, 557 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64").Pfl, 0>; 558 } // End AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" 559} 560 561multiclass VOP3P_Real_MFMA<bits<7> op> : 562 VOP3P_Real_MFMA_gfx90a <op> { 563 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>, 564 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> { 565 let AssemblerPredicate = HasMAIInsts; 566 let DecoderNamespace = "GFX8"; 567 } 568} 569} 570 571defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>; 572defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>; 573defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>; 574defm V_PK_SUB_I16 : VOP3P_Real_vi <0x03>; 575defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x04>; 576defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x05>; 577defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x06>; 578defm V_PK_MAX_I16 : VOP3P_Real_vi <0x07>; 579defm V_PK_MIN_I16 : VOP3P_Real_vi <0x08>; 580defm V_PK_MAD_U16 : VOP3P_Real_vi <0x09>; 581 582defm V_PK_ADD_U16 : VOP3P_Real_vi <0x0a>; 583defm V_PK_SUB_U16 : VOP3P_Real_vi <0x0b>; 584defm V_PK_MAX_U16 : VOP3P_Real_vi <0x0c>; 585defm V_PK_MIN_U16 : VOP3P_Real_vi <0x0d>; 586defm V_PK_FMA_F16 : VOP3P_Real_vi <0x0e>; 587defm V_PK_ADD_F16 : VOP3P_Real_vi <0x0f>; 588defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>; 589defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>; 590defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>; 591 592 593let SubtargetPredicate = HasMadMixInsts in { 594defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>; 595defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x21>; 596defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x22>; 597} 598 599let SubtargetPredicate = HasFmaMixInsts in { 600let DecoderNamespace = "GFX9_DL" in { 601// The mad_mix instructions were renamed and their behaviors changed, 602// but the opcode stayed the same so we need to put these in a 603// different DecoderNamespace to avoid the ambiguity. 604defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x20>; 605defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x21>; 606defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>; 607} 608} 609 610 611let SubtargetPredicate = HasDot2Insts in { 612 613defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>; 614defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>; 615 616} // End SubtargetPredicate = HasDot2Insts 617 618let SubtargetPredicate = HasDot7Insts in { 619 620defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>; 621defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x29>; 622defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>; 623 624} // End SubtargetPredicate = HasDot7Insts 625 626let SubtargetPredicate = HasDot1Insts in { 627 628defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x28>; 629defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x2a>; 630 631} // End SubtargetPredicate = HasDot1Insts 632 633let SubtargetPredicate = HasMAIInsts in { 634 635defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x58>; 636defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>; 637defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MFMA <0x40>; 638defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MFMA <0x41>; 639defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MFMA <0x42>; 640defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MFMA <0x44>; 641defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MFMA <0x45>; 642defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MFMA <0x48>; 643defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MFMA <0x49>; 644defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MFMA <0x4a>; 645defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MFMA <0x4c>; 646defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MFMA <0x4d>; 647defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MFMA <0x50>; 648defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MFMA <0x51>; 649defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MFMA <0x52>; 650defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MFMA <0x55>; 651defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MFMA <0x54>; 652defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA <0x68>; 653defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA <0x69>; 654defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MFMA <0x6b>; 655defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA <0x6c>; 656defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>; 657 658} // End SubtargetPredicate = HasMAIInsts 659 660defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x63>; 661defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x64>; 662defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x65>; 663defm V_MFMA_F32_32X32X8BF16_1K : VOP3P_Real_MFMA_gfx90a <0x66>; 664defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>; 665defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>; 666defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>; 667 668let SubtargetPredicate = HasPackedFP32Ops in { 669 defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>; 670 defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>; 671 defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>; 672 defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>; 673} // End SubtargetPredicate = HasPackedFP32Ops 674 675//===----------------------------------------------------------------------===// 676// GFX10. 677//===----------------------------------------------------------------------===// 678 679let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 in { 680 multiclass VOP3P_Real_gfx10<bits<7> op> { 681 def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>, 682 VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>; 683 } 684} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10", VOP3P = 1 685 686defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x00>; 687defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x01>; 688defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x02>; 689defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x03>; 690defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x04>; 691defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x05>; 692defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x06>; 693defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x07>; 694defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x08>; 695defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x09>; 696defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x0a>; 697defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x0b>; 698defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x0c>; 699defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x0d>; 700defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x0e>; 701defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x0f>; 702defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x10>; 703defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x11>; 704defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x12>; 705defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x20>; 706defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x21>; 707defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x22>; 708 709let SubtargetPredicate = HasDot2Insts in { 710 711defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>; 712defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>; 713 714} // End SubtargetPredicate = HasDot2Insts 715 716let SubtargetPredicate = HasDot7Insts in { 717 718defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x13>; 719defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x17>; 720defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x19>; 721 722} // End SubtargetPredicate = HasDot7Insts 723 724let SubtargetPredicate = HasDot1Insts in { 725 726defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x16>; 727defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x18>; 728 729} // End SubtargetPredicate = HasDot1Insts 730