1//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9//===----------------------------------------------------------------------===// 10// VOP3P Classes 11//===----------------------------------------------------------------------===// 12 13class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : 14 VOP3P_Pseudo<OpName, P, 15 !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret) 16>; 17 18// Non-packed instructions that use the VOP3P encoding. 19// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. 20class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0, 21 SDPatternOperator node = null_frag> : 22 VOP3P_Pseudo<OpName, P> { 23 // These operands are only sort of f16 operands. Depending on 24 // op_sel_hi, these may be interpreted as f32. The inline immediate 25 // values are really f16 converted to f32, so we treat these as f16 26 // operands. 27 let InOperandList = 28 !con( 29 !con( 30 (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, 31 FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, 32 FP16InputMods:$src2_modifiers, VCSrc_f16:$src2, 33 clampmod:$clamp), 34 !if(UseTiedOutput, (ins VGPR_32:$vdst_in), (ins))), 35 (ins op_sel:$op_sel, op_sel_hi:$op_sel_hi)); 36 37 let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", ""); 38 let DisableEncoding = !if(UseTiedOutput, "$vdst_in", ""); 39 let AsmOperands = 40 " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; 41} 42 43let isCommutable = 1 in { 44def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; 45def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; 46 47let FPDPRounding = 1 in { 48def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; 49def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>; 50def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>; 51} // End FPDPRounding = 1 52def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; 53def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; 54 55def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; 56def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; 57def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; 58 59def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; 60def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>; 61def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>; 62def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; 63} 64 65def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; 66def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; 67 68def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>; 69def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; 70def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; 71 72 73// Undo sub x, c -> add x, -c canonicalization since c is more likely 74// an inline immediate than -c. 75// The constant will be emitted as a mov, and folded later. 76// TODO: We could directly encode the immediate now 77def : GCNPat< 78 (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1), 79 (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp) 80>; 81 82multiclass MadFmaMixPats<SDPatternOperator fma_like, 83 Instruction mix_inst, 84 Instruction mixlo_inst, 85 Instruction mixhi_inst> { 86 def : GCNPat < 87 (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 88 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 89 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), 90 (mixlo_inst $src0_modifiers, $src0, 91 $src1_modifiers, $src1, 92 $src2_modifiers, $src2, 93 DSTCLAMP.NONE, 94 (i32 (IMPLICIT_DEF))) 95 >; 96 97 // FIXME: Special case handling for maxhi (especially for clamp) 98 // because dealing with the write to high half of the register is 99 // difficult. 100 def : GCNPat < 101 (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 102 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 103 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), 104 (v2f16 (mixhi_inst $src0_modifiers, $src0, 105 $src1_modifiers, $src1, 106 $src2_modifiers, $src2, 107 DSTCLAMP.NONE, 108 $elt0)) 109 >; 110 111 def : GCNPat < 112 (build_vector 113 f16:$elt0, 114 (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 115 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 116 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), 117 (v2f16 (mixhi_inst $src0_modifiers, $src0, 118 $src1_modifiers, $src1, 119 $src2_modifiers, $src2, 120 DSTCLAMP.ENABLE, 121 $elt0)) 122 >; 123 124 def : GCNPat < 125 (AMDGPUclamp (build_vector 126 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), 127 (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), 128 (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), 129 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), 130 (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), 131 (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), 132 (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, 133 $hi_src1_modifiers, $hi_src1, 134 $hi_src2_modifiers, $hi_src2, 135 DSTCLAMP.ENABLE, 136 (mixlo_inst $lo_src0_modifiers, $lo_src0, 137 $lo_src1_modifiers, $lo_src1, 138 $lo_src2_modifiers, $lo_src2, 139 DSTCLAMP.ENABLE, 140 (i32 (IMPLICIT_DEF))))) 141 >; 142} 143 144let SubtargetPredicate = HasMadMixInsts in { 145// These are VOP3a-like opcodes which accept no omod. 146// Size of src arguments (16/32) is controlled by op_sel. 147// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. 148let isCommutable = 1 in { 149def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; 150 151let FPDPRounding = 1 in { 152// Clamp modifier is applied after conversion to f16. 153def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; 154 155let ClampLo = 0, ClampHi = 1 in { 156def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; 157} 158} // End FPDPRounding = 1 159} 160 161defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; 162} // End SubtargetPredicate = HasMadMixInsts 163 164 165// Essentially the same as the mad_mix versions 166let SubtargetPredicate = HasFmaMixInsts in { 167let isCommutable = 1 in { 168def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; 169 170let FPDPRounding = 1 in { 171// Clamp modifier is applied after conversion to f16. 172def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; 173 174let ClampLo = 0, ClampHi = 1 in { 175def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; 176} 177} // End FPDPRounding = 1 178} 179 180defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; 181} 182 183// Defines patterns that extract signed 4bit from each Idx[0]. 184foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in 185 def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src), 186 (sra (shl node:$src, (i32 Idx[1])), (i32 28))>; 187 188// Defines code pattern that extracts U(unsigned/signed) 4/8bit from FromBitIndex. 189class Extract<int FromBitIndex, int BitMask, bit U>: PatFrag< 190 (ops node:$src), 191 !if (!or (!and (!eq (BitMask, 255), !eq (FromBitIndex, 24)), !eq (FromBitIndex, 28)), // last element 192 !if (U, (srl node:$src, (i32 FromBitIndex)), (sra node:$src, (i32 FromBitIndex))), 193 !if (!eq (FromBitIndex, 0), // first element 194 !if (U, (and node:$src, (i32 BitMask)), 195 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), 196 (sext_inreg node:$src, i8))), 197 !if (U, (and (srl node:$src, (i32 FromBitIndex)), (i32 BitMask)), 198 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), 199 (sext_inreg (srl node:$src, (i32 FromBitIndex)), i8)))))>; 200 201 202foreach Type = ["I", "U"] in 203 foreach Index = 0-3 in { 204 // Defines patterns that extract each Index'ed 8bit from an unsigned 205 // 32bit scalar value; 206 def #Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>; 207 208 // Defines multiplication patterns where the multiplication is happening on each 209 // Index'ed 8bit of a 32bit scalar value. 210 211 def Mul#Type#_Elt#Index : PatFrag< 212 (ops node:$src0, node:$src1), 213 (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse)) 214 (!cast<Extract>(#Type#Index#"_8bit") node:$src0), 215 (!cast<Extract>(#Type#Index#"_8bit") node:$src1))>; 216 } 217 218// Different variants of dot8 patterns cause a huge increase in the compile time. 219// Define non-associative/commutative add/mul to prevent permutation in the dot8 220// pattern. 221def NonACAdd : SDNode<"ISD::ADD" , SDTIntBinOp>; 222def NonACAdd_oneuse : HasOneUseBinOp<NonACAdd>; 223 224def NonACAMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24" , SDTIntBinOp>; 225def NonACAMDGPUmul_u24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_u24>; 226 227def NonACAMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24" , SDTIntBinOp>; 228def NonACAMDGPUmul_i24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_i24>; 229 230foreach Type = ["I", "U"] in 231 foreach Index = 0-7 in { 232 // Defines patterns that extract each Index'ed 4bit from an unsigned 233 // 32bit scalar value; 234 def #Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>; 235 236 // Defines multiplication patterns where the multiplication is happening on each 237 // Index'ed 8bit of a 32bit scalar value. 238 def Mul#Type#Index#"_4bit" : PatFrag< 239 (ops node:$src0, node:$src1), 240 (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse)) 241 (!cast<Extract>(#Type#Index#"_4bit") node:$src0), 242 (!cast<Extract>(#Type#Index#"_4bit") node:$src1))>; 243 } 244 245class UDot2Pat<Instruction Inst> : GCNPat < 246 (add (add_oneuse (AMDGPUmul_u24_oneuse (srl i32:$src0, (i32 16)), 247 (srl i32:$src1, (i32 16))), i32:$src2), 248 (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)), 249 (and i32:$src1, (i32 65535))) 250 ), 251 (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { 252 let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; 253} 254 255class SDot2Pat<Instruction Inst> : GCNPat < 256 (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)), 257 (sra i32:$src1, (i32 16))), i32:$src2), 258 (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16), 259 (sext_inreg i32:$src1, i16))), 260 (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { 261 let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; 262} 263 264let SubtargetPredicate = HasDot2Insts in { 265 266def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>; 267def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; 268def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; 269def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; 270def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; 271 272} // End SubtargetPredicate = HasDot2Insts 273 274let SubtargetPredicate = HasDot1Insts in { 275 276def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; 277def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; 278 279} // End SubtargetPredicate = HasDot1Insts 280 281multiclass DotPats<SDPatternOperator dot_op, 282 VOP3PInst dot_inst> { 283 let SubtargetPredicate = dot_inst.SubtargetPredicate in 284 def : GCNPat < 285 (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), 286 (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), 287 (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp), 288 (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>; 289} 290 291defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>; 292defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>; 293defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>; 294defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>; 295defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>; 296defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>; 297defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>; 298 299def : UDot2Pat<V_DOT2_U32_U16>; 300def : SDot2Pat<V_DOT2_I32_I16>; 301 302foreach Type = ["U", "I"] in 303 let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in 304 def : GCNPat < 305 !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, 306 (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), 307 (!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 308 309foreach Type = ["U", "I"] in 310 let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in 311 def : GCNPat < 312 !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), 313 [1, 2, 3, 4, 5, 6, 7], lhs, y, 314 (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), 315 (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 316 317// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase 318// in the compile time. Directly handle the pattern generated by the FE here. 319foreach Type = ["U", "I"] in 320 let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in 321 def : GCNPat < 322 !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), 323 [7, 1, 2, 3, 4, 5, 6], lhs, y, 324 (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), 325 (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 326 327def ADst_32 : VOPDstOperand<AGPR_32>; 328def ADst_128 : VOPDstOperand<AReg_128>; 329def ADst_512 : VOPDstOperand<AReg_512>; 330def ADst_1024 : VOPDstOperand<AReg_1024>; 331 332def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { 333 let Src0RC64 = ARegSrc_32; 334} 335 336def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { 337 let DstRC = ADst_32; 338 let Src0RC64 = VISrc_b32; 339} 340 341class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC, 342 RegisterOperand SrcABRC = AVSrc_32> 343 : VOP3_Profile<P, VOP3_MAI> { 344 let DstRC = _DstRC; 345 let Src0RC64 = SrcABRC; 346 let Src1RC64 = SrcABRC; 347 let Src2RC64 = _SrcRC; 348 let HasOpSel = 0; 349 let HasClamp = 0; 350 let HasModifiers = 0; 351 let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp"; 352 let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); 353} 354 355def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>; 356def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, AISrc_512_f32, ADst_512>; 357def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, AISrc_1024_f32, ADst_1024>; 358def VOPProfileMAI_I32_I32_X4 : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, AISrc_128_b32, ADst_128>; 359def VOPProfileMAI_I32_I32_X16 : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, AISrc_512_b32, ADst_512>; 360def VOPProfileMAI_I32_I32_X32 : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, AISrc_1024_b32, ADst_1024>; 361def VOPProfileMAI_F32_V2I16_X4 : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, AISrc_128_b32, ADst_128>; 362def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, AISrc_512_b32, ADst_512>; 363def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, AISrc_1024_b32, ADst_1024>; 364def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; 365def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; 366def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; 367 368let Predicates = [HasMAIInsts] in { 369def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; 370def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> { 371 let isMoveImm = 1; 372} 373 374let isConvergent = 1 in { 375def V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>; 376def V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>; 377def V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>; 378def V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>; 379def V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>; 380def V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>; 381def V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>; 382def V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>; 383def V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>; 384def V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>; 385def V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>; 386def V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>; 387def V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>; 388def V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>; 389def V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>; 390def V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>; 391def V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>; 392def V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>; 393def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>; 394def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>; 395} // End isConvergent = 1 396 397} // End SubtargetPredicate = HasMAIInsts 398 399def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; 400def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; 401 402multiclass VOP3P_Real_vi<bits<10> op> { 403 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, 404 VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> { 405 let AssemblerPredicates = [HasVOP3PInsts]; 406 let DecoderNamespace = "GFX8"; 407 } 408} 409 410multiclass VOP3P_Real_MAI<bits<10> op> { 411 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, 412 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME).Pfl> { 413 let AssemblerPredicates = [HasMAIInsts]; 414 let DecoderNamespace = "GFX8"; 415 } 416} 417 418defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>; 419defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>; 420defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>; 421defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>; 422defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>; 423defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>; 424defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>; 425defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>; 426defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>; 427defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>; 428 429defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>; 430defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>; 431defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>; 432defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>; 433defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>; 434defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>; 435defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>; 436defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>; 437defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>; 438 439 440let SubtargetPredicate = HasMadMixInsts in { 441defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>; 442defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; 443defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; 444} 445 446let SubtargetPredicate = HasFmaMixInsts in { 447let DecoderNamespace = "GFX9_DL" in { 448// The mad_mix instructions were renamed and their behaviors changed, 449// but the opcode stayed the same so we need to put these in a 450// different DecoderNamespace to avoid the ambiguity. 451defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>; 452defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; 453defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; 454} 455} 456 457 458let SubtargetPredicate = HasDot2Insts in { 459 460defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>; 461defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>; 462defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>; 463defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>; 464defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>; 465 466} // End SubtargetPredicate = HasDot2Insts 467 468let SubtargetPredicate = HasDot1Insts in { 469 470defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>; 471defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; 472 473} // End SubtargetPredicate = HasDot1Insts 474 475let SubtargetPredicate = HasMAIInsts in { 476 477defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x3d8>; 478defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x3d9>; 479defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MAI <0x3c0>; 480defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MAI <0x3c1>; 481defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MAI <0x3c2>; 482defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MAI <0x3c4>; 483defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MAI <0x3c5>; 484defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MAI <0x3c8>; 485defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MAI <0x3c9>; 486defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MAI <0x3ca>; 487defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MAI <0x3cc>; 488defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x3cd>; 489defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MAI <0x3d0>; 490defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MAI <0x3d1>; 491defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MAI <0x3d2>; 492defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MAI <0x3d4>; 493defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MAI <0x3d5>; 494defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x3e8>; 495defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x3e9>; 496defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MAI <0x3eb>; 497defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x3ec>; 498defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>; 499 500} // End SubtargetPredicate = HasMAIInsts 501 502//===----------------------------------------------------------------------===// 503// GFX10. 504//===----------------------------------------------------------------------===// 505 506let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { 507 multiclass VOP3P_Real_gfx10<bits<10> op> { 508 def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>, 509 VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>; 510 } 511} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" 512 513defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x000>; 514defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x001>; 515defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x002>; 516defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x003>; 517defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x004>; 518defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x005>; 519defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x006>; 520defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x007>; 521defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x008>; 522defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x009>; 523defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x00a>; 524defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x00b>; 525defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x00c>; 526defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x00d>; 527defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x00e>; 528defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x00f>; 529defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x010>; 530defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x011>; 531defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x012>; 532defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x020>; 533defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x021>; 534defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x022>; 535 536let SubtargetPredicate = HasDot2Insts in { 537 538defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x013>; 539defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x014>; 540defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x015>; 541defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x017>; 542defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x019>; 543 544} // End SubtargetPredicate = HasDot2Insts 545 546let SubtargetPredicate = HasDot1Insts in { 547 548defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x016>; 549defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x018>; 550 551} // End SubtargetPredicate = HasDot1Insts 552