1//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8 9//===----------------------------------------------------------------------===// 10// VOP3P Classes 11//===----------------------------------------------------------------------===// 12 13class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : 14 VOP3P_Pseudo<OpName, P, 15 !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret) 16>; 17 18// Non-packed instructions that use the VOP3P encoding. 19// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. 20class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0, 21 SDPatternOperator node = null_frag> : 22 VOP3P_Pseudo<OpName, P> { 23 // These operands are only sort of f16 operands. Depending on 24 // op_sel_hi, these may be interpreted as f32. The inline immediate 25 // values are really f16 converted to f32, so we treat these as f16 26 // operands. 27 let InOperandList = 28 !con( 29 !con( 30 (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, 31 FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, 32 FP16InputMods:$src2_modifiers, VCSrc_f16:$src2, 33 clampmod:$clamp), 34 !if(UseTiedOutput, (ins VGPR_32:$vdst_in), (ins))), 35 (ins op_sel:$op_sel, op_sel_hi:$op_sel_hi)); 36 37 let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", ""); 38 let DisableEncoding = !if(UseTiedOutput, "$vdst_in", ""); 39 let AsmOperands = 40 " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; 41} 42 43let isCommutable = 1 in { 44def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; 45def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; 46 47let FPDPRounding = 1 in { 48def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; 49def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>; 50def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>; 51} // End FPDPRounding = 1 52def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>; 53def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>; 54 55def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; 56def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; 57def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; 58 59def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; 60def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>; 61def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>; 62def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; 63} 64 65def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; 66def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; 67 68def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>; 69def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; 70def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; 71 72 73// Undo sub x, c -> add x, -c canonicalization since c is more likely 74// an inline immediate than -c. 75// The constant will be emitted as a mov, and folded later. 76// TODO: We could directly encode the immediate now 77def : GCNPat< 78 (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1), 79 (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp) 80>; 81 82multiclass MadFmaMixPats<SDPatternOperator fma_like, 83 Instruction mix_inst, 84 Instruction mixlo_inst, 85 Instruction mixhi_inst> { 86 def : GCNPat < 87 (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 88 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 89 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), 90 (mixlo_inst $src0_modifiers, $src0, 91 $src1_modifiers, $src1, 92 $src2_modifiers, $src2, 93 DSTCLAMP.NONE, 94 (i32 (IMPLICIT_DEF))) 95 >; 96 97 // FIXME: Special case handling for maxhi (especially for clamp) 98 // because dealing with the write to high half of the register is 99 // difficult. 100 def : GCNPat < 101 (build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 102 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 103 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), 104 (v2f16 (mixhi_inst $src0_modifiers, $src0, 105 $src1_modifiers, $src1, 106 $src2_modifiers, $src2, 107 DSTCLAMP.NONE, 108 $elt0)) 109 >; 110 111 def : GCNPat < 112 (build_vector 113 f16:$elt0, 114 (AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), 115 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), 116 (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), 117 (v2f16 (mixhi_inst $src0_modifiers, $src0, 118 $src1_modifiers, $src1, 119 $src2_modifiers, $src2, 120 DSTCLAMP.ENABLE, 121 $elt0)) 122 >; 123 124 def : GCNPat < 125 (AMDGPUclamp (build_vector 126 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), 127 (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), 128 (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))), 129 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), 130 (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), 131 (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))), 132 (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, 133 $hi_src1_modifiers, $hi_src1, 134 $hi_src2_modifiers, $hi_src2, 135 DSTCLAMP.ENABLE, 136 (mixlo_inst $lo_src0_modifiers, $lo_src0, 137 $lo_src1_modifiers, $lo_src1, 138 $lo_src2_modifiers, $lo_src2, 139 DSTCLAMP.ENABLE, 140 (i32 (IMPLICIT_DEF))))) 141 >; 142} 143 144let SubtargetPredicate = HasMadMixInsts in { 145// These are VOP3a-like opcodes which accept no omod. 146// Size of src arguments (16/32) is controlled by op_sel. 147// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. 148let isCommutable = 1 in { 149def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; 150 151let FPDPRounding = 1 in { 152// Clamp modifier is applied after conversion to f16. 153def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; 154 155let ClampLo = 0, ClampHi = 1 in { 156def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; 157} 158} // End FPDPRounding = 1 159} 160 161defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>; 162} // End SubtargetPredicate = HasMadMixInsts 163 164 165// Essentially the same as the mad_mix versions 166let SubtargetPredicate = HasFmaMixInsts in { 167let isCommutable = 1 in { 168def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>; 169 170let FPDPRounding = 1 in { 171// Clamp modifier is applied after conversion to f16. 172def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; 173 174let ClampLo = 0, ClampHi = 1 in { 175def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>; 176} 177} // End FPDPRounding = 1 178} 179 180defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>; 181} 182 183// Defines patterns that extract signed 4bit from each Idx[0]. 184foreach Idx = [[0,28],[4,24],[8,20],[12,16],[16,12],[20,8],[24,4]] in 185 def ExtractSigned4bit_#Idx[0] : PatFrag<(ops node:$src), 186 (sra (shl node:$src, (i32 Idx[1])), (i32 28))>; 187 188// Defines code pattern that extracts U(unsigned/signed) 4/8bit from FromBitIndex. 189class Extract<int FromBitIndex, int BitMask, bit U>: PatFrag< 190 (ops node:$src), 191 !if (!or (!and (!eq (BitMask, 255), !eq (FromBitIndex, 24)), !eq (FromBitIndex, 28)), // last element 192 !if (U, (srl node:$src, (i32 FromBitIndex)), (sra node:$src, (i32 FromBitIndex))), 193 !if (!eq (FromBitIndex, 0), // first element 194 !if (U, (and node:$src, (i32 BitMask)), 195 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), 196 (sext_inreg node:$src, i8))), 197 !if (U, (and (srl node:$src, (i32 FromBitIndex)), (i32 BitMask)), 198 !if (!eq (BitMask, 15), (!cast<PatFrag>("ExtractSigned4bit_"#FromBitIndex) node:$src), 199 (sext_inreg (srl node:$src, (i32 FromBitIndex)), i8)))))>; 200 201 202foreach Type = ["I", "U"] in 203 foreach Index = 0-3 in { 204 // Defines patterns that extract each Index'ed 8bit from an unsigned 205 // 32bit scalar value; 206 def #Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>; 207 208 // Defines multiplication patterns where the multiplication is happening on each 209 // Index'ed 8bit of a 32bit scalar value. 210 211 def Mul#Type#_Elt#Index : PatFrag< 212 (ops node:$src0, node:$src1), 213 (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse)) 214 (!cast<Extract>(#Type#Index#"_8bit") node:$src0), 215 (!cast<Extract>(#Type#Index#"_8bit") node:$src1))>; 216 } 217 218// Different variants of dot8 patterns cause a huge increase in the compile time. 219// Define non-associative/commutative add/mul to prevent permutation in the dot8 220// pattern. 221def NonACAdd : SDNode<"ISD::ADD" , SDTIntBinOp>; 222def NonACAdd_oneuse : HasOneUseBinOp<NonACAdd>; 223 224def NonACAMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24" , SDTIntBinOp>; 225def NonACAMDGPUmul_u24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_u24>; 226 227def NonACAMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24" , SDTIntBinOp>; 228def NonACAMDGPUmul_i24_oneuse : HasOneUseBinOp<NonACAMDGPUmul_i24>; 229 230foreach Type = ["I", "U"] in 231 foreach Index = 0-7 in { 232 // Defines patterns that extract each Index'ed 4bit from an unsigned 233 // 32bit scalar value; 234 def #Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>; 235 236 // Defines multiplication patterns where the multiplication is happening on each 237 // Index'ed 8bit of a 32bit scalar value. 238 def Mul#Type#Index#"_4bit" : PatFrag< 239 (ops node:$src0, node:$src1), 240 (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse)) 241 (!cast<Extract>(#Type#Index#"_4bit") node:$src0), 242 (!cast<Extract>(#Type#Index#"_4bit") node:$src1))>; 243 } 244 245class UDot2Pat<Instruction Inst> : GCNPat < 246 (add (add_oneuse (AMDGPUmul_u24_oneuse (srl i32:$src0, (i32 16)), 247 (srl i32:$src1, (i32 16))), i32:$src2), 248 (AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)), 249 (and i32:$src1, (i32 65535))) 250 ), 251 (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { 252 let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; 253} 254 255class SDot2Pat<Instruction Inst> : GCNPat < 256 (add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)), 257 (sra i32:$src1, (i32 16))), i32:$src2), 258 (AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16), 259 (sext_inreg i32:$src1, i16))), 260 (Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> { 261 let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate; 262} 263 264let IsDOT = 1 in { 265let SubtargetPredicate = HasDot2Insts in { 266 267def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>; 268def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; 269def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>; 270def V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; 271def V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; 272 273} // End SubtargetPredicate = HasDot2Insts 274 275let SubtargetPredicate = HasDot1Insts in { 276 277def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; 278def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>; 279 280} // End SubtargetPredicate = HasDot1Insts 281} // End let IsDOT = 1 282 283multiclass DotPats<SDPatternOperator dot_op, 284 VOP3PInst dot_inst> { 285 let SubtargetPredicate = dot_inst.SubtargetPredicate in 286 def : GCNPat < 287 (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)), 288 (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)), 289 (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp), 290 (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>; 291} 292 293defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>; 294defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>; 295defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>; 296defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>; 297defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>; 298defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>; 299defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>; 300 301def : UDot2Pat<V_DOT2_U32_U16>; 302def : SDot2Pat<V_DOT2_I32_I16>; 303 304foreach Type = ["U", "I"] in 305 let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in 306 def : GCNPat < 307 !cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y, 308 (add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))), 309 (!cast<VOP3PInst>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 310 311foreach Type = ["U", "I"] in 312 let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in 313 def : GCNPat < 314 !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), 315 [1, 2, 3, 4, 5, 6, 7], lhs, y, 316 (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), 317 (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 318 319// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase 320// in the compile time. Directly handle the pattern generated by the FE here. 321foreach Type = ["U", "I"] in 322 let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in 323 def : GCNPat < 324 !cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)), 325 [7, 1, 2, 3, 4, 5, 6], lhs, y, 326 (NonACAdd_oneuse lhs, (!cast<PatFrag>("Mul"#Type#y#"_4bit") i32:$src0, i32:$src1)))), 327 (!cast<VOP3PInst>("V_DOT8_"#Type#"32_"#Type#4) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>; 328 329def ADst_32 : VOPDstOperand<AGPR_32>; 330def ADst_128 : VOPDstOperand<AReg_128>; 331def ADst_512 : VOPDstOperand<AReg_512>; 332def ADst_1024 : VOPDstOperand<AReg_1024>; 333 334def VOPProfileAccRead : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { 335 let Src0RC64 = ARegSrc_32; 336} 337 338def VOPProfileAccWrite : VOP3_Profile<VOP_I32_I32, VOP3_MAI> { 339 let DstRC = ADst_32; 340 let Src0RC64 = VISrc_b32; 341} 342 343class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC, 344 RegisterOperand SrcABRC = AVSrc_32> 345 : VOP3_Profile<P, VOP3_MAI> { 346 let DstRC = _DstRC; 347 let Src0RC64 = SrcABRC; 348 let Src1RC64 = SrcABRC; 349 let Src2RC64 = _SrcRC; 350 let HasOpSel = 0; 351 let HasClamp = 0; 352 let HasModifiers = 0; 353 let Asm64 = " $vdst, $src0, $src1, $src2$cbsz$abid$blgp"; 354 let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp); 355} 356 357def VOPProfileMAI_F32_F32_X4 : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, AISrc_128_f32, ADst_128>; 358def VOPProfileMAI_F32_F32_X16 : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, AISrc_512_f32, ADst_512>; 359def VOPProfileMAI_F32_F32_X32 : VOPProfileMAI<VOP_V32F32_F32_F32_V32F32, AISrc_1024_f32, ADst_1024>; 360def VOPProfileMAI_I32_I32_X4 : VOPProfileMAI<VOP_V4I32_I32_I32_V4I32, AISrc_128_b32, ADst_128>; 361def VOPProfileMAI_I32_I32_X16 : VOPProfileMAI<VOP_V16I32_I32_I32_V16I32, AISrc_512_b32, ADst_512>; 362def VOPProfileMAI_I32_I32_X32 : VOPProfileMAI<VOP_V32I32_I32_I32_V32I32, AISrc_1024_b32, ADst_1024>; 363def VOPProfileMAI_F32_V2I16_X4 : VOPProfileMAI<VOP_V4F32_V2I16_V2I16_V4F32, AISrc_128_b32, ADst_128>; 364def VOPProfileMAI_F32_V2I16_X16 : VOPProfileMAI<VOP_V16F32_V2I16_V2I16_V16F32, AISrc_512_b32, ADst_512>; 365def VOPProfileMAI_F32_V2I16_X32 : VOPProfileMAI<VOP_V32F32_V2I16_V2I16_V32F32, AISrc_1024_b32, ADst_1024>; 366def VOPProfileMAI_F32_V4F16_X4 : VOPProfileMAI<VOP_V4F32_V4F16_V4F16_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>; 367def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>; 368def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>; 369 370let Predicates = [HasMAIInsts] in { 371def V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>; 372def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> { 373 let isMoveImm = 1; 374} 375 376let isConvergent = 1 in { 377def V_MFMA_F32_4X4X1F32 : VOP3Inst<"v_mfma_f32_4x4x1f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_4x4x1f32>; 378def V_MFMA_F32_4X4X4F16 : VOP3Inst<"v_mfma_f32_4x4x4f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_4x4x4f16>; 379def V_MFMA_I32_4X4X4I8 : VOP3Inst<"v_mfma_i32_4x4x4i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_4x4x4i8>; 380def V_MFMA_F32_4X4X2BF16 : VOP3Inst<"v_mfma_f32_4x4x2bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_4x4x2bf16>; 381def V_MFMA_F32_16X16X1F32 : VOP3Inst<"v_mfma_f32_16x16x1f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_16x16x1f32>; 382def V_MFMA_F32_16X16X4F32 : VOP3Inst<"v_mfma_f32_16x16x4f32", VOPProfileMAI_F32_F32_X4, int_amdgcn_mfma_f32_16x16x4f32>; 383def V_MFMA_F32_16X16X4F16 : VOP3Inst<"v_mfma_f32_16x16x4f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_16x16x4f16>; 384def V_MFMA_F32_16X16X16F16 : VOP3Inst<"v_mfma_f32_16x16x16f16", VOPProfileMAI_F32_V4F16_X4, int_amdgcn_mfma_f32_16x16x16f16>; 385def V_MFMA_I32_16X16X4I8 : VOP3Inst<"v_mfma_i32_16x16x4i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_16x16x4i8>; 386def V_MFMA_I32_16X16X16I8 : VOP3Inst<"v_mfma_i32_16x16x16i8", VOPProfileMAI_I32_I32_X4, int_amdgcn_mfma_i32_16x16x16i8>; 387def V_MFMA_F32_16X16X2BF16 : VOP3Inst<"v_mfma_f32_16x16x2bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_16x16x2bf16>; 388def V_MFMA_F32_16X16X8BF16 : VOP3Inst<"v_mfma_f32_16x16x8bf16", VOPProfileMAI_F32_V2I16_X4, int_amdgcn_mfma_f32_16x16x8bf16>; 389def V_MFMA_F32_32X32X1F32 : VOP3Inst<"v_mfma_f32_32x32x1f32", VOPProfileMAI_F32_F32_X32, int_amdgcn_mfma_f32_32x32x1f32>; 390def V_MFMA_F32_32X32X2F32 : VOP3Inst<"v_mfma_f32_32x32x2f32", VOPProfileMAI_F32_F32_X16, int_amdgcn_mfma_f32_32x32x2f32>; 391def V_MFMA_F32_32X32X4F16 : VOP3Inst<"v_mfma_f32_32x32x4f16", VOPProfileMAI_F32_V4F16_X32, int_amdgcn_mfma_f32_32x32x4f16>; 392def V_MFMA_F32_32X32X8F16 : VOP3Inst<"v_mfma_f32_32x32x8f16", VOPProfileMAI_F32_V4F16_X16, int_amdgcn_mfma_f32_32x32x8f16>; 393def V_MFMA_I32_32X32X4I8 : VOP3Inst<"v_mfma_i32_32x32x4i8", VOPProfileMAI_I32_I32_X32, int_amdgcn_mfma_i32_32x32x4i8>; 394def V_MFMA_I32_32X32X8I8 : VOP3Inst<"v_mfma_i32_32x32x8i8", VOPProfileMAI_I32_I32_X16, int_amdgcn_mfma_i32_32x32x8i8>; 395def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>; 396def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>; 397} // End isConvergent = 1 398 399} // End SubtargetPredicate = HasMAIInsts 400 401def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">; 402def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">; 403 404multiclass VOP3P_Real_vi<bits<10> op> { 405 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, 406 VOP3Pe <op, !cast<VOP3_Pseudo>(NAME).Pfl> { 407 let AssemblerPredicates = [HasVOP3PInsts]; 408 let DecoderNamespace = "GFX8"; 409 } 410} 411 412multiclass VOP3P_Real_MAI<bits<10> op> { 413 def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.VI>, 414 VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME).Pfl> { 415 let AssemblerPredicates = [HasMAIInsts]; 416 let DecoderNamespace = "GFX8"; 417 } 418} 419 420defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>; 421defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>; 422defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>; 423defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>; 424defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>; 425defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>; 426defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>; 427defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>; 428defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>; 429defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>; 430 431defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>; 432defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>; 433defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>; 434defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>; 435defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>; 436defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>; 437defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>; 438defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>; 439defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>; 440 441 442let SubtargetPredicate = HasMadMixInsts in { 443defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>; 444defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; 445defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; 446} 447 448let SubtargetPredicate = HasFmaMixInsts in { 449let DecoderNamespace = "GFX9_DL" in { 450// The mad_mix instructions were renamed and their behaviors changed, 451// but the opcode stayed the same so we need to put these in a 452// different DecoderNamespace to avoid the ambiguity. 453defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x3a0>; 454defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x3a1>; 455defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x3a2>; 456} 457} 458 459 460let SubtargetPredicate = HasDot2Insts in { 461 462defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x3a3>; 463defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x3a6>; 464defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x3a7>; 465defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x3a9>; 466defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x3ab>; 467 468} // End SubtargetPredicate = HasDot2Insts 469 470let SubtargetPredicate = HasDot1Insts in { 471 472defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x3a8>; 473defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x3aa>; 474 475} // End SubtargetPredicate = HasDot1Insts 476 477let SubtargetPredicate = HasMAIInsts in { 478 479defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x3d8>; 480defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x3d9>; 481defm V_MFMA_F32_32X32X1F32 : VOP3P_Real_MAI <0x3c0>; 482defm V_MFMA_F32_16X16X1F32 : VOP3P_Real_MAI <0x3c1>; 483defm V_MFMA_F32_4X4X1F32 : VOP3P_Real_MAI <0x3c2>; 484defm V_MFMA_F32_32X32X2F32 : VOP3P_Real_MAI <0x3c4>; 485defm V_MFMA_F32_16X16X4F32 : VOP3P_Real_MAI <0x3c5>; 486defm V_MFMA_F32_32X32X4F16 : VOP3P_Real_MAI <0x3c8>; 487defm V_MFMA_F32_16X16X4F16 : VOP3P_Real_MAI <0x3c9>; 488defm V_MFMA_F32_4X4X4F16 : VOP3P_Real_MAI <0x3ca>; 489defm V_MFMA_F32_32X32X8F16 : VOP3P_Real_MAI <0x3cc>; 490defm V_MFMA_F32_16X16X16F16 : VOP3P_Real_MAI <0x3cd>; 491defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MAI <0x3d0>; 492defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MAI <0x3d1>; 493defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MAI <0x3d2>; 494defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MAI <0x3d4>; 495defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MAI <0x3d5>; 496defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MAI <0x3e8>; 497defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MAI <0x3e9>; 498defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MAI <0x3eb>; 499defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MAI <0x3ec>; 500defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MAI <0x3ed>; 501 502} // End SubtargetPredicate = HasMAIInsts 503 504//===----------------------------------------------------------------------===// 505// GFX10. 506//===----------------------------------------------------------------------===// 507 508let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in { 509 multiclass VOP3P_Real_gfx10<bits<10> op> { 510 def _gfx10 : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.GFX10>, 511 VOP3Pe_gfx10 <op, !cast<VOP3P_Pseudo>(NAME).Pfl>; 512 } 513} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" 514 515defm V_PK_MAD_I16 : VOP3P_Real_gfx10<0x000>; 516defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10<0x001>; 517defm V_PK_ADD_I16 : VOP3P_Real_gfx10<0x002>; 518defm V_PK_SUB_I16 : VOP3P_Real_gfx10<0x003>; 519defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10<0x004>; 520defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10<0x005>; 521defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10<0x006>; 522defm V_PK_MAX_I16 : VOP3P_Real_gfx10<0x007>; 523defm V_PK_MIN_I16 : VOP3P_Real_gfx10<0x008>; 524defm V_PK_MAD_U16 : VOP3P_Real_gfx10<0x009>; 525defm V_PK_ADD_U16 : VOP3P_Real_gfx10<0x00a>; 526defm V_PK_SUB_U16 : VOP3P_Real_gfx10<0x00b>; 527defm V_PK_MAX_U16 : VOP3P_Real_gfx10<0x00c>; 528defm V_PK_MIN_U16 : VOP3P_Real_gfx10<0x00d>; 529defm V_PK_FMA_F16 : VOP3P_Real_gfx10<0x00e>; 530defm V_PK_ADD_F16 : VOP3P_Real_gfx10<0x00f>; 531defm V_PK_MUL_F16 : VOP3P_Real_gfx10<0x010>; 532defm V_PK_MIN_F16 : VOP3P_Real_gfx10<0x011>; 533defm V_PK_MAX_F16 : VOP3P_Real_gfx10<0x012>; 534defm V_FMA_MIX_F32 : VOP3P_Real_gfx10<0x020>; 535defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10<0x021>; 536defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10<0x022>; 537 538let SubtargetPredicate = HasDot2Insts in { 539 540defm V_DOT2_F32_F16 : VOP3P_Real_gfx10 <0x013>; 541defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x014>; 542defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x015>; 543defm V_DOT4_U32_U8 : VOP3P_Real_gfx10 <0x017>; 544defm V_DOT8_U32_U4 : VOP3P_Real_gfx10 <0x019>; 545 546} // End SubtargetPredicate = HasDot2Insts 547 548let SubtargetPredicate = HasDot1Insts in { 549 550defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x016>; 551defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x018>; 552 553} // End SubtargetPredicate = HasDot1Insts 554