1//===-- SIInstructions.td - SI Instruction Definitions --------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// This file was originally auto-generated from a GPU register header file and 9// all the instruction definitions were originally commented out. Instructions 10// that are not yet supported remain commented out. 11//===----------------------------------------------------------------------===// 12 13class GCNPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl; 14 15class UniformSextInreg<ValueType VT> : PatFrag< 16 (ops node:$src), 17 (sext_inreg $src, VT), 18 [{ return !N->isDivergent(); }]>; 19 20class DivergentSextInreg<ValueType VT> : PatFrag< 21 (ops node:$src), 22 (sext_inreg $src, VT), 23 [{ return N->isDivergent(); }]>; 24 25include "SOPInstructions.td" 26include "VOPInstructions.td" 27include "SMInstructions.td" 28include "FLATInstructions.td" 29include "BUFInstructions.td" 30include "EXPInstructions.td" 31include "DSDIRInstructions.td" 32include "VINTERPInstructions.td" 33 34//===----------------------------------------------------------------------===// 35// VINTRP Instructions 36//===----------------------------------------------------------------------===// 37 38// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI) 39def VINTRPDst : VINTRPDstOperand <VGPR_32>; 40 41let Uses = [MODE, M0, EXEC] in { 42 43// FIXME: Specify SchedRW for VINTRP instructions. 44 45multiclass V_INTERP_P1_F32_m : VINTRP_m < 46 0x00000000, 47 (outs VINTRPDst:$vdst), 48 (ins VGPR_32:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan), 49 "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", 50 [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc, 51 (i32 timm:$attrchan), (i32 timm:$attr), M0))] 52>; 53 54let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in { 55 56defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; 57 58} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus] 59 60let OtherPredicates = [has16BankLDS, isNotGFX90APlus], 61 Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { 62 63defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; 64 65} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus], 66 // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 67 68let OtherPredicates = [isNotGFX90APlus] in { 69let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { 70 71defm V_INTERP_P2_F32 : VINTRP_m < 72 0x00000001, 73 (outs VINTRPDst:$vdst), 74 (ins VGPR_32:$src0, VGPR_32:$vsrc, InterpAttr:$attr, 75 InterpAttrChan:$attrchan), 76 "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", 77 [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, 78 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 79 80} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" 81 82defm V_INTERP_MOV_F32 : VINTRP_m < 83 0x00000002, 84 (outs VINTRPDst:$vdst), 85 (ins InterpSlot:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan), 86 "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", 87 [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), 88 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 89 90} // End OtherPredicates = [isNotGFX90APlus] 91 92} // End Uses = [MODE, M0, EXEC] 93 94//===----------------------------------------------------------------------===// 95// Pseudo Instructions 96//===----------------------------------------------------------------------===// 97 98// Insert a branch to an endpgm block to use as a fallback trap. 99def ENDPGM_TRAP : SPseudoInstSI< 100 (outs), (ins), 101 [(AMDGPUendpgm_trap)], 102 "ENDPGM_TRAP"> { 103 let hasSideEffects = 1; 104 let usesCustomInserter = 1; 105} 106 107def SIMULATED_TRAP : SPseudoInstSI<(outs), (ins), [(AMDGPUsimulated_trap)], 108 "SIMULATED_TRAP"> { 109 let hasSideEffects = 1; 110 let usesCustomInserter = 1; 111} 112 113def ATOMIC_FENCE : SPseudoInstSI< 114 (outs), (ins i32imm:$ordering, i32imm:$scope), 115 [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))], 116 "ATOMIC_FENCE $ordering, $scope"> { 117 let hasSideEffects = 1; 118} 119 120let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { 121 122// For use in patterns 123def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), 124 (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { 125 let isPseudo = 1; 126 let isCodeGenOnly = 1; 127 let usesCustomInserter = 1; 128} 129 130// 64-bit vector move instruction. This is mainly used by the 131// SIFoldOperands pass to enable folding of inline immediates. 132def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), 133 (ins VSrc_b64:$src0)> { 134 let isReMaterializable = 1; 135 let isAsCheapAsAMove = 1; 136 let isMoveImm = 1; 137 let SchedRW = [Write64Bit]; 138 let Size = 4; 139 let UseNamedOperandTable = 1; 140} 141 142// 64-bit vector move with dpp. Expanded post-RA. 143def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> { 144 let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. 145} 146 147// 64-bit scalar move immediate instruction. This is used to avoid subregs 148// initialization and allow rematerialization. 149def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst), 150 (ins i64imm:$src0)> { 151 let isReMaterializable = 1; 152 let isAsCheapAsAMove = 1; 153 let isMoveImm = 1; 154 let SchedRW = [WriteSALU, Write64Bit]; 155 let Size = 4; 156 let Uses = []; 157 let UseNamedOperandTable = 1; 158} 159 160// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the 161// WQM pass processes it. 162def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 163 164// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is 165// turned into a copy by WQM pass, but does not seed WQM requirements. 166def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 167 168// Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so 169// that the @earlyclobber is respected. The @earlyclobber is to make sure that 170// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't 171// accidentally clobber inactive channels of $vdst. 172let Constraints = "@earlyclobber $vdst" in { 173def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 174def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 175} 176 177} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] 178 179def WWM_COPY : SPseudoInstSI < 180 (outs unknown:$dst), (ins unknown:$src)> { 181 let hasSideEffects = 0; 182 let isAsCheapAsAMove = 1; 183 let isConvergent = 1; 184} 185 186def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 187 let Uses = [EXEC]; 188 let Defs = [EXEC, SCC]; 189 let hasSideEffects = 0; 190 let mayLoad = 0; 191 let mayStore = 0; 192} 193 194def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 195 let hasSideEffects = 0; 196 let mayLoad = 0; 197 let mayStore = 0; 198} 199 200def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 201 let Uses = [EXEC]; 202 let Defs = [EXEC, SCC]; 203 let hasSideEffects = 0; 204 let mayLoad = 0; 205 let mayStore = 0; 206} 207 208def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 209 let hasSideEffects = 0; 210 let mayLoad = 0; 211 let mayStore = 0; 212} 213 214let usesCustomInserter = 1 in { 215let WaveSizePredicate = isWave32 in 216def S_INVERSE_BALLOT_U32 : SPseudoInstSI< 217 (outs SReg_32:$sdst), (ins SSrc_b32:$mask), 218 [(set i1:$sdst, (int_amdgcn_inverse_ballot i32:$mask))] 219>; 220 221let WaveSizePredicate = isWave64 in 222def S_INVERSE_BALLOT_U64 : SPseudoInstSI< 223 (outs SReg_64:$sdst), (ins SSrc_b64:$mask), 224 [(set i1:$sdst, (int_amdgcn_inverse_ballot i64:$mask))] 225>; 226} // End usesCustomInserter = 1 227 228// Pseudo instructions used for @llvm.fptrunc.round upward 229// and @llvm.fptrunc.round downward. 230// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD 231// and G_FPTRUNC_ROUND_DOWNWARD before being lowered to 232// FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO. 233// The final codegen is done in the ModeRegister pass. 234let Uses = [MODE, EXEC] in { 235def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), 236 (ins VGPR_32:$src0), 237 [(set f16:$vdst, (SIfptrunc_round_upward f32:$src0))]>; 238 239def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), 240 (ins VGPR_32:$src0), 241 [(set f16:$vdst, (SIfptrunc_round_downward f32:$src0))]>; 242} // End Uses = [MODE, EXEC] 243 244// Invert the exec mask and overwrite the inactive lanes of dst with inactive, 245// restoring it after we're done. 246let Defs = [SCC], isConvergent = 1 in { 247def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), 248 (ins VSrc_b32: $src, VSrc_b32:$inactive), []>; 249 250def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), 251 (ins VSrc_b64: $src, VSrc_b64:$inactive), []>; 252} // End Defs = [SCC] 253 254foreach vt = Reg32Types.types in { 255def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), 256 (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>; 257} 258 259foreach vt = Reg64Types.types in { 260def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)), 261 (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>; 262} 263 264def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)), 265 (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>; 266 267def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)), 268 (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>; 269 270let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { 271 def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), 272 (ins VSrc_b32: $src, VSrc_b32:$strategy), 273 [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> { 274 } 275 276 def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), 277 (ins VSrc_b32: $src, VSrc_b32:$strategy), 278 [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> { 279 } 280} 281 282let usesCustomInserter = 1, Defs = [VCC] in { 283def V_ADD_U64_PSEUDO : VPseudoInstSI < 284 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 285 [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] 286>; 287 288def V_SUB_U64_PSEUDO : VPseudoInstSI < 289 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 290 [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] 291>; 292} // End usesCustomInserter = 1, Defs = [VCC] 293 294let usesCustomInserter = 1, Defs = [SCC] in { 295def S_ADD_U64_PSEUDO : SPseudoInstSI < 296 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 297 [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))] 298>; 299 300def S_SUB_U64_PSEUDO : SPseudoInstSI < 301 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 302 [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))] 303>; 304 305def S_ADD_CO_PSEUDO : SPseudoInstSI < 306 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 307>; 308 309def S_SUB_CO_PSEUDO : SPseudoInstSI < 310 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 311>; 312 313def S_UADDO_PSEUDO : SPseudoInstSI < 314 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 315>; 316 317def S_USUBO_PSEUDO : SPseudoInstSI < 318 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 319>; 320 321let OtherPredicates = [HasShaderCyclesHiLoRegisters] in 322def GET_SHADERCYCLESHILO : SPseudoInstSI< 323 (outs SReg_64:$sdst), (ins), 324 [(set SReg_64:$sdst, (i64 (readcyclecounter)))] 325>; 326 327} // End usesCustomInserter = 1, Defs = [SCC] 328 329let usesCustomInserter = 1 in { 330def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins), 331 [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; 332} // End let usesCustomInserter = 1, SALU = 1 333 334// Wrap an instruction by duplicating it, except for setting isTerminator. 335class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< 336 base_inst.OutOperandList, 337 base_inst.InOperandList> { 338 let Uses = base_inst.Uses; 339 let Defs = base_inst.Defs; 340 let isTerminator = 1; 341 let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; 342 let hasSideEffects = base_inst.hasSideEffects; 343 let UseNamedOperandTable = base_inst.UseNamedOperandTable; 344 let CodeSize = base_inst.CodeSize; 345 let SchedRW = base_inst.SchedRW; 346} 347 348let WaveSizePredicate = isWave64 in { 349def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; 350def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; 351def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; 352def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; 353def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>; 354def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>; 355} 356 357let WaveSizePredicate = isWave32 in { 358def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>; 359def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; 360def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; 361def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; 362def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>; 363def S_AND_SAVEEXEC_B32_term : WrapTerminatorInst<S_AND_SAVEEXEC_B32>; 364} 365 366 367def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), 368 [(int_amdgcn_wave_barrier)]> { 369 let SchedRW = []; 370 let hasNoSchedulingInfo = 1; 371 let hasSideEffects = 1; 372 let mayLoad = 0; 373 let mayStore = 0; 374 let isConvergent = 1; 375 let FixedSize = 1; 376 let Size = 0; 377 let isMeta = 1; 378} 379 380def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask), 381 [(int_amdgcn_sched_barrier (i32 timm:$mask))]> { 382 let SchedRW = []; 383 let hasNoSchedulingInfo = 1; 384 let hasSideEffects = 1; 385 let mayLoad = 0; 386 let mayStore = 0; 387 let isConvergent = 1; 388 let FixedSize = 1; 389 let Size = 0; 390 let isMeta = 1; 391} 392 393def SCHED_GROUP_BARRIER : SPseudoInstSI< 394 (outs), 395 (ins i32imm:$mask, i32imm:$size, i32imm:$syncid), 396 [(int_amdgcn_sched_group_barrier (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid))]> { 397 let SchedRW = []; 398 let hasNoSchedulingInfo = 1; 399 let hasSideEffects = 1; 400 let mayLoad = 0; 401 let mayStore = 0; 402 let isConvergent = 1; 403 let FixedSize = 1; 404 let Size = 0; 405 let isMeta = 1; 406} 407 408def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask), 409 [(int_amdgcn_iglp_opt (i32 timm:$mask))]> { 410 let SchedRW = []; 411 let hasNoSchedulingInfo = 1; 412 let hasSideEffects = 1; 413 let mayLoad = 0; 414 let mayStore = 0; 415 let isConvergent = 1; 416 let FixedSize = 1; 417 let Size = 0; 418 let isMeta = 1; 419} 420 421// SI pseudo instructions. These are used by the CFG structurizer pass 422// and should be lowered to ISA instructions prior to codegen. 423 424// As we have enhanced control flow intrinsics to work under unstructured CFG, 425// duplicating such intrinsics can be actually treated as legal. On the contrary, 426// by making them non-duplicable, we are observing better code generation result. 427// So we choose to mark them non-duplicable in hope of getting better code 428// generation as well as simplied CFG during Machine IR optimization stage. 429 430let isTerminator = 1, isNotDuplicable = 1 in { 431 432let OtherPredicates = [EnableLateCFGStructurize] in { 433 def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < 434 (outs), 435 (ins SReg_1:$vcc, brtarget:$target), 436 [(brcond i1:$vcc, bb:$target)]> { 437 let Size = 12; 438} 439} 440 441def SI_IF: CFPseudoInstSI < 442 (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target), 443 [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { 444 let Constraints = ""; 445 let Size = 12; 446 let hasSideEffects = 1; 447 let IsNeverUniform = 1; 448} 449 450def SI_ELSE : CFPseudoInstSI < 451 (outs SReg_1:$dst), 452 (ins SReg_1:$src, brtarget:$target), [], 1, 1> { 453 let Size = 12; 454 let hasSideEffects = 1; 455 let IsNeverUniform = 1; 456} 457 458def SI_WATERFALL_LOOP : CFPseudoInstSI < 459 (outs), 460 (ins brtarget:$target), [], 1> { 461 let Size = 8; 462 let isBranch = 1; 463 let Defs = []; 464} 465 466def SI_LOOP : CFPseudoInstSI < 467 (outs), (ins SReg_1:$saved, brtarget:$target), 468 [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { 469 let Size = 8; 470 let isBranch = 1; 471 let hasSideEffects = 1; 472 let IsNeverUniform = 1; 473} 474 475} // End isTerminator = 1 476 477def SI_END_CF : CFPseudoInstSI < 478 (outs), (ins SReg_1:$saved), [], 1, 1> { 479 let Size = 4; 480 let isAsCheapAsAMove = 1; 481 let isReMaterializable = 1; 482 let hasSideEffects = 1; 483 let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. 484 let mayLoad = 1; // FIXME: Should not need memory flags 485 let mayStore = 1; 486} 487 488def SI_IF_BREAK : CFPseudoInstSI < 489 (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { 490 let Size = 4; 491 let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. 492 let isAsCheapAsAMove = 1; 493 let isReMaterializable = 1; 494} 495 496// Branch to the early termination block of the shader if SCC is 0. 497// This uses SCC from a previous SALU operation, i.e. the update of 498// a mask of live lanes after a kill/demote operation. 499// Only valid in pixel shaders. 500def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> { 501 let Uses = [EXEC,SCC]; 502} 503 504let Uses = [EXEC] in { 505 506multiclass PseudoInstKill <dag ins> { 507 // Even though this pseudo can usually be expanded without an SCC def, we 508 // conservatively assume that it has an SCC def, both because it is sometimes 509 // required in degenerate cases (when V_CMPX cannot be used due to constant 510 // bus limitations) and because it allows us to avoid having to track SCC 511 // liveness across basic blocks. 512 let Defs = [EXEC,SCC] in 513 def _PSEUDO : PseudoInstSI <(outs), ins> { 514 let isConvergent = 1; 515 let usesCustomInserter = 1; 516 } 517 518 let Defs = [EXEC,SCC] in 519 def _TERMINATOR : SPseudoInstSI <(outs), ins> { 520 let isTerminator = 1; 521 } 522} 523 524defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; 525let Defs = [VCC] in 526defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; 527 528let Defs = [EXEC,VCC] in 529def SI_ILLEGAL_COPY : SPseudoInstSI < 530 (outs unknown:$dst), (ins unknown:$src), 531 [], " ; illegal copy $src to $dst">; 532 533} // End Uses = [EXEC], Defs = [EXEC,VCC] 534 535// Branch on undef scc. Used to avoid intermediate copy from 536// IMPLICIT_DEF to SCC. 537def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins SOPPBrTarget:$simm16)> { 538 let isTerminator = 1; 539 let usesCustomInserter = 1; 540 let isBranch = 1; 541} 542 543def SI_PS_LIVE : PseudoInstSI < 544 (outs SReg_1:$dst), (ins), 545 [(set i1:$dst, (int_amdgcn_ps_live))]> { 546 let SALU = 1; 547} 548 549let Uses = [EXEC] in { 550def SI_LIVE_MASK : PseudoInstSI < 551 (outs SReg_1:$dst), (ins), 552 [(set i1:$dst, (int_amdgcn_live_mask))]> { 553 let SALU = 1; 554} 555let Defs = [EXEC,SCC] in { 556// Demote: Turn a pixel shader thread into a helper lane. 557def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>; 558} // End Defs = [EXEC,SCC] 559} // End Uses = [EXEC] 560 561def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), 562 [(int_amdgcn_unreachable)], 563 "; divergent unreachable"> { 564 let Size = 0; 565 let hasNoSchedulingInfo = 1; 566 let FixedSize = 1; 567 let isMeta = 1; 568 let maybeAtomic = 0; 569} 570 571// Used as an isel pseudo to directly emit initialization with an 572// s_mov_b32 rather than a copy of another initialized 573// register. MachineCSE skips copies, and we don't want to have to 574// fold operands before it runs. 575def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { 576 let Defs = [M0]; 577 let usesCustomInserter = 1; 578 let isAsCheapAsAMove = 1; 579 let isReMaterializable = 1; 580} 581 582def SI_INIT_EXEC : SPseudoInstSI < 583 (outs), (ins i64imm:$src), 584 [(int_amdgcn_init_exec (i64 timm:$src))]> { 585 let Defs = [EXEC]; 586 let isAsCheapAsAMove = 1; 587} 588 589def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < 590 (outs), (ins SSrc_b32:$input, i32imm:$shift), 591 [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { 592 let Defs = [EXEC]; 593} 594 595// Return for returning shaders to a shader variant epilog. 596def SI_RETURN_TO_EPILOG : SPseudoInstSI < 597 (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { 598 let isTerminator = 1; 599 let isBarrier = 1; 600 let isReturn = 1; 601 let hasNoSchedulingInfo = 1; 602 let DisableWQM = 1; 603 let FixedSize = 1; 604 605 // TODO: Should this be true? 606 let isMeta = 0; 607} 608 609// Return for returning function calls. 610def SI_RETURN : SPseudoInstSI < 611 (outs), (ins), [(AMDGPUret_glue)], 612 "; return"> { 613 let isTerminator = 1; 614 let isBarrier = 1; 615 let isReturn = 1; 616 let SchedRW = [WriteBranch]; 617} 618 619// Return for returning function calls without output register. 620// 621// This version is only needed so we can fill in the output register 622// in the custom inserter. 623def SI_CALL_ISEL : SPseudoInstSI < 624 (outs), (ins SSrc_b64:$src0, unknown:$callee), 625 [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { 626 let Size = 4; 627 let isCall = 1; 628 let SchedRW = [WriteBranch]; 629 let usesCustomInserter = 1; 630 // TODO: Should really base this on the call target 631 let isConvergent = 1; 632} 633 634def : GCNPat< 635 (AMDGPUcall i64:$src0, (i64 0)), 636 (SI_CALL_ISEL $src0, (i64 0)) 637>; 638 639// Wrapper around s_swappc_b64 with extra $callee parameter to track 640// the called function after regalloc. 641def SI_CALL : SPseudoInstSI < 642 (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { 643 let Size = 4; 644 let FixedSize = 1; 645 let isCall = 1; 646 let UseNamedOperandTable = 1; 647 let SchedRW = [WriteBranch]; 648 // TODO: Should really base this on the call target 649 let isConvergent = 1; 650} 651 652class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs), 653 (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), 654 [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { 655 let Size = 4; 656 let FixedSize = 1; 657 let isCall = 1; 658 let isTerminator = 1; 659 let isReturn = 1; 660 let isBarrier = 1; 661 let UseNamedOperandTable = 1; 662 let SchedRW = [WriteBranch]; 663 // TODO: Should really base this on the call target 664 let isConvergent = 1; 665} 666 667// Tail call handling pseudo 668def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>; 669def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>; 670 671// Handle selecting indirect tail calls 672def : GCNPat< 673 (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)), 674 (SI_TCRETURN CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) 675>; 676 677// Handle selecting indirect tail calls for AMDGPU_gfx 678def : GCNPat< 679 (AMDGPUtc_return_gfx i64:$src0, (i64 0), (i32 timm:$fpdiff)), 680 (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) 681>; 682 683// Pseudo for the llvm.amdgcn.cs.chain intrinsic. 684// This is essentially a tail call, but it also takes a mask to put in EXEC 685// right before jumping to the callee. 686class SI_CS_CHAIN_TC< 687 ValueType execvt, Predicate wavesizepred, 688 RegisterOperand execrc = getSOPSrcForVT<execvt>.ret> 689 : SPseudoInstSI <(outs), 690 (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)> { 691 let FixedSize = 0; 692 let isCall = 1; 693 let isTerminator = 1; 694 let isBarrier = 1; 695 let isReturn = 1; 696 let UseNamedOperandTable = 1; 697 let SchedRW = [WriteBranch]; 698 let isConvergent = 1; 699 700 let WaveSizePredicate = wavesizepred; 701} 702 703def SI_CS_CHAIN_TC_W32 : SI_CS_CHAIN_TC<i32, isWave32>; 704def SI_CS_CHAIN_TC_W64 : SI_CS_CHAIN_TC<i64, isWave64>; 705 706// Handle selecting direct & indirect calls via SI_CS_CHAIN_TC_W32/64 707multiclass si_cs_chain_tc_pattern< 708 dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> { 709def : GCNPat< 710 (AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec), 711 (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec) 712>; 713} 714 715multiclass si_cs_chain_tc_patterns< 716 ValueType execvt, 717 RegisterOperand execrc = getSOPSrcForVT<execvt>.ret, 718 Instruction tc = !if(!eq(execvt, i32), SI_CS_CHAIN_TC_W32, SI_CS_CHAIN_TC_W64) 719 > { 720 defm direct: si_cs_chain_tc_pattern<(tglobaladdr:$callee), execvt, execrc, tc>; 721 defm indirect: si_cs_chain_tc_pattern<(i64 0), execvt, execrc, tc>; 722} 723 724defm : si_cs_chain_tc_patterns<i32>; 725defm : si_cs_chain_tc_patterns<i64>; 726 727def ADJCALLSTACKUP : SPseudoInstSI< 728 (outs), (ins i32imm:$amt0, i32imm:$amt1), 729 [(callseq_start timm:$amt0, timm:$amt1)], 730 "; adjcallstackup $amt0 $amt1"> { 731 let Size = 8; // Worst case. (s_add_u32 + constant) 732 let FixedSize = 1; 733 let hasSideEffects = 1; 734 let usesCustomInserter = 1; 735 let SchedRW = [WriteSALU]; 736 let Defs = [SCC]; 737} 738 739def ADJCALLSTACKDOWN : SPseudoInstSI< 740 (outs), (ins i32imm:$amt1, i32imm:$amt2), 741 [(callseq_end timm:$amt1, timm:$amt2)], 742 "; adjcallstackdown $amt1"> { 743 let Size = 8; // Worst case. (s_add_u32 + constant) 744 let hasSideEffects = 1; 745 let usesCustomInserter = 1; 746 let SchedRW = [WriteSALU]; 747 let Defs = [SCC]; 748} 749 750let Defs = [M0, EXEC, SCC], 751 UseNamedOperandTable = 1 in { 752 753// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect 754// addressing implementation. 755class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < 756 (outs VGPR_32:$vdst), 757 (ins rc:$src, VS_32:$idx, i32imm:$offset)> { 758 let usesCustomInserter = 1; 759} 760 761class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < 762 (outs rc:$vdst), 763 (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { 764 let Constraints = "$src = $vdst"; 765 let usesCustomInserter = 1; 766} 767 768def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; 769def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; 770def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; 771def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; 772def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>; 773def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>; 774def SI_INDIRECT_SRC_V11 : SI_INDIRECT_SRC<VReg_352>; 775def SI_INDIRECT_SRC_V12 : SI_INDIRECT_SRC<VReg_384>; 776def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; 777def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>; 778 779def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; 780def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; 781def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; 782def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; 783def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>; 784def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>; 785def SI_INDIRECT_DST_V11 : SI_INDIRECT_DST<VReg_352>; 786def SI_INDIRECT_DST_V12 : SI_INDIRECT_DST<VReg_384>; 787def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; 788def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>; 789 790} // End Uses = [EXEC], Defs = [M0, EXEC] 791 792// This is a pseudo variant of the v_movreld_b32 instruction in which the 793// vector operand appears only twice, once as def and once as use. Using this 794// pseudo avoids problems with the Two Address instructions pass. 795class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 796 RegisterOperand val_ty> : PseudoInstSI < 797 (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> { 798 let Constraints = "$vsrc = $vdst"; 799 let Uses = [M0]; 800} 801 802class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 803 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> { 804 let VALU = 1; 805 let VOP1 = 1; 806 let Uses = [M0, EXEC]; 807} 808 809class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 810 RegisterOperand val_ty> : 811 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> { 812 let SALU = 1; 813 let SOP1 = 1; 814 let Uses = [M0]; 815} 816 817class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 818 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>; 819class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> : 820 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>; 821 822def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>; 823def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>; 824def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>; 825def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>; 826def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>; 827def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>; 828def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>; 829def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>; 830def V_INDIRECT_REG_WRITE_MOVREL_B32_V11 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_352>; 831def V_INDIRECT_REG_WRITE_MOVREL_B32_V12 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_384>; 832def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>; 833def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>; 834 835def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>; 836def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>; 837def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>; 838def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>; 839def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>; 840def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>; 841def S_INDIRECT_REG_WRITE_MOVREL_B32_V9 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_288>; 842def S_INDIRECT_REG_WRITE_MOVREL_B32_V10 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_320>; 843def S_INDIRECT_REG_WRITE_MOVREL_B32_V11 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_352>; 844def S_INDIRECT_REG_WRITE_MOVREL_B32_V12 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_384>; 845def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>; 846def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>; 847 848def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>; 849def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>; 850def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>; 851def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>; 852def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>; 853 854// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these 855// pseudos we avoid spills or copies being inserted within indirect sequences 856// that switch the VGPR indexing mode. Spills to accvgprs could be effected by 857// this mode switching. 858 859class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 860 (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> { 861 let Constraints = "$vsrc = $vdst"; 862 let VALU = 1; 863 let Uses = [M0, EXEC]; 864 let Defs = [M0]; 865} 866 867def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>; 868def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>; 869def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>; 870def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>; 871def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>; 872def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>; 873def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>; 874def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>; 875def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_352>; 876def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_384>; 877def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>; 878def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>; 879 880class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 881 (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> { 882 let VALU = 1; 883 let Uses = [M0, EXEC]; 884 let Defs = [M0]; 885} 886 887def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>; 888def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>; 889def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>; 890def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>; 891def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>; 892def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>; 893def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>; 894def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>; 895def V_INDIRECT_REG_READ_GPR_IDX_B32_V11 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_352>; 896def V_INDIRECT_REG_READ_GPR_IDX_B32_V12 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_384>; 897def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>; 898def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>; 899 900multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { 901 let UseNamedOperandTable = 1, Spill = 1, SALU = 1, Uses = [EXEC] in { 902 def _SAVE : PseudoInstSI < 903 (outs), 904 (ins sgpr_class:$data, i32imm:$addr)> { 905 let mayStore = 1; 906 let mayLoad = 0; 907 } 908 909 def _RESTORE : PseudoInstSI < 910 (outs sgpr_class:$data), 911 (ins i32imm:$addr)> { 912 let mayStore = 0; 913 let mayLoad = 1; 914 } 915 } // End UseNamedOperandTable = 1 916} 917 918// You cannot use M0 as the output of v_readlane_b32 instructions or 919// use it in the sdata operand of SMEM instructions. We still need to 920// be able to spill the physical register m0, so allow it for 921// SI_SPILL_32_* instructions. 922defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; 923defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; 924defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; 925defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; 926defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; 927defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>; 928defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>; 929defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; 930defm SI_SPILL_S288 : SI_SPILL_SGPR <SReg_288>; 931defm SI_SPILL_S320 : SI_SPILL_SGPR <SReg_320>; 932defm SI_SPILL_S352 : SI_SPILL_SGPR <SReg_352>; 933defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>; 934defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; 935defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; 936 937let Spill = 1, VALU = 1, isConvergent = 1 in { 938def SI_SPILL_S32_TO_VGPR : PseudoInstSI <(outs VGPR_32:$vdst), 939 (ins SReg_32:$src0, i32imm:$src1, VGPR_32:$vdst_in)> { 940 let Size = 4; 941 let FixedSize = 1; 942 let IsNeverUniform = 1; 943 let hasSideEffects = 0; 944 let mayLoad = 0; 945 let mayStore = 0; 946 let Constraints = "$vdst = $vdst_in"; 947} 948 949def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst), 950 (ins VGPR_32:$src0, i32imm:$src1)> { 951 let Size = 4; 952 let FixedSize = 1; 953 let hasSideEffects = 0; 954 let mayLoad = 0; 955 let mayStore = 0; 956} 957} // End Spill = 1, VALU = 1, isConvergent = 1 958 959// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register 960// needs to be used and an extra instruction to move between VGPR and AGPR. 961// UsesTmp adds to the total size of an expanded spill in this case. 962multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> { 963 let UseNamedOperandTable = 1, Spill = 1, VALU = 1, 964 SchedRW = [WriteVMEM] in { 965 def _SAVE : VPseudoInstSI < 966 (outs), 967 (ins vgpr_class:$vdata, i32imm:$vaddr, 968 SReg_32:$soffset, i32imm:$offset)> { 969 let mayStore = 1; 970 let mayLoad = 0; 971 // (2 * 4) + (8 * num_subregs) bytes maximum 972 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 973 // Size field is unsigned char and cannot fit more. 974 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 975 } 976 977 def _RESTORE : VPseudoInstSI < 978 (outs vgpr_class:$vdata), 979 (ins i32imm:$vaddr, 980 SReg_32:$soffset, i32imm:$offset)> { 981 let mayStore = 0; 982 let mayLoad = 1; 983 984 // (2 * 4) + (8 * num_subregs) bytes maximum 985 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 986 // Size field is unsigned char and cannot fit more. 987 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 988 } 989 } // End UseNamedOperandTable = 1, Spill = 1, VALU = 1, SchedRW = [WriteVMEM] 990} 991 992defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; 993defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; 994defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; 995defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; 996defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; 997defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; 998defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>; 999defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; 1000defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>; 1001defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>; 1002defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>; 1003defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>; 1004defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; 1005defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; 1006 1007defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>; 1008defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>; 1009defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; 1010defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; 1011defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; 1012defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; 1013defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>; 1014defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; 1015defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>; 1016defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>; 1017defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>; 1018defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>; 1019defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; 1020defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; 1021 1022defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; 1023defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; 1024defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; 1025defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; 1026defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; 1027defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; 1028defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; 1029defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; 1030defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>; 1031defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>; 1032defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>; 1033defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; 1034defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; 1035defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; 1036 1037let isConvergent = 1 in { 1038 defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; 1039 defm SI_SPILL_WWM_AV32 : SI_SPILL_VGPR <AV_32, 1>; 1040} 1041 1042let isReMaterializable = 1, isAsCheapAsAMove = 1 in 1043def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < 1044 (outs SReg_64:$dst), 1045 (ins si_ga:$ptr_lo, si_ga:$ptr_hi), 1046 [(set SReg_64:$dst, 1047 (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> { 1048 let Defs = [SCC]; 1049} 1050 1051def : GCNPat < 1052 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0), 1053 (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) 1054>; 1055 1056def : GCNPat< 1057 (AMDGPUtrap timm:$trapid), 1058 (S_TRAP $trapid) 1059>; 1060 1061def : GCNPat< 1062 (AMDGPUelse i1:$src, bb:$target), 1063 (SI_ELSE $src, $target) 1064>; 1065 1066def : Pat < 1067 (int_amdgcn_kill i1:$src), 1068 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0) 1069>; 1070 1071def : Pat < 1072 (int_amdgcn_kill (i1 (not i1:$src))), 1073 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1) 1074>; 1075 1076def : Pat < 1077 (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), 1078 (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) 1079>; 1080 1081def : Pat < 1082 (int_amdgcn_wqm_demote i1:$src), 1083 (SI_DEMOTE_I1 SCSrc_i1:$src, 0) 1084>; 1085 1086def : Pat < 1087 (int_amdgcn_wqm_demote (i1 (not i1:$src))), 1088 (SI_DEMOTE_I1 SCSrc_i1:$src, -1) 1089>; 1090 1091 // TODO: we could add more variants for other types of conditionals 1092 1093def : Pat < 1094 (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 1095 (COPY $src) // Return the SGPRs representing i1 src 1096>; 1097 1098def : Pat < 1099 (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 1100 (COPY $src) // Return the SGPRs representing i1 src 1101>; 1102 1103//===----------------------------------------------------------------------===// 1104// VOP1 Patterns 1105//===----------------------------------------------------------------------===// 1106 1107multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> { 1108 // f16_to_fp patterns 1109 def : GCNPat < 1110 (f32 (any_f16_to_fp i32:$src0)), 1111 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0) 1112 >; 1113 1114 def : GCNPat < 1115 (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), 1116 (cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0) 1117 >; 1118 1119 def : GCNPat < 1120 (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), 1121 (cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) 1122 >; 1123 1124 def : GCNPat < 1125 (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), 1126 (cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0) 1127 >; 1128 1129 def : GCNPat < 1130 (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), 1131 (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0) 1132 >; 1133 1134 def : GCNPat < 1135 (f64 (any_fpextend f16:$src)), 1136 (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src)) 1137 >; 1138 1139 // fp_to_fp16 patterns 1140 def : GCNPat < 1141 (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), 1142 (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) 1143 >; 1144 1145 def : GCNPat < 1146 (i32 (fp_to_sint f16:$src)), 1147 (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) 1148 >; 1149 1150 def : GCNPat < 1151 (i32 (fp_to_uint f16:$src)), 1152 (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) 1153 >; 1154 1155 def : GCNPat < 1156 (f16 (sint_to_fp i32:$src)), 1157 (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_I32_e32 VSrc_b32:$src)) 1158 >; 1159 1160 def : GCNPat < 1161 (f16 (uint_to_fp i32:$src)), 1162 (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src)) 1163 >; 1164 1165 // This is only used on targets without half support 1166 // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering 1167 def : GCNPat < 1168 (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), 1169 (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) 1170 >; 1171} 1172 1173let SubtargetPredicate = NotHasTrue16BitInsts in 1174defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>; 1175 1176let SubtargetPredicate = HasTrue16BitInsts in 1177defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64>; 1178 1179//===----------------------------------------------------------------------===// 1180// VOP2 Patterns 1181//===----------------------------------------------------------------------===// 1182 1183// NoMods pattern used for mac. If there are any source modifiers then it's 1184// better to select mad instead of mac. 1185class FMADPat <ValueType vt, Instruction inst> 1186 : GCNPat <(vt (any_fmad (vt (VOP3NoMods vt:$src0)), 1187 (vt (VOP3NoMods vt:$src1)), 1188 (vt (VOP3NoMods vt:$src2)))), 1189 (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1190 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1191>; 1192 1193// Prefer mac form when there are no modifiers. 1194let AddedComplexity = 9 in { 1195let OtherPredicates = [HasMadMacF32Insts] in 1196def : FMADPat <f32, V_MAC_F32_e64>; 1197 1198// Don't allow source modifiers. If there are any source modifiers then it's 1199// better to select mad instead of mac. 1200let SubtargetPredicate = isGFX6GFX7GFX10, 1201 OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 1202def : GCNPat < 1203 (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0), 1204 (VOP3NoMods f32:$src1)), 1205 (VOP3NoMods f32:$src2))), 1206 (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1207 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1208>; 1209 1210// Don't allow source modifiers. If there are any source modifiers then it's 1211// better to select fma instead of fmac. 1212let SubtargetPredicate = HasFmaLegacy32 in 1213def : GCNPat < 1214 (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0), 1215 (VOP3NoMods f32:$src1), 1216 (VOP3NoMods f32:$src2))), 1217 (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1218 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1219>; 1220 1221let SubtargetPredicate = Has16BitInsts in 1222def : FMADPat <f16, V_MAC_F16_e64>; 1223} // AddedComplexity = 9 1224 1225let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 1226def : GCNPat < 1227 (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod), 1228 (VOP3Mods f32:$src1, i32:$src1_mod)), 1229 (VOP3Mods f32:$src2, i32:$src2_mod))), 1230 (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1, 1231 $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1232>; 1233 1234class VOPSelectModsPat <ValueType vt> : GCNPat < 1235 (vt (select i1:$src0, (VOP3ModsNonCanonicalizing vt:$src1, i32:$src1_mods), 1236 (VOP3ModsNonCanonicalizing vt:$src2, i32:$src2_mods))), 1237 (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, 1238 FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) 1239>; 1240 1241class VOPSelectPat <ValueType vt> : GCNPat < 1242 (vt (select i1:$src0, vt:$src1, vt:$src2)), 1243 (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0) 1244>; 1245 1246def : VOPSelectModsPat <i32>; 1247def : VOPSelectModsPat <f32>; 1248def : VOPSelectPat <f16>; 1249def : VOPSelectPat <i16>; 1250 1251let AddedComplexity = 1 in { 1252def : GCNPat < 1253 (i32 (add (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), i32:$val)), 1254 (V_BCNT_U32_B32_e64 $popcnt, $val) 1255>; 1256} 1257 1258def : GCNPat < 1259 (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), 1260 (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) 1261>; 1262 1263def : GCNPat < 1264 (i16 (add (i16 (trunc (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)))), i16:$val)), 1265 (V_BCNT_U32_B32_e64 $popcnt, $val) 1266>; 1267 1268def : GCNPat < 1269 (i64 (DivergentUnaryFrag<ctpop> i64:$src)), 1270 (REG_SEQUENCE VReg_64, 1271 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)), 1272 (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0, 1273 (i32 (V_MOV_B32_e32 (i32 0))), sub1) 1274>; 1275 1276/********** ============================================ **********/ 1277/********** Extraction, Insertion, Building and Casting **********/ 1278/********** ============================================ **********/ 1279 1280// Special case for 2 element vectors. REQ_SEQUENCE produces better code 1281// than an INSERT_SUBREG. 1282multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> { 1283 def : GCNPat < 1284 (insertelt vec_type:$vec, elem_type:$elem, 0), 1285 (REG_SEQUENCE RC, $elem, sub0, (elem_type (EXTRACT_SUBREG $vec, sub1)), sub1) 1286 >; 1287 1288 def : GCNPat < 1289 (insertelt vec_type:$vec, elem_type:$elem, 1), 1290 (REG_SEQUENCE RC, (elem_type (EXTRACT_SUBREG $vec, sub0)), sub0, $elem, sub1) 1291 >; 1292} 1293 1294foreach Index = 0-1 in { 1295 def Extract_Element_v2i32_#Index : Extract_Element < 1296 i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) 1297 >; 1298 1299 def Extract_Element_v2f32_#Index : Extract_Element < 1300 f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) 1301 >; 1302} 1303 1304defm : Insert_Element_V2 <SReg_64, i32, v2i32>; 1305defm : Insert_Element_V2 <SReg_64, f32, v2f32>; 1306 1307foreach Index = 0-2 in { 1308 def Extract_Element_v3i32_#Index : Extract_Element < 1309 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1310 >; 1311 def Insert_Element_v3i32_#Index : Insert_Element < 1312 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1313 >; 1314 1315 def Extract_Element_v3f32_#Index : Extract_Element < 1316 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1317 >; 1318 def Insert_Element_v3f32_#Index : Insert_Element < 1319 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1320 >; 1321} 1322 1323foreach Index = 0-3 in { 1324 def Extract_Element_v4i32_#Index : Extract_Element < 1325 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1326 >; 1327 def Insert_Element_v4i32_#Index : Insert_Element < 1328 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1329 >; 1330 1331 def Extract_Element_v4f32_#Index : Extract_Element < 1332 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1333 >; 1334 def Insert_Element_v4f32_#Index : Insert_Element < 1335 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1336 >; 1337} 1338 1339foreach Index = 0-4 in { 1340 def Extract_Element_v5i32_#Index : Extract_Element < 1341 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1342 >; 1343 def Insert_Element_v5i32_#Index : Insert_Element < 1344 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1345 >; 1346 1347 def Extract_Element_v5f32_#Index : Extract_Element < 1348 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1349 >; 1350 def Insert_Element_v5f32_#Index : Insert_Element < 1351 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1352 >; 1353} 1354 1355foreach Index = 0-5 in { 1356 def Extract_Element_v6i32_#Index : Extract_Element < 1357 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1358 >; 1359 def Insert_Element_v6i32_#Index : Insert_Element < 1360 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1361 >; 1362 1363 def Extract_Element_v6f32_#Index : Extract_Element < 1364 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1365 >; 1366 def Insert_Element_v6f32_#Index : Insert_Element < 1367 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1368 >; 1369} 1370 1371foreach Index = 0-6 in { 1372 def Extract_Element_v7i32_#Index : Extract_Element < 1373 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1374 >; 1375 def Insert_Element_v7i32_#Index : Insert_Element < 1376 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1377 >; 1378 1379 def Extract_Element_v7f32_#Index : Extract_Element < 1380 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1381 >; 1382 def Insert_Element_v7f32_#Index : Insert_Element < 1383 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1384 >; 1385} 1386 1387foreach Index = 0-7 in { 1388 def Extract_Element_v8i32_#Index : Extract_Element < 1389 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1390 >; 1391 def Insert_Element_v8i32_#Index : Insert_Element < 1392 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1393 >; 1394 1395 def Extract_Element_v8f32_#Index : Extract_Element < 1396 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1397 >; 1398 def Insert_Element_v8f32_#Index : Insert_Element < 1399 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1400 >; 1401} 1402 1403foreach Index = 0-8 in { 1404 def Extract_Element_v9i32_#Index : Extract_Element < 1405 i32, v9i32, Index, !cast<SubRegIndex>(sub#Index) 1406 >; 1407 def Insert_Element_v9i32_#Index : Insert_Element < 1408 i32, v9i32, Index, !cast<SubRegIndex>(sub#Index) 1409 >; 1410 1411 def Extract_Element_v9f32_#Index : Extract_Element < 1412 f32, v9f32, Index, !cast<SubRegIndex>(sub#Index) 1413 >; 1414 def Insert_Element_v9f32_#Index : Insert_Element < 1415 f32, v9f32, Index, !cast<SubRegIndex>(sub#Index) 1416 >; 1417} 1418 1419foreach Index = 0-9 in { 1420 def Extract_Element_v10i32_#Index : Extract_Element < 1421 i32, v10i32, Index, !cast<SubRegIndex>(sub#Index) 1422 >; 1423 def Insert_Element_v10i32_#Index : Insert_Element < 1424 i32, v10i32, Index, !cast<SubRegIndex>(sub#Index) 1425 >; 1426 1427 def Extract_Element_v10f32_#Index : Extract_Element < 1428 f32, v10f32, Index, !cast<SubRegIndex>(sub#Index) 1429 >; 1430 def Insert_Element_v10f32_#Index : Insert_Element < 1431 f32, v10f32, Index, !cast<SubRegIndex>(sub#Index) 1432 >; 1433} 1434 1435foreach Index = 0-10 in { 1436 def Extract_Element_v11i32_#Index : Extract_Element < 1437 i32, v11i32, Index, !cast<SubRegIndex>(sub#Index) 1438 >; 1439 def Insert_Element_v11i32_#Index : Insert_Element < 1440 i32, v11i32, Index, !cast<SubRegIndex>(sub#Index) 1441 >; 1442 1443 def Extract_Element_v11f32_#Index : Extract_Element < 1444 f32, v11f32, Index, !cast<SubRegIndex>(sub#Index) 1445 >; 1446 def Insert_Element_v11f32_#Index : Insert_Element < 1447 f32, v11f32, Index, !cast<SubRegIndex>(sub#Index) 1448 >; 1449} 1450 1451foreach Index = 0-11 in { 1452 def Extract_Element_v12i32_#Index : Extract_Element < 1453 i32, v12i32, Index, !cast<SubRegIndex>(sub#Index) 1454 >; 1455 def Insert_Element_v12i32_#Index : Insert_Element < 1456 i32, v12i32, Index, !cast<SubRegIndex>(sub#Index) 1457 >; 1458 1459 def Extract_Element_v12f32_#Index : Extract_Element < 1460 f32, v12f32, Index, !cast<SubRegIndex>(sub#Index) 1461 >; 1462 def Insert_Element_v12f32_#Index : Insert_Element < 1463 f32, v12f32, Index, !cast<SubRegIndex>(sub#Index) 1464 >; 1465} 1466 1467foreach Index = 0-15 in { 1468 def Extract_Element_v16i32_#Index : Extract_Element < 1469 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1470 >; 1471 def Insert_Element_v16i32_#Index : Insert_Element < 1472 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1473 >; 1474 1475 def Extract_Element_v16f32_#Index : Extract_Element < 1476 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1477 >; 1478 def Insert_Element_v16f32_#Index : Insert_Element < 1479 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1480 >; 1481} 1482 1483 1484foreach Index = 0-31 in { 1485 def Extract_Element_v32i32_#Index : Extract_Element < 1486 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1487 >; 1488 1489 def Insert_Element_v32i32_#Index : Insert_Element < 1490 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1491 >; 1492 1493 def Extract_Element_v32f32_#Index : Extract_Element < 1494 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1495 >; 1496 1497 def Insert_Element_v32f32_#Index : Insert_Element < 1498 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1499 >; 1500} 1501 1502// FIXME: Why do only some of these type combinations for SReg and 1503// VReg? 1504// 16-bit bitcast 1505def : BitConvert <i16, f16, VGPR_32>; 1506def : BitConvert <f16, i16, VGPR_32>; 1507def : BitConvert <f16, bf16, VGPR_32>; 1508def : BitConvert <bf16, f16, VGPR_32>; 1509 1510def : BitConvert <i16, f16, SReg_32>; 1511def : BitConvert <f16, i16, SReg_32>; 1512def : BitConvert <f16, bf16, SReg_32>; 1513def : BitConvert <bf16, f16, SReg_32>; 1514 1515def : BitConvert <i16, bf16, VGPR_32>; 1516def : BitConvert <bf16, i16, VGPR_32>; 1517def : BitConvert <i16, bf16, SReg_32>; 1518def : BitConvert <bf16, i16, SReg_32>; 1519 1520// 32-bit bitcast 1521def : BitConvert <i32, f32, VGPR_32>; 1522def : BitConvert <f32, i32, VGPR_32>; 1523def : BitConvert <i32, f32, SReg_32>; 1524def : BitConvert <f32, i32, SReg_32>; 1525def : BitConvert <v2i16, i32, SReg_32>; 1526def : BitConvert <i32, v2i16, SReg_32>; 1527def : BitConvert <v2f16, i32, SReg_32>; 1528def : BitConvert <i32, v2f16, SReg_32>; 1529def : BitConvert <v2i16, v2f16, SReg_32>; 1530def : BitConvert <v2f16, v2i16, SReg_32>; 1531def : BitConvert <v2f16, f32, SReg_32>; 1532def : BitConvert <f32, v2f16, SReg_32>; 1533def : BitConvert <v2i16, f32, SReg_32>; 1534def : BitConvert <f32, v2i16, SReg_32>; 1535def : BitConvert <v2bf16, i32, SReg_32>; 1536def : BitConvert <i32, v2bf16, SReg_32>; 1537def : BitConvert <v2bf16, i32, VGPR_32>; 1538def : BitConvert <i32, v2bf16, VGPR_32>; 1539def : BitConvert <v2bf16, v2i16, SReg_32>; 1540def : BitConvert <v2i16, v2bf16, SReg_32>; 1541def : BitConvert <v2bf16, v2i16, VGPR_32>; 1542def : BitConvert <v2i16, v2bf16, VGPR_32>; 1543def : BitConvert <v2bf16, v2f16, SReg_32>; 1544def : BitConvert <v2f16, v2bf16, SReg_32>; 1545def : BitConvert <v2bf16, v2f16, VGPR_32>; 1546def : BitConvert <v2f16, v2bf16, VGPR_32>; 1547def : BitConvert <f32, v2bf16, VGPR_32>; 1548def : BitConvert <v2bf16, f32, VGPR_32>; 1549def : BitConvert <f32, v2bf16, SReg_32>; 1550def : BitConvert <v2bf16, f32, SReg_32>; 1551 1552 1553// 64-bit bitcast 1554def : BitConvert <i64, f64, VReg_64>; 1555def : BitConvert <f64, i64, VReg_64>; 1556def : BitConvert <v2i32, v2f32, VReg_64>; 1557def : BitConvert <v2f32, v2i32, VReg_64>; 1558def : BitConvert <i64, v2i32, VReg_64>; 1559def : BitConvert <v2i32, i64, VReg_64>; 1560def : BitConvert <i64, v2f32, VReg_64>; 1561def : BitConvert <v2f32, i64, VReg_64>; 1562def : BitConvert <f64, v2f32, VReg_64>; 1563def : BitConvert <v2f32, f64, VReg_64>; 1564def : BitConvert <f64, v2i32, VReg_64>; 1565def : BitConvert <v2i32, f64, VReg_64>; 1566def : BitConvert <v4i16, v4f16, VReg_64>; 1567def : BitConvert <v4f16, v4i16, VReg_64>; 1568def : BitConvert <v4bf16, v2i32, VReg_64>; 1569def : BitConvert <v2i32, v4bf16, VReg_64>; 1570def : BitConvert <v4bf16, i64, VReg_64>; 1571def : BitConvert <i64, v4bf16, VReg_64>; 1572def : BitConvert <v4bf16, v4i16, VReg_64>; 1573def : BitConvert <v4i16, v4bf16, VReg_64>; 1574def : BitConvert <v4bf16, v4f16, VReg_64>; 1575def : BitConvert <v4f16, v4bf16, VReg_64>; 1576def : BitConvert <v4bf16, v2f32, VReg_64>; 1577def : BitConvert <v2f32, v4bf16, VReg_64>; 1578def : BitConvert <v4bf16, f64, VReg_64>; 1579def : BitConvert <f64, v4bf16, VReg_64>; 1580 1581 1582// FIXME: Make SGPR 1583def : BitConvert <v2i32, v4f16, VReg_64>; 1584def : BitConvert <v4f16, v2i32, VReg_64>; 1585def : BitConvert <v2i32, v4f16, VReg_64>; 1586def : BitConvert <v2i32, v4i16, VReg_64>; 1587def : BitConvert <v4i16, v2i32, VReg_64>; 1588def : BitConvert <v2f32, v4f16, VReg_64>; 1589def : BitConvert <v4f16, v2f32, VReg_64>; 1590def : BitConvert <v2f32, v4i16, VReg_64>; 1591def : BitConvert <v4i16, v2f32, VReg_64>; 1592def : BitConvert <v4i16, f64, VReg_64>; 1593def : BitConvert <v4f16, f64, VReg_64>; 1594def : BitConvert <f64, v4i16, VReg_64>; 1595def : BitConvert <f64, v4f16, VReg_64>; 1596def : BitConvert <v4i16, i64, VReg_64>; 1597def : BitConvert <v4f16, i64, VReg_64>; 1598def : BitConvert <i64, v4i16, VReg_64>; 1599def : BitConvert <i64, v4f16, VReg_64>; 1600 1601def : BitConvert <v4i32, v4f32, VReg_128>; 1602def : BitConvert <v4f32, v4i32, VReg_128>; 1603 1604// 96-bit bitcast 1605def : BitConvert <v3i32, v3f32, SGPR_96>; 1606def : BitConvert <v3f32, v3i32, SGPR_96>; 1607 1608// 128-bit bitcast 1609def : BitConvert <v2i64, v4i32, SReg_128>; 1610def : BitConvert <v4i32, v2i64, SReg_128>; 1611def : BitConvert <v2f64, v4f32, VReg_128>; 1612def : BitConvert <v2f64, v4i32, VReg_128>; 1613def : BitConvert <v4f32, v2f64, VReg_128>; 1614def : BitConvert <v4i32, v2f64, VReg_128>; 1615def : BitConvert <v2i64, v2f64, VReg_128>; 1616def : BitConvert <v2f64, v2i64, VReg_128>; 1617def : BitConvert <v4f32, v2i64, VReg_128>; 1618def : BitConvert <v2i64, v4f32, VReg_128>; 1619def : BitConvert <v8i16, v4i32, SReg_128>; 1620def : BitConvert <v4i32, v8i16, SReg_128>; 1621def : BitConvert <v8f16, v4f32, VReg_128>; 1622def : BitConvert <v8f16, v4i32, VReg_128>; 1623def : BitConvert <v4f32, v8f16, VReg_128>; 1624def : BitConvert <v4i32, v8f16, VReg_128>; 1625def : BitConvert <v8i16, v8f16, VReg_128>; 1626def : BitConvert <v8f16, v8i16, VReg_128>; 1627def : BitConvert <v4f32, v8i16, VReg_128>; 1628def : BitConvert <v8i16, v4f32, VReg_128>; 1629def : BitConvert <v8i16, v8f16, SReg_128>; 1630def : BitConvert <v8i16, v2i64, SReg_128>; 1631def : BitConvert <v8i16, v2f64, SReg_128>; 1632def : BitConvert <v8f16, v2i64, SReg_128>; 1633def : BitConvert <v8f16, v2f64, SReg_128>; 1634def : BitConvert <v8f16, v8i16, SReg_128>; 1635def : BitConvert <v2i64, v8i16, SReg_128>; 1636def : BitConvert <v2f64, v8i16, SReg_128>; 1637def : BitConvert <v2i64, v8f16, SReg_128>; 1638def : BitConvert <v2f64, v8f16, SReg_128>; 1639 1640def : BitConvert <v4i32, v8bf16, SReg_128>; 1641def : BitConvert <v8bf16, v4i32, SReg_128>; 1642def : BitConvert <v4i32, v8bf16, VReg_128>; 1643def : BitConvert <v8bf16, v4i32, VReg_128>; 1644 1645def : BitConvert <v4f32, v8bf16, SReg_128>; 1646def : BitConvert <v8bf16, v4f32, SReg_128>; 1647def : BitConvert <v4f32, v8bf16, VReg_128>; 1648def : BitConvert <v8bf16, v4f32, VReg_128>; 1649 1650def : BitConvert <v8i16, v8bf16, SReg_128>; 1651def : BitConvert <v8bf16, v8i16, SReg_128>; 1652def : BitConvert <v8i16, v8bf16, VReg_128>; 1653def : BitConvert <v8bf16, v8i16, VReg_128>; 1654 1655def : BitConvert <v8f16, v8bf16, SReg_128>; 1656def : BitConvert <v8bf16, v8f16, SReg_128>; 1657def : BitConvert <v8f16, v8bf16, VReg_128>; 1658def : BitConvert <v8bf16, v8f16, VReg_128>; 1659 1660def : BitConvert <v2f64, v8bf16, SReg_128>; 1661def : BitConvert <v8bf16, v2f64, SReg_128>; 1662def : BitConvert <v2f64, v8bf16, VReg_128>; 1663def : BitConvert <v8bf16, v2f64, VReg_128>; 1664 1665def : BitConvert <v2i64, v8bf16, SReg_128>; 1666def : BitConvert <v8bf16, v2i64, SReg_128>; 1667def : BitConvert <v2i64, v8bf16, VReg_128>; 1668def : BitConvert <v8bf16, v2i64, VReg_128>; 1669 1670 1671// 160-bit bitcast 1672def : BitConvert <v5i32, v5f32, SReg_160>; 1673def : BitConvert <v5f32, v5i32, SReg_160>; 1674def : BitConvert <v5i32, v5f32, VReg_160>; 1675def : BitConvert <v5f32, v5i32, VReg_160>; 1676 1677// 192-bit bitcast 1678def : BitConvert <v6i32, v6f32, SReg_192>; 1679def : BitConvert <v6f32, v6i32, SReg_192>; 1680def : BitConvert <v6i32, v6f32, VReg_192>; 1681def : BitConvert <v6f32, v6i32, VReg_192>; 1682def : BitConvert <v3i64, v3f64, VReg_192>; 1683def : BitConvert <v3f64, v3i64, VReg_192>; 1684def : BitConvert <v3i64, v6i32, VReg_192>; 1685def : BitConvert <v3i64, v6f32, VReg_192>; 1686def : BitConvert <v3f64, v6i32, VReg_192>; 1687def : BitConvert <v3f64, v6f32, VReg_192>; 1688def : BitConvert <v6i32, v3i64, VReg_192>; 1689def : BitConvert <v6f32, v3i64, VReg_192>; 1690def : BitConvert <v6i32, v3f64, VReg_192>; 1691def : BitConvert <v6f32, v3f64, VReg_192>; 1692 1693// 224-bit bitcast 1694def : BitConvert <v7i32, v7f32, SReg_224>; 1695def : BitConvert <v7f32, v7i32, SReg_224>; 1696def : BitConvert <v7i32, v7f32, VReg_224>; 1697def : BitConvert <v7f32, v7i32, VReg_224>; 1698 1699// 256-bit bitcast 1700def : BitConvert <v8i32, v8f32, SReg_256>; 1701def : BitConvert <v8f32, v8i32, SReg_256>; 1702def : BitConvert <v8i32, v8f32, VReg_256>; 1703def : BitConvert <v8f32, v8i32, VReg_256>; 1704def : BitConvert <v4i64, v4f64, VReg_256>; 1705def : BitConvert <v4f64, v4i64, VReg_256>; 1706def : BitConvert <v4i64, v8i32, VReg_256>; 1707def : BitConvert <v4i64, v8f32, VReg_256>; 1708def : BitConvert <v4f64, v8i32, VReg_256>; 1709def : BitConvert <v4f64, v8f32, VReg_256>; 1710def : BitConvert <v8i32, v4i64, VReg_256>; 1711def : BitConvert <v8f32, v4i64, VReg_256>; 1712def : BitConvert <v8i32, v4f64, VReg_256>; 1713def : BitConvert <v8f32, v4f64, VReg_256>; 1714def : BitConvert <v16i16, v16f16, SReg_256>; 1715def : BitConvert <v16f16, v16i16, SReg_256>; 1716def : BitConvert <v16i16, v16f16, VReg_256>; 1717def : BitConvert <v16f16, v16i16, VReg_256>; 1718def : BitConvert <v16f16, v8i32, VReg_256>; 1719def : BitConvert <v16i16, v8i32, VReg_256>; 1720def : BitConvert <v16f16, v8f32, VReg_256>; 1721def : BitConvert <v16i16, v8f32, VReg_256>; 1722def : BitConvert <v8i32, v16f16, VReg_256>; 1723def : BitConvert <v8i32, v16i16, VReg_256>; 1724def : BitConvert <v8f32, v16f16, VReg_256>; 1725def : BitConvert <v8f32, v16i16, VReg_256>; 1726def : BitConvert <v16f16, v4i64, VReg_256>; 1727def : BitConvert <v16i16, v4i64, VReg_256>; 1728def : BitConvert <v16f16, v4f64, VReg_256>; 1729def : BitConvert <v16i16, v4f64, VReg_256>; 1730def : BitConvert <v4i64, v16f16, VReg_256>; 1731def : BitConvert <v4i64, v16i16, VReg_256>; 1732def : BitConvert <v4f64, v16f16, VReg_256>; 1733def : BitConvert <v4f64, v16i16, VReg_256>; 1734 1735 1736def : BitConvert <v8i32, v16bf16, VReg_256>; 1737def : BitConvert <v16bf16, v8i32, VReg_256>; 1738def : BitConvert <v8f32, v16bf16, VReg_256>; 1739def : BitConvert <v16bf16, v8f32, VReg_256>; 1740def : BitConvert <v4i64, v16bf16, VReg_256>; 1741def : BitConvert <v16bf16, v4i64, VReg_256>; 1742def : BitConvert <v4f64, v16bf16, VReg_256>; 1743def : BitConvert <v16bf16, v4f64, VReg_256>; 1744 1745 1746 1747def : BitConvert <v16i16, v16bf16, SReg_256>; 1748def : BitConvert <v16bf16, v16i16, SReg_256>; 1749def : BitConvert <v16i16, v16bf16, VReg_256>; 1750def : BitConvert <v16bf16, v16i16, VReg_256>; 1751 1752def : BitConvert <v16f16, v16bf16, SReg_256>; 1753def : BitConvert <v16bf16, v16f16, SReg_256>; 1754def : BitConvert <v16f16, v16bf16, VReg_256>; 1755def : BitConvert <v16bf16, v16f16, VReg_256>; 1756 1757 1758 1759 1760// 288-bit bitcast 1761def : BitConvert <v9i32, v9f32, SReg_288>; 1762def : BitConvert <v9f32, v9i32, SReg_288>; 1763def : BitConvert <v9i32, v9f32, VReg_288>; 1764def : BitConvert <v9f32, v9i32, VReg_288>; 1765 1766// 320-bit bitcast 1767def : BitConvert <v10i32, v10f32, SReg_320>; 1768def : BitConvert <v10f32, v10i32, SReg_320>; 1769def : BitConvert <v10i32, v10f32, VReg_320>; 1770def : BitConvert <v10f32, v10i32, VReg_320>; 1771 1772// 320-bit bitcast 1773def : BitConvert <v11i32, v11f32, SReg_352>; 1774def : BitConvert <v11f32, v11i32, SReg_352>; 1775def : BitConvert <v11i32, v11f32, VReg_352>; 1776def : BitConvert <v11f32, v11i32, VReg_352>; 1777 1778// 384-bit bitcast 1779def : BitConvert <v12i32, v12f32, SReg_384>; 1780def : BitConvert <v12f32, v12i32, SReg_384>; 1781def : BitConvert <v12i32, v12f32, VReg_384>; 1782def : BitConvert <v12f32, v12i32, VReg_384>; 1783 1784// 512-bit bitcast 1785def : BitConvert <v32f16, v32i16, VReg_512>; 1786def : BitConvert <v32i16, v32f16, VReg_512>; 1787def : BitConvert <v32f16, v16i32, VReg_512>; 1788def : BitConvert <v32f16, v16f32, VReg_512>; 1789def : BitConvert <v16f32, v32f16, VReg_512>; 1790def : BitConvert <v16i32, v32f16, VReg_512>; 1791def : BitConvert <v32i16, v16i32, VReg_512>; 1792def : BitConvert <v32i16, v16f32, VReg_512>; 1793def : BitConvert <v16f32, v32i16, VReg_512>; 1794def : BitConvert <v16i32, v32i16, VReg_512>; 1795def : BitConvert <v16i32, v16f32, VReg_512>; 1796def : BitConvert <v16f32, v16i32, VReg_512>; 1797def : BitConvert <v8i64, v8f64, VReg_512>; 1798def : BitConvert <v8f64, v8i64, VReg_512>; 1799def : BitConvert <v8i64, v16i32, VReg_512>; 1800def : BitConvert <v8f64, v16i32, VReg_512>; 1801def : BitConvert <v16i32, v8i64, VReg_512>; 1802def : BitConvert <v16i32, v8f64, VReg_512>; 1803def : BitConvert <v8i64, v16f32, VReg_512>; 1804def : BitConvert <v8f64, v16f32, VReg_512>; 1805def : BitConvert <v16f32, v8i64, VReg_512>; 1806def : BitConvert <v16f32, v8f64, VReg_512>; 1807 1808 1809 1810def : BitConvert <v32bf16, v32i16, VReg_512>; 1811def : BitConvert <v32i16, v32bf16, VReg_512>; 1812def : BitConvert <v32bf16, v32i16, SReg_512>; 1813def : BitConvert <v32i16, v32bf16, SReg_512>; 1814 1815def : BitConvert <v32bf16, v32f16, VReg_512>; 1816def : BitConvert <v32f16, v32bf16, VReg_512>; 1817def : BitConvert <v32bf16, v32f16, SReg_512>; 1818def : BitConvert <v32f16, v32bf16, SReg_512>; 1819 1820def : BitConvert <v32bf16, v16i32, VReg_512>; 1821def : BitConvert <v16i32, v32bf16, VReg_512>; 1822def : BitConvert <v32bf16, v16i32, SReg_512>; 1823def : BitConvert <v16i32, v32bf16, SReg_512>; 1824 1825def : BitConvert <v32bf16, v16f32, VReg_512>; 1826def : BitConvert <v16f32, v32bf16, VReg_512>; 1827def : BitConvert <v32bf16, v16f32, SReg_512>; 1828def : BitConvert <v16f32, v32bf16, SReg_512>; 1829 1830def : BitConvert <v32bf16, v8f64, VReg_512>; 1831def : BitConvert <v8f64, v32bf16, VReg_512>; 1832def : BitConvert <v32bf16, v8f64, SReg_512>; 1833def : BitConvert <v8f64, v32bf16, SReg_512>; 1834 1835def : BitConvert <v32bf16, v8i64, VReg_512>; 1836def : BitConvert <v8i64, v32bf16, VReg_512>; 1837def : BitConvert <v32bf16, v8i64, SReg_512>; 1838def : BitConvert <v8i64, v32bf16, SReg_512>; 1839 1840// 1024-bit bitcast 1841def : BitConvert <v32i32, v32f32, VReg_1024>; 1842def : BitConvert <v32f32, v32i32, VReg_1024>; 1843def : BitConvert <v16i64, v16f64, VReg_1024>; 1844def : BitConvert <v16f64, v16i64, VReg_1024>; 1845def : BitConvert <v16i64, v32i32, VReg_1024>; 1846def : BitConvert <v32i32, v16i64, VReg_1024>; 1847def : BitConvert <v16f64, v32f32, VReg_1024>; 1848def : BitConvert <v32f32, v16f64, VReg_1024>; 1849def : BitConvert <v16i64, v32f32, VReg_1024>; 1850def : BitConvert <v32i32, v16f64, VReg_1024>; 1851def : BitConvert <v16f64, v32i32, VReg_1024>; 1852def : BitConvert <v32f32, v16i64, VReg_1024>; 1853 1854 1855/********** =================== **********/ 1856/********** Src & Dst modifiers **********/ 1857/********** =================== **********/ 1858 1859 1860// If denormals are not enabled, it only impacts the compare of the 1861// inputs. The output result is not flushed. 1862class ClampPat<Instruction inst, ValueType vt> : GCNPat < 1863 (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), 1864 (inst i32:$src0_modifiers, vt:$src0, 1865 i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) 1866>; 1867 1868def : ClampPat<V_MAX_F32_e64, f32>; 1869let SubtargetPredicate = isNotGFX12Plus in 1870def : ClampPat<V_MAX_F64_e64, f64>; 1871let SubtargetPredicate = isGFX12Plus in 1872def : ClampPat<V_MAX_NUM_F64_e64, f64>; 1873let SubtargetPredicate = NotHasTrue16BitInsts in 1874def : ClampPat<V_MAX_F16_e64, f16>; 1875let SubtargetPredicate = UseRealTrue16Insts in 1876def : ClampPat<V_MAX_F16_t16_e64, f16>; 1877let SubtargetPredicate = UseFakeTrue16Insts in 1878def : ClampPat<V_MAX_F16_fake16_e64, f16>; 1879 1880let SubtargetPredicate = HasVOP3PInsts in { 1881def : GCNPat < 1882 (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), 1883 (V_PK_MAX_F16 $src0_modifiers, $src0, 1884 $src0_modifiers, $src0, DSTCLAMP.ENABLE) 1885>; 1886} 1887 1888 1889/********** ================================ **********/ 1890/********** Floating point absolute/negative **********/ 1891/********** ================================ **********/ 1892 1893def : GCNPat < 1894 (UniformUnaryFrag<fneg> (fabs (f32 SReg_32:$src))), 1895 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit 1896>; 1897 1898def : GCNPat < 1899 (UniformUnaryFrag<fabs> (f32 SReg_32:$src)), 1900 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) 1901>; 1902 1903def : GCNPat < 1904 (UniformUnaryFrag<fneg> (f32 SReg_32:$src)), 1905 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) 1906>; 1907 1908foreach fp16vt = [f16, bf16] in { 1909def : GCNPat < 1910 (UniformUnaryFrag<fneg> (fp16vt SReg_32:$src)), 1911 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) 1912>; 1913 1914def : GCNPat < 1915 (UniformUnaryFrag<fabs> (fp16vt SReg_32:$src)), 1916 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) 1917>; 1918 1919def : GCNPat < 1920 (UniformUnaryFrag<fneg> (fabs (fp16vt SReg_32:$src))), 1921 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit 1922>; 1923} // End foreach fp16vt = ... 1924 1925def : GCNPat < 1926 (UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)), 1927 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) 1928>; 1929 1930def : GCNPat < 1931 (UniformUnaryFrag<fabs> (v2f16 SReg_32:$src)), 1932 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) 1933>; 1934 1935// This is really (fneg (fabs v2f16:$src)) 1936// 1937// fabs is not reported as free because there is modifier for it in 1938// VOP3P instructions, so it is turned into the bit op. 1939def : GCNPat < 1940 (UniformUnaryFrag<fneg> (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), 1941 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1942>; 1943 1944def : GCNPat < 1945 (UniformUnaryFrag<fneg> (v2f16 (fabs SReg_32:$src))), 1946 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1947>; 1948 1949 1950// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead 1951// of the real value. 1952def : GCNPat < 1953 (UniformUnaryFrag<fneg> (v2f32 SReg_64:$src)), 1954 (v2f32 (REG_SEQUENCE SReg_64, 1955 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1956 (i32 (S_MOV_B32 (i32 0x80000000)))), 1957 SReg_32)), sub0, 1958 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1959 (i32 (S_MOV_B32 (i32 0x80000000)))), 1960 SReg_32)), sub1)) 1961>; 1962 1963def : GCNPat < 1964 (UniformUnaryFrag<fabs> (v2f32 SReg_64:$src)), 1965 (v2f32 (REG_SEQUENCE SReg_64, 1966 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1967 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1968 SReg_32)), sub0, 1969 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1970 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1971 SReg_32)), sub1)) 1972>; 1973 1974def : GCNPat < 1975 (UniformUnaryFrag<fneg> (fabs (v2f32 SReg_64:$src))), 1976 (v2f32 (REG_SEQUENCE SReg_64, 1977 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1978 (i32 (S_MOV_B32 (i32 0x80000000)))), 1979 SReg_32)), sub0, 1980 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1981 (i32 (S_MOV_B32 (i32 0x80000000)))), 1982 SReg_32)), sub1)) 1983>; 1984 1985// FIXME: Use S_BITSET0_B32/B64? 1986def : GCNPat < 1987 (UniformUnaryFrag<fabs> (f64 SReg_64:$src)), 1988 (REG_SEQUENCE SReg_64, 1989 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1990 sub0, 1991 (i32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1992 (S_MOV_B32 (i32 0x7fffffff))), SReg_32)), // Set sign bit. 1993 sub1) 1994>; 1995 1996def : GCNPat < 1997 (UniformUnaryFrag<fneg> (f64 SReg_64:$src)), 1998 (REG_SEQUENCE SReg_64, 1999 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 2000 sub0, 2001 (i32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 2002 (i32 (S_MOV_B32 (i32 0x80000000)))), SReg_32)), 2003 sub1) 2004>; 2005 2006def : GCNPat < 2007 (UniformUnaryFrag<fneg> (fabs (f64 SReg_64:$src))), 2008 (REG_SEQUENCE SReg_64, 2009 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 2010 sub0, 2011 (i32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 2012 (S_MOV_B32 (i32 0x80000000))), SReg_32)),// Set sign bit. 2013 sub1) 2014>; 2015 2016 2017def : GCNPat < 2018 (fneg (fabs (f32 VGPR_32:$src))), 2019 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) // Set sign bit 2020>; 2021 2022def : GCNPat < 2023 (fabs (f32 VGPR_32:$src)), 2024 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) 2025>; 2026 2027def : GCNPat < 2028 (fneg (f32 VGPR_32:$src)), 2029 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) 2030>; 2031 2032foreach fp16vt = [f16, bf16] in { 2033def : GCNPat < 2034 (fabs (fp16vt VGPR_32:$src)), 2035 (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) 2036>; 2037 2038def : GCNPat < 2039 (fneg (fp16vt VGPR_32:$src)), 2040 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) 2041>; 2042 2043def : GCNPat < 2044 (fneg (fabs (fp16vt VGPR_32:$src))), 2045 (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit 2046>; 2047} // End foreach fp16vt = ... 2048 2049def : GCNPat < 2050 (fneg (v2f16 VGPR_32:$src)), 2051 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 2052>; 2053 2054def : GCNPat < 2055 (fabs (v2f16 VGPR_32:$src)), 2056 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) 2057>; 2058 2059def : GCNPat < 2060 (fneg (v2f16 (fabs VGPR_32:$src))), 2061 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 2062>; 2063 2064def : GCNPat < 2065 (fabs (f64 VReg_64:$src)), 2066 (REG_SEQUENCE VReg_64, 2067 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 2068 sub0, 2069 (V_AND_B32_e64 (i32 (S_MOV_B32 (i32 0x7fffffff))), 2070 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 2071 sub1) 2072>; 2073 2074def : GCNPat < 2075 (fneg (f64 VReg_64:$src)), 2076 (REG_SEQUENCE VReg_64, 2077 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 2078 sub0, 2079 (V_XOR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 2080 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 2081 sub1) 2082>; 2083 2084def : GCNPat < 2085 (fneg (fabs (f64 VReg_64:$src))), 2086 (REG_SEQUENCE VReg_64, 2087 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 2088 sub0, 2089 (V_OR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 2090 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 2091 sub1) 2092>; 2093 2094def : GCNPat < 2095 (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)), 2096 (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, 2097 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, (i64 0), 2098 0, 0, 0, 0, 0) 2099> { 2100 let SubtargetPredicate = HasPackedFP32Ops; 2101} 2102 2103foreach fp16vt = [f16, bf16] in { 2104 2105def : GCNPat < 2106 (fcopysign fp16vt:$src0, fp16vt:$src1), 2107 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) 2108>; 2109 2110def : GCNPat < 2111 (fcopysign f32:$src0, fp16vt:$src1), 2112 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 2113 (V_LSHLREV_B32_e64 (i32 16), $src1)) 2114>; 2115 2116def : GCNPat < 2117 (fcopysign f64:$src0, fp16vt:$src1), 2118 (REG_SEQUENCE SReg_64, 2119 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2120 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), 2121 (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) 2122>; 2123 2124def : GCNPat < 2125 (fcopysign fp16vt:$src0, f32:$src1), 2126 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 2127 (V_LSHRREV_B32_e64 (i32 16), $src1)) 2128>; 2129 2130def : GCNPat < 2131 (fcopysign fp16vt:$src0, f64:$src1), 2132 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 2133 (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) 2134>; 2135} // End foreach fp16vt = [f16, bf16] 2136 2137/********** ================== **********/ 2138/********** Immediate Patterns **********/ 2139/********** ================== **********/ 2140 2141def : GCNPat < 2142 (VGPRImm<(i32 imm)>:$imm), 2143 (V_MOV_B32_e32 imm:$imm) 2144>; 2145 2146def : GCNPat < 2147 (VGPRImm<(f32 fpimm)>:$imm), 2148 (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) 2149>; 2150 2151def : GCNPat < 2152 (i32 imm:$imm), 2153 (S_MOV_B32 imm:$imm) 2154>; 2155 2156def : GCNPat < 2157 (VGPRImm<(SIlds tglobaladdr:$ga)>), 2158 (V_MOV_B32_e32 $ga) 2159>; 2160 2161def : GCNPat < 2162 (SIlds tglobaladdr:$ga), 2163 (S_MOV_B32 $ga) 2164>; 2165 2166// FIXME: Workaround for ordering issue with peephole optimizer where 2167// a register class copy interferes with immediate folding. Should 2168// use s_mov_b32, which can be shrunk to s_movk_i32 2169def : GCNPat < 2170 (VGPRImm<(f16 fpimm)>:$imm), 2171 (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) 2172>; 2173 2174def : GCNPat < 2175 (VGPRImm<(bf16 fpimm)>:$imm), 2176 (V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm))) 2177>; 2178 2179// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit 2180// immediate and wil be expanded as needed, but we will only use these patterns 2181// for values which can be encoded. 2182def : GCNPat < 2183 (VGPRImm<(i64 imm)>:$imm), 2184 (V_MOV_B64_PSEUDO imm:$imm) 2185>; 2186 2187def : GCNPat < 2188 (VGPRImm<(f64 fpimm)>:$imm), 2189 (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm))) 2190>; 2191 2192def : GCNPat < 2193 (i64 imm:$imm), 2194 (S_MOV_B64_IMM_PSEUDO imm:$imm) 2195>; 2196 2197def : GCNPat < 2198 (f64 fpimm:$imm), 2199 (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm))) 2200>; 2201 2202def : GCNPat < 2203 (f32 fpimm:$imm), 2204 (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) 2205>; 2206 2207def : GCNPat < 2208 (f16 fpimm:$imm), 2209 (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) 2210>; 2211 2212def : GCNPat < 2213 (bf16 fpimm:$imm), 2214 (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) 2215>; 2216 2217def : GCNPat < 2218 (p5 frameindex:$fi), 2219 (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi))) 2220>; 2221 2222def : GCNPat < 2223 (p5 frameindex:$fi), 2224 (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi))) 2225>; 2226 2227def : GCNPat < 2228 (i64 InlineImm64:$imm), 2229 (S_MOV_B64 InlineImm64:$imm) 2230>; 2231 2232// XXX - Should this use a s_cmp to set SCC? 2233 2234// Set to sign-extended 64-bit value (true = -1, false = 0) 2235def : GCNPat < 2236 (i1 imm:$imm), 2237 (S_MOV_B64 (i64 (as_i64imm $imm))) 2238> { 2239 let WaveSizePredicate = isWave64; 2240} 2241 2242def : GCNPat < 2243 (i1 imm:$imm), 2244 (S_MOV_B32 (i32 (as_i32imm $imm))) 2245> { 2246 let WaveSizePredicate = isWave32; 2247} 2248 2249def : GCNPat < 2250 (f64 InlineImmFP64:$imm), 2251 (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm))) 2252>; 2253 2254/********** ================== **********/ 2255/********** Intrinsic Patterns **********/ 2256/********** ================== **********/ 2257 2258def : GCNPat < 2259 (f32 (fpow (VOP3Mods f32:$src0, i32:$src0_mods), (VOP3Mods f32:$src1, i32:$src1_mods))), 2260 (V_EXP_F32_e64 SRCMODS.NONE, (V_MUL_LEGACY_F32_e64 $src1_mods, $src1, SRCMODS.NONE, (V_LOG_F32_e64 $src0_mods, $src0), 0, 0)) 2261>; 2262 2263def : GCNPat < 2264 (i32 (sext i1:$src0)), 2265 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2266 /*src1mod*/(i32 0), /*src1*/(i32 -1), i1:$src0) 2267>; 2268 2269class Ext32Pat <SDNode ext> : GCNPat < 2270 (i32 (ext i1:$src0)), 2271 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2272 /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0) 2273>; 2274 2275def : Ext32Pat <zext>; 2276def : Ext32Pat <anyext>; 2277 2278// The multiplication scales from [0,1) to the unsigned integer range, 2279// rounding down a bit to avoid unwanted overflow. 2280def : GCNPat < 2281 (AMDGPUurecip i32:$src0), 2282 (V_CVT_U32_F32_e32 2283 (V_MUL_F32_e32 (i32 CONST.FP_4294966784), 2284 (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) 2285>; 2286 2287//===----------------------------------------------------------------------===// 2288// VOP3 Patterns 2289//===----------------------------------------------------------------------===// 2290 2291def : IMad24Pat<V_MAD_I32_I24_e64, 1>; 2292def : UMad24Pat<V_MAD_U32_U24_e64, 1>; 2293 2294// BFI patterns 2295 2296def BFIImm32 : PatFrag< 2297 (ops node:$x, node:$y, node:$z), 2298 (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))), 2299 [{ 2300 auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1)); 2301 auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1)); 2302 return X && NotX && 2303 ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue(); 2304 }] 2305>; 2306 2307 2308// Definition from ISA doc: 2309// (y & x) | (z & ~x) 2310def : AMDGPUPatIgnoreCopies < 2311 (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), 2312 (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), 2313 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), 2314 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) 2315>; 2316 2317// (y & C) | (z & ~C) 2318def : AMDGPUPatIgnoreCopies < 2319 (BFIImm32 i32:$x, i32:$y, i32:$z), 2320 (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) 2321>; 2322 2323// 64-bit version 2324def : AMDGPUPatIgnoreCopies < 2325 (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), 2326 (REG_SEQUENCE VReg_64, 2327 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 2328 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 2329 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 2330 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 2331 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 2332 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 2333>; 2334 2335// SHA-256 Ch function 2336// z ^ (x & (y ^ z)) 2337def : AMDGPUPatIgnoreCopies < 2338 (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), 2339 (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), 2340 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), 2341 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) 2342>; 2343 2344// 64-bit version 2345def : AMDGPUPatIgnoreCopies < 2346 (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), 2347 (REG_SEQUENCE VReg_64, 2348 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 2349 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 2350 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 2351 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 2352 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 2353 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 2354>; 2355 2356def : AMDGPUPat < 2357 (fcopysign f32:$src0, f32:$src1), 2358 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1) 2359>; 2360 2361def : AMDGPUPat < 2362 (fcopysign f32:$src0, f64:$src1), 2363 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 2364 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))) 2365>; 2366 2367def : AMDGPUPat < 2368 (fcopysign f64:$src0, f64:$src1), 2369 (REG_SEQUENCE SReg_64, 2370 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2371 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 2372 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 2373 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1) 2374>; 2375 2376def : AMDGPUPat < 2377 (fcopysign f64:$src0, f32:$src1), 2378 (REG_SEQUENCE SReg_64, 2379 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2380 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 2381 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 2382 $src1), sub1) 2383>; 2384 2385def : ROTRPattern <V_ALIGNBIT_B32_e64>; 2386 2387def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), 2388 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 2389 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 2390 2391def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), 2392 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 2393 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 2394 2395/********** ====================== **********/ 2396/********** Indirect addressing **********/ 2397/********** ====================== **********/ 2398 2399multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { 2400 // Extract with offset 2401 def : GCNPat< 2402 (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), 2403 (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) 2404 >; 2405 2406 // Insert with offset 2407 def : GCNPat< 2408 (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), 2409 (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) 2410 >; 2411} 2412 2413defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; 2414defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; 2415defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; 2416defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">; 2417defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">; 2418defm : SI_INDIRECT_Pattern <v11f32, f32, "V11">; 2419defm : SI_INDIRECT_Pattern <v12f32, f32, "V12">; 2420defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; 2421defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">; 2422 2423defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; 2424defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; 2425defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; 2426defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">; 2427defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">; 2428defm : SI_INDIRECT_Pattern <v11i32, i32, "V11">; 2429defm : SI_INDIRECT_Pattern <v12i32, i32, "V12">; 2430defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; 2431defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">; 2432 2433//===----------------------------------------------------------------------===// 2434// SAD Patterns 2435//===----------------------------------------------------------------------===// 2436 2437def : GCNPat < 2438 (add (sub_oneuse (umax i32:$src0, i32:$src1), 2439 (umin i32:$src0, i32:$src1)), 2440 i32:$src2), 2441 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 2442>; 2443 2444def : GCNPat < 2445 (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)), 2446 (sub i32:$src0, i32:$src1), 2447 (sub i32:$src1, i32:$src0)), 2448 i32:$src2), 2449 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 2450>; 2451 2452//===----------------------------------------------------------------------===// 2453// Conversion Patterns 2454//===----------------------------------------------------------------------===// 2455def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)), 2456 (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 2457 2458// Handle sext_inreg in i64 2459def : GCNPat < 2460 (i64 (UniformSextInreg<i1> i64:$src)), 2461 (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 2462>; 2463 2464def : GCNPat < 2465 (i16 (UniformSextInreg<i1> i16:$src)), 2466 (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 2467>; 2468 2469def : GCNPat < 2470 (i16 (UniformSextInreg<i8> i16:$src)), 2471 (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 2472>; 2473 2474def : GCNPat < 2475 (i64 (UniformSextInreg<i8> i64:$src)), 2476 (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 2477>; 2478 2479def : GCNPat < 2480 (i64 (UniformSextInreg<i16> i64:$src)), 2481 (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 2482>; 2483 2484def : GCNPat < 2485 (i64 (UniformSextInreg<i32> i64:$src)), 2486 (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 2487>; 2488 2489def : GCNPat< 2490 (i32 (DivergentSextInreg<i1> i32:$src)), 2491 (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>; 2492 2493def : GCNPat < 2494 (i16 (DivergentSextInreg<i1> i16:$src)), 2495 (V_BFE_I32_e64 $src, (i32 0), (i32 1)) 2496>; 2497 2498def : GCNPat < 2499 (i16 (DivergentSextInreg<i8> i16:$src)), 2500 (V_BFE_I32_e64 $src, (i32 0), (i32 8)) 2501>; 2502 2503def : GCNPat< 2504 (i32 (DivergentSextInreg<i8> i32:$src)), 2505 (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8)) 2506>; 2507 2508def : GCNPat < 2509 (i32 (DivergentSextInreg<i16> i32:$src)), 2510 (V_BFE_I32_e64 $src, (i32 0), (i32 16)) 2511>; 2512 2513def : GCNPat < 2514 (i64 (DivergentSextInreg<i1> i64:$src)), 2515 (REG_SEQUENCE VReg_64, 2516 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0, 2517 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1) 2518>; 2519 2520def : GCNPat < 2521 (i64 (DivergentSextInreg<i8> i64:$src)), 2522 (REG_SEQUENCE VReg_64, 2523 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)), sub0, 2524 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1) 2525>; 2526 2527def : GCNPat < 2528 (i64 (DivergentSextInreg<i16> i64:$src)), 2529 (REG_SEQUENCE VReg_64, 2530 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)), sub0, 2531 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1) 2532>; 2533 2534def : GCNPat < 2535 (i64 (DivergentSextInreg<i32> i64:$src)), 2536 (REG_SEQUENCE VReg_64, 2537 (i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0, 2538 (V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1) 2539>; 2540 2541def : GCNPat < 2542 (i64 (zext i32:$src)), 2543 (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) 2544>; 2545 2546def : GCNPat < 2547 (i64 (anyext i32:$src)), 2548 (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) 2549>; 2550 2551class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < 2552 (i64 (ext i1:$src)), 2553 (REG_SEQUENCE VReg_64, 2554 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2555 /*src1mod*/(i32 0), /*src1*/(i32 1), $src), 2556 sub0, (S_MOV_B32 (i32 0)), sub1) 2557>; 2558 2559 2560def : ZExt_i64_i1_Pat<zext>; 2561def : ZExt_i64_i1_Pat<anyext>; 2562 2563// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that 2564// REG_SEQUENCE patterns don't support instructions with multiple outputs. 2565def : GCNPat < 2566 (i64 (UniformUnaryFrag<sext> i32:$src)), 2567 (REG_SEQUENCE SReg_64, $src, sub0, 2568 (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) 2569>; 2570 2571def : GCNPat < 2572 (i64 (DivergentUnaryFrag<sext> i32:$src)), 2573 (REG_SEQUENCE VReg_64, $src, sub0, 2574 (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1) 2575>; 2576 2577def : GCNPat < 2578 (i64 (sext i1:$src)), 2579 (REG_SEQUENCE VReg_64, 2580 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2581 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0, 2582 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2583 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1) 2584>; 2585 2586class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat < 2587 (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), 2588 (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) 2589>; 2590 2591let OtherPredicates = [NotHasTrue16BitInsts] in { 2592 def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; 2593 def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; 2594} // end OtherPredicates = [NotHasTrue16BitInsts] 2595 2596let OtherPredicates = [HasTrue16BitInsts] in { 2597 def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; 2598 def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; 2599} // end OtherPredicates = [HasTrue16BitInsts] 2600 2601def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; 2602def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; 2603def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; 2604def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; 2605 2606// If we need to perform a logical operation on i1 values, we need to 2607// use vector comparisons since there is only one SCC register. Vector 2608// comparisons may write to a pair of SGPRs or a single SGPR, so treat 2609// these as 32 or 64-bit comparisons. When legalizing SGPR copies, 2610// instructions resulting in the copies from SCC to these instructions 2611// will be moved to the VALU. 2612 2613let WaveSizePredicate = isWave64 in { 2614def : GCNPat < 2615 (i1 (and i1:$src0, i1:$src1)), 2616 (S_AND_B64 $src0, $src1) 2617>; 2618 2619def : GCNPat < 2620 (i1 (or i1:$src0, i1:$src1)), 2621 (S_OR_B64 $src0, $src1) 2622>; 2623 2624def : GCNPat < 2625 (i1 (xor i1:$src0, i1:$src1)), 2626 (S_XOR_B64 $src0, $src1) 2627>; 2628 2629def : GCNPat < 2630 (i1 (add i1:$src0, i1:$src1)), 2631 (S_XOR_B64 $src0, $src1) 2632>; 2633 2634def : GCNPat < 2635 (i1 (sub i1:$src0, i1:$src1)), 2636 (S_XOR_B64 $src0, $src1) 2637>; 2638 2639let AddedComplexity = 1 in { 2640def : GCNPat < 2641 (i1 (add i1:$src0, (i1 -1))), 2642 (S_NOT_B64 $src0) 2643>; 2644 2645def : GCNPat < 2646 (i1 (sub i1:$src0, (i1 -1))), 2647 (S_NOT_B64 $src0) 2648>; 2649} 2650} // end isWave64 2651 2652let WaveSizePredicate = isWave32 in { 2653def : GCNPat < 2654 (i1 (and i1:$src0, i1:$src1)), 2655 (S_AND_B32 $src0, $src1) 2656>; 2657 2658def : GCNPat < 2659 (i1 (or i1:$src0, i1:$src1)), 2660 (S_OR_B32 $src0, $src1) 2661>; 2662 2663def : GCNPat < 2664 (i1 (xor i1:$src0, i1:$src1)), 2665 (S_XOR_B32 $src0, $src1) 2666>; 2667 2668def : GCNPat < 2669 (i1 (add i1:$src0, i1:$src1)), 2670 (S_XOR_B32 $src0, $src1) 2671>; 2672 2673def : GCNPat < 2674 (i1 (sub i1:$src0, i1:$src1)), 2675 (S_XOR_B32 $src0, $src1) 2676>; 2677 2678let AddedComplexity = 1 in { 2679def : GCNPat < 2680 (i1 (add i1:$src0, (i1 -1))), 2681 (S_NOT_B32 $src0) 2682>; 2683 2684def : GCNPat < 2685 (i1 (sub i1:$src0, (i1 -1))), 2686 (S_NOT_B32 $src0) 2687>; 2688} 2689} // end isWave32 2690 2691def : GCNPat < 2692 (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))), 2693 (V_NOT_B32_e32 $src0) 2694>; 2695 2696def : GCNPat < 2697 (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))), 2698 (REG_SEQUENCE VReg_64, 2699 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0, 2700 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1 2701 ) 2702>; 2703 2704let SubtargetPredicate = NotHasTrue16BitInsts in 2705def : GCNPat < 2706 (f16 (sint_to_fp i1:$src)), 2707 (V_CVT_F16_F32_e32 ( 2708 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2709 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2710 SSrc_i1:$src)) 2711>; 2712 2713let SubtargetPredicate = HasTrue16BitInsts in 2714def : GCNPat < 2715 (f16 (sint_to_fp i1:$src)), 2716 (V_CVT_F16_F32_t16_e32 ( 2717 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2718 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2719 SSrc_i1:$src)) 2720>; 2721 2722let SubtargetPredicate = NotHasTrue16BitInsts in 2723def : GCNPat < 2724 (f16 (uint_to_fp i1:$src)), 2725 (V_CVT_F16_F32_e32 ( 2726 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2727 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2728 SSrc_i1:$src)) 2729>; 2730let SubtargetPredicate = HasTrue16BitInsts in 2731def : GCNPat < 2732 (f16 (uint_to_fp i1:$src)), 2733 (V_CVT_F16_F32_t16_e32 ( 2734 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2735 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2736 SSrc_i1:$src)) 2737>; 2738 2739def : GCNPat < 2740 (f32 (sint_to_fp i1:$src)), 2741 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2742 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2743 SSrc_i1:$src) 2744>; 2745 2746def : GCNPat < 2747 (f32 (uint_to_fp i1:$src)), 2748 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2749 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2750 SSrc_i1:$src) 2751>; 2752 2753def : GCNPat < 2754 (f64 (sint_to_fp i1:$src)), 2755 (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2756 /*src1mod*/(i32 0), /*src1*/(i32 -1), 2757 SSrc_i1:$src)) 2758>; 2759 2760def : GCNPat < 2761 (f64 (uint_to_fp i1:$src)), 2762 (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2763 /*src1mod*/(i32 0), /*src1*/(i32 1), 2764 SSrc_i1:$src)) 2765>; 2766 2767//===----------------------------------------------------------------------===// 2768// Miscellaneous Patterns 2769//===----------------------------------------------------------------------===// 2770 2771// Eliminate a zero extension from an fp16 operation if it already 2772// zeros the high bits of the 32-bit register. 2773// 2774// This is complicated on gfx9+. Some instructions maintain the legacy 2775// zeroing behavior, but others preserve the high bits. Some have a 2776// control bit to change the behavior. We can't simply say with 2777// certainty what the source behavior is without more context on how 2778// the src is lowered. e.g. fptrunc + fma may be lowered to a 2779// v_fma_mix* instruction which does not zero, or may not. 2780def : GCNPat< 2781 (i32 (DivergentUnaryFrag<abs> i32:$src)), 2782 (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>; 2783 2784let AddedComplexity = 1 in { 2785def : GCNPat< 2786 (i32 (DivergentUnaryFrag<abs> i32:$src)), 2787 (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{ 2788 let SubtargetPredicate = HasAddNoCarryInsts; 2789} 2790} // AddedComplexity = 1 2791 2792def : GCNPat< 2793 (i32 (DivergentUnaryFrag<zext> i16:$src)), 2794 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src) 2795>; 2796 2797def : GCNPat< 2798 (i64 (DivergentUnaryFrag<zext> i16:$src)), 2799 (REG_SEQUENCE VReg_64, 2800 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0, 2801 (S_MOV_B32 (i32 0)), sub1) 2802>; 2803 2804def : GCNPat< 2805 (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), 2806 (COPY VSrc_b16:$src)>; 2807 2808def : GCNPat < 2809 (i32 (trunc i64:$a)), 2810 (EXTRACT_SUBREG $a, sub0) 2811>; 2812 2813def : GCNPat < 2814 (i1 (UniformUnaryFrag<trunc> i32:$a)), 2815 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2816>; 2817 2818def : GCNPat < 2819 (i1 (UniformUnaryFrag<trunc> i16:$a)), 2820 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2821>; 2822 2823def : GCNPat < 2824 (i1 (UniformUnaryFrag<trunc> i64:$a)), 2825 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), 2826 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2827>; 2828 2829def : GCNPat < 2830 (i1 (DivergentUnaryFrag<trunc> i32:$a)), 2831 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2832>; 2833 2834def : GCNPat < 2835 (i1 (DivergentUnaryFrag<trunc> i16:$a)), 2836 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2837>; 2838 2839def IMMBitSelConst : SDNodeXForm<imm, [{ 2840 return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N), 2841 MVT::i32); 2842}]>; 2843 2844// Matching separate SRL and TRUNC instructions 2845// with dependent operands (SRL dest is source of TRUNC) 2846// generates three instructions. However, by using bit shifts, 2847// the V_LSHRREV_B32_e64 result can be directly used in the 2848// operand of the V_AND_B32_e64 instruction: 2849// (trunc i32 (srl i32 $a, i32 $b)) -> 2850// v_and_b32_e64 $a, (1 << $b), $a 2851// v_cmp_ne_u32_e64 $a, 0, $a 2852 2853// Handle the VALU case. 2854def : GCNPat < 2855 (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), 2856 (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a), 2857 (i32 0)) 2858>; 2859 2860// Handle the scalar case. 2861def : GCNPat < 2862 (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), 2863 (S_CMP_LG_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a), 2864 (i32 0)) 2865>; 2866 2867def : GCNPat < 2868 (i1 (DivergentUnaryFrag<trunc> i64:$a)), 2869 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), 2870 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2871>; 2872 2873def : GCNPat < 2874 (i32 (bswap i32:$a)), 2875 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2876 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), 2877 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) 2878>; 2879 2880// FIXME: This should have been narrowed to i32 during legalization. 2881// This pattern should also be skipped for GlobalISel 2882def : GCNPat < 2883 (i64 (bswap i64:$a)), 2884 (REG_SEQUENCE VReg_64, 2885 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2886 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2887 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2888 (i32 24)), 2889 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2890 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2891 (i32 8))), 2892 sub0, 2893 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2894 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2895 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2896 (i32 24)), 2897 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2898 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2899 (i32 8))), 2900 sub1) 2901>; 2902 2903// FIXME: The AddedComplexity should not be needed, but in GlobalISel 2904// the BFI pattern ends up taking precedence without it. 2905let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { 2906// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24) 2907// 2908// My reading of the manual suggests we should be using src0 for the 2909// register value, but this is what seems to work. 2910def : GCNPat < 2911 (i32 (bswap i32:$a)), 2912 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) 2913>; 2914 2915// FIXME: This should have been narrowed to i32 during legalization. 2916// This pattern should also be skipped for GlobalISel 2917def : GCNPat < 2918 (i64 (bswap i64:$a)), 2919 (REG_SEQUENCE VReg_64, 2920 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), 2921 (S_MOV_B32 (i32 0x00010203))), 2922 sub0, 2923 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), 2924 (S_MOV_B32 (i32 0x00010203))), 2925 sub1) 2926>; 2927 2928// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24) 2929// The 12s emit 0s. 2930def : GCNPat < 2931 (i16 (bswap i16:$a)), 2932 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 2933>; 2934 2935def : GCNPat < 2936 (i32 (zext (bswap i16:$a))), 2937 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 2938>; 2939 2940// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) 2941def : GCNPat < 2942 (v2i16 (bswap v2i16:$a)), 2943 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) 2944>; 2945 2946} 2947 2948def : GCNPat< 2949 (i64 (DivergentUnaryFrag<bitreverse> i64:$a)), 2950 (REG_SEQUENCE VReg_64, 2951 (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0, 2952 (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>; 2953 2954// If fcanonicalize's operand is implicitly canonicalized, we only need a copy. 2955let AddedComplexity = 1000 in { 2956foreach vt = [f16, v2f16, f32, v2f32, f64] in { 2957 def : GCNPat< 2958 (fcanonicalize (vt is_canonicalized:$src)), 2959 (COPY vt:$src) 2960 >; 2961} 2962} 2963 2964// Prefer selecting to max when legal, but using mul is always valid. 2965let AddedComplexity = -5 in { 2966 2967let OtherPredicates = [NotHasTrue16BitInsts] in { 2968def : GCNPat< 2969 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2970 (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 2971>; 2972 2973def : GCNPat< 2974 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 2975 (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 2976>; 2977} // End OtherPredicates 2978 2979let OtherPredicates = [HasTrue16BitInsts] in { 2980def : GCNPat< 2981 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2982 (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 2983>; 2984 2985def : GCNPat< 2986 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 2987 (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 2988>; 2989} // End OtherPredicates 2990 2991def : GCNPat< 2992 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 2993 (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) 2994>; 2995 2996def : GCNPat< 2997 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 2998 (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src) 2999>; 3000 3001def : GCNPat< 3002 (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), 3003 (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) 3004>; 3005 3006let SubtargetPredicate = HasPackedFP32Ops in { 3007def : GCNPat< 3008 (fcanonicalize (v2f32 (VOP3PMods v2f32:$src, i32:$src_mods))), 3009 (V_PK_MUL_F32 0, (i64 CONST.FP32_ONE), $src_mods, $src) 3010>; 3011} 3012 3013// TODO: Handle fneg like other types. 3014let SubtargetPredicate = isNotGFX12Plus in { 3015def : GCNPat< 3016 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 3017 (V_MUL_F64_e64 0, (i64 CONST.FP64_ONE), $src_mods, $src) 3018>; 3019} 3020} // End AddedComplexity = -5 3021 3022multiclass SelectCanonicalizeAsMax< 3023 list<Predicate> f32_preds = [], 3024 list<Predicate> f64_preds = [], 3025 list<Predicate> f16_preds = []> { 3026 def : GCNPat< 3027 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 3028 (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> { 3029 let OtherPredicates = f32_preds; 3030 } 3031 3032 def : GCNPat< 3033 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 3034 (V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> { 3035 let OtherPredicates = !listconcat(f64_preds, [isNotGFX12Plus]); 3036 } 3037 3038 def : GCNPat< 3039 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 3040 (V_MAX_NUM_F64_e64 $src_mods, $src, $src_mods, $src)> { 3041 let OtherPredicates = !listconcat(f64_preds, [isGFX12Plus]); 3042 } 3043 3044 def : GCNPat< 3045 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 3046 (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 3047 let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, NotHasTrue16BitInsts]); 3048 } 3049 3050 def : GCNPat< 3051 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 3052 (V_MAX_F16_fake16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 3053 let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, HasTrue16BitInsts]); 3054 } 3055 3056 def : GCNPat< 3057 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 3058 (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> { 3059 // FIXME: Should have VOP3P subtarget predicate 3060 let OtherPredicates = f16_preds; 3061 } 3062} 3063 3064// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal 3065// mode, and would never flush. For f64, it's faster to do implement 3066// this with a max. For f16/f32 it's a wash, but prefer max when 3067// valid. 3068// 3069// FIXME: Lowering f32/f16 with max is worse since we can use a 3070// smaller encoding if the input is fneg'd. It also adds an extra 3071// register use. 3072let SubtargetPredicate = HasMinMaxDenormModes in { 3073 defm : SelectCanonicalizeAsMax<[], [], []>; 3074} // End SubtargetPredicate = HasMinMaxDenormModes 3075 3076let SubtargetPredicate = NotHasMinMaxDenormModes in { 3077 // Use the max lowering if we don't need to flush. 3078 3079 // FIXME: We don't do use this for f32 as a workaround for the 3080 // library being compiled with the default ieee mode, but 3081 // potentially being called from flushing kernels. Really we should 3082 // not be mixing code expecting different default FP modes, but mul 3083 // works in any FP environment. 3084 defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>; 3085} // End SubtargetPredicate = NotHasMinMaxDenormModes 3086 3087 3088let OtherPredicates = [HasDLInsts] in { 3089// Don't allow source modifiers. If there are any source modifiers then it's 3090// better to select fma instead of fmac. 3091def : GCNPat < 3092 (fma (f32 (VOP3NoMods f32:$src0)), 3093 (f32 (VOP3NoMods f32:$src1)), 3094 (f32 (VOP3NoMods f32:$src2))), 3095 (V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 3096 SRCMODS.NONE, $src2) 3097>; 3098} // End OtherPredicates = [HasDLInsts] 3099 3100let SubtargetPredicate = isGFX10Plus in { 3101// Don't allow source modifiers. If there are any source modifiers then it's 3102// better to select fma instead of fmac. 3103let OtherPredicates = [NotHasTrue16BitInsts] in 3104def : GCNPat < 3105 (fma (f16 (VOP3NoMods f32:$src0)), 3106 (f16 (VOP3NoMods f32:$src1)), 3107 (f16 (VOP3NoMods f32:$src2))), 3108 (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 3109 SRCMODS.NONE, $src2) 3110>; 3111let OtherPredicates = [HasTrue16BitInsts] in 3112def : GCNPat < 3113 (fma (f16 (VOP3NoMods f32:$src0)), 3114 (f16 (VOP3NoMods f32:$src1)), 3115 (f16 (VOP3NoMods f32:$src2))), 3116 (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 3117 SRCMODS.NONE, $src2) 3118>; 3119} 3120 3121let OtherPredicates = [HasFmacF64Inst] in 3122// Don't allow source modifiers. If there are any source modifiers then it's 3123// better to select fma instead of fmac. 3124def : GCNPat < 3125 (fma (f64 (VOP3NoMods f64:$src0)), 3126 (f64 (VOP3NoMods f64:$src1)), 3127 (f64 (VOP3NoMods f64:$src2))), 3128 (V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 3129 SRCMODS.NONE, $src2) 3130>; 3131 3132// COPY is workaround tablegen bug from multiple outputs 3133// from S_LSHL_B32's multiple outputs from implicit scc def. 3134let AddedComplexity = 1 in { 3135def : GCNPat < 3136 (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), 3137 (S_LSHL_B32 SReg_32:$src1, (i16 16)) 3138>; 3139 3140def : GCNPat < 3141 (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))), 3142 (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1)) 3143>; 3144 3145 3146def : GCNPat < 3147 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), 3148 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 3149>; 3150 3151def : GCNPat < 3152 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))), 3153 (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) 3154>; 3155 3156def : GCNPat < 3157 (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), 3158 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 3159>; 3160 3161def : GCNPat < 3162 (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))), 3163 (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) 3164>; 3165 3166foreach vecTy = [v2i16, v2f16, v2bf16] in { 3167 3168defvar Ty = vecTy.ElementType; 3169 3170def : GCNPat < 3171 (vecTy (UniformBinFrag<build_vector> (Ty SReg_32:$src0), (Ty undef))), 3172 (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) 3173>; 3174 3175def : GCNPat < 3176 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))), 3177 (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32) 3178>; 3179 3180def : GCNPat < 3181 (vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))), 3182 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 3183>; 3184 3185def : GCNPat < 3186 (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))), 3187 (vecTy (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1)) 3188>; 3189} // End foreach Ty = ... 3190} 3191 3192let SubtargetPredicate = HasVOP3PInsts in { 3193def : GCNPat < 3194 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))), 3195 (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) 3196>; 3197 3198// With multiple uses of the shift, this will duplicate the shift and 3199// increase register pressure. 3200def : GCNPat < 3201 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 3202 (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1)) 3203>; 3204 3205def : GCNPat < 3206 (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), 3207 (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 3208 (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) 3209>; 3210 3211 3212foreach vecTy = [v2i16, v2f16, v2bf16] in { 3213 3214defvar Ty = vecTy.ElementType; 3215defvar immzeroTy = !if(!eq(Ty, i16), immzero, fpimmzero); 3216 3217def : GCNPat < 3218 (vecTy (UniformBinFrag<build_vector> (Ty SReg_32:$src0), (Ty SReg_32:$src1))), 3219 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 3220>; 3221 3222// Take the lower 16 bits from each VGPR_32 and concat them 3223def : GCNPat < 3224 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))), 3225 (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) 3226>; 3227 3228 3229// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] 3230// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) 3231def : GCNPat < 3232 (vecTy (DivergentBinFrag<build_vector> (Ty (immzeroTy)), 3233 (Ty !if(!eq(Ty, i16), 3234 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 3235 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 3236 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b) 3237>; 3238 3239 3240// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] 3241// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) 3242def : GCNPat < 3243 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), 3244 (Ty !if(!eq(Ty, i16), 3245 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 3246 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 3247 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b) 3248>; 3249 3250 3251// Take the upper 16 bits from V[0] and the lower 16 bits from V[1] 3252// Special case, can use V_ALIGNBIT (always uses encoded literal) 3253def : GCNPat < 3254 (vecTy (DivergentBinFrag<build_vector> 3255 (Ty !if(!eq(Ty, i16), 3256 (Ty (trunc (srl VGPR_32:$a, (i32 16)))), 3257 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), 3258 (Ty VGPR_32:$b))), 3259 (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) 3260>; 3261 3262// Take the upper 16 bits from each VGPR_32 and concat them 3263def : GCNPat < 3264 (vecTy (DivergentBinFrag<build_vector> 3265 (Ty !if(!eq(Ty, i16), 3266 (Ty (trunc (srl VGPR_32:$a, (i32 16)))), 3267 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), 3268 (Ty !if(!eq(Ty, i16), 3269 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 3270 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 3271 (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302))) 3272>; 3273 3274 3275} // end foreach Ty 3276 3277 3278let AddedComplexity = 5 in { 3279def : GCNPat < 3280 (v2f16 (is_canonicalized_2<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), 3281 (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), 3282 (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) 3283>; 3284} 3285} // End SubtargetPredicate = HasVOP3PInsts 3286 3287// With multiple uses of the shift, this will duplicate the shift and 3288// increase register pressure. 3289let SubtargetPredicate = isGFX11Plus in 3290def : GCNPat < 3291 (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 SReg_32:$src1))), 3292 (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1)) 3293>; 3294 3295 3296def : GCNPat < 3297 (v2f16 (scalar_to_vector f16:$src0)), 3298 (COPY $src0) 3299>; 3300 3301def : GCNPat < 3302 (v2i16 (scalar_to_vector i16:$src0)), 3303 (COPY $src0) 3304>; 3305 3306def : GCNPat < 3307 (v4i16 (scalar_to_vector i16:$src0)), 3308 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 3309>; 3310 3311def : GCNPat < 3312 (v4f16 (scalar_to_vector f16:$src0)), 3313 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 3314>; 3315 3316def : GCNPat < 3317 (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, 3318 timm:$bank_mask, timm:$bound_ctrl)), 3319 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src, 3320 (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), 3321 (as_i32timm $bank_mask), 3322 (as_i1timm $bound_ctrl)) 3323>; 3324 3325foreach vt = Reg64Types.types in { 3326def : GCNPat < 3327 (vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl, timm:$row_mask, 3328 timm:$bank_mask, timm:$bound_ctrl)), 3329 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl), 3330 (as_i32timm $row_mask), (as_i32timm $bank_mask), 3331 (as_i1timm $bound_ctrl)) 3332>; 3333} 3334 3335//===----------------------------------------------------------------------===// 3336// Fract Patterns 3337//===----------------------------------------------------------------------===// 3338 3339let SubtargetPredicate = isGFX6 in { 3340 3341// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is 3342// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient 3343// way to implement it is using V_FRACT_F64. 3344// The workaround for the V_FRACT bug is: 3345// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3346 3347// Convert floor(x) to (x - fract(x)) 3348 3349// Don't bother handling this for GlobalISel, it's handled during 3350// lowering. 3351// 3352// FIXME: DAG should also custom lower this. 3353def : GCNPat < 3354 (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), 3355 (V_ADD_F64_e64 3356 $mods, 3357 $x, 3358 SRCMODS.NEG, 3359 (V_CNDMASK_B64_PSEUDO 3360 (V_MIN_F64_e64 3361 SRCMODS.NONE, 3362 (V_FRACT_F64_e64 $mods, $x), 3363 SRCMODS.NONE, 3364 (V_MOV_B64_PSEUDO (i64 0x3fefffffffffffff))), 3365 $x, 3366 (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/)))) 3367>; 3368 3369} // End SubtargetPredicates = isGFX6 3370 3371//============================================================================// 3372// Miscellaneous Optimization Patterns 3373//============================================================================// 3374 3375// Undo sub x, c -> add x, -c canonicalization since c is more likely 3376// an inline immediate than -c. 3377// TODO: Also do for 64-bit. 3378def : GCNPat< 3379 (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3380 (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1) 3381>; 3382 3383def : GCNPat< 3384 (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3385 (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 3386 let SubtargetPredicate = HasAddNoCarryInsts; 3387} 3388 3389def : GCNPat< 3390 (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3391 (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 3392 let SubtargetPredicate = NotHasAddNoCarryInsts; 3393} 3394 3395 3396// Avoid pointlessly materializing a constant in VGPR. 3397// FIXME: Should also do this for readlane, but tablegen crashes on 3398// the ignored src1. 3399def : GCNPat< 3400 (i32 (int_amdgcn_readfirstlane (i32 imm:$src))), 3401 (S_MOV_B32 SReg_32:$src) 3402>; 3403 3404multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> { 3405 def : GCNPat < 3406 (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), 3407 (BFM $a, $b) 3408 >; 3409 3410 def : GCNPat < 3411 (vt (ADD (vt (shl 1, vt:$a)), -1)), 3412 (BFM $a, (i32 0)) 3413 >; 3414} 3415 3416defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>; 3417// FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>; 3418defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>; 3419 3420// Bitfield extract patterns 3421 3422def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{ 3423 return isMask_32(Imm); 3424}]>; 3425 3426def IMMPopCount : SDNodeXForm<imm, [{ 3427 return CurDAG->getTargetConstant(llvm::popcount(N->getZExtValue()), SDLoc(N), 3428 MVT::i32); 3429}]>; 3430 3431def : AMDGPUPat < 3432 (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)), 3433 IMMZeroBasedBitfieldMask:$mask), 3434 (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask))) 3435>; 3436 3437// x & ((1 << y) - 1) 3438def : AMDGPUPat < 3439 (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)), 3440 (V_BFE_U32_e64 $src, (i32 0), $width) 3441>; 3442 3443// x & ~(-1 << y) 3444def : AMDGPUPat < 3445 (DivergentBinFrag<and> i32:$src, 3446 (xor_oneuse (shl_oneuse -1, i32:$width), -1)), 3447 (V_BFE_U32_e64 $src, (i32 0), $width) 3448>; 3449 3450// x & (-1 >> (bitwidth - y)) 3451def : AMDGPUPat < 3452 (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))), 3453 (V_BFE_U32_e64 $src, (i32 0), $width) 3454>; 3455 3456// x << (bitwidth - y) >> (bitwidth - y) 3457def : AMDGPUPat < 3458 (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)), 3459 (sub 32, i32:$width)), 3460 (V_BFE_U32_e64 $src, (i32 0), $width) 3461>; 3462 3463def : AMDGPUPat < 3464 (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)), 3465 (sub 32, i32:$width)), 3466 (V_BFE_I32_e64 $src, (i32 0), $width) 3467>; 3468 3469// SHA-256 Ma patterns 3470 3471// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y 3472def : AMDGPUPatIgnoreCopies < 3473 (DivergentBinFrag<or> (and i32:$x, i32:$z), 3474 (and i32:$y, (or i32:$x, i32:$z))), 3475 (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), 3476 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)), 3477 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32), 3478 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)) 3479>; 3480 3481def : AMDGPUPatIgnoreCopies < 3482 (DivergentBinFrag<or> (and i64:$x, i64:$z), 3483 (and i64:$y, (or i64:$x, i64:$z))), 3484 (REG_SEQUENCE VReg_64, 3485 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 3486 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), 3487 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)), 3488 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0, 3489 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 3490 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), 3491 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)), 3492 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1) 3493>; 3494 3495multiclass IntMed3Pat<Instruction med3Inst, 3496 SDPatternOperator min, 3497 SDPatternOperator max> { 3498 3499 // This matches 16 permutations of 3500 // min(max(a, b), max(min(a, b), c)) 3501 def : AMDGPUPat < 3502 (min (max i32:$src0, i32:$src1), 3503 (max (min i32:$src0, i32:$src1), i32:$src2)), 3504 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3505>; 3506 3507 // This matches 16 permutations of 3508 // max(min(x, y), min(max(x, y), z)) 3509 def : AMDGPUPat < 3510 (max (min i32:$src0, i32:$src1), 3511 (min (max i32:$src0, i32:$src1), i32:$src2)), 3512 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3513>; 3514} 3515 3516defm : IntMed3Pat<V_MED3_I32_e64, smin, smax>; 3517defm : IntMed3Pat<V_MED3_U32_e64, umin, umax>; 3518 3519multiclass FPMed3Pat<ValueType vt, 3520 Instruction med3Inst> { 3521 // This matches 16 permutations of max(min(x, y), min(max(x, y), z)) 3522 def : GCNPat< 3523 (fmaxnum_like_nnan 3524 (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3525 (VOP3Mods vt:$src1, i32:$src1_mods)), 3526 (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3527 (VOP3Mods vt:$src1, i32:$src1_mods)), 3528 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3529 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3530 DSTCLAMP.NONE, DSTOMOD.NONE)>; 3531 3532 3533 // This matches 16 permutations of min(max(x, y), max(min(x, y), z)) 3534 def : GCNPat< 3535 (fminnum_like_nnan 3536 (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3537 (VOP3Mods vt:$src1, i32:$src1_mods)), 3538 (fmaxnum_like (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3539 (VOP3Mods vt:$src1, i32:$src1_mods)), 3540 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3541 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3542 DSTCLAMP.NONE, DSTOMOD.NONE)>; 3543} 3544 3545multiclass Int16Med3Pat<Instruction med3Inst, 3546 SDPatternOperator min, 3547 SDPatternOperator max> { 3548 // This matches 16 permutations of 3549 // max(min(x, y), min(max(x, y), z)) 3550 def : GCNPat < 3551 (max (min i16:$src0, i16:$src1), 3552 (min (max i16:$src0, i16:$src1), i16:$src2)), 3553 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 3554>; 3555 3556 // This matches 16 permutations of 3557 // min(max(a, b), max(min(a, b), c)) 3558 def : GCNPat < 3559 (min (max i16:$src0, i16:$src1), 3560 (max (min i16:$src0, i16:$src1), i16:$src2)), 3561 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 3562>; 3563} 3564 3565defm : FPMed3Pat<f32, V_MED3_F32_e64>; 3566 3567let SubtargetPredicate = HasMed3_16 in { 3568defm : FPMed3Pat<f16, V_MED3_F16_e64>; 3569} 3570 3571class 3572IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max, 3573 SDPatternOperator max_or_min_oneuse> : AMDGPUPat < 3574 (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1), 3575 i32:$src2), 3576 (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3577>; 3578 3579class 3580FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max, 3581 SDPatternOperator max_or_min_oneuse> : GCNPat < 3582 (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), 3583 (VOP3Mods vt:$src1, i32:$src1_mods)), 3584 (vt (VOP3Mods vt:$src2, i32:$src2_mods))), 3585 (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3586 DSTCLAMP.NONE, DSTOMOD.NONE) 3587>; 3588 3589class 3590FPMinCanonMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max, 3591 SDPatternOperator max_or_min_oneuse> : GCNPat < 3592 (min_or_max (is_canonicalized_1<fcanonicalize> 3593 (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), 3594 (VOP3Mods vt:$src1, i32:$src1_mods))), 3595 (vt (VOP3Mods vt:$src2, i32:$src2_mods))), 3596 (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3597 DSTCLAMP.NONE, DSTOMOD.NONE) 3598>; 3599 3600let OtherPredicates = [isGFX11Plus] in { 3601def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>; 3602def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>; 3603def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>; 3604def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>; 3605def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; 3606def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; 3607def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; 3608def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; 3609def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; 3610def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; 3611def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; 3612def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; 3613} 3614 3615let OtherPredicates = [isGFX9Plus] in { 3616defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>; 3617defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>; 3618} // End Predicates = [isGFX9Plus] 3619 3620let OtherPredicates = [isGFX12Plus] in { 3621def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>; 3622def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>; 3623def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; 3624def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; 3625def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>; 3626def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>; 3627def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>; 3628def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>; 3629} 3630 3631// Convert a floating-point power of 2 to the integer exponent. 3632def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{ 3633 const auto &APF = N->getValueAPF(); 3634 int Log2 = APF.getExactLog2Abs(); 3635 assert(Log2 != INT_MIN); 3636 return CurDAG->getTargetConstant(Log2, SDLoc(N), MVT::i32); 3637}]>; 3638 3639// Check if a floating point value is a power of 2 floating-point 3640// immediate where it's preferable to emit a multiply by as an 3641// ldexp. We skip over 0.5 to 4.0 as those are inline immediates 3642// anyway. 3643def fpimm_pos_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{ 3644 if (Imm.isNegative()) 3645 return false; 3646 3647 int Exp = Imm.getExactLog2Abs(); 3648 // Prefer leaving the FP inline immediates as they are. 3649 // 0.5, 1.0, 2.0, 4.0 3650 3651 // For f64 ldexp is always better than materializing a 64-bit 3652 // constant. 3653 return Exp != INT_MIN && (Exp < -1 || Exp > 2); 3654 }], FPPow2ToExponentXForm 3655>; 3656 3657def fpimm_neg_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{ 3658 if (!Imm.isNegative()) 3659 return false; 3660 int Exp = Imm.getExactLog2Abs(); 3661 // Prefer leaving the FP inline immediates as they are. 3662 // 0.5, 1.0, 2.0, 4.0 3663 3664 // For f64 ldexp is always better than materializing a 64-bit 3665 // constant. 3666 return Exp != INT_MIN && (Exp < -1 || Exp > 2); 3667 }], FPPow2ToExponentXForm 3668>; 3669 3670// f64 is different because we also want to handle cases that may 3671// require materialization of the exponent. 3672// TODO: If we know f64 ops are fast, prefer add (ldexp x, N), y over fma 3673// TODO: For f32/f16, it's not a clear win on code size to use ldexp 3674// in place of mul since we have to use the vop3 form. Are there power 3675// savings or some other reason to prefer ldexp over mul? 3676def : GCNPat< 3677 (any_fmul (f64 (VOP3Mods f64:$src0, i32:$src0_mods)), 3678 fpimm_pos_pow2_prefer_ldexp_f64:$src1), 3679 (V_LDEXP_F64_e64 i32:$src0_mods, VSrc_b64:$src0, 3680 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1)))) 3681>; 3682 3683def : GCNPat< 3684 (any_fmul f64:$src0, fpimm_neg_pow2_prefer_ldexp_f64:$src1), 3685 (V_LDEXP_F64_e64 SRCMODS.NEG, VSrc_b64:$src0, 3686 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1)))) 3687>; 3688 3689// We want to avoid using VOP3Mods which could pull in another fneg 3690// which we would need to be re-negated (which should never happen in 3691// practice). I don't see a way to apply an SDNodeXForm that accounts 3692// for a second operand. 3693def : GCNPat< 3694 (any_fmul (fabs f64:$src0), fpimm_neg_pow2_prefer_ldexp_f64:$src1), 3695 (V_LDEXP_F64_e64 SRCMODS.NEG_ABS, VSrc_b64:$src0, 3696 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1)))) 3697>; 3698 3699class AMDGPUGenericInstruction : GenericInstruction { 3700 let Namespace = "AMDGPU"; 3701} 3702 3703// Convert a wave address to a swizzled vector address (i.e. this is 3704// for copying the stack pointer to a vector address appropriate to 3705// use in the offset field of mubuf instructions). 3706def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction { 3707 let OutOperandList = (outs type0:$dst); 3708 let InOperandList = (ins type0:$src); 3709 let hasSideEffects = 0; 3710} 3711 3712// Returns -1 if the input is zero. 3713def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { 3714 let OutOperandList = (outs type0:$dst); 3715 let InOperandList = (ins type1:$src); 3716 let hasSideEffects = 0; 3717} 3718 3719// Returns -1 if the input is zero. 3720def G_AMDGPU_FFBL_B32 : AMDGPUGenericInstruction { 3721 let OutOperandList = (outs type0:$dst); 3722 let InOperandList = (ins type1:$src); 3723 let hasSideEffects = 0; 3724} 3725 3726def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction { 3727 let OutOperandList = (outs type0:$dst); 3728 let InOperandList = (ins type1:$src); 3729 let hasSideEffects = 0; 3730} 3731 3732class BufferLoadGenericInstruction : AMDGPUGenericInstruction { 3733 let OutOperandList = (outs type0:$dst); 3734 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 3735 type2:$soffset, untyped_imm_0:$offset, 3736 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3737 let hasSideEffects = 0; 3738 let mayLoad = 1; 3739} 3740 3741class TBufferLoadGenericInstruction : AMDGPUGenericInstruction { 3742 let OutOperandList = (outs type0:$dst); 3743 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 3744 type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format, 3745 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3746 let hasSideEffects = 0; 3747 let mayLoad = 1; 3748} 3749 3750def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction; 3751def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction; 3752def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction; 3753def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction; 3754def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction; 3755def G_AMDGPU_BUFFER_LOAD_UBYTE_TFE : BufferLoadGenericInstruction; 3756def G_AMDGPU_BUFFER_LOAD_SBYTE_TFE : BufferLoadGenericInstruction; 3757def G_AMDGPU_BUFFER_LOAD_USHORT_TFE : BufferLoadGenericInstruction; 3758def G_AMDGPU_BUFFER_LOAD_SSHORT_TFE : BufferLoadGenericInstruction; 3759def G_AMDGPU_BUFFER_LOAD_TFE : BufferLoadGenericInstruction; 3760def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction; 3761def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction; 3762def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; 3763def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; 3764def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; 3765 3766class BufferStoreGenericInstruction : AMDGPUGenericInstruction { 3767 let OutOperandList = (outs); 3768 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3769 type2:$soffset, untyped_imm_0:$offset, 3770 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3771 let hasSideEffects = 0; 3772 let mayStore = 1; 3773} 3774 3775class TBufferStoreGenericInstruction : AMDGPUGenericInstruction { 3776 let OutOperandList = (outs); 3777 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3778 type2:$soffset, untyped_imm_0:$offset, 3779 untyped_imm_0:$format, 3780 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3781 let hasSideEffects = 0; 3782 let mayStore = 1; 3783} 3784 3785def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction; 3786def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction; 3787def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction; 3788def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction; 3789def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction; 3790def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction; 3791def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction; 3792 3793def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction { 3794 let OutOperandList = (outs type0:$dst); 3795 let InOperandList = (ins type0:$src0, type0:$src1); 3796 let hasSideEffects = 0; 3797} 3798 3799def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction { 3800 let OutOperandList = (outs type0:$dst); 3801 let InOperandList = (ins type0:$src0, type0:$src1); 3802 let hasSideEffects = 0; 3803} 3804 3805foreach N = 0-3 in { 3806def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { 3807 let OutOperandList = (outs type0:$dst); 3808 let InOperandList = (ins type0:$src0); 3809 let hasSideEffects = 0; 3810} 3811} 3812 3813def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction { 3814 let OutOperandList = (outs type0:$dst); 3815 let InOperandList = (ins type0:$src0, type0:$src1); 3816 let hasSideEffects = 0; 3817} 3818 3819def G_AMDGPU_SMED3 : AMDGPUGenericInstruction { 3820 let OutOperandList = (outs type0:$dst); 3821 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3822 let hasSideEffects = 0; 3823} 3824 3825def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { 3826 let OutOperandList = (outs type0:$dst); 3827 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3828 let hasSideEffects = 0; 3829} 3830 3831def G_AMDGPU_FMED3 : AMDGPUGenericInstruction { 3832 let OutOperandList = (outs type0:$dst); 3833 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3834 let hasSideEffects = 0; 3835} 3836 3837def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { 3838 let OutOperandList = (outs type0:$dst); 3839 let InOperandList = (ins type0:$src); 3840 let hasSideEffects = 0; 3841} 3842 3843// Integer multiply-add: arg0 * arg1 + arg2. 3844// 3845// arg0 and arg1 are 32-bit integers (interpreted as signed or unsigned), 3846// arg2 is a 64-bit integer. Result is a 64-bit integer and a 1-bit carry-out. 3847class G_AMDGPU_MAD_64_32 : AMDGPUGenericInstruction { 3848 let OutOperandList = (outs type0:$dst, type1:$carry_out); 3849 let InOperandList = (ins type2:$arg0, type2:$arg1, type0:$arg2); 3850 let hasSideEffects = 0; 3851} 3852 3853def G_AMDGPU_MAD_U64_U32 : G_AMDGPU_MAD_64_32; 3854def G_AMDGPU_MAD_I64_I32 : G_AMDGPU_MAD_64_32; 3855 3856// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector 3857// operand Expects a MachineMemOperand in addition to explicit 3858// operands. 3859def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { 3860 let OutOperandList = (outs type0:$oldval); 3861 let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval); 3862 let hasSideEffects = 0; 3863 let mayLoad = 1; 3864 let mayStore = 1; 3865} 3866 3867class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { 3868 let OutOperandList = (outs type0:$dst); 3869 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3870 type2:$soffset, untyped_imm_0:$offset, 3871 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3872 let hasSideEffects = 0; 3873 let mayLoad = 1; 3874 let mayStore = 1; 3875} 3876 3877def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction; 3878def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction; 3879def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction; 3880def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction; 3881def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; 3882def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; 3883def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; 3884def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; 3885def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction; 3886def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; 3887def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; 3888def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; 3889def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; 3890def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; 3891def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; 3892def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; 3893 3894def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { 3895 let OutOperandList = (outs type0:$dst); 3896 let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex, 3897 type2:$voffset, type2:$soffset, untyped_imm_0:$offset, 3898 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3899 let hasSideEffects = 0; 3900 let mayLoad = 1; 3901 let mayStore = 1; 3902} 3903 3904// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as 3905// a workaround for the intrinsic being defined as readnone, but 3906// really needs a memory operand. 3907 3908class SBufferLoadInstruction : AMDGPUGenericInstruction { 3909 let OutOperandList = (outs type0:$dst); 3910 let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); 3911 let hasSideEffects = 0; 3912 let mayLoad = 1; 3913 let mayStore = 0; 3914} 3915 3916def G_AMDGPU_S_BUFFER_LOAD : SBufferLoadInstruction; 3917def G_AMDGPU_S_BUFFER_LOAD_SBYTE : SBufferLoadInstruction; 3918def G_AMDGPU_S_BUFFER_LOAD_UBYTE : SBufferLoadInstruction; 3919def G_AMDGPU_S_BUFFER_LOAD_SSHORT : SBufferLoadInstruction; 3920def G_AMDGPU_S_BUFFER_LOAD_USHORT : SBufferLoadInstruction; 3921 3922def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction { 3923 let OutOperandList = (outs type0:$dst); 3924 let InOperandList = (ins type0:$src0, type0:$src1); 3925 let hasSideEffects = 0; 3926} 3927 3928def G_AMDGPU_S_MUL_I64_I32 : AMDGPUGenericInstruction { 3929 let OutOperandList = (outs type0:$dst); 3930 let InOperandList = (ins type0:$src0, type0:$src1); 3931 let hasSideEffects = 0; 3932} 3933 3934// This is equivalent to the G_INTRINSIC*, but the operands may have 3935// been legalized depending on the subtarget requirements. 3936def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { 3937 let OutOperandList = (outs type0:$dst); 3938 let InOperandList = (ins unknown:$intrin, variable_ops); 3939 let hasSideEffects = 0; 3940 let mayLoad = 1; 3941 3942 // FIXME: Use separate opcode for atomics. 3943 let mayStore = 1; 3944} 3945 3946def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction { 3947 let OutOperandList = (outs type0:$dst); 3948 let InOperandList = (ins unknown:$intrin, variable_ops); 3949 let hasSideEffects = 0; 3950 let mayLoad = 1; 3951 3952 // FIXME: Use separate opcode for atomics. 3953 let mayStore = 1; 3954} 3955 3956def G_AMDGPU_INTRIN_IMAGE_LOAD_NORET : AMDGPUGenericInstruction { 3957 let OutOperandList = (outs); 3958 let InOperandList = (ins unknown:$intrin, variable_ops); 3959 let hasSideEffects = 0; 3960 let mayLoad = 1; 3961 let mayStore = 1; 3962} 3963 3964// This is equivalent to the G_INTRINSIC*, but the operands may have 3965// been legalized depending on the subtarget requirements. 3966def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { 3967 let OutOperandList = (outs); 3968 let InOperandList = (ins unknown:$intrin, variable_ops); 3969 let hasSideEffects = 0; 3970 let mayStore = 1; 3971} 3972 3973def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction { 3974 let OutOperandList = (outs); 3975 let InOperandList = (ins unknown:$intrin, variable_ops); 3976 let hasSideEffects = 0; 3977 let mayStore = 1; 3978} 3979 3980def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { 3981 let OutOperandList = (outs type0:$dst); 3982 let InOperandList = (ins unknown:$intrin, variable_ops); 3983 let hasSideEffects = 0; 3984 let mayLoad = 1; 3985 let mayStore = 0; 3986} 3987 3988// Generic instruction for SI_CALL, so we can select the register bank and insert a waterfall loop 3989// if necessary. 3990def G_SI_CALL : AMDGPUGenericInstruction { 3991 let OutOperandList = (outs SReg_64:$dst); 3992 let InOperandList = (ins type0:$src0, unknown:$callee); 3993 let Size = 4; 3994 let isCall = 1; 3995 let UseNamedOperandTable = 1; 3996 let SchedRW = [WriteBranch]; 3997 // TODO: Should really base this on the call target 3998 let isConvergent = 1; 3999} 4000 4001def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction { 4002 let OutOperandList = (outs type0:$vdst); 4003 let InOperandList = (ins type1:$src0); 4004 let hasSideEffects = 0; 4005} 4006 4007def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction { 4008 let OutOperandList = (outs type0:$vdst); 4009 let InOperandList = (ins type1:$src0); 4010 let hasSideEffects = 0; 4011} 4012 4013//============================================================================// 4014// Dummy Instructions 4015//============================================================================// 4016 4017def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> { 4018 let Inst{31-0} = 0x00000000; 4019 let FixedSize = 1; 4020 let Size = 4; 4021 let Uses = [EXEC]; 4022 let hasSideEffects = 1; 4023 let SubtargetPredicate = isGFX10Plus; 4024} 4025