1//===-- SIInstructions.td - SI Instruction Definitions --------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// This file was originally auto-generated from a GPU register header file and 9// all the instruction definitions were originally commented out. Instructions 10// that are not yet supported remain commented out. 11//===----------------------------------------------------------------------===// 12 13class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl { 14 15} 16 17class UniformSextInreg<ValueType VT> : PatFrag< 18 (ops node:$src), 19 (sext_inreg $src, VT), 20 [{ return !N->isDivergent(); }]>; 21 22class DivergentSextInreg<ValueType VT> : PatFrag< 23 (ops node:$src), 24 (sext_inreg $src, VT), 25 [{ return N->isDivergent(); }]>; 26 27include "SOPInstructions.td" 28include "VOPInstructions.td" 29include "SMInstructions.td" 30include "FLATInstructions.td" 31include "BUFInstructions.td" 32include "EXPInstructions.td" 33include "LDSDIRInstructions.td" 34include "VINTERPInstructions.td" 35 36//===----------------------------------------------------------------------===// 37// VINTRP Instructions 38//===----------------------------------------------------------------------===// 39 40// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI) 41def VINTRPDst : VINTRPDstOperand <VGPR_32>; 42 43let Uses = [MODE, M0, EXEC] in { 44 45// FIXME: Specify SchedRW for VINTRP instructions. 46 47multiclass V_INTERP_P1_F32_m : VINTRP_m < 48 0x00000000, 49 (outs VINTRPDst:$vdst), 50 (ins VGPR_32:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan), 51 "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", 52 [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc, 53 (i32 timm:$attrchan), (i32 timm:$attr), M0))] 54>; 55 56let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in { 57 58defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; 59 60} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus] 61 62let OtherPredicates = [has16BankLDS, isNotGFX90APlus], 63 Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { 64 65defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; 66 67} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus], 68 // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 69 70let OtherPredicates = [isNotGFX90APlus] in { 71let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { 72 73defm V_INTERP_P2_F32 : VINTRP_m < 74 0x00000001, 75 (outs VINTRPDst:$vdst), 76 (ins VGPR_32:$src0, VGPR_32:$vsrc, InterpAttr:$attr, 77 InterpAttrChan:$attrchan), 78 "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", 79 [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, 80 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 81 82} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" 83 84defm V_INTERP_MOV_F32 : VINTRP_m < 85 0x00000002, 86 (outs VINTRPDst:$vdst), 87 (ins InterpSlot:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan), 88 "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", 89 [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), 90 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 91 92} // End OtherPredicates = [isNotGFX90APlus] 93 94} // End Uses = [MODE, M0, EXEC] 95 96//===----------------------------------------------------------------------===// 97// Pseudo Instructions 98//===----------------------------------------------------------------------===// 99 100// Insert a branch to an endpgm block to use as a fallback trap. 101def ENDPGM_TRAP : SPseudoInstSI< 102 (outs), (ins), 103 [(AMDGPUendpgm_trap)], 104 "ENDPGM_TRAP"> { 105 let hasSideEffects = 1; 106 let usesCustomInserter = 1; 107} 108 109def ATOMIC_FENCE : SPseudoInstSI< 110 (outs), (ins i32imm:$ordering, i32imm:$scope), 111 [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))], 112 "ATOMIC_FENCE $ordering, $scope"> { 113 let hasSideEffects = 1; 114 let maybeAtomic = 1; 115} 116 117let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { 118 119// For use in patterns 120def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), 121 (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { 122 let isPseudo = 1; 123 let isCodeGenOnly = 1; 124 let usesCustomInserter = 1; 125} 126 127// 64-bit vector move instruction. This is mainly used by the 128// SIFoldOperands pass to enable folding of inline immediates. 129def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), 130 (ins VSrc_b64:$src0)> { 131 let isReMaterializable = 1; 132 let isAsCheapAsAMove = 1; 133 let isMoveImm = 1; 134 let SchedRW = [Write64Bit]; 135 let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each. 136 let UseNamedOperandTable = 1; 137} 138 139// 64-bit vector move with dpp. Expanded post-RA. 140def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> { 141 let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. 142} 143 144// 64-bit scalar move immediate instruction. This is used to avoid subregs 145// initialization and allow rematerialization. 146def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst), 147 (ins i64imm:$src0)> { 148 let isReMaterializable = 1; 149 let isAsCheapAsAMove = 1; 150 let isMoveImm = 1; 151 let SchedRW = [WriteSALU, Write64Bit]; 152 let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each. 153 let Uses = []; 154} 155 156// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the 157// WQM pass processes it. 158def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 159 160// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is 161// turned into a copy by WQM pass, but does not seed WQM requirements. 162def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 163 164// Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so 165// that the @earlyclobber is respected. The @earlyclobber is to make sure that 166// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't 167// accidentally clobber inactive channels of $vdst. 168let Constraints = "@earlyclobber $vdst" in { 169def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 170def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 171} 172 173} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] 174 175def WWM_COPY : SPseudoInstSI < 176 (outs unknown:$dst), (ins unknown:$src)> { 177 let hasSideEffects = 0; 178 let isAsCheapAsAMove = 1; 179 let isConvergent = 1; 180} 181 182def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 183 let Uses = [EXEC]; 184 let Defs = [EXEC, SCC]; 185 let hasSideEffects = 0; 186 let mayLoad = 0; 187 let mayStore = 0; 188} 189 190def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 191 let hasSideEffects = 0; 192 let mayLoad = 0; 193 let mayStore = 0; 194} 195 196def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 197 let Uses = [EXEC]; 198 let Defs = [EXEC, SCC]; 199 let hasSideEffects = 0; 200 let mayLoad = 0; 201 let mayStore = 0; 202} 203 204def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 205 let hasSideEffects = 0; 206 let mayLoad = 0; 207 let mayStore = 0; 208} 209 210let usesCustomInserter = 1 in { 211def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$mask)>; 212 213def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>; 214} // End usesCustomInserter = 1 215 216// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes. 217def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { 218 let Uses = [EXEC]; 219 let Defs = [EXEC]; 220 let hasSideEffects = 0; 221 let mayLoad = 0; 222 let mayStore = 0; 223} 224 225def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { 226 let hasSideEffects = 0; 227 let mayLoad = 0; 228 let mayStore = 0; 229} 230 231// Pseudo instructions used for @llvm.fptrunc.round upward 232// and @llvm.fptrunc.round downward. 233// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD 234// and G_FPTRUNC_ROUND_DOWNWARD before being lowered to 235// FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO. 236// The final codegen is done in the ModeRegister pass. 237let Uses = [MODE, EXEC] in { 238def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), 239 (ins VGPR_32:$src0), 240 [(set f16:$vdst, (SIfptrunc_round_upward f32:$src0))]>; 241 242def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), 243 (ins VGPR_32:$src0), 244 [(set f16:$vdst, (SIfptrunc_round_downward f32:$src0))]>; 245} // End Uses = [MODE, EXEC] 246 247// Invert the exec mask and overwrite the inactive lanes of dst with inactive, 248// restoring it after we're done. 249let Defs = [SCC], isConvergent = 1 in { 250def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), 251 (ins VSrc_b32: $src, VSrc_b32:$inactive), 252 [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { 253} 254 255def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), 256 (ins VSrc_b64: $src, VSrc_b64:$inactive), 257 [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { 258} 259} // End Defs = [SCC] 260 261let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { 262 def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), 263 (ins VSrc_b32: $src, VSrc_b32:$strategy), 264 [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> { 265 } 266 267 def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst), 268 (ins VSrc_b32: $src, VSrc_b32:$strategy), 269 [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> { 270 } 271} 272 273let usesCustomInserter = 1, Defs = [VCC, EXEC] in { 274def V_ADD_U64_PSEUDO : VPseudoInstSI < 275 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 276 [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] 277>; 278 279def V_SUB_U64_PSEUDO : VPseudoInstSI < 280 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 281 [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] 282>; 283} // End usesCustomInserter = 1, Defs = [VCC, EXEC] 284 285let usesCustomInserter = 1, Defs = [SCC] in { 286def S_ADD_U64_PSEUDO : SPseudoInstSI < 287 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 288 [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))] 289>; 290 291def S_SUB_U64_PSEUDO : SPseudoInstSI < 292 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 293 [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))] 294>; 295 296def S_ADD_CO_PSEUDO : SPseudoInstSI < 297 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 298>; 299 300def S_SUB_CO_PSEUDO : SPseudoInstSI < 301 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 302>; 303 304def S_UADDO_PSEUDO : SPseudoInstSI < 305 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 306>; 307 308def S_USUBO_PSEUDO : SPseudoInstSI < 309 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 310>; 311 312} // End usesCustomInserter = 1, Defs = [SCC] 313 314let usesCustomInserter = 1 in { 315def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins), 316 [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; 317} // End let usesCustomInserter = 1, SALU = 1 318 319// Wrap an instruction by duplicating it, except for setting isTerminator. 320class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< 321 base_inst.OutOperandList, 322 base_inst.InOperandList> { 323 let Uses = base_inst.Uses; 324 let Defs = base_inst.Defs; 325 let isTerminator = 1; 326 let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; 327 let hasSideEffects = base_inst.hasSideEffects; 328 let UseNamedOperandTable = base_inst.UseNamedOperandTable; 329 let CodeSize = base_inst.CodeSize; 330 let SchedRW = base_inst.SchedRW; 331} 332 333let WaveSizePredicate = isWave64 in { 334def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; 335def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; 336def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; 337def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; 338def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>; 339def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>; 340} 341 342let WaveSizePredicate = isWave32 in { 343def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>; 344def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; 345def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; 346def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; 347def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>; 348def S_AND_SAVEEXEC_B32_term : WrapTerminatorInst<S_AND_SAVEEXEC_B32>; 349} 350 351 352def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), 353 [(int_amdgcn_wave_barrier)]> { 354 let SchedRW = []; 355 let hasNoSchedulingInfo = 1; 356 let hasSideEffects = 1; 357 let mayLoad = 0; 358 let mayStore = 0; 359 let isConvergent = 1; 360 let FixedSize = 1; 361 let Size = 0; 362 let isMeta = 1; 363} 364 365def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask), 366 [(int_amdgcn_sched_barrier (i32 timm:$mask))]> { 367 let SchedRW = []; 368 let hasNoSchedulingInfo = 1; 369 let hasSideEffects = 1; 370 let mayLoad = 0; 371 let mayStore = 0; 372 let isConvergent = 1; 373 let FixedSize = 1; 374 let Size = 0; 375 let isMeta = 1; 376} 377 378def SCHED_GROUP_BARRIER : SPseudoInstSI< 379 (outs), 380 (ins i32imm:$mask, i32imm:$size, i32imm:$syncid), 381 [(int_amdgcn_sched_group_barrier (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid))]> { 382 let SchedRW = []; 383 let hasNoSchedulingInfo = 1; 384 let hasSideEffects = 1; 385 let mayLoad = 0; 386 let mayStore = 0; 387 let isConvergent = 1; 388 let FixedSize = 1; 389 let Size = 0; 390 let isMeta = 1; 391} 392 393def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask), 394 [(int_amdgcn_iglp_opt (i32 timm:$mask))]> { 395 let SchedRW = []; 396 let hasNoSchedulingInfo = 1; 397 let hasSideEffects = 1; 398 let mayLoad = 0; 399 let mayStore = 0; 400 let isConvergent = 1; 401 let FixedSize = 1; 402 let Size = 0; 403 let isMeta = 1; 404} 405 406// SI pseudo instructions. These are used by the CFG structurizer pass 407// and should be lowered to ISA instructions prior to codegen. 408 409// As we have enhanced control flow intrinsics to work under unstructured CFG, 410// duplicating such intrinsics can be actually treated as legal. On the contrary, 411// by making them non-duplicable, we are observing better code generation result. 412// So we choose to mark them non-duplicable in hope of getting better code 413// generation as well as simplied CFG during Machine IR optimization stage. 414 415let isTerminator = 1, isNotDuplicable = 1 in { 416 417let OtherPredicates = [EnableLateCFGStructurize] in { 418 def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < 419 (outs), 420 (ins SReg_1:$vcc, brtarget:$target), 421 [(brcond i1:$vcc, bb:$target)]> { 422 let Size = 12; 423} 424} 425 426def SI_IF: CFPseudoInstSI < 427 (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target), 428 [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { 429 let Constraints = ""; 430 let Size = 12; 431 let hasSideEffects = 1; 432 let IsNeverUniform = 1; 433} 434 435def SI_ELSE : CFPseudoInstSI < 436 (outs SReg_1:$dst), 437 (ins SReg_1:$src, brtarget:$target), [], 1, 1> { 438 let Size = 12; 439 let hasSideEffects = 1; 440 let IsNeverUniform = 1; 441} 442 443def SI_WATERFALL_LOOP : CFPseudoInstSI < 444 (outs), 445 (ins brtarget:$target), [], 1> { 446 let Size = 8; 447 let isBranch = 1; 448 let Defs = []; 449} 450 451def SI_LOOP : CFPseudoInstSI < 452 (outs), (ins SReg_1:$saved, brtarget:$target), 453 [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { 454 let Size = 8; 455 let isBranch = 1; 456 let hasSideEffects = 1; 457 let IsNeverUniform = 1; 458} 459 460} // End isTerminator = 1 461 462def SI_END_CF : CFPseudoInstSI < 463 (outs), (ins SReg_1:$saved), [], 1, 1> { 464 let Size = 4; 465 let isAsCheapAsAMove = 1; 466 let isReMaterializable = 1; 467 let hasSideEffects = 1; 468 let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. 469 let mayLoad = 1; // FIXME: Should not need memory flags 470 let mayStore = 1; 471} 472 473def SI_IF_BREAK : CFPseudoInstSI < 474 (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { 475 let Size = 4; 476 let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details. 477 let isAsCheapAsAMove = 1; 478 let isReMaterializable = 1; 479} 480 481// Branch to the early termination block of the shader if SCC is 0. 482// This uses SCC from a previous SALU operation, i.e. the update of 483// a mask of live lanes after a kill/demote operation. 484// Only valid in pixel shaders. 485def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> { 486 let Uses = [EXEC,SCC]; 487} 488 489let Uses = [EXEC] in { 490 491multiclass PseudoInstKill <dag ins> { 492 // Even though this pseudo can usually be expanded without an SCC def, we 493 // conservatively assume that it has an SCC def, both because it is sometimes 494 // required in degenerate cases (when V_CMPX cannot be used due to constant 495 // bus limitations) and because it allows us to avoid having to track SCC 496 // liveness across basic blocks. 497 let Defs = [EXEC,SCC] in 498 def _PSEUDO : PseudoInstSI <(outs), ins> { 499 let isConvergent = 1; 500 let usesCustomInserter = 1; 501 } 502 503 let Defs = [EXEC,SCC] in 504 def _TERMINATOR : SPseudoInstSI <(outs), ins> { 505 let isTerminator = 1; 506 } 507} 508 509defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; 510let Defs = [VCC] in 511defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; 512 513let Defs = [EXEC,VCC] in 514def SI_ILLEGAL_COPY : SPseudoInstSI < 515 (outs unknown:$dst), (ins unknown:$src), 516 [], " ; illegal copy $src to $dst">; 517 518} // End Uses = [EXEC], Defs = [EXEC,VCC] 519 520// Branch on undef scc. Used to avoid intermediate copy from 521// IMPLICIT_DEF to SCC. 522def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins SOPPBrTarget:$simm16)> { 523 let isTerminator = 1; 524 let usesCustomInserter = 1; 525 let isBranch = 1; 526} 527 528def SI_PS_LIVE : PseudoInstSI < 529 (outs SReg_1:$dst), (ins), 530 [(set i1:$dst, (int_amdgcn_ps_live))]> { 531 let SALU = 1; 532} 533 534let Uses = [EXEC] in { 535def SI_LIVE_MASK : PseudoInstSI < 536 (outs SReg_1:$dst), (ins), 537 [(set i1:$dst, (int_amdgcn_live_mask))]> { 538 let SALU = 1; 539} 540let Defs = [EXEC,SCC] in { 541// Demote: Turn a pixel shader thread into a helper lane. 542def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>; 543} // End Defs = [EXEC,SCC] 544} // End Uses = [EXEC] 545 546def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), 547 [(int_amdgcn_unreachable)], 548 "; divergent unreachable"> { 549 let Size = 0; 550 let hasNoSchedulingInfo = 1; 551 let FixedSize = 1; 552 let isMeta = 1; 553} 554 555// Used as an isel pseudo to directly emit initialization with an 556// s_mov_b32 rather than a copy of another initialized 557// register. MachineCSE skips copies, and we don't want to have to 558// fold operands before it runs. 559def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { 560 let Defs = [M0]; 561 let usesCustomInserter = 1; 562 let isAsCheapAsAMove = 1; 563 let isReMaterializable = 1; 564} 565 566def SI_INIT_EXEC : SPseudoInstSI < 567 (outs), (ins i64imm:$src), 568 [(int_amdgcn_init_exec (i64 timm:$src))]> { 569 let Defs = [EXEC]; 570 let isAsCheapAsAMove = 1; 571} 572 573def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < 574 (outs), (ins SSrc_b32:$input, i32imm:$shift), 575 [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { 576 let Defs = [EXEC]; 577} 578 579// Return for returning shaders to a shader variant epilog. 580def SI_RETURN_TO_EPILOG : SPseudoInstSI < 581 (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { 582 let isTerminator = 1; 583 let isBarrier = 1; 584 let isReturn = 1; 585 let hasNoSchedulingInfo = 1; 586 let DisableWQM = 1; 587 let FixedSize = 1; 588 589 // TODO: Should this be true? 590 let isMeta = 0; 591} 592 593// Return for returning function calls. 594def SI_RETURN : SPseudoInstSI < 595 (outs), (ins), [(AMDGPUret_glue)], 596 "; return"> { 597 let isTerminator = 1; 598 let isBarrier = 1; 599 let isReturn = 1; 600 let SchedRW = [WriteBranch]; 601} 602 603// Return for returning function calls without output register. 604// 605// This version is only needed so we can fill in the output register 606// in the custom inserter. 607def SI_CALL_ISEL : SPseudoInstSI < 608 (outs), (ins SSrc_b64:$src0, unknown:$callee), 609 [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { 610 let Size = 4; 611 let isCall = 1; 612 let SchedRW = [WriteBranch]; 613 let usesCustomInserter = 1; 614 // TODO: Should really base this on the call target 615 let isConvergent = 1; 616} 617 618def : GCNPat< 619 (AMDGPUcall i64:$src0, (i64 0)), 620 (SI_CALL_ISEL $src0, (i64 0)) 621>; 622 623// Wrapper around s_swappc_b64 with extra $callee parameter to track 624// the called function after regalloc. 625def SI_CALL : SPseudoInstSI < 626 (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { 627 let Size = 4; 628 let FixedSize = 1; 629 let isCall = 1; 630 let UseNamedOperandTable = 1; 631 let SchedRW = [WriteBranch]; 632 // TODO: Should really base this on the call target 633 let isConvergent = 1; 634} 635 636class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs), 637 (ins rc:$src0, unknown:$callee, i32imm:$fpdiff), 638 [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { 639 let Size = 4; 640 let FixedSize = 1; 641 let isCall = 1; 642 let isTerminator = 1; 643 let isReturn = 1; 644 let isBarrier = 1; 645 let UseNamedOperandTable = 1; 646 let SchedRW = [WriteBranch]; 647 // TODO: Should really base this on the call target 648 let isConvergent = 1; 649} 650 651// Tail call handling pseudo 652def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>; 653def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>; 654 655// Handle selecting indirect tail calls 656def : GCNPat< 657 (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)), 658 (SI_TCRETURN CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) 659>; 660 661// Handle selecting indirect tail calls for AMDGPU_gfx 662def : GCNPat< 663 (AMDGPUtc_return_gfx i64:$src0, (i64 0), (i32 timm:$fpdiff)), 664 (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff) 665>; 666 667def ADJCALLSTACKUP : SPseudoInstSI< 668 (outs), (ins i32imm:$amt0, i32imm:$amt1), 669 [(callseq_start timm:$amt0, timm:$amt1)], 670 "; adjcallstackup $amt0 $amt1"> { 671 let Size = 8; // Worst case. (s_add_u32 + constant) 672 let FixedSize = 1; 673 let hasSideEffects = 1; 674 let usesCustomInserter = 1; 675 let SchedRW = [WriteSALU]; 676 let Defs = [SCC]; 677} 678 679def ADJCALLSTACKDOWN : SPseudoInstSI< 680 (outs), (ins i32imm:$amt1, i32imm:$amt2), 681 [(callseq_end timm:$amt1, timm:$amt2)], 682 "; adjcallstackdown $amt1"> { 683 let Size = 8; // Worst case. (s_add_u32 + constant) 684 let hasSideEffects = 1; 685 let usesCustomInserter = 1; 686 let SchedRW = [WriteSALU]; 687 let Defs = [SCC]; 688} 689 690let Defs = [M0, EXEC, SCC], 691 UseNamedOperandTable = 1 in { 692 693// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect 694// addressing implementation. 695class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < 696 (outs VGPR_32:$vdst), 697 (ins rc:$src, VS_32:$idx, i32imm:$offset)> { 698 let usesCustomInserter = 1; 699} 700 701class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < 702 (outs rc:$vdst), 703 (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { 704 let Constraints = "$src = $vdst"; 705 let usesCustomInserter = 1; 706} 707 708def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; 709def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; 710def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; 711def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; 712def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>; 713def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>; 714def SI_INDIRECT_SRC_V11 : SI_INDIRECT_SRC<VReg_352>; 715def SI_INDIRECT_SRC_V12 : SI_INDIRECT_SRC<VReg_384>; 716def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; 717def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>; 718 719def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; 720def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; 721def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; 722def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; 723def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>; 724def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>; 725def SI_INDIRECT_DST_V11 : SI_INDIRECT_DST<VReg_352>; 726def SI_INDIRECT_DST_V12 : SI_INDIRECT_DST<VReg_384>; 727def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; 728def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>; 729 730} // End Uses = [EXEC], Defs = [M0, EXEC] 731 732// This is a pseudo variant of the v_movreld_b32 instruction in which the 733// vector operand appears only twice, once as def and once as use. Using this 734// pseudo avoids problems with the Two Address instructions pass. 735class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 736 RegisterOperand val_ty> : PseudoInstSI < 737 (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> { 738 let Constraints = "$vsrc = $vdst"; 739 let Uses = [M0]; 740} 741 742class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 743 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> { 744 let VALU = 1; 745 let VOP1 = 1; 746 let Uses = [M0, EXEC]; 747} 748 749class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 750 RegisterOperand val_ty> : 751 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> { 752 let SALU = 1; 753 let SOP1 = 1; 754 let Uses = [M0]; 755} 756 757class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 758 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>; 759class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> : 760 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>; 761 762def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>; 763def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>; 764def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>; 765def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>; 766def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>; 767def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>; 768def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>; 769def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>; 770def V_INDIRECT_REG_WRITE_MOVREL_B32_V11 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_352>; 771def V_INDIRECT_REG_WRITE_MOVREL_B32_V12 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_384>; 772def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>; 773def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>; 774 775def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>; 776def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>; 777def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>; 778def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>; 779def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>; 780def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>; 781def S_INDIRECT_REG_WRITE_MOVREL_B32_V9 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_288>; 782def S_INDIRECT_REG_WRITE_MOVREL_B32_V10 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_320>; 783def S_INDIRECT_REG_WRITE_MOVREL_B32_V11 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_352>; 784def S_INDIRECT_REG_WRITE_MOVREL_B32_V12 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_384>; 785def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>; 786def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>; 787 788def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>; 789def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>; 790def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>; 791def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>; 792def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>; 793 794// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these 795// pseudos we avoid spills or copies being inserted within indirect sequences 796// that switch the VGPR indexing mode. Spills to accvgprs could be effected by 797// this mode switching. 798 799class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 800 (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> { 801 let Constraints = "$vsrc = $vdst"; 802 let VALU = 1; 803 let Uses = [M0, EXEC]; 804 let Defs = [M0]; 805} 806 807def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>; 808def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>; 809def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>; 810def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>; 811def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>; 812def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>; 813def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>; 814def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>; 815def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_352>; 816def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_384>; 817def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>; 818def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>; 819 820class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 821 (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> { 822 let VALU = 1; 823 let Uses = [M0, EXEC]; 824 let Defs = [M0]; 825} 826 827def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>; 828def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>; 829def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>; 830def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>; 831def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>; 832def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>; 833def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>; 834def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>; 835def V_INDIRECT_REG_READ_GPR_IDX_B32_V11 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_352>; 836def V_INDIRECT_REG_READ_GPR_IDX_B32_V12 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_384>; 837def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>; 838def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>; 839 840multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { 841 let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in { 842 def _SAVE : PseudoInstSI < 843 (outs), 844 (ins sgpr_class:$data, i32imm:$addr)> { 845 let mayStore = 1; 846 let mayLoad = 0; 847 } 848 849 def _RESTORE : PseudoInstSI < 850 (outs sgpr_class:$data), 851 (ins i32imm:$addr)> { 852 let mayStore = 0; 853 let mayLoad = 1; 854 } 855 } // End UseNamedOperandTable = 1 856} 857 858// You cannot use M0 as the output of v_readlane_b32 instructions or 859// use it in the sdata operand of SMEM instructions. We still need to 860// be able to spill the physical register m0, so allow it for 861// SI_SPILL_32_* instructions. 862defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; 863defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; 864defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; 865defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; 866defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; 867defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>; 868defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>; 869defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; 870defm SI_SPILL_S288 : SI_SPILL_SGPR <SReg_288>; 871defm SI_SPILL_S320 : SI_SPILL_SGPR <SReg_320>; 872defm SI_SPILL_S352 : SI_SPILL_SGPR <SReg_352>; 873defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>; 874defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; 875defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; 876 877// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register 878// needs to be used and an extra instruction to move between VGPR and AGPR. 879// UsesTmp adds to the total size of an expanded spill in this case. 880multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> { 881 let UseNamedOperandTable = 1, VGPRSpill = 1, 882 SchedRW = [WriteVMEM] in { 883 def _SAVE : VPseudoInstSI < 884 (outs), 885 (ins vgpr_class:$vdata, i32imm:$vaddr, 886 SReg_32:$soffset, i32imm:$offset)> { 887 let mayStore = 1; 888 let mayLoad = 0; 889 // (2 * 4) + (8 * num_subregs) bytes maximum 890 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 891 // Size field is unsigned char and cannot fit more. 892 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 893 } 894 895 def _RESTORE : VPseudoInstSI < 896 (outs vgpr_class:$vdata), 897 (ins i32imm:$vaddr, 898 SReg_32:$soffset, i32imm:$offset)> { 899 let mayStore = 0; 900 let mayLoad = 1; 901 902 // (2 * 4) + (8 * num_subregs) bytes maximum 903 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 904 // Size field is unsigned char and cannot fit more. 905 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 906 } 907 } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] 908} 909 910defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; 911defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; 912defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; 913defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; 914defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; 915defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; 916defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>; 917defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; 918defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>; 919defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>; 920defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>; 921defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>; 922defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; 923defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; 924 925defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>; 926defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>; 927defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; 928defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; 929defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; 930defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; 931defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>; 932defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; 933defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>; 934defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>; 935defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>; 936defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>; 937defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; 938defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; 939 940defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; 941defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; 942defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; 943defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; 944defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; 945defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; 946defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; 947defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; 948defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>; 949defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>; 950defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>; 951defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; 952defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; 953defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; 954 955let isConvergent = 1 in 956defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>; 957 958def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < 959 (outs SReg_64:$dst), 960 (ins si_ga:$ptr_lo, si_ga:$ptr_hi), 961 [(set SReg_64:$dst, 962 (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> { 963 let Defs = [SCC]; 964} 965 966def : GCNPat < 967 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0), 968 (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) 969>; 970 971def : GCNPat< 972 (AMDGPUtrap timm:$trapid), 973 (S_TRAP $trapid) 974>; 975 976def : GCNPat< 977 (AMDGPUelse i1:$src, bb:$target), 978 (SI_ELSE $src, $target) 979>; 980 981def : Pat < 982 (int_amdgcn_kill i1:$src), 983 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0) 984>; 985 986def : Pat < 987 (int_amdgcn_kill (i1 (not i1:$src))), 988 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1) 989>; 990 991def : Pat < 992 (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), 993 (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) 994>; 995 996def : Pat < 997 (int_amdgcn_wqm_demote i1:$src), 998 (SI_DEMOTE_I1 SCSrc_i1:$src, 0) 999>; 1000 1001def : Pat < 1002 (int_amdgcn_wqm_demote (i1 (not i1:$src))), 1003 (SI_DEMOTE_I1 SCSrc_i1:$src, -1) 1004>; 1005 1006 // TODO: we could add more variants for other types of conditionals 1007 1008def : Pat < 1009 (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 1010 (COPY $src) // Return the SGPRs representing i1 src 1011>; 1012 1013def : Pat < 1014 (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 1015 (COPY $src) // Return the SGPRs representing i1 src 1016>; 1017 1018//===----------------------------------------------------------------------===// 1019// VOP1 Patterns 1020//===----------------------------------------------------------------------===// 1021 1022multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> { 1023 // f16_to_fp patterns 1024 def : GCNPat < 1025 (f32 (f16_to_fp i32:$src0)), 1026 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0) 1027 >; 1028 1029 def : GCNPat < 1030 (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), 1031 (cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0) 1032 >; 1033 1034 def : GCNPat < 1035 (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), 1036 (cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) 1037 >; 1038 1039 def : GCNPat < 1040 (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), 1041 (cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0) 1042 >; 1043 1044 def : GCNPat < 1045 (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), 1046 (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0) 1047 >; 1048 1049 def : GCNPat < 1050 (f64 (fpextend f16:$src)), 1051 (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src)) 1052 >; 1053 1054 // fp_to_fp16 patterns 1055 def : GCNPat < 1056 (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), 1057 (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) 1058 >; 1059 1060 def : GCNPat < 1061 (i32 (fp_to_sint f16:$src)), 1062 (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) 1063 >; 1064 1065 def : GCNPat < 1066 (i32 (fp_to_uint f16:$src)), 1067 (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) 1068 >; 1069 1070 def : GCNPat < 1071 (f16 (sint_to_fp i32:$src)), 1072 (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_I32_e32 VSrc_b32:$src)) 1073 >; 1074 1075 def : GCNPat < 1076 (f16 (uint_to_fp i32:$src)), 1077 (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src)) 1078 >; 1079} 1080 1081let SubtargetPredicate = NotHasTrue16BitInsts in 1082defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>; 1083 1084let SubtargetPredicate = HasTrue16BitInsts in 1085defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64>; 1086 1087//===----------------------------------------------------------------------===// 1088// VOP2 Patterns 1089//===----------------------------------------------------------------------===// 1090 1091// NoMods pattern used for mac. If there are any source modifiers then it's 1092// better to select mad instead of mac. 1093class FMADPat <ValueType vt, Instruction inst> 1094 : GCNPat <(vt (any_fmad (vt (VOP3NoMods vt:$src0)), 1095 (vt (VOP3NoMods vt:$src1)), 1096 (vt (VOP3NoMods vt:$src2)))), 1097 (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1098 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1099>; 1100 1101// Prefer mac form when there are no modifiers. 1102let AddedComplexity = 9 in { 1103let OtherPredicates = [HasMadMacF32Insts] in 1104def : FMADPat <f32, V_MAC_F32_e64>; 1105 1106// Don't allow source modifiers. If there are any source modifiers then it's 1107// better to select mad instead of mac. 1108let SubtargetPredicate = isGFX6GFX7GFX10, 1109 OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 1110def : GCNPat < 1111 (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0), 1112 (VOP3NoMods f32:$src1)), 1113 (VOP3NoMods f32:$src2))), 1114 (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1115 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1116>; 1117 1118// Don't allow source modifiers. If there are any source modifiers then it's 1119// better to select fma instead of fmac. 1120let SubtargetPredicate = HasFmaLegacy32 in 1121def : GCNPat < 1122 (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0), 1123 (VOP3NoMods f32:$src1), 1124 (VOP3NoMods f32:$src2))), 1125 (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1126 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1127>; 1128 1129let SubtargetPredicate = Has16BitInsts in 1130def : FMADPat <f16, V_MAC_F16_e64>; 1131} // AddedComplexity = 9 1132 1133let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 1134def : GCNPat < 1135 (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod), 1136 (VOP3Mods f32:$src1, i32:$src1_mod)), 1137 (VOP3Mods f32:$src2, i32:$src2_mod))), 1138 (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1, 1139 $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1140>; 1141 1142class VOPSelectModsPat <ValueType vt> : GCNPat < 1143 (vt (select i1:$src0, (VOP3ModsNonCanonicalizing vt:$src1, i32:$src1_mods), 1144 (VOP3ModsNonCanonicalizing vt:$src2, i32:$src2_mods))), 1145 (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, 1146 FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) 1147>; 1148 1149class VOPSelectPat <ValueType vt> : GCNPat < 1150 (vt (select i1:$src0, vt:$src1, vt:$src2)), 1151 (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0) 1152>; 1153 1154def : VOPSelectModsPat <i32>; 1155def : VOPSelectModsPat <f32>; 1156def : VOPSelectPat <f16>; 1157def : VOPSelectPat <i16>; 1158 1159let AddedComplexity = 1 in { 1160def : GCNPat < 1161 (i32 (add (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), i32:$val)), 1162 (V_BCNT_U32_B32_e64 $popcnt, $val) 1163>; 1164} 1165 1166def : GCNPat < 1167 (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), 1168 (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) 1169>; 1170 1171def : GCNPat < 1172 (i16 (add (i16 (trunc (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)))), i16:$val)), 1173 (V_BCNT_U32_B32_e64 $popcnt, $val) 1174>; 1175 1176def : GCNPat < 1177 (i64 (DivergentUnaryFrag<ctpop> i64:$src)), 1178 (REG_SEQUENCE VReg_64, 1179 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)), 1180 (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0, 1181 (i32 (V_MOV_B32_e32 (i32 0))), sub1) 1182>; 1183 1184/********** ============================================ **********/ 1185/********** Extraction, Insertion, Building and Casting **********/ 1186/********** ============================================ **********/ 1187 1188// Special case for 2 element vectors. REQ_SEQUENCE produces better code 1189// than an INSERT_SUBREG. 1190multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> { 1191 def : GCNPat < 1192 (insertelt vec_type:$vec, elem_type:$elem, 0), 1193 (REG_SEQUENCE RC, $elem, sub0, (elem_type (EXTRACT_SUBREG $vec, sub1)), sub1) 1194 >; 1195 1196 def : GCNPat < 1197 (insertelt vec_type:$vec, elem_type:$elem, 1), 1198 (REG_SEQUENCE RC, (elem_type (EXTRACT_SUBREG $vec, sub0)), sub0, $elem, sub1) 1199 >; 1200} 1201 1202foreach Index = 0-1 in { 1203 def Extract_Element_v2i32_#Index : Extract_Element < 1204 i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) 1205 >; 1206 1207 def Extract_Element_v2f32_#Index : Extract_Element < 1208 f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) 1209 >; 1210} 1211 1212defm : Insert_Element_V2 <SReg_64, i32, v2i32>; 1213defm : Insert_Element_V2 <SReg_64, f32, v2f32>; 1214 1215foreach Index = 0-2 in { 1216 def Extract_Element_v3i32_#Index : Extract_Element < 1217 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1218 >; 1219 def Insert_Element_v3i32_#Index : Insert_Element < 1220 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1221 >; 1222 1223 def Extract_Element_v3f32_#Index : Extract_Element < 1224 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1225 >; 1226 def Insert_Element_v3f32_#Index : Insert_Element < 1227 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1228 >; 1229} 1230 1231foreach Index = 0-3 in { 1232 def Extract_Element_v4i32_#Index : Extract_Element < 1233 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1234 >; 1235 def Insert_Element_v4i32_#Index : Insert_Element < 1236 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1237 >; 1238 1239 def Extract_Element_v4f32_#Index : Extract_Element < 1240 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1241 >; 1242 def Insert_Element_v4f32_#Index : Insert_Element < 1243 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1244 >; 1245} 1246 1247foreach Index = 0-4 in { 1248 def Extract_Element_v5i32_#Index : Extract_Element < 1249 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1250 >; 1251 def Insert_Element_v5i32_#Index : Insert_Element < 1252 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1253 >; 1254 1255 def Extract_Element_v5f32_#Index : Extract_Element < 1256 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1257 >; 1258 def Insert_Element_v5f32_#Index : Insert_Element < 1259 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1260 >; 1261} 1262 1263foreach Index = 0-5 in { 1264 def Extract_Element_v6i32_#Index : Extract_Element < 1265 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1266 >; 1267 def Insert_Element_v6i32_#Index : Insert_Element < 1268 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1269 >; 1270 1271 def Extract_Element_v6f32_#Index : Extract_Element < 1272 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1273 >; 1274 def Insert_Element_v6f32_#Index : Insert_Element < 1275 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1276 >; 1277} 1278 1279foreach Index = 0-6 in { 1280 def Extract_Element_v7i32_#Index : Extract_Element < 1281 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1282 >; 1283 def Insert_Element_v7i32_#Index : Insert_Element < 1284 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1285 >; 1286 1287 def Extract_Element_v7f32_#Index : Extract_Element < 1288 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1289 >; 1290 def Insert_Element_v7f32_#Index : Insert_Element < 1291 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1292 >; 1293} 1294 1295foreach Index = 0-7 in { 1296 def Extract_Element_v8i32_#Index : Extract_Element < 1297 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1298 >; 1299 def Insert_Element_v8i32_#Index : Insert_Element < 1300 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1301 >; 1302 1303 def Extract_Element_v8f32_#Index : Extract_Element < 1304 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1305 >; 1306 def Insert_Element_v8f32_#Index : Insert_Element < 1307 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1308 >; 1309} 1310 1311foreach Index = 0-8 in { 1312 def Extract_Element_v9i32_#Index : Extract_Element < 1313 i32, v9i32, Index, !cast<SubRegIndex>(sub#Index) 1314 >; 1315 def Insert_Element_v9i32_#Index : Insert_Element < 1316 i32, v9i32, Index, !cast<SubRegIndex>(sub#Index) 1317 >; 1318 1319 def Extract_Element_v9f32_#Index : Extract_Element < 1320 f32, v9f32, Index, !cast<SubRegIndex>(sub#Index) 1321 >; 1322 def Insert_Element_v9f32_#Index : Insert_Element < 1323 f32, v9f32, Index, !cast<SubRegIndex>(sub#Index) 1324 >; 1325} 1326 1327foreach Index = 0-9 in { 1328 def Extract_Element_v10i32_#Index : Extract_Element < 1329 i32, v10i32, Index, !cast<SubRegIndex>(sub#Index) 1330 >; 1331 def Insert_Element_v10i32_#Index : Insert_Element < 1332 i32, v10i32, Index, !cast<SubRegIndex>(sub#Index) 1333 >; 1334 1335 def Extract_Element_v10f32_#Index : Extract_Element < 1336 f32, v10f32, Index, !cast<SubRegIndex>(sub#Index) 1337 >; 1338 def Insert_Element_v10f32_#Index : Insert_Element < 1339 f32, v10f32, Index, !cast<SubRegIndex>(sub#Index) 1340 >; 1341} 1342 1343foreach Index = 0-10 in { 1344 def Extract_Element_v11i32_#Index : Extract_Element < 1345 i32, v11i32, Index, !cast<SubRegIndex>(sub#Index) 1346 >; 1347 def Insert_Element_v11i32_#Index : Insert_Element < 1348 i32, v11i32, Index, !cast<SubRegIndex>(sub#Index) 1349 >; 1350 1351 def Extract_Element_v11f32_#Index : Extract_Element < 1352 f32, v11f32, Index, !cast<SubRegIndex>(sub#Index) 1353 >; 1354 def Insert_Element_v11f32_#Index : Insert_Element < 1355 f32, v11f32, Index, !cast<SubRegIndex>(sub#Index) 1356 >; 1357} 1358 1359foreach Index = 0-11 in { 1360 def Extract_Element_v12i32_#Index : Extract_Element < 1361 i32, v12i32, Index, !cast<SubRegIndex>(sub#Index) 1362 >; 1363 def Insert_Element_v12i32_#Index : Insert_Element < 1364 i32, v12i32, Index, !cast<SubRegIndex>(sub#Index) 1365 >; 1366 1367 def Extract_Element_v12f32_#Index : Extract_Element < 1368 f32, v12f32, Index, !cast<SubRegIndex>(sub#Index) 1369 >; 1370 def Insert_Element_v12f32_#Index : Insert_Element < 1371 f32, v12f32, Index, !cast<SubRegIndex>(sub#Index) 1372 >; 1373} 1374 1375foreach Index = 0-15 in { 1376 def Extract_Element_v16i32_#Index : Extract_Element < 1377 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1378 >; 1379 def Insert_Element_v16i32_#Index : Insert_Element < 1380 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1381 >; 1382 1383 def Extract_Element_v16f32_#Index : Extract_Element < 1384 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1385 >; 1386 def Insert_Element_v16f32_#Index : Insert_Element < 1387 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1388 >; 1389} 1390 1391 1392foreach Index = 0-31 in { 1393 def Extract_Element_v32i32_#Index : Extract_Element < 1394 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1395 >; 1396 1397 def Insert_Element_v32i32_#Index : Insert_Element < 1398 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1399 >; 1400 1401 def Extract_Element_v32f32_#Index : Extract_Element < 1402 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1403 >; 1404 1405 def Insert_Element_v32f32_#Index : Insert_Element < 1406 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1407 >; 1408} 1409 1410// FIXME: Why do only some of these type combinations for SReg and 1411// VReg? 1412// 16-bit bitcast 1413def : BitConvert <i16, f16, VGPR_32>; 1414def : BitConvert <f16, i16, VGPR_32>; 1415def : BitConvert <i16, f16, SReg_32>; 1416def : BitConvert <f16, i16, SReg_32>; 1417 1418// 32-bit bitcast 1419def : BitConvert <i32, f32, VGPR_32>; 1420def : BitConvert <f32, i32, VGPR_32>; 1421def : BitConvert <i32, f32, SReg_32>; 1422def : BitConvert <f32, i32, SReg_32>; 1423def : BitConvert <v2i16, i32, SReg_32>; 1424def : BitConvert <i32, v2i16, SReg_32>; 1425def : BitConvert <v2f16, i32, SReg_32>; 1426def : BitConvert <i32, v2f16, SReg_32>; 1427def : BitConvert <v2i16, v2f16, SReg_32>; 1428def : BitConvert <v2f16, v2i16, SReg_32>; 1429def : BitConvert <v2f16, f32, SReg_32>; 1430def : BitConvert <f32, v2f16, SReg_32>; 1431def : BitConvert <v2i16, f32, SReg_32>; 1432def : BitConvert <f32, v2i16, SReg_32>; 1433 1434// 64-bit bitcast 1435def : BitConvert <i64, f64, VReg_64>; 1436def : BitConvert <f64, i64, VReg_64>; 1437def : BitConvert <v2i32, v2f32, VReg_64>; 1438def : BitConvert <v2f32, v2i32, VReg_64>; 1439def : BitConvert <i64, v2i32, VReg_64>; 1440def : BitConvert <v2i32, i64, VReg_64>; 1441def : BitConvert <i64, v2f32, VReg_64>; 1442def : BitConvert <v2f32, i64, VReg_64>; 1443def : BitConvert <f64, v2f32, VReg_64>; 1444def : BitConvert <v2f32, f64, VReg_64>; 1445def : BitConvert <f64, v2i32, VReg_64>; 1446def : BitConvert <v2i32, f64, VReg_64>; 1447def : BitConvert <v4i16, v4f16, VReg_64>; 1448def : BitConvert <v4f16, v4i16, VReg_64>; 1449 1450// FIXME: Make SGPR 1451def : BitConvert <v2i32, v4f16, VReg_64>; 1452def : BitConvert <v4f16, v2i32, VReg_64>; 1453def : BitConvert <v2i32, v4f16, VReg_64>; 1454def : BitConvert <v2i32, v4i16, VReg_64>; 1455def : BitConvert <v4i16, v2i32, VReg_64>; 1456def : BitConvert <v2f32, v4f16, VReg_64>; 1457def : BitConvert <v4f16, v2f32, VReg_64>; 1458def : BitConvert <v2f32, v4i16, VReg_64>; 1459def : BitConvert <v4i16, v2f32, VReg_64>; 1460def : BitConvert <v4i16, f64, VReg_64>; 1461def : BitConvert <v4f16, f64, VReg_64>; 1462def : BitConvert <f64, v4i16, VReg_64>; 1463def : BitConvert <f64, v4f16, VReg_64>; 1464def : BitConvert <v4i16, i64, VReg_64>; 1465def : BitConvert <v4f16, i64, VReg_64>; 1466def : BitConvert <i64, v4i16, VReg_64>; 1467def : BitConvert <i64, v4f16, VReg_64>; 1468 1469def : BitConvert <v4i32, v4f32, VReg_128>; 1470def : BitConvert <v4f32, v4i32, VReg_128>; 1471 1472// 96-bit bitcast 1473def : BitConvert <v3i32, v3f32, SGPR_96>; 1474def : BitConvert <v3f32, v3i32, SGPR_96>; 1475 1476// 128-bit bitcast 1477def : BitConvert <v2i64, v4i32, SReg_128>; 1478def : BitConvert <v4i32, v2i64, SReg_128>; 1479def : BitConvert <v2f64, v4f32, VReg_128>; 1480def : BitConvert <v2f64, v4i32, VReg_128>; 1481def : BitConvert <v4f32, v2f64, VReg_128>; 1482def : BitConvert <v4i32, v2f64, VReg_128>; 1483def : BitConvert <v2i64, v2f64, VReg_128>; 1484def : BitConvert <v2f64, v2i64, VReg_128>; 1485def : BitConvert <v4f32, v2i64, VReg_128>; 1486def : BitConvert <v2i64, v4f32, VReg_128>; 1487def : BitConvert <v8i16, v4i32, SReg_128>; 1488def : BitConvert <v4i32, v8i16, SReg_128>; 1489def : BitConvert <v8f16, v4f32, VReg_128>; 1490def : BitConvert <v8f16, v4i32, VReg_128>; 1491def : BitConvert <v4f32, v8f16, VReg_128>; 1492def : BitConvert <v4i32, v8f16, VReg_128>; 1493def : BitConvert <v8i16, v8f16, VReg_128>; 1494def : BitConvert <v8f16, v8i16, VReg_128>; 1495def : BitConvert <v4f32, v8i16, VReg_128>; 1496def : BitConvert <v8i16, v4f32, VReg_128>; 1497def : BitConvert <v8i16, v8f16, SReg_128>; 1498def : BitConvert <v8i16, v2i64, SReg_128>; 1499def : BitConvert <v8i16, v2f64, SReg_128>; 1500def : BitConvert <v8f16, v2i64, SReg_128>; 1501def : BitConvert <v8f16, v2f64, SReg_128>; 1502def : BitConvert <v8f16, v8i16, SReg_128>; 1503def : BitConvert <v2i64, v8i16, SReg_128>; 1504def : BitConvert <v2f64, v8i16, SReg_128>; 1505def : BitConvert <v2i64, v8f16, SReg_128>; 1506def : BitConvert <v2f64, v8f16, SReg_128>; 1507 1508// 160-bit bitcast 1509def : BitConvert <v5i32, v5f32, SReg_160>; 1510def : BitConvert <v5f32, v5i32, SReg_160>; 1511def : BitConvert <v5i32, v5f32, VReg_160>; 1512def : BitConvert <v5f32, v5i32, VReg_160>; 1513 1514// 192-bit bitcast 1515def : BitConvert <v6i32, v6f32, SReg_192>; 1516def : BitConvert <v6f32, v6i32, SReg_192>; 1517def : BitConvert <v6i32, v6f32, VReg_192>; 1518def : BitConvert <v6f32, v6i32, VReg_192>; 1519def : BitConvert <v3i64, v3f64, VReg_192>; 1520def : BitConvert <v3f64, v3i64, VReg_192>; 1521def : BitConvert <v3i64, v6i32, VReg_192>; 1522def : BitConvert <v3i64, v6f32, VReg_192>; 1523def : BitConvert <v3f64, v6i32, VReg_192>; 1524def : BitConvert <v3f64, v6f32, VReg_192>; 1525def : BitConvert <v6i32, v3i64, VReg_192>; 1526def : BitConvert <v6f32, v3i64, VReg_192>; 1527def : BitConvert <v6i32, v3f64, VReg_192>; 1528def : BitConvert <v6f32, v3f64, VReg_192>; 1529 1530// 224-bit bitcast 1531def : BitConvert <v7i32, v7f32, SReg_224>; 1532def : BitConvert <v7f32, v7i32, SReg_224>; 1533def : BitConvert <v7i32, v7f32, VReg_224>; 1534def : BitConvert <v7f32, v7i32, VReg_224>; 1535 1536// 256-bit bitcast 1537def : BitConvert <v8i32, v8f32, SReg_256>; 1538def : BitConvert <v8f32, v8i32, SReg_256>; 1539def : BitConvert <v8i32, v8f32, VReg_256>; 1540def : BitConvert <v8f32, v8i32, VReg_256>; 1541def : BitConvert <v4i64, v4f64, VReg_256>; 1542def : BitConvert <v4f64, v4i64, VReg_256>; 1543def : BitConvert <v4i64, v8i32, VReg_256>; 1544def : BitConvert <v4i64, v8f32, VReg_256>; 1545def : BitConvert <v4f64, v8i32, VReg_256>; 1546def : BitConvert <v4f64, v8f32, VReg_256>; 1547def : BitConvert <v8i32, v4i64, VReg_256>; 1548def : BitConvert <v8f32, v4i64, VReg_256>; 1549def : BitConvert <v8i32, v4f64, VReg_256>; 1550def : BitConvert <v8f32, v4f64, VReg_256>; 1551def : BitConvert <v16i16, v16f16, SReg_256>; 1552def : BitConvert <v16f16, v16i16, SReg_256>; 1553def : BitConvert <v16i16, v16f16, VReg_256>; 1554def : BitConvert <v16f16, v16i16, VReg_256>; 1555def : BitConvert <v16f16, v8i32, VReg_256>; 1556def : BitConvert <v16i16, v8i32, VReg_256>; 1557def : BitConvert <v16f16, v8f32, VReg_256>; 1558def : BitConvert <v16i16, v8f32, VReg_256>; 1559def : BitConvert <v8i32, v16f16, VReg_256>; 1560def : BitConvert <v8i32, v16i16, VReg_256>; 1561def : BitConvert <v8f32, v16f16, VReg_256>; 1562def : BitConvert <v8f32, v16i16, VReg_256>; 1563def : BitConvert <v16f16, v4i64, VReg_256>; 1564def : BitConvert <v16i16, v4i64, VReg_256>; 1565def : BitConvert <v16f16, v4f64, VReg_256>; 1566def : BitConvert <v16i16, v4f64, VReg_256>; 1567def : BitConvert <v4i64, v16f16, VReg_256>; 1568def : BitConvert <v4i64, v16i16, VReg_256>; 1569def : BitConvert <v4f64, v16f16, VReg_256>; 1570def : BitConvert <v4f64, v16i16, VReg_256>; 1571 1572// 288-bit bitcast 1573def : BitConvert <v9i32, v9f32, SReg_288>; 1574def : BitConvert <v9f32, v9i32, SReg_288>; 1575def : BitConvert <v9i32, v9f32, VReg_288>; 1576def : BitConvert <v9f32, v9i32, VReg_288>; 1577 1578// 320-bit bitcast 1579def : BitConvert <v10i32, v10f32, SReg_320>; 1580def : BitConvert <v10f32, v10i32, SReg_320>; 1581def : BitConvert <v10i32, v10f32, VReg_320>; 1582def : BitConvert <v10f32, v10i32, VReg_320>; 1583 1584// 320-bit bitcast 1585def : BitConvert <v11i32, v11f32, SReg_352>; 1586def : BitConvert <v11f32, v11i32, SReg_352>; 1587def : BitConvert <v11i32, v11f32, VReg_352>; 1588def : BitConvert <v11f32, v11i32, VReg_352>; 1589 1590// 384-bit bitcast 1591def : BitConvert <v12i32, v12f32, SReg_384>; 1592def : BitConvert <v12f32, v12i32, SReg_384>; 1593def : BitConvert <v12i32, v12f32, VReg_384>; 1594def : BitConvert <v12f32, v12i32, VReg_384>; 1595 1596// 512-bit bitcast 1597def : BitConvert <v16i32, v16f32, VReg_512>; 1598def : BitConvert <v16f32, v16i32, VReg_512>; 1599def : BitConvert <v8i64, v8f64, VReg_512>; 1600def : BitConvert <v8f64, v8i64, VReg_512>; 1601def : BitConvert <v8i64, v16i32, VReg_512>; 1602def : BitConvert <v8f64, v16i32, VReg_512>; 1603def : BitConvert <v16i32, v8i64, VReg_512>; 1604def : BitConvert <v16i32, v8f64, VReg_512>; 1605def : BitConvert <v8i64, v16f32, VReg_512>; 1606def : BitConvert <v8f64, v16f32, VReg_512>; 1607def : BitConvert <v16f32, v8i64, VReg_512>; 1608def : BitConvert <v16f32, v8f64, VReg_512>; 1609 1610// 1024-bit bitcast 1611def : BitConvert <v32i32, v32f32, VReg_1024>; 1612def : BitConvert <v32f32, v32i32, VReg_1024>; 1613def : BitConvert <v16i64, v16f64, VReg_1024>; 1614def : BitConvert <v16f64, v16i64, VReg_1024>; 1615def : BitConvert <v16i64, v32i32, VReg_1024>; 1616def : BitConvert <v32i32, v16i64, VReg_1024>; 1617def : BitConvert <v16f64, v32f32, VReg_1024>; 1618def : BitConvert <v32f32, v16f64, VReg_1024>; 1619def : BitConvert <v16i64, v32f32, VReg_1024>; 1620def : BitConvert <v32i32, v16f64, VReg_1024>; 1621def : BitConvert <v16f64, v32i32, VReg_1024>; 1622def : BitConvert <v32f32, v16i64, VReg_1024>; 1623 1624 1625/********** =================== **********/ 1626/********** Src & Dst modifiers **********/ 1627/********** =================== **********/ 1628 1629 1630// If denormals are not enabled, it only impacts the compare of the 1631// inputs. The output result is not flushed. 1632class ClampPat<Instruction inst, ValueType vt> : GCNPat < 1633 (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), 1634 (inst i32:$src0_modifiers, vt:$src0, 1635 i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) 1636>; 1637 1638def : ClampPat<V_MAX_F32_e64, f32>; 1639def : ClampPat<V_MAX_F64_e64, f64>; 1640let SubtargetPredicate = NotHasTrue16BitInsts in 1641def : ClampPat<V_MAX_F16_e64, f16>; 1642let SubtargetPredicate = HasTrue16BitInsts in 1643def : ClampPat<V_MAX_F16_t16_e64, f16>; 1644 1645let SubtargetPredicate = HasVOP3PInsts in { 1646def : GCNPat < 1647 (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), 1648 (V_PK_MAX_F16 $src0_modifiers, $src0, 1649 $src0_modifiers, $src0, DSTCLAMP.ENABLE) 1650>; 1651} 1652 1653 1654/********** ================================ **********/ 1655/********** Floating point absolute/negative **********/ 1656/********** ================================ **********/ 1657 1658def : GCNPat < 1659 (UniformUnaryFrag<fneg> (fabs (f32 SReg_32:$src))), 1660 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit 1661>; 1662 1663def : GCNPat < 1664 (UniformUnaryFrag<fabs> (f32 SReg_32:$src)), 1665 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) 1666>; 1667 1668def : GCNPat < 1669 (UniformUnaryFrag<fneg> (f32 SReg_32:$src)), 1670 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) 1671>; 1672 1673def : GCNPat < 1674 (UniformUnaryFrag<fneg> (f16 SReg_32:$src)), 1675 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) 1676>; 1677 1678def : GCNPat < 1679 (UniformUnaryFrag<fabs> (f16 SReg_32:$src)), 1680 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) 1681>; 1682 1683def : GCNPat < 1684 (UniformUnaryFrag<fneg> (fabs (f16 SReg_32:$src))), 1685 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit 1686>; 1687 1688def : GCNPat < 1689 (UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)), 1690 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) 1691>; 1692 1693def : GCNPat < 1694 (UniformUnaryFrag<fabs> (v2f16 SReg_32:$src)), 1695 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) 1696>; 1697 1698// This is really (fneg (fabs v2f16:$src)) 1699// 1700// fabs is not reported as free because there is modifier for it in 1701// VOP3P instructions, so it is turned into the bit op. 1702def : GCNPat < 1703 (UniformUnaryFrag<fneg> (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), 1704 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1705>; 1706 1707def : GCNPat < 1708 (UniformUnaryFrag<fneg> (v2f16 (fabs SReg_32:$src))), 1709 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1710>; 1711 1712 1713// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead 1714// of the real value. 1715def : GCNPat < 1716 (UniformUnaryFrag<fneg> (v2f32 SReg_64:$src)), 1717 (v2f32 (REG_SEQUENCE SReg_64, 1718 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1719 (i32 (S_MOV_B32 (i32 0x80000000)))), 1720 SReg_32)), sub0, 1721 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1722 (i32 (S_MOV_B32 (i32 0x80000000)))), 1723 SReg_32)), sub1)) 1724>; 1725 1726def : GCNPat < 1727 (UniformUnaryFrag<fabs> (v2f32 SReg_64:$src)), 1728 (v2f32 (REG_SEQUENCE SReg_64, 1729 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1730 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1731 SReg_32)), sub0, 1732 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1733 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1734 SReg_32)), sub1)) 1735>; 1736 1737def : GCNPat < 1738 (UniformUnaryFrag<fneg> (fabs (v2f32 SReg_64:$src))), 1739 (v2f32 (REG_SEQUENCE SReg_64, 1740 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1741 (i32 (S_MOV_B32 (i32 0x80000000)))), 1742 SReg_32)), sub0, 1743 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1744 (i32 (S_MOV_B32 (i32 0x80000000)))), 1745 SReg_32)), sub1)) 1746>; 1747 1748// FIXME: Use S_BITSET0_B32/B64? 1749def : GCNPat < 1750 (UniformUnaryFrag<fabs> (f64 SReg_64:$src)), 1751 (REG_SEQUENCE SReg_64, 1752 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1753 sub0, 1754 (i32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1755 (S_MOV_B32 (i32 0x7fffffff))), SReg_32)), // Set sign bit. 1756 sub1) 1757>; 1758 1759def : GCNPat < 1760 (UniformUnaryFrag<fneg> (f64 SReg_64:$src)), 1761 (REG_SEQUENCE SReg_64, 1762 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1763 sub0, 1764 (i32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1765 (i32 (S_MOV_B32 (i32 0x80000000)))), SReg_32)), 1766 sub1) 1767>; 1768 1769def : GCNPat < 1770 (UniformUnaryFrag<fneg> (fabs (f64 SReg_64:$src))), 1771 (REG_SEQUENCE SReg_64, 1772 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1773 sub0, 1774 (i32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1775 (S_MOV_B32 (i32 0x80000000))), SReg_32)),// Set sign bit. 1776 sub1) 1777>; 1778 1779 1780def : GCNPat < 1781 (fneg (fabs (f32 VGPR_32:$src))), 1782 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) // Set sign bit 1783>; 1784 1785def : GCNPat < 1786 (fabs (f32 VGPR_32:$src)), 1787 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) 1788>; 1789 1790def : GCNPat < 1791 (fneg (f32 VGPR_32:$src)), 1792 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) 1793>; 1794 1795def : GCNPat < 1796 (fabs (f16 VGPR_32:$src)), 1797 (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) 1798>; 1799 1800def : GCNPat < 1801 (fneg (f16 VGPR_32:$src)), 1802 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) 1803>; 1804 1805def : GCNPat < 1806 (fneg (fabs (f16 VGPR_32:$src))), 1807 (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit 1808>; 1809 1810def : GCNPat < 1811 (fneg (v2f16 VGPR_32:$src)), 1812 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 1813>; 1814 1815def : GCNPat < 1816 (fabs (v2f16 VGPR_32:$src)), 1817 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) 1818>; 1819 1820def : GCNPat < 1821 (fneg (v2f16 (fabs VGPR_32:$src))), 1822 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 1823>; 1824 1825def : GCNPat < 1826 (fabs (f64 VReg_64:$src)), 1827 (REG_SEQUENCE VReg_64, 1828 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1829 sub0, 1830 (V_AND_B32_e64 (i32 (S_MOV_B32 (i32 0x7fffffff))), 1831 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 1832 sub1) 1833>; 1834 1835def : GCNPat < 1836 (fneg (f64 VReg_64:$src)), 1837 (REG_SEQUENCE VReg_64, 1838 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1839 sub0, 1840 (V_XOR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 1841 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 1842 sub1) 1843>; 1844 1845def : GCNPat < 1846 (fneg (fabs (f64 VReg_64:$src))), 1847 (REG_SEQUENCE VReg_64, 1848 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1849 sub0, 1850 (V_OR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 1851 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 1852 sub1) 1853>; 1854 1855def : GCNPat < 1856 (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)), 1857 (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, 1858 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0, 1859 0, 0, 0, 0, 0) 1860> { 1861 let SubtargetPredicate = HasPackedFP32Ops; 1862} 1863 1864def : GCNPat < 1865 (fcopysign f16:$src0, f16:$src1), 1866 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) 1867>; 1868 1869def : GCNPat < 1870 (fcopysign f32:$src0, f16:$src1), 1871 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 1872 (V_LSHLREV_B32_e64 (i32 16), $src1)) 1873>; 1874 1875def : GCNPat < 1876 (fcopysign f64:$src0, f16:$src1), 1877 (REG_SEQUENCE SReg_64, 1878 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 1879 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), 1880 (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) 1881>; 1882 1883def : GCNPat < 1884 (fcopysign f16:$src0, f32:$src1), 1885 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 1886 (V_LSHRREV_B32_e64 (i32 16), $src1)) 1887>; 1888 1889def : GCNPat < 1890 (fcopysign f16:$src0, f64:$src1), 1891 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 1892 (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) 1893>; 1894 1895/********** ================== **********/ 1896/********** Immediate Patterns **********/ 1897/********** ================== **********/ 1898 1899def : GCNPat < 1900 (VGPRImm<(i32 imm)>:$imm), 1901 (V_MOV_B32_e32 imm:$imm) 1902>; 1903 1904def : GCNPat < 1905 (VGPRImm<(f32 fpimm)>:$imm), 1906 (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) 1907>; 1908 1909def : GCNPat < 1910 (i32 imm:$imm), 1911 (S_MOV_B32 imm:$imm) 1912>; 1913 1914def : GCNPat < 1915 (VGPRImm<(SIlds tglobaladdr:$ga)>), 1916 (V_MOV_B32_e32 $ga) 1917>; 1918 1919def : GCNPat < 1920 (SIlds tglobaladdr:$ga), 1921 (S_MOV_B32 $ga) 1922>; 1923 1924// FIXME: Workaround for ordering issue with peephole optimizer where 1925// a register class copy interferes with immediate folding. Should 1926// use s_mov_b32, which can be shrunk to s_movk_i32 1927def : GCNPat < 1928 (VGPRImm<(f16 fpimm)>:$imm), 1929 (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) 1930>; 1931 1932def : GCNPat < 1933 (f32 fpimm:$imm), 1934 (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) 1935>; 1936 1937def : GCNPat < 1938 (f16 fpimm:$imm), 1939 (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) 1940>; 1941 1942def : GCNPat < 1943 (p5 frameindex:$fi), 1944 (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi))) 1945>; 1946 1947def : GCNPat < 1948 (p5 frameindex:$fi), 1949 (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi))) 1950>; 1951 1952def : GCNPat < 1953 (i64 InlineImm64:$imm), 1954 (S_MOV_B64 InlineImm64:$imm) 1955>; 1956 1957// XXX - Should this use a s_cmp to set SCC? 1958 1959// Set to sign-extended 64-bit value (true = -1, false = 0) 1960def : GCNPat < 1961 (i1 imm:$imm), 1962 (S_MOV_B64 (i64 (as_i64imm $imm))) 1963> { 1964 let WaveSizePredicate = isWave64; 1965} 1966 1967def : GCNPat < 1968 (i1 imm:$imm), 1969 (S_MOV_B32 (i32 (as_i32imm $imm))) 1970> { 1971 let WaveSizePredicate = isWave32; 1972} 1973 1974def : GCNPat < 1975 (f64 InlineImmFP64:$imm), 1976 (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm))) 1977>; 1978 1979/********** ================== **********/ 1980/********** Intrinsic Patterns **********/ 1981/********** ================== **********/ 1982 1983def : GCNPat < 1984 (f32 (fpow (VOP3Mods f32:$src0, i32:$src0_mods), (VOP3Mods f32:$src1, i32:$src1_mods))), 1985 (V_EXP_F32_e64 SRCMODS.NONE, (V_MUL_LEGACY_F32_e64 $src1_mods, $src1, SRCMODS.NONE, (V_LOG_F32_e64 $src0_mods, $src0), 0, 0)) 1986>; 1987 1988def : GCNPat < 1989 (i32 (sext i1:$src0)), 1990 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1991 /*src1mod*/(i32 0), /*src1*/(i32 -1), i1:$src0) 1992>; 1993 1994class Ext32Pat <SDNode ext> : GCNPat < 1995 (i32 (ext i1:$src0)), 1996 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1997 /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0) 1998>; 1999 2000def : Ext32Pat <zext>; 2001def : Ext32Pat <anyext>; 2002 2003// The multiplication scales from [0,1) to the unsigned integer range, 2004// rounding down a bit to avoid unwanted overflow. 2005def : GCNPat < 2006 (AMDGPUurecip i32:$src0), 2007 (V_CVT_U32_F32_e32 2008 (V_MUL_F32_e32 (i32 CONST.FP_4294966784), 2009 (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) 2010>; 2011 2012//===----------------------------------------------------------------------===// 2013// VOP3 Patterns 2014//===----------------------------------------------------------------------===// 2015 2016def : IMad24Pat<V_MAD_I32_I24_e64, 1>; 2017def : UMad24Pat<V_MAD_U32_U24_e64, 1>; 2018 2019// BFI patterns 2020 2021def BFIImm32 : PatFrag< 2022 (ops node:$x, node:$y, node:$z), 2023 (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))), 2024 [{ 2025 auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1)); 2026 auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1)); 2027 return X && NotX && 2028 ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue(); 2029 }] 2030>; 2031 2032 2033// Definition from ISA doc: 2034// (y & x) | (z & ~x) 2035def : AMDGPUPatIgnoreCopies < 2036 (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), 2037 (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), 2038 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), 2039 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) 2040>; 2041 2042// (y & C) | (z & ~C) 2043def : AMDGPUPatIgnoreCopies < 2044 (BFIImm32 i32:$x, i32:$y, i32:$z), 2045 (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) 2046>; 2047 2048// 64-bit version 2049def : AMDGPUPatIgnoreCopies < 2050 (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), 2051 (REG_SEQUENCE VReg_64, 2052 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 2053 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 2054 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 2055 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 2056 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 2057 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 2058>; 2059 2060// SHA-256 Ch function 2061// z ^ (x & (y ^ z)) 2062def : AMDGPUPatIgnoreCopies < 2063 (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), 2064 (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), 2065 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32), 2066 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) 2067>; 2068 2069// 64-bit version 2070def : AMDGPUPatIgnoreCopies < 2071 (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), 2072 (REG_SEQUENCE VReg_64, 2073 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 2074 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 2075 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 2076 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 2077 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 2078 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 2079>; 2080 2081def : AMDGPUPat < 2082 (fcopysign f32:$src0, f32:$src1), 2083 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1) 2084>; 2085 2086def : AMDGPUPat < 2087 (fcopysign f32:$src0, f64:$src1), 2088 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 2089 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))) 2090>; 2091 2092def : AMDGPUPat < 2093 (fcopysign f64:$src0, f64:$src1), 2094 (REG_SEQUENCE SReg_64, 2095 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2096 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 2097 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 2098 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1) 2099>; 2100 2101def : AMDGPUPat < 2102 (fcopysign f64:$src0, f32:$src1), 2103 (REG_SEQUENCE SReg_64, 2104 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2105 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 2106 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 2107 $src1), sub1) 2108>; 2109 2110def : ROTRPattern <V_ALIGNBIT_B32_e64>; 2111 2112def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), 2113 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 2114 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 2115 2116def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), 2117 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 2118 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 2119 2120/********** ====================== **********/ 2121/********** Indirect addressing **********/ 2122/********** ====================== **********/ 2123 2124multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { 2125 // Extract with offset 2126 def : GCNPat< 2127 (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), 2128 (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) 2129 >; 2130 2131 // Insert with offset 2132 def : GCNPat< 2133 (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), 2134 (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) 2135 >; 2136} 2137 2138defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; 2139defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; 2140defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; 2141defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">; 2142defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">; 2143defm : SI_INDIRECT_Pattern <v11f32, f32, "V11">; 2144defm : SI_INDIRECT_Pattern <v12f32, f32, "V12">; 2145defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; 2146defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">; 2147 2148defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; 2149defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; 2150defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; 2151defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">; 2152defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">; 2153defm : SI_INDIRECT_Pattern <v11i32, i32, "V11">; 2154defm : SI_INDIRECT_Pattern <v12i32, i32, "V12">; 2155defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; 2156defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">; 2157 2158//===----------------------------------------------------------------------===// 2159// SAD Patterns 2160//===----------------------------------------------------------------------===// 2161 2162def : GCNPat < 2163 (add (sub_oneuse (umax i32:$src0, i32:$src1), 2164 (umin i32:$src0, i32:$src1)), 2165 i32:$src2), 2166 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 2167>; 2168 2169def : GCNPat < 2170 (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)), 2171 (sub i32:$src0, i32:$src1), 2172 (sub i32:$src1, i32:$src0)), 2173 i32:$src2), 2174 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 2175>; 2176 2177//===----------------------------------------------------------------------===// 2178// Conversion Patterns 2179//===----------------------------------------------------------------------===// 2180def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)), 2181 (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 2182 2183// Handle sext_inreg in i64 2184def : GCNPat < 2185 (i64 (UniformSextInreg<i1> i64:$src)), 2186 (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 2187>; 2188 2189def : GCNPat < 2190 (i16 (UniformSextInreg<i1> i16:$src)), 2191 (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 2192>; 2193 2194def : GCNPat < 2195 (i16 (UniformSextInreg<i8> i16:$src)), 2196 (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 2197>; 2198 2199def : GCNPat < 2200 (i64 (UniformSextInreg<i8> i64:$src)), 2201 (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 2202>; 2203 2204def : GCNPat < 2205 (i64 (UniformSextInreg<i16> i64:$src)), 2206 (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 2207>; 2208 2209def : GCNPat < 2210 (i64 (UniformSextInreg<i32> i64:$src)), 2211 (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 2212>; 2213 2214def : GCNPat< 2215 (i32 (DivergentSextInreg<i1> i32:$src)), 2216 (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>; 2217 2218def : GCNPat < 2219 (i16 (DivergentSextInreg<i1> i16:$src)), 2220 (V_BFE_I32_e64 $src, (i32 0), (i32 1)) 2221>; 2222 2223def : GCNPat < 2224 (i16 (DivergentSextInreg<i8> i16:$src)), 2225 (V_BFE_I32_e64 $src, (i32 0), (i32 8)) 2226>; 2227 2228def : GCNPat< 2229 (i32 (DivergentSextInreg<i8> i32:$src)), 2230 (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8)) 2231>; 2232 2233def : GCNPat < 2234 (i32 (DivergentSextInreg<i16> i32:$src)), 2235 (V_BFE_I32_e64 $src, (i32 0), (i32 16)) 2236>; 2237 2238def : GCNPat < 2239 (i64 (DivergentSextInreg<i1> i64:$src)), 2240 (REG_SEQUENCE VReg_64, 2241 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0, 2242 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1) 2243>; 2244 2245def : GCNPat < 2246 (i64 (DivergentSextInreg<i8> i64:$src)), 2247 (REG_SEQUENCE VReg_64, 2248 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)), sub0, 2249 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1) 2250>; 2251 2252def : GCNPat < 2253 (i64 (DivergentSextInreg<i16> i64:$src)), 2254 (REG_SEQUENCE VReg_64, 2255 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)), sub0, 2256 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1) 2257>; 2258 2259def : GCNPat < 2260 (i64 (DivergentSextInreg<i32> i64:$src)), 2261 (REG_SEQUENCE VReg_64, 2262 (i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0, 2263 (V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1) 2264>; 2265 2266def : GCNPat < 2267 (i64 (zext i32:$src)), 2268 (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) 2269>; 2270 2271def : GCNPat < 2272 (i64 (anyext i32:$src)), 2273 (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) 2274>; 2275 2276class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < 2277 (i64 (ext i1:$src)), 2278 (REG_SEQUENCE VReg_64, 2279 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2280 /*src1mod*/(i32 0), /*src1*/(i32 1), $src), 2281 sub0, (S_MOV_B32 (i32 0)), sub1) 2282>; 2283 2284 2285def : ZExt_i64_i1_Pat<zext>; 2286def : ZExt_i64_i1_Pat<anyext>; 2287 2288// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that 2289// REG_SEQUENCE patterns don't support instructions with multiple outputs. 2290def : GCNPat < 2291 (i64 (UniformUnaryFrag<sext> i32:$src)), 2292 (REG_SEQUENCE SReg_64, $src, sub0, 2293 (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) 2294>; 2295 2296def : GCNPat < 2297 (i64 (DivergentUnaryFrag<sext> i32:$src)), 2298 (REG_SEQUENCE VReg_64, $src, sub0, 2299 (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1) 2300>; 2301 2302def : GCNPat < 2303 (i64 (sext i1:$src)), 2304 (REG_SEQUENCE VReg_64, 2305 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2306 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0, 2307 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2308 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1) 2309>; 2310 2311class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat < 2312 (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), 2313 (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) 2314>; 2315 2316def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; 2317def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; 2318def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; 2319def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; 2320def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; 2321def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; 2322 2323// If we need to perform a logical operation on i1 values, we need to 2324// use vector comparisons since there is only one SCC register. Vector 2325// comparisons may write to a pair of SGPRs or a single SGPR, so treat 2326// these as 32 or 64-bit comparisons. When legalizing SGPR copies, 2327// instructions resulting in the copies from SCC to these instructions 2328// will be moved to the VALU. 2329 2330let WaveSizePredicate = isWave64 in { 2331def : GCNPat < 2332 (i1 (and i1:$src0, i1:$src1)), 2333 (S_AND_B64 $src0, $src1) 2334>; 2335 2336def : GCNPat < 2337 (i1 (or i1:$src0, i1:$src1)), 2338 (S_OR_B64 $src0, $src1) 2339>; 2340 2341def : GCNPat < 2342 (i1 (xor i1:$src0, i1:$src1)), 2343 (S_XOR_B64 $src0, $src1) 2344>; 2345 2346def : GCNPat < 2347 (i1 (add i1:$src0, i1:$src1)), 2348 (S_XOR_B64 $src0, $src1) 2349>; 2350 2351def : GCNPat < 2352 (i1 (sub i1:$src0, i1:$src1)), 2353 (S_XOR_B64 $src0, $src1) 2354>; 2355 2356let AddedComplexity = 1 in { 2357def : GCNPat < 2358 (i1 (add i1:$src0, (i1 -1))), 2359 (S_NOT_B64 $src0) 2360>; 2361 2362def : GCNPat < 2363 (i1 (sub i1:$src0, (i1 -1))), 2364 (S_NOT_B64 $src0) 2365>; 2366} 2367} // end isWave64 2368 2369let WaveSizePredicate = isWave32 in { 2370def : GCNPat < 2371 (i1 (and i1:$src0, i1:$src1)), 2372 (S_AND_B32 $src0, $src1) 2373>; 2374 2375def : GCNPat < 2376 (i1 (or i1:$src0, i1:$src1)), 2377 (S_OR_B32 $src0, $src1) 2378>; 2379 2380def : GCNPat < 2381 (i1 (xor i1:$src0, i1:$src1)), 2382 (S_XOR_B32 $src0, $src1) 2383>; 2384 2385def : GCNPat < 2386 (i1 (add i1:$src0, i1:$src1)), 2387 (S_XOR_B32 $src0, $src1) 2388>; 2389 2390def : GCNPat < 2391 (i1 (sub i1:$src0, i1:$src1)), 2392 (S_XOR_B32 $src0, $src1) 2393>; 2394 2395let AddedComplexity = 1 in { 2396def : GCNPat < 2397 (i1 (add i1:$src0, (i1 -1))), 2398 (S_NOT_B32 $src0) 2399>; 2400 2401def : GCNPat < 2402 (i1 (sub i1:$src0, (i1 -1))), 2403 (S_NOT_B32 $src0) 2404>; 2405} 2406} // end isWave32 2407 2408def : GCNPat < 2409 (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))), 2410 (V_NOT_B32_e32 $src0) 2411>; 2412 2413def : GCNPat < 2414 (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))), 2415 (REG_SEQUENCE VReg_64, 2416 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0, 2417 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1 2418 ) 2419>; 2420 2421let SubtargetPredicate = NotHasTrue16BitInsts in 2422def : GCNPat < 2423 (f16 (sint_to_fp i1:$src)), 2424 (V_CVT_F16_F32_e32 ( 2425 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2426 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2427 SSrc_i1:$src)) 2428>; 2429 2430let SubtargetPredicate = HasTrue16BitInsts in 2431def : GCNPat < 2432 (f16 (sint_to_fp i1:$src)), 2433 (V_CVT_F16_F32_t16_e32 ( 2434 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2435 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2436 SSrc_i1:$src)) 2437>; 2438 2439let SubtargetPredicate = NotHasTrue16BitInsts in 2440def : GCNPat < 2441 (f16 (uint_to_fp i1:$src)), 2442 (V_CVT_F16_F32_e32 ( 2443 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2444 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2445 SSrc_i1:$src)) 2446>; 2447let SubtargetPredicate = HasTrue16BitInsts in 2448def : GCNPat < 2449 (f16 (uint_to_fp i1:$src)), 2450 (V_CVT_F16_F32_t16_e32 ( 2451 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2452 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2453 SSrc_i1:$src)) 2454>; 2455 2456def : GCNPat < 2457 (f32 (sint_to_fp i1:$src)), 2458 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2459 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2460 SSrc_i1:$src) 2461>; 2462 2463def : GCNPat < 2464 (f32 (uint_to_fp i1:$src)), 2465 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2466 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2467 SSrc_i1:$src) 2468>; 2469 2470def : GCNPat < 2471 (f64 (sint_to_fp i1:$src)), 2472 (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2473 /*src1mod*/(i32 0), /*src1*/(i32 -1), 2474 SSrc_i1:$src)) 2475>; 2476 2477def : GCNPat < 2478 (f64 (uint_to_fp i1:$src)), 2479 (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2480 /*src1mod*/(i32 0), /*src1*/(i32 1), 2481 SSrc_i1:$src)) 2482>; 2483 2484//===----------------------------------------------------------------------===// 2485// Miscellaneous Patterns 2486//===----------------------------------------------------------------------===// 2487 2488// Eliminate a zero extension from an fp16 operation if it already 2489// zeros the high bits of the 32-bit register. 2490// 2491// This is complicated on gfx9+. Some instructions maintain the legacy 2492// zeroing behavior, but others preserve the high bits. Some have a 2493// control bit to change the behavior. We can't simply say with 2494// certainty what the source behavior is without more context on how 2495// the src is lowered. e.g. fptrunc + fma may be lowered to a 2496// v_fma_mix* instruction which does not zero, or may not. 2497def : GCNPat< 2498 (i32 (DivergentUnaryFrag<abs> i32:$src)), 2499 (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>; 2500 2501let AddedComplexity = 1 in { 2502def : GCNPat< 2503 (i32 (DivergentUnaryFrag<abs> i32:$src)), 2504 (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{ 2505 let SubtargetPredicate = HasAddNoCarryInsts; 2506} 2507} // AddedComplexity = 1 2508 2509def : GCNPat< 2510 (i32 (DivergentUnaryFrag<zext> i16:$src)), 2511 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src) 2512>; 2513 2514def : GCNPat< 2515 (i64 (DivergentUnaryFrag<zext> i16:$src)), 2516 (REG_SEQUENCE VReg_64, 2517 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0, 2518 (S_MOV_B32 (i32 0)), sub1) 2519>; 2520 2521def : GCNPat< 2522 (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), 2523 (COPY VSrc_b16:$src)>; 2524 2525def : GCNPat < 2526 (i32 (trunc i64:$a)), 2527 (EXTRACT_SUBREG $a, sub0) 2528>; 2529 2530def : GCNPat < 2531 (i1 (UniformUnaryFrag<trunc> i32:$a)), 2532 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2533>; 2534 2535def : GCNPat < 2536 (i1 (UniformUnaryFrag<trunc> i16:$a)), 2537 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2538>; 2539 2540def : GCNPat < 2541 (i1 (UniformUnaryFrag<trunc> i64:$a)), 2542 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), 2543 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2544>; 2545 2546def : GCNPat < 2547 (i1 (DivergentUnaryFrag<trunc> i32:$a)), 2548 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2549>; 2550 2551def : GCNPat < 2552 (i1 (DivergentUnaryFrag<trunc> i16:$a)), 2553 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2554>; 2555 2556def IMMBitSelConst : SDNodeXForm<imm, [{ 2557 return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N), 2558 MVT::i32); 2559}]>; 2560 2561// Matching separate SRL and TRUNC instructions 2562// with dependent operands (SRL dest is source of TRUNC) 2563// generates three instructions. However, by using bit shifts, 2564// the V_LSHRREV_B32_e64 result can be directly used in the 2565// operand of the V_AND_B32_e64 instruction: 2566// (trunc i32 (srl i32 $a, i32 $b)) -> 2567// v_and_b32_e64 $a, (1 << $b), $a 2568// v_cmp_ne_u32_e64 $a, 0, $a 2569 2570// Handle the VALU case. 2571def : GCNPat < 2572 (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), 2573 (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a), 2574 (i32 0)) 2575>; 2576 2577// Handle the scalar case. 2578def : GCNPat < 2579 (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), 2580 (S_CMP_LG_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a), 2581 (i32 0)) 2582>; 2583 2584def : GCNPat < 2585 (i1 (DivergentUnaryFrag<trunc> i64:$a)), 2586 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), 2587 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2588>; 2589 2590def : GCNPat < 2591 (i32 (bswap i32:$a)), 2592 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2593 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), 2594 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) 2595>; 2596 2597// FIXME: This should have been narrowed to i32 during legalization. 2598// This pattern should also be skipped for GlobalISel 2599def : GCNPat < 2600 (i64 (bswap i64:$a)), 2601 (REG_SEQUENCE VReg_64, 2602 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2603 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2604 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2605 (i32 24)), 2606 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2607 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2608 (i32 8))), 2609 sub0, 2610 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2611 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2612 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2613 (i32 24)), 2614 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2615 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2616 (i32 8))), 2617 sub1) 2618>; 2619 2620// FIXME: The AddedComplexity should not be needed, but in GlobalISel 2621// the BFI pattern ends up taking precedence without it. 2622let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { 2623// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24) 2624// 2625// My reading of the manual suggests we should be using src0 for the 2626// register value, but this is what seems to work. 2627def : GCNPat < 2628 (i32 (bswap i32:$a)), 2629 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) 2630>; 2631 2632// FIXME: This should have been narrowed to i32 during legalization. 2633// This pattern should also be skipped for GlobalISel 2634def : GCNPat < 2635 (i64 (bswap i64:$a)), 2636 (REG_SEQUENCE VReg_64, 2637 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), 2638 (S_MOV_B32 (i32 0x00010203))), 2639 sub0, 2640 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), 2641 (S_MOV_B32 (i32 0x00010203))), 2642 sub1) 2643>; 2644 2645// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24) 2646// The 12s emit 0s. 2647def : GCNPat < 2648 (i16 (bswap i16:$a)), 2649 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 2650>; 2651 2652def : GCNPat < 2653 (i32 (zext (bswap i16:$a))), 2654 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 2655>; 2656 2657// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) 2658def : GCNPat < 2659 (v2i16 (bswap v2i16:$a)), 2660 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) 2661>; 2662 2663} 2664 2665def : GCNPat< 2666 (i64 (DivergentUnaryFrag<bitreverse> i64:$a)), 2667 (REG_SEQUENCE VReg_64, 2668 (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0, 2669 (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>; 2670 2671// Prefer selecting to max when legal, but using mul is always valid. 2672let AddedComplexity = -5 in { 2673 2674let OtherPredicates = [NotHasTrue16BitInsts] in { 2675def : GCNPat< 2676 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2677 (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 2678>; 2679 2680def : GCNPat< 2681 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 2682 (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 2683>; 2684} // End OtherPredicates 2685 2686let OtherPredicates = [HasTrue16BitInsts] in { 2687def : GCNPat< 2688 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2689 (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 2690>; 2691 2692def : GCNPat< 2693 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 2694 (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 2695>; 2696} // End OtherPredicates 2697 2698def : GCNPat< 2699 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 2700 (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) 2701>; 2702 2703def : GCNPat< 2704 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 2705 (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src) 2706>; 2707 2708def : GCNPat< 2709 (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), 2710 (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) 2711>; 2712 2713// TODO: Handle fneg like other types. 2714def : GCNPat< 2715 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 2716 (V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src) 2717>; 2718} // End AddedComplexity = -5 2719 2720multiclass SelectCanonicalizeAsMax< 2721 list<Predicate> f32_preds = [], 2722 list<Predicate> f64_preds = [], 2723 list<Predicate> f16_preds = []> { 2724 def : GCNPat< 2725 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 2726 (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> { 2727 let OtherPredicates = f32_preds; 2728 } 2729 2730 def : GCNPat< 2731 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 2732 (V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> { 2733 let OtherPredicates = f64_preds; 2734 } 2735 2736 def : GCNPat< 2737 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2738 (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 2739 let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, NotHasTrue16BitInsts]); 2740 } 2741 2742 def : GCNPat< 2743 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2744 (V_MAX_F16_t16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 2745 let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, HasTrue16BitInsts]); 2746 } 2747 2748 def : GCNPat< 2749 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 2750 (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> { 2751 // FIXME: Should have VOP3P subtarget predicate 2752 let OtherPredicates = f16_preds; 2753 } 2754} 2755 2756// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal 2757// mode, and would never flush. For f64, it's faster to do implement 2758// this with a max. For f16/f32 it's a wash, but prefer max when 2759// valid. 2760// 2761// FIXME: Lowering f32/f16 with max is worse since we can use a 2762// smaller encoding if the input is fneg'd. It also adds an extra 2763// register use. 2764let SubtargetPredicate = HasMinMaxDenormModes in { 2765 defm : SelectCanonicalizeAsMax<[], [], []>; 2766} // End SubtargetPredicate = HasMinMaxDenormModes 2767 2768let SubtargetPredicate = NotHasMinMaxDenormModes in { 2769 // Use the max lowering if we don't need to flush. 2770 2771 // FIXME: We don't do use this for f32 as a workaround for the 2772 // library being compiled with the default ieee mode, but 2773 // potentially being called from flushing kernels. Really we should 2774 // not be mixing code expecting different default FP modes, but mul 2775 // works in any FP environment. 2776 defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>; 2777} // End SubtargetPredicate = NotHasMinMaxDenormModes 2778 2779 2780let OtherPredicates = [HasDLInsts] in { 2781// Don't allow source modifiers. If there are any source modifiers then it's 2782// better to select fma instead of fmac. 2783def : GCNPat < 2784 (fma (f32 (VOP3NoMods f32:$src0)), 2785 (f32 (VOP3NoMods f32:$src1)), 2786 (f32 (VOP3NoMods f32:$src2))), 2787 (V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2788 SRCMODS.NONE, $src2) 2789>; 2790} // End OtherPredicates = [HasDLInsts] 2791 2792let SubtargetPredicate = isGFX10Plus in { 2793// Don't allow source modifiers. If there are any source modifiers then it's 2794// better to select fma instead of fmac. 2795let OtherPredicates = [NotHasTrue16BitInsts] in 2796def : GCNPat < 2797 (fma (f16 (VOP3NoMods f32:$src0)), 2798 (f16 (VOP3NoMods f32:$src1)), 2799 (f16 (VOP3NoMods f32:$src2))), 2800 (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2801 SRCMODS.NONE, $src2) 2802>; 2803let OtherPredicates = [HasTrue16BitInsts] in 2804def : GCNPat < 2805 (fma (f16 (VOP3NoMods f32:$src0)), 2806 (f16 (VOP3NoMods f32:$src1)), 2807 (f16 (VOP3NoMods f32:$src2))), 2808 (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2809 SRCMODS.NONE, $src2) 2810>; 2811} 2812 2813let OtherPredicates = [HasFmacF64Inst] in 2814// Don't allow source modifiers. If there are any source modifiers then it's 2815// better to select fma instead of fmac. 2816def : GCNPat < 2817 (fma (f64 (VOP3NoMods f64:$src0)), 2818 (f64 (VOP3NoMods f64:$src1)), 2819 (f64 (VOP3NoMods f64:$src2))), 2820 (V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2821 SRCMODS.NONE, $src2) 2822>; 2823 2824// COPY is workaround tablegen bug from multiple outputs 2825// from S_LSHL_B32's multiple outputs from implicit scc def. 2826let AddedComplexity = 1 in { 2827def : GCNPat < 2828 (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), 2829 (S_LSHL_B32 SReg_32:$src1, (i16 16)) 2830>; 2831 2832def : GCNPat < 2833 (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))), 2834 (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1)) 2835>; 2836 2837 2838def : GCNPat < 2839 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), 2840 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 2841>; 2842 2843def : GCNPat < 2844 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))), 2845 (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) 2846>; 2847 2848def : GCNPat < 2849 (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), 2850 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 2851>; 2852 2853def : GCNPat < 2854 (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))), 2855 (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) 2856>; 2857 2858def : GCNPat < 2859 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 undef))), 2860 (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) 2861>; 2862 2863def : GCNPat < 2864 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 undef))), 2865 (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32) 2866>; 2867 2868def : GCNPat < 2869 (v2f16 (build_vector f16:$src0, (f16 undef))), 2870 (COPY $src0) 2871>; 2872 2873def : GCNPat < 2874 (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))), 2875 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 2876>; 2877 2878def : GCNPat < 2879 (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 VGPR_32:$src1))), 2880 (v2i16 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1)) 2881>; 2882 2883 2884def : GCNPat < 2885 (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))), 2886 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 2887>; 2888 2889def : GCNPat < 2890 (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 VGPR_32:$src1))), 2891 (v2f16 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1)) 2892>; 2893} 2894 2895let SubtargetPredicate = HasVOP3PInsts in { 2896def : GCNPat < 2897 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))), 2898 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 2899>; 2900 2901def : GCNPat < 2902 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))), 2903 (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) 2904>; 2905 2906// With multiple uses of the shift, this will duplicate the shift and 2907// increase register pressure. 2908def : GCNPat < 2909 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2910 (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1)) 2911>; 2912 2913def : GCNPat < 2914 (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), 2915 (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2916 (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) 2917>; 2918 2919def : GCNPat < 2920 (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))), 2921 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 2922>; 2923 2924 2925 2926foreach Ty = [i16, f16] in { 2927 2928defvar vecTy = !if(!eq(Ty, i16), v2i16, v2f16); 2929defvar immzeroTy = !if(!eq(Ty, i16), immzero, fpimmzero); 2930 2931// Take the lower 16 bits from each VGPR_32 and concat them 2932def : GCNPat < 2933 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))), 2934 (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) 2935>; 2936 2937 2938// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] 2939// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) 2940def : GCNPat < 2941 (vecTy (DivergentBinFrag<build_vector> (Ty (immzeroTy)), 2942 (Ty !if(!eq(Ty, i16), 2943 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 2944 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 2945 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b) 2946>; 2947 2948 2949// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] 2950// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) 2951def : GCNPat < 2952 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), 2953 (Ty !if(!eq(Ty, i16), 2954 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 2955 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 2956 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b) 2957>; 2958 2959 2960// Take the upper 16 bits from V[0] and the lower 16 bits from V[1] 2961// Special case, can use V_ALIGNBIT (always uses encoded literal) 2962def : GCNPat < 2963 (vecTy (DivergentBinFrag<build_vector> 2964 (Ty !if(!eq(Ty, i16), 2965 (Ty (trunc (srl VGPR_32:$a, (i32 16)))), 2966 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), 2967 (Ty VGPR_32:$b))), 2968 (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) 2969>; 2970 2971// Take the upper 16 bits from each VGPR_32 and concat them 2972def : GCNPat < 2973 (vecTy (DivergentBinFrag<build_vector> 2974 (Ty !if(!eq(Ty, i16), 2975 (Ty (trunc (srl VGPR_32:$a, (i32 16)))), 2976 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), 2977 (Ty !if(!eq(Ty, i16), 2978 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 2979 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 2980 (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302))) 2981>; 2982 2983 2984} // end foreach Ty 2985 2986 2987let AddedComplexity = 5 in { 2988def : GCNPat < 2989 (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), 2990 (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), 2991 (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) 2992>; 2993} 2994} // End SubtargetPredicate = HasVOP3PInsts 2995 2996// With multiple uses of the shift, this will duplicate the shift and 2997// increase register pressure. 2998let SubtargetPredicate = isGFX11Plus in 2999def : GCNPat < 3000 (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 SReg_32:$src1))), 3001 (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1)) 3002>; 3003 3004 3005def : GCNPat < 3006 (v2f16 (scalar_to_vector f16:$src0)), 3007 (COPY $src0) 3008>; 3009 3010def : GCNPat < 3011 (v2i16 (scalar_to_vector i16:$src0)), 3012 (COPY $src0) 3013>; 3014 3015def : GCNPat < 3016 (v4i16 (scalar_to_vector i16:$src0)), 3017 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 3018>; 3019 3020def : GCNPat < 3021 (v4f16 (scalar_to_vector f16:$src0)), 3022 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 3023>; 3024 3025def : GCNPat < 3026 (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, 3027 timm:$bank_mask, timm:$bound_ctrl)), 3028 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src, 3029 (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), 3030 (as_i32timm $bank_mask), 3031 (as_i1timm $bound_ctrl)) 3032>; 3033 3034def : GCNPat < 3035 (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, 3036 timm:$bank_mask, timm:$bound_ctrl)), 3037 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl), 3038 (as_i32timm $row_mask), (as_i32timm $bank_mask), 3039 (as_i1timm $bound_ctrl)) 3040>; 3041 3042//===----------------------------------------------------------------------===// 3043// Fract Patterns 3044//===----------------------------------------------------------------------===// 3045 3046let SubtargetPredicate = isGFX6 in { 3047 3048// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is 3049// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient 3050// way to implement it is using V_FRACT_F64. 3051// The workaround for the V_FRACT bug is: 3052// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3053 3054// Convert floor(x) to (x - fract(x)) 3055 3056// Don't bother handling this for GlobalISel, it's handled during 3057// lowering. 3058// 3059// FIXME: DAG should also custom lower this. 3060def : GCNPat < 3061 (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), 3062 (V_ADD_F64_e64 3063 $mods, 3064 $x, 3065 SRCMODS.NEG, 3066 (V_CNDMASK_B64_PSEUDO 3067 (V_MIN_F64_e64 3068 SRCMODS.NONE, 3069 (V_FRACT_F64_e64 $mods, $x), 3070 SRCMODS.NONE, 3071 (V_MOV_B64_PSEUDO 0x3fefffffffffffff)), 3072 $x, 3073 (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/)))) 3074>; 3075 3076} // End SubtargetPredicates = isGFX6 3077 3078//============================================================================// 3079// Miscellaneous Optimization Patterns 3080//============================================================================// 3081 3082// Undo sub x, c -> add x, -c canonicalization since c is more likely 3083// an inline immediate than -c. 3084// TODO: Also do for 64-bit. 3085def : GCNPat< 3086 (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3087 (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1) 3088>; 3089 3090def : GCNPat< 3091 (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3092 (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 3093 let SubtargetPredicate = HasAddNoCarryInsts; 3094} 3095 3096def : GCNPat< 3097 (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3098 (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 3099 let SubtargetPredicate = NotHasAddNoCarryInsts; 3100} 3101 3102 3103// Avoid pointlessly materializing a constant in VGPR. 3104// FIXME: Should also do this for readlane, but tablegen crashes on 3105// the ignored src1. 3106def : GCNPat< 3107 (int_amdgcn_readfirstlane (i32 imm:$src)), 3108 (S_MOV_B32 SReg_32:$src) 3109>; 3110 3111multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> { 3112 def : GCNPat < 3113 (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), 3114 (BFM $a, $b) 3115 >; 3116 3117 def : GCNPat < 3118 (vt (ADD (vt (shl 1, vt:$a)), -1)), 3119 (BFM $a, (i32 0)) 3120 >; 3121} 3122 3123defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>; 3124// FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>; 3125defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>; 3126 3127// Bitfield extract patterns 3128 3129def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{ 3130 return isMask_32(Imm); 3131}]>; 3132 3133def IMMPopCount : SDNodeXForm<imm, [{ 3134 return CurDAG->getTargetConstant(llvm::popcount(N->getZExtValue()), SDLoc(N), 3135 MVT::i32); 3136}]>; 3137 3138def : AMDGPUPat < 3139 (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)), 3140 IMMZeroBasedBitfieldMask:$mask), 3141 (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask))) 3142>; 3143 3144// x & ((1 << y) - 1) 3145def : AMDGPUPat < 3146 (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)), 3147 (V_BFE_U32_e64 $src, (i32 0), $width) 3148>; 3149 3150// x & ~(-1 << y) 3151def : AMDGPUPat < 3152 (DivergentBinFrag<and> i32:$src, 3153 (xor_oneuse (shl_oneuse -1, i32:$width), -1)), 3154 (V_BFE_U32_e64 $src, (i32 0), $width) 3155>; 3156 3157// x & (-1 >> (bitwidth - y)) 3158def : AMDGPUPat < 3159 (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))), 3160 (V_BFE_U32_e64 $src, (i32 0), $width) 3161>; 3162 3163// x << (bitwidth - y) >> (bitwidth - y) 3164def : AMDGPUPat < 3165 (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)), 3166 (sub 32, i32:$width)), 3167 (V_BFE_U32_e64 $src, (i32 0), $width) 3168>; 3169 3170def : AMDGPUPat < 3171 (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)), 3172 (sub 32, i32:$width)), 3173 (V_BFE_I32_e64 $src, (i32 0), $width) 3174>; 3175 3176// SHA-256 Ma patterns 3177 3178// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y 3179def : AMDGPUPatIgnoreCopies < 3180 (DivergentBinFrag<or> (and i32:$x, i32:$z), 3181 (and i32:$y, (or i32:$x, i32:$z))), 3182 (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32), 3183 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)), 3184 (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32), 3185 (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)) 3186>; 3187 3188def : AMDGPUPatIgnoreCopies < 3189 (DivergentBinFrag<or> (and i64:$x, i64:$z), 3190 (and i64:$y, (or i64:$x, i64:$z))), 3191 (REG_SEQUENCE VReg_64, 3192 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 3193 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), 3194 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)), 3195 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0, 3196 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 3197 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), 3198 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)), 3199 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1) 3200>; 3201 3202multiclass IntMed3Pat<Instruction med3Inst, 3203 SDPatternOperator min, 3204 SDPatternOperator max> { 3205 3206 // This matches 16 permutations of 3207 // min(max(a, b), max(min(a, b), c)) 3208 def : AMDGPUPat < 3209 (min (max i32:$src0, i32:$src1), 3210 (max (min i32:$src0, i32:$src1), i32:$src2)), 3211 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3212>; 3213 3214 // This matches 16 permutations of 3215 // max(min(x, y), min(max(x, y), z)) 3216 def : AMDGPUPat < 3217 (max (min i32:$src0, i32:$src1), 3218 (min (max i32:$src0, i32:$src1), i32:$src2)), 3219 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3220>; 3221} 3222 3223defm : IntMed3Pat<V_MED3_I32_e64, smin, smax>; 3224defm : IntMed3Pat<V_MED3_U32_e64, umin, umax>; 3225 3226multiclass FPMed3Pat<ValueType vt, 3227 Instruction med3Inst> { 3228 // This matches 16 permutations of max(min(x, y), min(max(x, y), z)) 3229 def : GCNPat< 3230 (fmaxnum_like_nnan 3231 (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3232 (VOP3Mods vt:$src1, i32:$src1_mods)), 3233 (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3234 (VOP3Mods vt:$src1, i32:$src1_mods)), 3235 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3236 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3237 DSTCLAMP.NONE, DSTOMOD.NONE)>; 3238 3239 3240 // This matches 16 permutations of min(max(x, y), max(min(x, y), z)) 3241 def : GCNPat< 3242 (fminnum_like_nnan 3243 (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3244 (VOP3Mods vt:$src1, i32:$src1_mods)), 3245 (fmaxnum_like (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3246 (VOP3Mods vt:$src1, i32:$src1_mods)), 3247 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3248 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3249 DSTCLAMP.NONE, DSTOMOD.NONE)>; 3250} 3251 3252class FP16Med3Pat<ValueType vt, 3253 Instruction med3Inst> : GCNPat< 3254 (fmaxnum_like_nnan (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3255 (VOP3Mods vt:$src1, i32:$src1_mods)), 3256 (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3257 (VOP3Mods vt:$src1, i32:$src1_mods)), 3258 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3259 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE) 3260>; 3261 3262multiclass Int16Med3Pat<Instruction med3Inst, 3263 SDPatternOperator min, 3264 SDPatternOperator max> { 3265 // This matches 16 permutations of 3266 // max(min(x, y), min(max(x, y), z)) 3267 def : GCNPat < 3268 (max (min i16:$src0, i16:$src1), 3269 (min (max i16:$src0, i16:$src1), i16:$src2)), 3270 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 3271>; 3272 3273 // This matches 16 permutations of 3274 // min(max(a, b), max(min(a, b), c)) 3275 def : GCNPat < 3276 (min (max i16:$src0, i16:$src1), 3277 (max (min i16:$src0, i16:$src1), i16:$src2)), 3278 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 3279>; 3280} 3281 3282defm : FPMed3Pat<f32, V_MED3_F32_e64>; 3283 3284class 3285IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max, 3286 SDPatternOperator max_or_min_oneuse> : AMDGPUPat < 3287 (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1), 3288 i32:$src2), 3289 (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3290>; 3291 3292class 3293FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max, 3294 SDPatternOperator max_or_min_oneuse> : GCNPat < 3295 (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), 3296 (VOP3Mods vt:$src1, i32:$src1_mods)), 3297 (vt (VOP3Mods vt:$src2, i32:$src2_mods))), 3298 (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3299 DSTCLAMP.NONE, DSTOMOD.NONE) 3300>; 3301 3302let OtherPredicates = [isGFX11Plus] in { 3303def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>; 3304def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>; 3305def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>; 3306def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>; 3307def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; 3308def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; 3309def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; 3310def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; 3311} 3312 3313let OtherPredicates = [isGFX9Plus] in { 3314def : FP16Med3Pat<f16, V_MED3_F16_e64>; 3315defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>; 3316defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>; 3317} // End Predicates = [isGFX9Plus] 3318 3319class AMDGPUGenericInstruction : GenericInstruction { 3320 let Namespace = "AMDGPU"; 3321} 3322 3323// Convert a wave address to a swizzled vector address (i.e. this is 3324// for copying the stack pointer to a vector address appropriate to 3325// use in the offset field of mubuf instructions). 3326def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction { 3327 let OutOperandList = (outs type0:$dst); 3328 let InOperandList = (ins type0:$src); 3329 let hasSideEffects = 0; 3330} 3331 3332// Returns -1 if the input is zero. 3333def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { 3334 let OutOperandList = (outs type0:$dst); 3335 let InOperandList = (ins type1:$src); 3336 let hasSideEffects = 0; 3337} 3338 3339// Returns -1 if the input is zero. 3340def G_AMDGPU_FFBL_B32 : AMDGPUGenericInstruction { 3341 let OutOperandList = (outs type0:$dst); 3342 let InOperandList = (ins type1:$src); 3343 let hasSideEffects = 0; 3344} 3345 3346def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction { 3347 let OutOperandList = (outs type0:$dst); 3348 let InOperandList = (ins type1:$src); 3349 let hasSideEffects = 0; 3350} 3351 3352class BufferLoadGenericInstruction : AMDGPUGenericInstruction { 3353 let OutOperandList = (outs type0:$dst); 3354 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 3355 type2:$soffset, untyped_imm_0:$offset, 3356 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3357 let hasSideEffects = 0; 3358 let mayLoad = 1; 3359} 3360 3361class TBufferLoadGenericInstruction : AMDGPUGenericInstruction { 3362 let OutOperandList = (outs type0:$dst); 3363 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 3364 type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format, 3365 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3366 let hasSideEffects = 0; 3367 let mayLoad = 1; 3368} 3369 3370def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction; 3371def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction; 3372def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction; 3373def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction; 3374def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction; 3375def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction; 3376def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction; 3377def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; 3378def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; 3379def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; 3380 3381class BufferStoreGenericInstruction : AMDGPUGenericInstruction { 3382 let OutOperandList = (outs); 3383 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3384 type2:$soffset, untyped_imm_0:$offset, 3385 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3386 let hasSideEffects = 0; 3387 let mayStore = 1; 3388} 3389 3390class TBufferStoreGenericInstruction : AMDGPUGenericInstruction { 3391 let OutOperandList = (outs); 3392 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3393 type2:$soffset, untyped_imm_0:$offset, 3394 untyped_imm_0:$format, 3395 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3396 let hasSideEffects = 0; 3397 let mayStore = 1; 3398} 3399 3400def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction; 3401def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction; 3402def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction; 3403def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction; 3404def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction; 3405def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction; 3406def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction; 3407 3408def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction { 3409 let OutOperandList = (outs type0:$dst); 3410 let InOperandList = (ins type0:$src0, type0:$src1); 3411 let hasSideEffects = 0; 3412} 3413 3414def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction { 3415 let OutOperandList = (outs type0:$dst); 3416 let InOperandList = (ins type0:$src0, type0:$src1); 3417 let hasSideEffects = 0; 3418} 3419 3420foreach N = 0-3 in { 3421def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { 3422 let OutOperandList = (outs type0:$dst); 3423 let InOperandList = (ins type0:$src0); 3424 let hasSideEffects = 0; 3425} 3426} 3427 3428def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction { 3429 let OutOperandList = (outs type0:$dst); 3430 let InOperandList = (ins type0:$src0, type0:$src1); 3431 let hasSideEffects = 0; 3432} 3433 3434def G_AMDGPU_SMED3 : AMDGPUGenericInstruction { 3435 let OutOperandList = (outs type0:$dst); 3436 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3437 let hasSideEffects = 0; 3438} 3439 3440def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { 3441 let OutOperandList = (outs type0:$dst); 3442 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3443 let hasSideEffects = 0; 3444} 3445 3446def G_AMDGPU_FMED3 : AMDGPUGenericInstruction { 3447 let OutOperandList = (outs type0:$dst); 3448 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3449 let hasSideEffects = 0; 3450} 3451 3452def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { 3453 let OutOperandList = (outs type0:$dst); 3454 let InOperandList = (ins type0:$src); 3455 let hasSideEffects = 0; 3456} 3457 3458// Integer multiply-add: arg0 * arg1 + arg2. 3459// 3460// arg0 and arg1 are 32-bit integers (interpreted as signed or unsigned), 3461// arg2 is a 64-bit integer. Result is a 64-bit integer and a 1-bit carry-out. 3462class G_AMDGPU_MAD_64_32 : AMDGPUGenericInstruction { 3463 let OutOperandList = (outs type0:$dst, type1:$carry_out); 3464 let InOperandList = (ins type2:$arg0, type2:$arg1, type0:$arg2); 3465 let hasSideEffects = 0; 3466} 3467 3468def G_AMDGPU_MAD_U64_U32 : G_AMDGPU_MAD_64_32; 3469def G_AMDGPU_MAD_I64_I32 : G_AMDGPU_MAD_64_32; 3470 3471// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector 3472// operand Expects a MachineMemOperand in addition to explicit 3473// operands. 3474def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { 3475 let OutOperandList = (outs type0:$oldval); 3476 let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval); 3477 let hasSideEffects = 0; 3478 let mayLoad = 1; 3479 let mayStore = 1; 3480} 3481 3482let Namespace = "AMDGPU" in { 3483def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; 3484def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; 3485} 3486 3487class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction { 3488 let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst)); 3489 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3490 type2:$soffset, untyped_imm_0:$offset, 3491 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3492 let hasSideEffects = 0; 3493 let mayLoad = 1; 3494 let mayStore = 1; 3495} 3496 3497def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction; 3498def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction; 3499def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction; 3500def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction; 3501def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; 3502def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; 3503def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; 3504def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; 3505def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; 3506def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; 3507def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; 3508def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; 3509def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; 3510def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; 3511def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; 3512 3513def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { 3514 let OutOperandList = (outs type0:$dst); 3515 let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex, 3516 type2:$voffset, type2:$soffset, untyped_imm_0:$offset, 3517 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3518 let hasSideEffects = 0; 3519 let mayLoad = 1; 3520 let mayStore = 1; 3521} 3522 3523// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as 3524// a workaround for the intrinsic being defined as readnone, but 3525// really needs a memory operand. 3526def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { 3527 let OutOperandList = (outs type0:$dst); 3528 let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); 3529 let hasSideEffects = 0; 3530 let mayLoad = 1; 3531 let mayStore = 0; 3532} 3533 3534// This is equivalent to the G_INTRINSIC*, but the operands may have 3535// been legalized depending on the subtarget requirements. 3536def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { 3537 let OutOperandList = (outs type0:$dst); 3538 let InOperandList = (ins unknown:$intrin, variable_ops); 3539 let hasSideEffects = 0; 3540 let mayLoad = 1; 3541 3542 // FIXME: Use separate opcode for atomics. 3543 let mayStore = 1; 3544} 3545 3546def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction { 3547 let OutOperandList = (outs type0:$dst); 3548 let InOperandList = (ins unknown:$intrin, variable_ops); 3549 let hasSideEffects = 0; 3550 let mayLoad = 1; 3551 3552 // FIXME: Use separate opcode for atomics. 3553 let mayStore = 1; 3554} 3555 3556// This is equivalent to the G_INTRINSIC*, but the operands may have 3557// been legalized depending on the subtarget requirements. 3558def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { 3559 let OutOperandList = (outs); 3560 let InOperandList = (ins unknown:$intrin, variable_ops); 3561 let hasSideEffects = 0; 3562 let mayStore = 1; 3563} 3564 3565def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction { 3566 let OutOperandList = (outs); 3567 let InOperandList = (ins unknown:$intrin, variable_ops); 3568 let hasSideEffects = 0; 3569 let mayStore = 1; 3570} 3571 3572def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { 3573 let OutOperandList = (outs type0:$dst); 3574 let InOperandList = (ins unknown:$intrin, variable_ops); 3575 let hasSideEffects = 0; 3576 let mayLoad = 1; 3577 let mayStore = 0; 3578} 3579 3580// Generic instruction for SI_CALL, so we can select the register bank and insert a waterfall loop 3581// if necessary. 3582def G_SI_CALL : AMDGPUGenericInstruction { 3583 let OutOperandList = (outs SReg_64:$dst); 3584 let InOperandList = (ins type0:$src0, unknown:$callee); 3585 let Size = 4; 3586 let isCall = 1; 3587 let UseNamedOperandTable = 1; 3588 let SchedRW = [WriteBranch]; 3589 // TODO: Should really base this on the call target 3590 let isConvergent = 1; 3591} 3592 3593def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction { 3594 let OutOperandList = (outs type0:$vdst); 3595 let InOperandList = (ins type1:$src0); 3596 let hasSideEffects = 0; 3597} 3598 3599def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction { 3600 let OutOperandList = (outs type0:$vdst); 3601 let InOperandList = (ins type1:$src0); 3602 let hasSideEffects = 0; 3603} 3604 3605//============================================================================// 3606// Dummy Instructions 3607//============================================================================// 3608 3609def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> { 3610 let Inst{31-0} = 0x00000000; 3611 let FixedSize = 1; 3612 let Size = 4; 3613 let Uses = [EXEC]; 3614 let hasSideEffects = 1; 3615 let SubtargetPredicate = isGFX10Plus; 3616} 3617