1//===-- SIInstructions.td - SI Instruction Definitions --------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// This file was originally auto-generated from a GPU register header file and 9// all the instruction definitions were originally commented out. Instructions 10// that are not yet supported remain commented out. 11//===----------------------------------------------------------------------===// 12 13class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl { 14 15} 16 17class UniformSextInreg<ValueType VT> : PatFrag< 18 (ops node:$src), 19 (sext_inreg $src, VT), 20 [{ return !N->isDivergent(); }]>; 21 22class DivergentSextInreg<ValueType VT> : PatFrag< 23 (ops node:$src), 24 (sext_inreg $src, VT), 25 [{ return N->isDivergent(); }]>; 26 27include "SOPInstructions.td" 28include "VOPInstructions.td" 29include "SMInstructions.td" 30include "FLATInstructions.td" 31include "BUFInstructions.td" 32include "EXPInstructions.td" 33include "LDSDIRInstructions.td" 34include "VINTERPInstructions.td" 35 36//===----------------------------------------------------------------------===// 37// VINTRP Instructions 38//===----------------------------------------------------------------------===// 39 40// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI) 41def VINTRPDst : VINTRPDstOperand <VGPR_32>; 42 43let Uses = [MODE, M0, EXEC] in { 44 45// FIXME: Specify SchedRW for VINTRP instructions. 46 47multiclass V_INTERP_P1_F32_m : VINTRP_m < 48 0x00000000, 49 (outs VINTRPDst:$vdst), 50 (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), 51 "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", 52 [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc, 53 (i32 timm:$attrchan), (i32 timm:$attr), M0))] 54>; 55 56let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in { 57 58defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; 59 60} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus] 61 62let OtherPredicates = [has16BankLDS, isNotGFX90APlus], 63 Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { 64 65defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; 66 67} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus], 68 // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 69 70let OtherPredicates = [isNotGFX90APlus] in { 71let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { 72 73defm V_INTERP_P2_F32 : VINTRP_m < 74 0x00000001, 75 (outs VINTRPDst:$vdst), 76 (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), 77 "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", 78 [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, 79 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 80 81} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" 82 83defm V_INTERP_MOV_F32 : VINTRP_m < 84 0x00000002, 85 (outs VINTRPDst:$vdst), 86 (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), 87 "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", 88 [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), 89 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 90 91} // End OtherPredicates = [isNotGFX90APlus] 92 93} // End Uses = [MODE, M0, EXEC] 94 95//===----------------------------------------------------------------------===// 96// Pseudo Instructions 97//===----------------------------------------------------------------------===// 98def ATOMIC_FENCE : SPseudoInstSI< 99 (outs), (ins i32imm:$ordering, i32imm:$scope), 100 [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))], 101 "ATOMIC_FENCE $ordering, $scope"> { 102 let hasSideEffects = 1; 103 let maybeAtomic = 1; 104} 105 106let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { 107 108// For use in patterns 109def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), 110 (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { 111 let isPseudo = 1; 112 let isCodeGenOnly = 1; 113 let usesCustomInserter = 1; 114} 115 116// 64-bit vector move instruction. This is mainly used by the 117// SIFoldOperands pass to enable folding of inline immediates. 118def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), 119 (ins VSrc_b64:$src0)> { 120 let isReMaterializable = 1; 121 let isAsCheapAsAMove = 1; 122 let isMoveImm = 1; 123 let SchedRW = [Write64Bit]; 124 let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each. 125 let UseNamedOperandTable = 1; 126} 127 128// 64-bit vector move with dpp. Expanded post-RA. 129def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> { 130 let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. 131} 132 133// 64-bit scalar move immediate instruction. This is used to avoid subregs 134// initialization and allow rematerialization. 135def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst), 136 (ins i64imm:$src0)> { 137 let isReMaterializable = 1; 138 let isAsCheapAsAMove = 1; 139 let isMoveImm = 1; 140 let SchedRW = [WriteSALU, Write64Bit]; 141 let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each. 142 let Uses = []; 143} 144 145// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the 146// WQM pass processes it. 147def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 148 149// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is 150// turned into a copy by WQM pass, but does not seed WQM requirements. 151def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 152 153// Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so 154// that the @earlyclobber is respected. The @earlyclobber is to make sure that 155// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't 156// accidentally clobber inactive channels of $vdst. 157let Constraints = "@earlyclobber $vdst" in { 158def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 159def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 160} 161 162} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] 163 164def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 165 let Uses = [EXEC]; 166 let Defs = [EXEC, SCC]; 167 let hasSideEffects = 0; 168 let mayLoad = 0; 169 let mayStore = 0; 170} 171 172def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 173 let hasSideEffects = 0; 174 let mayLoad = 0; 175 let mayStore = 0; 176} 177 178def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 179 let Uses = [EXEC]; 180 let Defs = [EXEC, SCC]; 181 let hasSideEffects = 0; 182 let mayLoad = 0; 183 let mayStore = 0; 184} 185 186def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 187 let hasSideEffects = 0; 188 let mayLoad = 0; 189 let mayStore = 0; 190} 191 192// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes. 193def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { 194 let Uses = [EXEC]; 195 let Defs = [EXEC]; 196 let hasSideEffects = 0; 197 let mayLoad = 0; 198 let mayStore = 0; 199} 200 201def EXIT_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { 202 let hasSideEffects = 0; 203 let mayLoad = 0; 204 let mayStore = 0; 205} 206 207// Pseudo instructions used for @llvm.fptrunc.round upward 208// and @llvm.fptrunc.round downward. 209// These intrinsics will be legalized to G_FPTRUNC_ROUND_UPWARD 210// and G_FPTRUNC_ROUND_DOWNWARD before being lowered to 211// FPTRUNC_UPWARD_PSEUDO and FPTRUNC_DOWNWARD_PSEUDO. 212// The final codegen is done in the ModeRegister pass. 213let Uses = [MODE, EXEC] in { 214def FPTRUNC_UPWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), 215 (ins VGPR_32:$src0), 216 [(set f16:$vdst, (SIfptrunc_round_upward f32:$src0))]>; 217 218def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst), 219 (ins VGPR_32:$src0), 220 [(set f16:$vdst, (SIfptrunc_round_downward f32:$src0))]>; 221} // End Uses = [MODE, EXEC] 222 223// Invert the exec mask and overwrite the inactive lanes of dst with inactive, 224// restoring it after we're done. 225let Defs = [SCC] in { 226def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), 227 (ins VSrc_b32: $src, VSrc_b32:$inactive), 228 [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { 229} 230 231def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), 232 (ins VSrc_b64: $src, VSrc_b64:$inactive), 233 [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { 234} 235} // End Defs = [SCC] 236 237let usesCustomInserter = 1, Defs = [VCC, EXEC] in { 238def V_ADD_U64_PSEUDO : VPseudoInstSI < 239 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 240 [(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))] 241>; 242 243def V_SUB_U64_PSEUDO : VPseudoInstSI < 244 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 245 [(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))] 246>; 247} // End usesCustomInserter = 1, Defs = [VCC, EXEC] 248 249let usesCustomInserter = 1, Defs = [SCC] in { 250def S_ADD_U64_PSEUDO : SPseudoInstSI < 251 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 252 [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))] 253>; 254 255def S_SUB_U64_PSEUDO : SPseudoInstSI < 256 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 257 [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))] 258>; 259 260def S_ADD_CO_PSEUDO : SPseudoInstSI < 261 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 262>; 263 264def S_SUB_CO_PSEUDO : SPseudoInstSI < 265 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 266>; 267 268def S_UADDO_PSEUDO : SPseudoInstSI < 269 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 270>; 271 272def S_USUBO_PSEUDO : SPseudoInstSI < 273 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 274>; 275 276} // End usesCustomInserter = 1, Defs = [SCC] 277 278let usesCustomInserter = 1 in { 279def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins), 280 [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; 281} // End let usesCustomInserter = 1, SALU = 1 282 283// Wrap an instruction by duplicating it, except for setting isTerminator. 284class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< 285 base_inst.OutOperandList, 286 base_inst.InOperandList> { 287 let Uses = base_inst.Uses; 288 let Defs = base_inst.Defs; 289 let isTerminator = 1; 290 let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; 291 let hasSideEffects = base_inst.hasSideEffects; 292 let UseNamedOperandTable = base_inst.UseNamedOperandTable; 293 let CodeSize = base_inst.CodeSize; 294 let SchedRW = base_inst.SchedRW; 295} 296 297let WaveSizePredicate = isWave64 in { 298def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; 299def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; 300def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; 301def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; 302def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>; 303} 304 305let WaveSizePredicate = isWave32 in { 306def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>; 307def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; 308def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; 309def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; 310def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>; 311} 312 313 314def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), 315 [(int_amdgcn_wave_barrier)]> { 316 let SchedRW = []; 317 let hasNoSchedulingInfo = 1; 318 let hasSideEffects = 1; 319 let mayLoad = 0; 320 let mayStore = 0; 321 let isConvergent = 1; 322 let FixedSize = 1; 323 let Size = 0; 324 let isMeta = 1; 325} 326 327def SCHED_BARRIER : SPseudoInstSI<(outs), (ins i32imm:$mask), 328 [(int_amdgcn_sched_barrier (i32 timm:$mask))]> { 329 let SchedRW = []; 330 let hasNoSchedulingInfo = 1; 331 let hasSideEffects = 1; 332 let mayLoad = 0; 333 let mayStore = 0; 334 let isConvergent = 1; 335 let FixedSize = 1; 336 let Size = 0; 337 let isMeta = 1; 338} 339 340def SCHED_GROUP_BARRIER : SPseudoInstSI< 341 (outs), 342 (ins i32imm:$mask, i32imm:$size, i32imm:$syncid), 343 [(int_amdgcn_sched_group_barrier (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid))]> { 344 let SchedRW = []; 345 let hasNoSchedulingInfo = 1; 346 let hasSideEffects = 1; 347 let mayLoad = 0; 348 let mayStore = 0; 349 let isConvergent = 1; 350 let FixedSize = 1; 351 let Size = 0; 352 let isMeta = 1; 353} 354 355def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask), 356 [(int_amdgcn_iglp_opt (i32 timm:$mask))]> { 357 let SchedRW = []; 358 let hasNoSchedulingInfo = 1; 359 let hasSideEffects = 1; 360 let mayLoad = 0; 361 let mayStore = 0; 362 let isConvergent = 1; 363 let FixedSize = 1; 364 let Size = 0; 365 let isMeta = 1; 366} 367 368// SI pseudo instructions. These are used by the CFG structurizer pass 369// and should be lowered to ISA instructions prior to codegen. 370 371let isTerminator = 1 in { 372 373let OtherPredicates = [EnableLateCFGStructurize] in { 374 def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < 375 (outs), 376 (ins SReg_1:$vcc, brtarget:$target), 377 [(brcond i1:$vcc, bb:$target)]> { 378 let Size = 12; 379} 380} 381 382def SI_IF: CFPseudoInstSI < 383 (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target), 384 [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { 385 let Constraints = ""; 386 let Size = 12; 387 let hasSideEffects = 1; 388} 389 390def SI_ELSE : CFPseudoInstSI < 391 (outs SReg_1:$dst), 392 (ins SReg_1:$src, brtarget:$target), [], 1, 1> { 393 let Size = 12; 394 let hasSideEffects = 1; 395} 396 397def SI_WATERFALL_LOOP : CFPseudoInstSI < 398 (outs), 399 (ins brtarget:$target), [], 1> { 400 let Size = 8; 401 let isBranch = 1; 402 let Defs = []; 403} 404 405def SI_LOOP : CFPseudoInstSI < 406 (outs), (ins SReg_1:$saved, brtarget:$target), 407 [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { 408 let Size = 8; 409 let isBranch = 1; 410 let hasSideEffects = 1; 411} 412 413} // End isTerminator = 1 414 415def SI_END_CF : CFPseudoInstSI < 416 (outs), (ins SReg_1:$saved), [], 1, 1> { 417 let Size = 4; 418 let isAsCheapAsAMove = 1; 419 let isReMaterializable = 1; 420 let hasSideEffects = 1; 421 let mayLoad = 1; // FIXME: Should not need memory flags 422 let mayStore = 1; 423} 424 425def SI_IF_BREAK : CFPseudoInstSI < 426 (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { 427 let Size = 4; 428 let isAsCheapAsAMove = 1; 429 let isReMaterializable = 1; 430} 431 432// Branch to the early termination block of the shader if SCC is 0. 433// This uses SCC from a previous SALU operation, i.e. the update of 434// a mask of live lanes after a kill/demote operation. 435// Only valid in pixel shaders. 436def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> { 437 let Uses = [EXEC,SCC]; 438} 439 440let Uses = [EXEC] in { 441 442multiclass PseudoInstKill <dag ins> { 443 // Even though this pseudo can usually be expanded without an SCC def, we 444 // conservatively assume that it has an SCC def, both because it is sometimes 445 // required in degenerate cases (when V_CMPX cannot be used due to constant 446 // bus limitations) and because it allows us to avoid having to track SCC 447 // liveness across basic blocks. 448 let Defs = [EXEC,SCC] in 449 def _PSEUDO : PseudoInstSI <(outs), ins> { 450 let isConvergent = 1; 451 let usesCustomInserter = 1; 452 } 453 454 let Defs = [EXEC,SCC] in 455 def _TERMINATOR : SPseudoInstSI <(outs), ins> { 456 let isTerminator = 1; 457 } 458} 459 460defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; 461let Defs = [VCC] in 462defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; 463 464let Defs = [EXEC,VCC] in 465def SI_ILLEGAL_COPY : SPseudoInstSI < 466 (outs unknown:$dst), (ins unknown:$src), 467 [], " ; illegal copy $src to $dst">; 468 469} // End Uses = [EXEC], Defs = [EXEC,VCC] 470 471// Branch on undef scc. Used to avoid intermediate copy from 472// IMPLICIT_DEF to SCC. 473def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { 474 let isTerminator = 1; 475 let usesCustomInserter = 1; 476 let isBranch = 1; 477} 478 479def SI_PS_LIVE : PseudoInstSI < 480 (outs SReg_1:$dst), (ins), 481 [(set i1:$dst, (int_amdgcn_ps_live))]> { 482 let SALU = 1; 483} 484 485let Uses = [EXEC] in { 486def SI_LIVE_MASK : PseudoInstSI < 487 (outs SReg_1:$dst), (ins), 488 [(set i1:$dst, (int_amdgcn_live_mask))]> { 489 let SALU = 1; 490} 491let Defs = [EXEC,SCC] in { 492// Demote: Turn a pixel shader thread into a helper lane. 493def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>; 494} // End Defs = [EXEC,SCC] 495} // End Uses = [EXEC] 496 497def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), 498 [(int_amdgcn_unreachable)], 499 "; divergent unreachable"> { 500 let Size = 0; 501 let hasNoSchedulingInfo = 1; 502 let FixedSize = 1; 503 let isMeta = 1; 504} 505 506// Used as an isel pseudo to directly emit initialization with an 507// s_mov_b32 rather than a copy of another initialized 508// register. MachineCSE skips copies, and we don't want to have to 509// fold operands before it runs. 510def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { 511 let Defs = [M0]; 512 let usesCustomInserter = 1; 513 let isAsCheapAsAMove = 1; 514 let isReMaterializable = 1; 515} 516 517def SI_INIT_EXEC : SPseudoInstSI < 518 (outs), (ins i64imm:$src), 519 [(int_amdgcn_init_exec (i64 timm:$src))]> { 520 let Defs = [EXEC]; 521 let isAsCheapAsAMove = 1; 522} 523 524def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < 525 (outs), (ins SSrc_b32:$input, i32imm:$shift), 526 [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { 527 let Defs = [EXEC]; 528} 529 530// Return for returning shaders to a shader variant epilog. 531def SI_RETURN_TO_EPILOG : SPseudoInstSI < 532 (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { 533 let isTerminator = 1; 534 let isBarrier = 1; 535 let isReturn = 1; 536 let hasNoSchedulingInfo = 1; 537 let DisableWQM = 1; 538 let FixedSize = 1; 539 540 // TODO: Should this be true? 541 let isMeta = 0; 542} 543 544// Return for returning function calls. 545def SI_RETURN : SPseudoInstSI < 546 (outs), (ins), [(AMDGPUret_flag)], 547 "; return"> { 548 let isTerminator = 1; 549 let isBarrier = 1; 550 let isReturn = 1; 551 let SchedRW = [WriteBranch]; 552} 553 554// Return for returning function calls without output register. 555// 556// This version is only needed so we can fill in the output register 557// in the custom inserter. 558def SI_CALL_ISEL : SPseudoInstSI < 559 (outs), (ins SSrc_b64:$src0, unknown:$callee), 560 [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { 561 let Size = 4; 562 let isCall = 1; 563 let SchedRW = [WriteBranch]; 564 let usesCustomInserter = 1; 565 // TODO: Should really base this on the call target 566 let isConvergent = 1; 567} 568 569def : GCNPat< 570 (AMDGPUcall i64:$src0, (i64 0)), 571 (SI_CALL_ISEL $src0, (i64 0)) 572>; 573 574// Wrapper around s_swappc_b64 with extra $callee parameter to track 575// the called function after regalloc. 576def SI_CALL : SPseudoInstSI < 577 (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { 578 let Size = 4; 579 let FixedSize = 1; 580 let isCall = 1; 581 let UseNamedOperandTable = 1; 582 let SchedRW = [WriteBranch]; 583 // TODO: Should really base this on the call target 584 let isConvergent = 1; 585} 586 587// Tail call handling pseudo 588def SI_TCRETURN : SPseudoInstSI <(outs), 589 (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff), 590 [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { 591 let Size = 4; 592 let FixedSize = 1; 593 let isCall = 1; 594 let isTerminator = 1; 595 let isReturn = 1; 596 let isBarrier = 1; 597 let UseNamedOperandTable = 1; 598 let SchedRW = [WriteBranch]; 599 // TODO: Should really base this on the call target 600 let isConvergent = 1; 601} 602 603// Handle selecting indirect tail calls 604def : GCNPat< 605 (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)), 606 (SI_TCRETURN SReg_64:$src0, (i64 0), i32imm:$fpdiff) 607>; 608 609def ADJCALLSTACKUP : SPseudoInstSI< 610 (outs), (ins i32imm:$amt0, i32imm:$amt1), 611 [(callseq_start timm:$amt0, timm:$amt1)], 612 "; adjcallstackup $amt0 $amt1"> { 613 let Size = 8; // Worst case. (s_add_u32 + constant) 614 let FixedSize = 1; 615 let hasSideEffects = 1; 616 let usesCustomInserter = 1; 617 let SchedRW = [WriteSALU]; 618 let Defs = [SCC]; 619} 620 621def ADJCALLSTACKDOWN : SPseudoInstSI< 622 (outs), (ins i32imm:$amt1, i32imm:$amt2), 623 [(callseq_end timm:$amt1, timm:$amt2)], 624 "; adjcallstackdown $amt1"> { 625 let Size = 8; // Worst case. (s_add_u32 + constant) 626 let hasSideEffects = 1; 627 let usesCustomInserter = 1; 628 let SchedRW = [WriteSALU]; 629 let Defs = [SCC]; 630} 631 632let Defs = [M0, EXEC, SCC], 633 UseNamedOperandTable = 1 in { 634 635// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect 636// addressing implementation. 637class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < 638 (outs VGPR_32:$vdst), 639 (ins rc:$src, VS_32:$idx, i32imm:$offset)> { 640 let usesCustomInserter = 1; 641} 642 643class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < 644 (outs rc:$vdst), 645 (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { 646 let Constraints = "$src = $vdst"; 647 let usesCustomInserter = 1; 648} 649 650def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; 651def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; 652def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; 653def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; 654def SI_INDIRECT_SRC_V9 : SI_INDIRECT_SRC<VReg_288>; 655def SI_INDIRECT_SRC_V10 : SI_INDIRECT_SRC<VReg_320>; 656def SI_INDIRECT_SRC_V11 : SI_INDIRECT_SRC<VReg_352>; 657def SI_INDIRECT_SRC_V12 : SI_INDIRECT_SRC<VReg_384>; 658def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; 659def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>; 660 661def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; 662def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; 663def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; 664def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; 665def SI_INDIRECT_DST_V9 : SI_INDIRECT_DST<VReg_288>; 666def SI_INDIRECT_DST_V10 : SI_INDIRECT_DST<VReg_320>; 667def SI_INDIRECT_DST_V11 : SI_INDIRECT_DST<VReg_352>; 668def SI_INDIRECT_DST_V12 : SI_INDIRECT_DST<VReg_384>; 669def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; 670def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>; 671 672} // End Uses = [EXEC], Defs = [M0, EXEC] 673 674// This is a pseudo variant of the v_movreld_b32 instruction in which the 675// vector operand appears only twice, once as def and once as use. Using this 676// pseudo avoids problems with the Two Address instructions pass. 677class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 678 RegisterOperand val_ty> : PseudoInstSI < 679 (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> { 680 let Constraints = "$vsrc = $vdst"; 681 let Uses = [M0]; 682} 683 684class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 685 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> { 686 let VALU = 1; 687 let VOP1 = 1; 688 let Uses = [M0, EXEC]; 689} 690 691class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 692 RegisterOperand val_ty> : 693 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> { 694 let SALU = 1; 695 let SOP1 = 1; 696 let Uses = [M0]; 697} 698 699class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 700 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>; 701class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> : 702 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>; 703 704def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>; 705def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>; 706def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>; 707def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>; 708def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>; 709def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>; 710def V_INDIRECT_REG_WRITE_MOVREL_B32_V9 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_288>; 711def V_INDIRECT_REG_WRITE_MOVREL_B32_V10 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_320>; 712def V_INDIRECT_REG_WRITE_MOVREL_B32_V11 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_352>; 713def V_INDIRECT_REG_WRITE_MOVREL_B32_V12 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_384>; 714def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>; 715def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>; 716 717def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>; 718def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>; 719def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>; 720def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>; 721def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>; 722def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>; 723def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>; 724def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>; 725 726def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>; 727def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>; 728def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>; 729def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>; 730def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>; 731 732// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these 733// pseudos we avoid spills or copies being inserted within indirect sequences 734// that switch the VGPR indexing mode. Spills to accvgprs could be effected by 735// this mode switching. 736 737class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 738 (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> { 739 let Constraints = "$vsrc = $vdst"; 740 let VALU = 1; 741 let Uses = [M0, EXEC]; 742 let Defs = [M0]; 743} 744 745def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>; 746def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>; 747def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>; 748def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>; 749def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>; 750def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>; 751def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_288>; 752def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_320>; 753def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_352>; 754def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_384>; 755def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>; 756def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>; 757 758class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 759 (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> { 760 let VALU = 1; 761 let Uses = [M0, EXEC]; 762 let Defs = [M0]; 763} 764 765def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>; 766def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>; 767def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>; 768def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>; 769def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>; 770def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>; 771def V_INDIRECT_REG_READ_GPR_IDX_B32_V9 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_288>; 772def V_INDIRECT_REG_READ_GPR_IDX_B32_V10 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_320>; 773def V_INDIRECT_REG_READ_GPR_IDX_B32_V11 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_352>; 774def V_INDIRECT_REG_READ_GPR_IDX_B32_V12 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_384>; 775def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>; 776def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>; 777 778multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { 779 let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in { 780 def _SAVE : PseudoInstSI < 781 (outs), 782 (ins sgpr_class:$data, i32imm:$addr)> { 783 let mayStore = 1; 784 let mayLoad = 0; 785 } 786 787 def _RESTORE : PseudoInstSI < 788 (outs sgpr_class:$data), 789 (ins i32imm:$addr)> { 790 let mayStore = 0; 791 let mayLoad = 1; 792 } 793 } // End UseNamedOperandTable = 1 794} 795 796// You cannot use M0 as the output of v_readlane_b32 instructions or 797// use it in the sdata operand of SMEM instructions. We still need to 798// be able to spill the physical register m0, so allow it for 799// SI_SPILL_32_* instructions. 800defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; 801defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; 802defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; 803defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; 804defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; 805defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>; 806defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>; 807defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; 808defm SI_SPILL_S288 : SI_SPILL_SGPR <SReg_288>; 809defm SI_SPILL_S320 : SI_SPILL_SGPR <SReg_320>; 810defm SI_SPILL_S352 : SI_SPILL_SGPR <SReg_352>; 811defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>; 812defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; 813defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; 814 815// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register 816// needs to be used and an extra instruction to move between VGPR and AGPR. 817// UsesTmp adds to the total size of an expanded spill in this case. 818multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> { 819 let UseNamedOperandTable = 1, VGPRSpill = 1, 820 SchedRW = [WriteVMEM] in { 821 def _SAVE : VPseudoInstSI < 822 (outs), 823 (ins vgpr_class:$vdata, i32imm:$vaddr, 824 SReg_32:$soffset, i32imm:$offset)> { 825 let mayStore = 1; 826 let mayLoad = 0; 827 // (2 * 4) + (8 * num_subregs) bytes maximum 828 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 829 // Size field is unsigned char and cannot fit more. 830 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 831 } 832 833 def _RESTORE : VPseudoInstSI < 834 (outs vgpr_class:$vdata), 835 (ins i32imm:$vaddr, 836 SReg_32:$soffset, i32imm:$offset)> { 837 let mayStore = 0; 838 let mayLoad = 1; 839 840 // (2 * 4) + (8 * num_subregs) bytes maximum 841 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 842 // Size field is unsigned char and cannot fit more. 843 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 844 } 845 } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] 846} 847 848defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; 849defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; 850defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; 851defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; 852defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; 853defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; 854defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>; 855defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; 856defm SI_SPILL_V288 : SI_SPILL_VGPR <VReg_288>; 857defm SI_SPILL_V320 : SI_SPILL_VGPR <VReg_320>; 858defm SI_SPILL_V352 : SI_SPILL_VGPR <VReg_352>; 859defm SI_SPILL_V384 : SI_SPILL_VGPR <VReg_384>; 860defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; 861defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; 862 863defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>; 864defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>; 865defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; 866defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; 867defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; 868defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; 869defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>; 870defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; 871defm SI_SPILL_A288 : SI_SPILL_VGPR <AReg_288, 1>; 872defm SI_SPILL_A320 : SI_SPILL_VGPR <AReg_320, 1>; 873defm SI_SPILL_A352 : SI_SPILL_VGPR <AReg_352, 1>; 874defm SI_SPILL_A384 : SI_SPILL_VGPR <AReg_384, 1>; 875defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; 876defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; 877 878defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; 879defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; 880defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; 881defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; 882defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; 883defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; 884defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; 885defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; 886defm SI_SPILL_AV288 : SI_SPILL_VGPR <AV_288, 1>; 887defm SI_SPILL_AV320 : SI_SPILL_VGPR <AV_320, 1>; 888defm SI_SPILL_AV352 : SI_SPILL_VGPR <AV_352, 1>; 889defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>; 890defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; 891defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; 892 893def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < 894 (outs SReg_64:$dst), 895 (ins si_ga:$ptr_lo, si_ga:$ptr_hi), 896 [(set SReg_64:$dst, 897 (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> { 898 let Defs = [SCC]; 899} 900 901def : GCNPat < 902 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0), 903 (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) 904>; 905 906def : GCNPat< 907 (AMDGPUtrap timm:$trapid), 908 (S_TRAP $trapid) 909>; 910 911def : GCNPat< 912 (AMDGPUelse i1:$src, bb:$target), 913 (SI_ELSE $src, $target) 914>; 915 916def : Pat < 917 (int_amdgcn_kill i1:$src), 918 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0) 919>; 920 921def : Pat < 922 (int_amdgcn_kill (i1 (not i1:$src))), 923 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1) 924>; 925 926def : Pat < 927 (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), 928 (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) 929>; 930 931def : Pat < 932 (int_amdgcn_wqm_demote i1:$src), 933 (SI_DEMOTE_I1 SCSrc_i1:$src, 0) 934>; 935 936def : Pat < 937 (int_amdgcn_wqm_demote (i1 (not i1:$src))), 938 (SI_DEMOTE_I1 SCSrc_i1:$src, -1) 939>; 940 941 // TODO: we could add more variants for other types of conditionals 942 943def : Pat < 944 (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 945 (COPY $src) // Return the SGPRs representing i1 src 946>; 947 948def : Pat < 949 (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 950 (COPY $src) // Return the SGPRs representing i1 src 951>; 952 953//===----------------------------------------------------------------------===// 954// VOP1 Patterns 955//===----------------------------------------------------------------------===// 956 957let OtherPredicates = [UnsafeFPMath] in { 958 959// Convert (x - floor(x)) to fract(x) 960def : GCNPat < 961 (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), 962 (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), 963 (V_FRACT_F32_e64 $mods, $x) 964>; 965 966// Convert (x + (-floor(x))) to fract(x) 967def : GCNPat < 968 (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), 969 (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), 970 (V_FRACT_F64_e64 $mods, $x) 971>; 972 973} // End OtherPredicates = [UnsafeFPMath] 974 975 976multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> { 977 // f16_to_fp patterns 978 def : GCNPat < 979 (f32 (f16_to_fp i32:$src0)), 980 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0) 981 >; 982 983 def : GCNPat < 984 (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), 985 (cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0) 986 >; 987 988 def : GCNPat < 989 (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), 990 (cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) 991 >; 992 993 def : GCNPat < 994 (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), 995 (cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0) 996 >; 997 998 def : GCNPat < 999 (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), 1000 (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0) 1001 >; 1002 1003 def : GCNPat < 1004 (f64 (fpextend f16:$src)), 1005 (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src)) 1006 >; 1007 1008 // fp_to_fp16 patterns 1009 def : GCNPat < 1010 (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), 1011 (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) 1012 >; 1013 1014 def : GCNPat < 1015 (i32 (fp_to_sint f16:$src)), 1016 (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) 1017 >; 1018 1019 def : GCNPat < 1020 (i32 (fp_to_uint f16:$src)), 1021 (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src)) 1022 >; 1023 1024 def : GCNPat < 1025 (f16 (sint_to_fp i32:$src)), 1026 (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_I32_e32 VSrc_b32:$src)) 1027 >; 1028 1029 def : GCNPat < 1030 (f16 (uint_to_fp i32:$src)), 1031 (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src)) 1032 >; 1033} 1034 1035let SubtargetPredicate = NotHasTrue16BitInsts in 1036defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>; 1037 1038let SubtargetPredicate = HasTrue16BitInsts in 1039defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64>; 1040 1041//===----------------------------------------------------------------------===// 1042// VOP2 Patterns 1043//===----------------------------------------------------------------------===// 1044 1045// NoMods pattern used for mac. If there are any source modifiers then it's 1046// better to select mad instead of mac. 1047class FMADPat <ValueType vt, Instruction inst> 1048 : GCNPat <(vt (any_fmad (vt (VOP3NoMods vt:$src0)), 1049 (vt (VOP3NoMods vt:$src1)), 1050 (vt (VOP3NoMods vt:$src2)))), 1051 (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1052 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1053>; 1054 1055// Prefer mac form when there are no modifiers. 1056let AddedComplexity = 9 in { 1057let OtherPredicates = [HasMadMacF32Insts] in 1058def : FMADPat <f32, V_MAC_F32_e64>; 1059 1060// Don't allow source modifiers. If there are any source modifiers then it's 1061// better to select mad instead of mac. 1062let SubtargetPredicate = isGFX6GFX7GFX10, 1063 OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 1064def : GCNPat < 1065 (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0), 1066 (VOP3NoMods f32:$src1)), 1067 (VOP3NoMods f32:$src2))), 1068 (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1069 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1070>; 1071 1072// Don't allow source modifiers. If there are any source modifiers then it's 1073// better to select fma instead of fmac. 1074let SubtargetPredicate = HasFmaLegacy32 in 1075def : GCNPat < 1076 (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0), 1077 (VOP3NoMods f32:$src1), 1078 (VOP3NoMods f32:$src2))), 1079 (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 1080 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1081>; 1082 1083let SubtargetPredicate = Has16BitInsts in 1084def : FMADPat <f16, V_MAC_F16_e64>; 1085} // AddedComplexity = 9 1086 1087let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 1088def : GCNPat < 1089 (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod), 1090 (VOP3Mods f32:$src1, i32:$src1_mod)), 1091 (VOP3Mods f32:$src2, i32:$src2_mod))), 1092 (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1, 1093 $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 1094>; 1095 1096class VOPSelectModsPat <ValueType vt> : GCNPat < 1097 (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods), 1098 (VOP3Mods vt:$src2, i32:$src2_mods))), 1099 (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, 1100 FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) 1101>; 1102 1103class VOPSelectPat <ValueType vt> : GCNPat < 1104 (vt (select i1:$src0, vt:$src1, vt:$src2)), 1105 (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0) 1106>; 1107 1108def : VOPSelectModsPat <i32>; 1109def : VOPSelectModsPat <f32>; 1110def : VOPSelectPat <f16>; 1111def : VOPSelectPat <i16>; 1112 1113let AddedComplexity = 1 in { 1114def : GCNPat < 1115 (i32 (add (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), i32:$val)), 1116 (V_BCNT_U32_B32_e64 $popcnt, $val) 1117>; 1118} 1119 1120def : GCNPat < 1121 (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), 1122 (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) 1123>; 1124 1125def : GCNPat < 1126 (i16 (add (i16 (trunc (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)))), i16:$val)), 1127 (V_BCNT_U32_B32_e64 $popcnt, $val) 1128>; 1129 1130def : GCNPat < 1131 (i64 (DivergentUnaryFrag<ctpop> i64:$src)), 1132 (REG_SEQUENCE VReg_64, 1133 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)), 1134 (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0, 1135 (i32 (V_MOV_B32_e32 (i32 0))), sub1) 1136>; 1137 1138/********** ============================================ **********/ 1139/********** Extraction, Insertion, Building and Casting **********/ 1140/********** ============================================ **********/ 1141 1142// Special case for 2 element vectors. REQ_SEQUENCE produces better code 1143// than an INSERT_SUBREG. 1144multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> { 1145 def : GCNPat < 1146 (insertelt vec_type:$vec, elem_type:$elem, 0), 1147 (REG_SEQUENCE RC, $elem, sub0, (elem_type (EXTRACT_SUBREG $vec, sub1)), sub1) 1148 >; 1149 1150 def : GCNPat < 1151 (insertelt vec_type:$vec, elem_type:$elem, 1), 1152 (REG_SEQUENCE RC, (elem_type (EXTRACT_SUBREG $vec, sub0)), sub0, $elem, sub1) 1153 >; 1154} 1155 1156foreach Index = 0-1 in { 1157 def Extract_Element_v2i32_#Index : Extract_Element < 1158 i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) 1159 >; 1160 1161 def Extract_Element_v2f32_#Index : Extract_Element < 1162 f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) 1163 >; 1164} 1165 1166defm : Insert_Element_V2 <SReg_64, i32, v2i32>; 1167defm : Insert_Element_V2 <SReg_64, f32, v2f32>; 1168 1169foreach Index = 0-2 in { 1170 def Extract_Element_v3i32_#Index : Extract_Element < 1171 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1172 >; 1173 def Insert_Element_v3i32_#Index : Insert_Element < 1174 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1175 >; 1176 1177 def Extract_Element_v3f32_#Index : Extract_Element < 1178 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1179 >; 1180 def Insert_Element_v3f32_#Index : Insert_Element < 1181 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1182 >; 1183} 1184 1185foreach Index = 0-3 in { 1186 def Extract_Element_v4i32_#Index : Extract_Element < 1187 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1188 >; 1189 def Insert_Element_v4i32_#Index : Insert_Element < 1190 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1191 >; 1192 1193 def Extract_Element_v4f32_#Index : Extract_Element < 1194 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1195 >; 1196 def Insert_Element_v4f32_#Index : Insert_Element < 1197 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1198 >; 1199} 1200 1201foreach Index = 0-4 in { 1202 def Extract_Element_v5i32_#Index : Extract_Element < 1203 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1204 >; 1205 def Insert_Element_v5i32_#Index : Insert_Element < 1206 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1207 >; 1208 1209 def Extract_Element_v5f32_#Index : Extract_Element < 1210 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1211 >; 1212 def Insert_Element_v5f32_#Index : Insert_Element < 1213 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1214 >; 1215} 1216 1217foreach Index = 0-5 in { 1218 def Extract_Element_v6i32_#Index : Extract_Element < 1219 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1220 >; 1221 def Insert_Element_v6i32_#Index : Insert_Element < 1222 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1223 >; 1224 1225 def Extract_Element_v6f32_#Index : Extract_Element < 1226 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1227 >; 1228 def Insert_Element_v6f32_#Index : Insert_Element < 1229 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1230 >; 1231} 1232 1233foreach Index = 0-6 in { 1234 def Extract_Element_v7i32_#Index : Extract_Element < 1235 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1236 >; 1237 def Insert_Element_v7i32_#Index : Insert_Element < 1238 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1239 >; 1240 1241 def Extract_Element_v7f32_#Index : Extract_Element < 1242 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1243 >; 1244 def Insert_Element_v7f32_#Index : Insert_Element < 1245 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1246 >; 1247} 1248 1249foreach Index = 0-7 in { 1250 def Extract_Element_v8i32_#Index : Extract_Element < 1251 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1252 >; 1253 def Insert_Element_v8i32_#Index : Insert_Element < 1254 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1255 >; 1256 1257 def Extract_Element_v8f32_#Index : Extract_Element < 1258 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1259 >; 1260 def Insert_Element_v8f32_#Index : Insert_Element < 1261 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1262 >; 1263} 1264 1265foreach Index = 0-8 in { 1266 def Extract_Element_v9i32_#Index : Extract_Element < 1267 i32, v9i32, Index, !cast<SubRegIndex>(sub#Index) 1268 >; 1269 def Insert_Element_v9i32_#Index : Insert_Element < 1270 i32, v9i32, Index, !cast<SubRegIndex>(sub#Index) 1271 >; 1272 1273 def Extract_Element_v9f32_#Index : Extract_Element < 1274 f32, v9f32, Index, !cast<SubRegIndex>(sub#Index) 1275 >; 1276 def Insert_Element_v9f32_#Index : Insert_Element < 1277 f32, v9f32, Index, !cast<SubRegIndex>(sub#Index) 1278 >; 1279} 1280 1281foreach Index = 0-9 in { 1282 def Extract_Element_v10i32_#Index : Extract_Element < 1283 i32, v10i32, Index, !cast<SubRegIndex>(sub#Index) 1284 >; 1285 def Insert_Element_v10i32_#Index : Insert_Element < 1286 i32, v10i32, Index, !cast<SubRegIndex>(sub#Index) 1287 >; 1288 1289 def Extract_Element_v10f32_#Index : Extract_Element < 1290 f32, v10f32, Index, !cast<SubRegIndex>(sub#Index) 1291 >; 1292 def Insert_Element_v10f32_#Index : Insert_Element < 1293 f32, v10f32, Index, !cast<SubRegIndex>(sub#Index) 1294 >; 1295} 1296 1297foreach Index = 0-10 in { 1298 def Extract_Element_v11i32_#Index : Extract_Element < 1299 i32, v11i32, Index, !cast<SubRegIndex>(sub#Index) 1300 >; 1301 def Insert_Element_v11i32_#Index : Insert_Element < 1302 i32, v11i32, Index, !cast<SubRegIndex>(sub#Index) 1303 >; 1304 1305 def Extract_Element_v11f32_#Index : Extract_Element < 1306 f32, v11f32, Index, !cast<SubRegIndex>(sub#Index) 1307 >; 1308 def Insert_Element_v11f32_#Index : Insert_Element < 1309 f32, v11f32, Index, !cast<SubRegIndex>(sub#Index) 1310 >; 1311} 1312 1313foreach Index = 0-11 in { 1314 def Extract_Element_v12i32_#Index : Extract_Element < 1315 i32, v12i32, Index, !cast<SubRegIndex>(sub#Index) 1316 >; 1317 def Insert_Element_v12i32_#Index : Insert_Element < 1318 i32, v12i32, Index, !cast<SubRegIndex>(sub#Index) 1319 >; 1320 1321 def Extract_Element_v12f32_#Index : Extract_Element < 1322 f32, v12f32, Index, !cast<SubRegIndex>(sub#Index) 1323 >; 1324 def Insert_Element_v12f32_#Index : Insert_Element < 1325 f32, v12f32, Index, !cast<SubRegIndex>(sub#Index) 1326 >; 1327} 1328 1329foreach Index = 0-15 in { 1330 def Extract_Element_v16i32_#Index : Extract_Element < 1331 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1332 >; 1333 def Insert_Element_v16i32_#Index : Insert_Element < 1334 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1335 >; 1336 1337 def Extract_Element_v16f32_#Index : Extract_Element < 1338 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1339 >; 1340 def Insert_Element_v16f32_#Index : Insert_Element < 1341 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1342 >; 1343} 1344 1345 1346def : Pat < 1347 (extract_subvector v4i16:$vec, (i32 0)), 1348 (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0)) 1349>; 1350 1351def : Pat < 1352 (extract_subvector v4i16:$vec, (i32 2)), 1353 (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1)) 1354>; 1355 1356def : Pat < 1357 (extract_subvector v4f16:$vec, (i32 0)), 1358 (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0)) 1359>; 1360 1361def : Pat < 1362 (extract_subvector v4f16:$vec, (i32 2)), 1363 (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) 1364>; 1365 1366def : Pat < 1367 (extract_subvector v8i16:$vec, (i32 0)), 1368 (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1)) 1369>; 1370 1371def : Pat < 1372 (extract_subvector v8i16:$vec, (i32 4)), 1373 (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3)) 1374>; 1375 1376def : Pat < 1377 (extract_subvector v8f16:$vec, (i32 0)), 1378 (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1)) 1379>; 1380 1381def : Pat < 1382 (extract_subvector v8f16:$vec, (i32 4)), 1383 (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3)) 1384>; 1385 1386def : Pat < 1387 (extract_subvector v16i16:$vec, (i32 0)), 1388 (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3)) 1389>; 1390 1391def : Pat < 1392 (extract_subvector v16i16:$vec, (i32 8)), 1393 (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7)) 1394>; 1395 1396def : Pat < 1397 (extract_subvector v16f16:$vec, (i32 0)), 1398 (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3)) 1399>; 1400 1401def : Pat < 1402 (extract_subvector v16f16:$vec, (i32 8)), 1403 (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7)) 1404>; 1405 1406foreach Index = 0-31 in { 1407 def Extract_Element_v32i32_#Index : Extract_Element < 1408 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1409 >; 1410 1411 def Insert_Element_v32i32_#Index : Insert_Element < 1412 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1413 >; 1414 1415 def Extract_Element_v32f32_#Index : Extract_Element < 1416 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1417 >; 1418 1419 def Insert_Element_v32f32_#Index : Insert_Element < 1420 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1421 >; 1422} 1423 1424// FIXME: Why do only some of these type combinations for SReg and 1425// VReg? 1426// 16-bit bitcast 1427def : BitConvert <i16, f16, VGPR_32>; 1428def : BitConvert <f16, i16, VGPR_32>; 1429def : BitConvert <i16, f16, SReg_32>; 1430def : BitConvert <f16, i16, SReg_32>; 1431 1432// 32-bit bitcast 1433def : BitConvert <i32, f32, VGPR_32>; 1434def : BitConvert <f32, i32, VGPR_32>; 1435def : BitConvert <i32, f32, SReg_32>; 1436def : BitConvert <f32, i32, SReg_32>; 1437def : BitConvert <v2i16, i32, SReg_32>; 1438def : BitConvert <i32, v2i16, SReg_32>; 1439def : BitConvert <v2f16, i32, SReg_32>; 1440def : BitConvert <i32, v2f16, SReg_32>; 1441def : BitConvert <v2i16, v2f16, SReg_32>; 1442def : BitConvert <v2f16, v2i16, SReg_32>; 1443def : BitConvert <v2f16, f32, SReg_32>; 1444def : BitConvert <f32, v2f16, SReg_32>; 1445def : BitConvert <v2i16, f32, SReg_32>; 1446def : BitConvert <f32, v2i16, SReg_32>; 1447 1448// 64-bit bitcast 1449def : BitConvert <i64, f64, VReg_64>; 1450def : BitConvert <f64, i64, VReg_64>; 1451def : BitConvert <v2i32, v2f32, VReg_64>; 1452def : BitConvert <v2f32, v2i32, VReg_64>; 1453def : BitConvert <i64, v2i32, VReg_64>; 1454def : BitConvert <v2i32, i64, VReg_64>; 1455def : BitConvert <i64, v2f32, VReg_64>; 1456def : BitConvert <v2f32, i64, VReg_64>; 1457def : BitConvert <f64, v2f32, VReg_64>; 1458def : BitConvert <v2f32, f64, VReg_64>; 1459def : BitConvert <f64, v2i32, VReg_64>; 1460def : BitConvert <v2i32, f64, VReg_64>; 1461def : BitConvert <v4i16, v4f16, VReg_64>; 1462def : BitConvert <v4f16, v4i16, VReg_64>; 1463 1464// FIXME: Make SGPR 1465def : BitConvert <v2i32, v4f16, VReg_64>; 1466def : BitConvert <v4f16, v2i32, VReg_64>; 1467def : BitConvert <v2i32, v4f16, VReg_64>; 1468def : BitConvert <v2i32, v4i16, VReg_64>; 1469def : BitConvert <v4i16, v2i32, VReg_64>; 1470def : BitConvert <v2f32, v4f16, VReg_64>; 1471def : BitConvert <v4f16, v2f32, VReg_64>; 1472def : BitConvert <v2f32, v4i16, VReg_64>; 1473def : BitConvert <v4i16, v2f32, VReg_64>; 1474def : BitConvert <v4i16, f64, VReg_64>; 1475def : BitConvert <v4f16, f64, VReg_64>; 1476def : BitConvert <f64, v4i16, VReg_64>; 1477def : BitConvert <f64, v4f16, VReg_64>; 1478def : BitConvert <v4i16, i64, VReg_64>; 1479def : BitConvert <v4f16, i64, VReg_64>; 1480def : BitConvert <i64, v4i16, VReg_64>; 1481def : BitConvert <i64, v4f16, VReg_64>; 1482 1483def : BitConvert <v4i32, v4f32, VReg_128>; 1484def : BitConvert <v4f32, v4i32, VReg_128>; 1485 1486// 96-bit bitcast 1487def : BitConvert <v3i32, v3f32, SGPR_96>; 1488def : BitConvert <v3f32, v3i32, SGPR_96>; 1489 1490// 128-bit bitcast 1491def : BitConvert <v2i64, v4i32, SReg_128>; 1492def : BitConvert <v4i32, v2i64, SReg_128>; 1493def : BitConvert <v2f64, v4f32, VReg_128>; 1494def : BitConvert <v2f64, v4i32, VReg_128>; 1495def : BitConvert <v4f32, v2f64, VReg_128>; 1496def : BitConvert <v4i32, v2f64, VReg_128>; 1497def : BitConvert <v2i64, v2f64, VReg_128>; 1498def : BitConvert <v2f64, v2i64, VReg_128>; 1499def : BitConvert <v4f32, v2i64, VReg_128>; 1500def : BitConvert <v2i64, v4f32, VReg_128>; 1501def : BitConvert <v8i16, v4i32, SReg_128>; 1502def : BitConvert <v4i32, v8i16, SReg_128>; 1503def : BitConvert <v8f16, v4f32, VReg_128>; 1504def : BitConvert <v8f16, v4i32, VReg_128>; 1505def : BitConvert <v4f32, v8f16, VReg_128>; 1506def : BitConvert <v4i32, v8f16, VReg_128>; 1507def : BitConvert <v8i16, v8f16, VReg_128>; 1508def : BitConvert <v8f16, v8i16, VReg_128>; 1509def : BitConvert <v4f32, v8i16, VReg_128>; 1510def : BitConvert <v8i16, v4f32, VReg_128>; 1511def : BitConvert <v8i16, v8f16, SReg_128>; 1512def : BitConvert <v8i16, v2i64, SReg_128>; 1513def : BitConvert <v8i16, v2f64, SReg_128>; 1514def : BitConvert <v8f16, v2i64, SReg_128>; 1515def : BitConvert <v8f16, v2f64, SReg_128>; 1516def : BitConvert <v8f16, v8i16, SReg_128>; 1517def : BitConvert <v2i64, v8i16, SReg_128>; 1518def : BitConvert <v2f64, v8i16, SReg_128>; 1519def : BitConvert <v2i64, v8f16, SReg_128>; 1520def : BitConvert <v2f64, v8f16, SReg_128>; 1521 1522// 160-bit bitcast 1523def : BitConvert <v5i32, v5f32, SReg_160>; 1524def : BitConvert <v5f32, v5i32, SReg_160>; 1525def : BitConvert <v5i32, v5f32, VReg_160>; 1526def : BitConvert <v5f32, v5i32, VReg_160>; 1527 1528// 192-bit bitcast 1529def : BitConvert <v6i32, v6f32, SReg_192>; 1530def : BitConvert <v6f32, v6i32, SReg_192>; 1531def : BitConvert <v6i32, v6f32, VReg_192>; 1532def : BitConvert <v6f32, v6i32, VReg_192>; 1533def : BitConvert <v3i64, v3f64, VReg_192>; 1534def : BitConvert <v3f64, v3i64, VReg_192>; 1535def : BitConvert <v3i64, v6i32, VReg_192>; 1536def : BitConvert <v3i64, v6f32, VReg_192>; 1537def : BitConvert <v3f64, v6i32, VReg_192>; 1538def : BitConvert <v3f64, v6f32, VReg_192>; 1539def : BitConvert <v6i32, v3i64, VReg_192>; 1540def : BitConvert <v6f32, v3i64, VReg_192>; 1541def : BitConvert <v6i32, v3f64, VReg_192>; 1542def : BitConvert <v6f32, v3f64, VReg_192>; 1543 1544// 224-bit bitcast 1545def : BitConvert <v7i32, v7f32, SReg_224>; 1546def : BitConvert <v7f32, v7i32, SReg_224>; 1547def : BitConvert <v7i32, v7f32, VReg_224>; 1548def : BitConvert <v7f32, v7i32, VReg_224>; 1549 1550// 256-bit bitcast 1551def : BitConvert <v8i32, v8f32, SReg_256>; 1552def : BitConvert <v8f32, v8i32, SReg_256>; 1553def : BitConvert <v8i32, v8f32, VReg_256>; 1554def : BitConvert <v8f32, v8i32, VReg_256>; 1555def : BitConvert <v4i64, v4f64, VReg_256>; 1556def : BitConvert <v4f64, v4i64, VReg_256>; 1557def : BitConvert <v4i64, v8i32, VReg_256>; 1558def : BitConvert <v4i64, v8f32, VReg_256>; 1559def : BitConvert <v4f64, v8i32, VReg_256>; 1560def : BitConvert <v4f64, v8f32, VReg_256>; 1561def : BitConvert <v8i32, v4i64, VReg_256>; 1562def : BitConvert <v8f32, v4i64, VReg_256>; 1563def : BitConvert <v8i32, v4f64, VReg_256>; 1564def : BitConvert <v8f32, v4f64, VReg_256>; 1565def : BitConvert <v16i16, v16f16, SReg_256>; 1566def : BitConvert <v16f16, v16i16, SReg_256>; 1567def : BitConvert <v16i16, v16f16, VReg_256>; 1568def : BitConvert <v16f16, v16i16, VReg_256>; 1569def : BitConvert <v16f16, v8i32, VReg_256>; 1570def : BitConvert <v16i16, v8i32, VReg_256>; 1571def : BitConvert <v16f16, v8f32, VReg_256>; 1572def : BitConvert <v16i16, v8f32, VReg_256>; 1573def : BitConvert <v8i32, v16f16, VReg_256>; 1574def : BitConvert <v8i32, v16i16, VReg_256>; 1575def : BitConvert <v8f32, v16f16, VReg_256>; 1576def : BitConvert <v8f32, v16i16, VReg_256>; 1577def : BitConvert <v16f16, v4i64, VReg_256>; 1578def : BitConvert <v16i16, v4i64, VReg_256>; 1579def : BitConvert <v16f16, v4f64, VReg_256>; 1580def : BitConvert <v16i16, v4f64, VReg_256>; 1581def : BitConvert <v4i64, v16f16, VReg_256>; 1582def : BitConvert <v4i64, v16i16, VReg_256>; 1583def : BitConvert <v4f64, v16f16, VReg_256>; 1584def : BitConvert <v4f64, v16i16, VReg_256>; 1585 1586// 288-bit bitcast 1587def : BitConvert <v9i32, v9f32, SReg_288>; 1588def : BitConvert <v9f32, v9i32, SReg_288>; 1589def : BitConvert <v9i32, v9f32, VReg_288>; 1590def : BitConvert <v9f32, v9i32, VReg_288>; 1591 1592// 320-bit bitcast 1593def : BitConvert <v10i32, v10f32, SReg_320>; 1594def : BitConvert <v10f32, v10i32, SReg_320>; 1595def : BitConvert <v10i32, v10f32, VReg_320>; 1596def : BitConvert <v10f32, v10i32, VReg_320>; 1597 1598// 320-bit bitcast 1599def : BitConvert <v11i32, v11f32, SReg_352>; 1600def : BitConvert <v11f32, v11i32, SReg_352>; 1601def : BitConvert <v11i32, v11f32, VReg_352>; 1602def : BitConvert <v11f32, v11i32, VReg_352>; 1603 1604// 384-bit bitcast 1605def : BitConvert <v12i32, v12f32, SReg_384>; 1606def : BitConvert <v12f32, v12i32, SReg_384>; 1607def : BitConvert <v12i32, v12f32, VReg_384>; 1608def : BitConvert <v12f32, v12i32, VReg_384>; 1609 1610// 512-bit bitcast 1611def : BitConvert <v16i32, v16f32, VReg_512>; 1612def : BitConvert <v16f32, v16i32, VReg_512>; 1613def : BitConvert <v8i64, v8f64, VReg_512>; 1614def : BitConvert <v8f64, v8i64, VReg_512>; 1615def : BitConvert <v8i64, v16i32, VReg_512>; 1616def : BitConvert <v8f64, v16i32, VReg_512>; 1617def : BitConvert <v16i32, v8i64, VReg_512>; 1618def : BitConvert <v16i32, v8f64, VReg_512>; 1619def : BitConvert <v8i64, v16f32, VReg_512>; 1620def : BitConvert <v8f64, v16f32, VReg_512>; 1621def : BitConvert <v16f32, v8i64, VReg_512>; 1622def : BitConvert <v16f32, v8f64, VReg_512>; 1623 1624// 1024-bit bitcast 1625def : BitConvert <v32i32, v32f32, VReg_1024>; 1626def : BitConvert <v32f32, v32i32, VReg_1024>; 1627def : BitConvert <v16i64, v16f64, VReg_1024>; 1628def : BitConvert <v16f64, v16i64, VReg_1024>; 1629def : BitConvert <v16i64, v32i32, VReg_1024>; 1630def : BitConvert <v32i32, v16i64, VReg_1024>; 1631def : BitConvert <v16f64, v32f32, VReg_1024>; 1632def : BitConvert <v32f32, v16f64, VReg_1024>; 1633def : BitConvert <v16i64, v32f32, VReg_1024>; 1634def : BitConvert <v32i32, v16f64, VReg_1024>; 1635def : BitConvert <v16f64, v32i32, VReg_1024>; 1636def : BitConvert <v32f32, v16i64, VReg_1024>; 1637 1638 1639/********** =================== **********/ 1640/********** Src & Dst modifiers **********/ 1641/********** =================== **********/ 1642 1643 1644// If denormals are not enabled, it only impacts the compare of the 1645// inputs. The output result is not flushed. 1646class ClampPat<Instruction inst, ValueType vt> : GCNPat < 1647 (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), 1648 (inst i32:$src0_modifiers, vt:$src0, 1649 i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) 1650>; 1651 1652def : ClampPat<V_MAX_F32_e64, f32>; 1653def : ClampPat<V_MAX_F64_e64, f64>; 1654let SubtargetPredicate = NotHasTrue16BitInsts in 1655def : ClampPat<V_MAX_F16_e64, f16>; 1656let SubtargetPredicate = HasTrue16BitInsts in 1657def : ClampPat<V_MAX_F16_t16_e64, f16>; 1658 1659let SubtargetPredicate = HasVOP3PInsts in { 1660def : GCNPat < 1661 (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), 1662 (V_PK_MAX_F16 $src0_modifiers, $src0, 1663 $src0_modifiers, $src0, DSTCLAMP.ENABLE) 1664>; 1665} 1666 1667 1668/********** ================================ **********/ 1669/********** Floating point absolute/negative **********/ 1670/********** ================================ **********/ 1671 1672def : GCNPat < 1673 (UniformUnaryFrag<fneg> (fabs (f32 SReg_32:$src))), 1674 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit 1675>; 1676 1677def : GCNPat < 1678 (UniformUnaryFrag<fabs> (f32 SReg_32:$src)), 1679 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) 1680>; 1681 1682def : GCNPat < 1683 (UniformUnaryFrag<fneg> (f32 SReg_32:$src)), 1684 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) 1685>; 1686 1687def : GCNPat < 1688 (UniformUnaryFrag<fneg> (f16 SReg_32:$src)), 1689 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) 1690>; 1691 1692def : GCNPat < 1693 (UniformUnaryFrag<fabs> (f16 SReg_32:$src)), 1694 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) 1695>; 1696 1697def : GCNPat < 1698 (UniformUnaryFrag<fneg> (fabs (f16 SReg_32:$src))), 1699 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit 1700>; 1701 1702def : GCNPat < 1703 (UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)), 1704 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) 1705>; 1706 1707def : GCNPat < 1708 (UniformUnaryFrag<fabs> (v2f16 SReg_32:$src)), 1709 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) 1710>; 1711 1712// This is really (fneg (fabs v2f16:$src)) 1713// 1714// fabs is not reported as free because there is modifier for it in 1715// VOP3P instructions, so it is turned into the bit op. 1716def : GCNPat < 1717 (UniformUnaryFrag<fneg> (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), 1718 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1719>; 1720 1721def : GCNPat < 1722 (UniformUnaryFrag<fneg> (v2f16 (fabs SReg_32:$src))), 1723 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1724>; 1725 1726 1727// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead 1728// of the real value. 1729def : GCNPat < 1730 (UniformUnaryFrag<fneg> (v2f32 SReg_64:$src)), 1731 (v2f32 (REG_SEQUENCE SReg_64, 1732 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1733 (i32 (S_MOV_B32 (i32 0x80000000)))), 1734 SReg_32)), sub0, 1735 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1736 (i32 (S_MOV_B32 (i32 0x80000000)))), 1737 SReg_32)), sub1)) 1738>; 1739 1740def : GCNPat < 1741 (UniformUnaryFrag<fabs> (v2f32 SReg_64:$src)), 1742 (v2f32 (REG_SEQUENCE SReg_64, 1743 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1744 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1745 SReg_32)), sub0, 1746 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1747 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1748 SReg_32)), sub1)) 1749>; 1750 1751def : GCNPat < 1752 (UniformUnaryFrag<fneg> (fabs (v2f32 SReg_64:$src))), 1753 (v2f32 (REG_SEQUENCE SReg_64, 1754 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1755 (i32 (S_MOV_B32 (i32 0x80000000)))), 1756 SReg_32)), sub0, 1757 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1758 (i32 (S_MOV_B32 (i32 0x80000000)))), 1759 SReg_32)), sub1)) 1760>; 1761 1762// FIXME: Use S_BITSET0_B32/B64? 1763def : GCNPat < 1764 (UniformUnaryFrag<fabs> (f64 SReg_64:$src)), 1765 (REG_SEQUENCE SReg_64, 1766 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1767 sub0, 1768 (i32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1769 (S_MOV_B32 (i32 0x7fffffff))), SReg_32)), // Set sign bit. 1770 sub1) 1771>; 1772 1773def : GCNPat < 1774 (UniformUnaryFrag<fneg> (f64 SReg_64:$src)), 1775 (REG_SEQUENCE SReg_64, 1776 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1777 sub0, 1778 (i32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1779 (i32 (S_MOV_B32 (i32 0x80000000)))), SReg_32)), 1780 sub1) 1781>; 1782 1783def : GCNPat < 1784 (UniformUnaryFrag<fneg> (fabs (f64 SReg_64:$src))), 1785 (REG_SEQUENCE SReg_64, 1786 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1787 sub0, 1788 (i32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1789 (S_MOV_B32 (i32 0x80000000))), SReg_32)),// Set sign bit. 1790 sub1) 1791>; 1792 1793 1794def : GCNPat < 1795 (fneg (fabs (f32 VGPR_32:$src))), 1796 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) // Set sign bit 1797>; 1798 1799def : GCNPat < 1800 (fabs (f32 VGPR_32:$src)), 1801 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) 1802>; 1803 1804def : GCNPat < 1805 (fneg (f32 VGPR_32:$src)), 1806 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) 1807>; 1808 1809def : GCNPat < 1810 (fabs (f16 VGPR_32:$src)), 1811 (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) 1812>; 1813 1814def : GCNPat < 1815 (fneg (f16 VGPR_32:$src)), 1816 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) 1817>; 1818 1819def : GCNPat < 1820 (fneg (fabs (f16 VGPR_32:$src))), 1821 (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit 1822>; 1823 1824def : GCNPat < 1825 (fneg (v2f16 VGPR_32:$src)), 1826 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 1827>; 1828 1829def : GCNPat < 1830 (fabs (v2f16 VGPR_32:$src)), 1831 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) 1832>; 1833 1834def : GCNPat < 1835 (fneg (v2f16 (fabs VGPR_32:$src))), 1836 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 1837>; 1838 1839def : GCNPat < 1840 (fabs (f64 VReg_64:$src)), 1841 (REG_SEQUENCE VReg_64, 1842 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1843 sub0, 1844 (V_AND_B32_e64 (i32 (S_MOV_B32 (i32 0x7fffffff))), 1845 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 1846 sub1) 1847>; 1848 1849def : GCNPat < 1850 (fneg (f64 VReg_64:$src)), 1851 (REG_SEQUENCE VReg_64, 1852 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1853 sub0, 1854 (V_XOR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 1855 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 1856 sub1) 1857>; 1858 1859def : GCNPat < 1860 (fneg (fabs (f64 VReg_64:$src))), 1861 (REG_SEQUENCE VReg_64, 1862 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1863 sub0, 1864 (V_OR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 1865 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 1866 sub1) 1867>; 1868 1869def : GCNPat < 1870 (DivergentUnaryFrag<fneg> (v2f32 VReg_64:$src)), 1871 (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, 1872 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0, 1873 0, 0, 0, 0, 0) 1874> { 1875 let SubtargetPredicate = HasPackedFP32Ops; 1876} 1877 1878def : GCNPat < 1879 (fcopysign f16:$src0, f16:$src1), 1880 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) 1881>; 1882 1883def : GCNPat < 1884 (fcopysign f32:$src0, f16:$src1), 1885 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 1886 (V_LSHLREV_B32_e64 (i32 16), $src1)) 1887>; 1888 1889def : GCNPat < 1890 (fcopysign f64:$src0, f16:$src1), 1891 (REG_SEQUENCE SReg_64, 1892 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 1893 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), 1894 (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) 1895>; 1896 1897def : GCNPat < 1898 (fcopysign f16:$src0, f32:$src1), 1899 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 1900 (V_LSHRREV_B32_e64 (i32 16), $src1)) 1901>; 1902 1903def : GCNPat < 1904 (fcopysign f16:$src0, f64:$src1), 1905 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 1906 (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) 1907>; 1908 1909/********** ================== **********/ 1910/********** Immediate Patterns **********/ 1911/********** ================== **********/ 1912 1913def : GCNPat < 1914 (VGPRImm<(i32 imm)>:$imm), 1915 (V_MOV_B32_e32 imm:$imm) 1916>; 1917 1918def : GCNPat < 1919 (VGPRImm<(f32 fpimm)>:$imm), 1920 (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) 1921>; 1922 1923def : GCNPat < 1924 (i32 imm:$imm), 1925 (S_MOV_B32 imm:$imm) 1926>; 1927 1928def : GCNPat < 1929 (VGPRImm<(SIlds tglobaladdr:$ga)>), 1930 (V_MOV_B32_e32 $ga) 1931>; 1932 1933def : GCNPat < 1934 (SIlds tglobaladdr:$ga), 1935 (S_MOV_B32 $ga) 1936>; 1937 1938// FIXME: Workaround for ordering issue with peephole optimizer where 1939// a register class copy interferes with immediate folding. Should 1940// use s_mov_b32, which can be shrunk to s_movk_i32 1941def : GCNPat < 1942 (VGPRImm<(f16 fpimm)>:$imm), 1943 (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) 1944>; 1945 1946def : GCNPat < 1947 (f32 fpimm:$imm), 1948 (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) 1949>; 1950 1951def : GCNPat < 1952 (f16 fpimm:$imm), 1953 (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) 1954>; 1955 1956def : GCNPat < 1957 (p5 frameindex:$fi), 1958 (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi))) 1959>; 1960 1961def : GCNPat < 1962 (p5 frameindex:$fi), 1963 (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi))) 1964>; 1965 1966def : GCNPat < 1967 (i64 InlineImm64:$imm), 1968 (S_MOV_B64 InlineImm64:$imm) 1969>; 1970 1971// XXX - Should this use a s_cmp to set SCC? 1972 1973// Set to sign-extended 64-bit value (true = -1, false = 0) 1974def : GCNPat < 1975 (i1 imm:$imm), 1976 (S_MOV_B64 (i64 (as_i64imm $imm))) 1977> { 1978 let WaveSizePredicate = isWave64; 1979} 1980 1981def : GCNPat < 1982 (i1 imm:$imm), 1983 (S_MOV_B32 (i32 (as_i32imm $imm))) 1984> { 1985 let WaveSizePredicate = isWave32; 1986} 1987 1988def : GCNPat < 1989 (f64 InlineImmFP64:$imm), 1990 (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm))) 1991>; 1992 1993/********** ================== **********/ 1994/********** Intrinsic Patterns **********/ 1995/********** ================== **********/ 1996 1997def : GCNPat < 1998 (f32 (fpow (VOP3Mods f32:$src0, i32:$src0_mods), (VOP3Mods f32:$src1, i32:$src1_mods))), 1999 (V_EXP_F32_e64 SRCMODS.NONE, (V_MUL_LEGACY_F32_e64 $src1_mods, $src1, SRCMODS.NONE, (V_LOG_F32_e64 $src0_mods, $src0), 0, 0)) 2000>; 2001 2002def : GCNPat < 2003 (i32 (sext i1:$src0)), 2004 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2005 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0) 2006>; 2007 2008class Ext32Pat <SDNode ext> : GCNPat < 2009 (i32 (ext i1:$src0)), 2010 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2011 /*src1mod*/(i32 0), /*src1*/(i32 1), $src0) 2012>; 2013 2014def : Ext32Pat <zext>; 2015def : Ext32Pat <anyext>; 2016 2017// The multiplication scales from [0,1) to the unsigned integer range, 2018// rounding down a bit to avoid unwanted overflow. 2019def : GCNPat < 2020 (AMDGPUurecip i32:$src0), 2021 (V_CVT_U32_F32_e32 2022 (V_MUL_F32_e32 (i32 CONST.FP_4294966784), 2023 (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) 2024>; 2025 2026//===----------------------------------------------------------------------===// 2027// VOP3 Patterns 2028//===----------------------------------------------------------------------===// 2029 2030def : IMad24Pat<V_MAD_I32_I24_e64, 1>; 2031def : UMad24Pat<V_MAD_U32_U24_e64, 1>; 2032 2033// BFI patterns 2034 2035def BFIImm32 : PatFrag< 2036 (ops node:$x, node:$y, node:$z), 2037 (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))), 2038 [{ 2039 auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1)); 2040 auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1)); 2041 return X && NotX && 2042 ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue(); 2043 }] 2044>; 2045 2046// Definition from ISA doc: 2047// (y & x) | (z & ~x) 2048def : AMDGPUPat < 2049 (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), 2050 (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) 2051>; 2052 2053// (y & C) | (z & ~C) 2054def : AMDGPUPat < 2055 (BFIImm32 i32:$x, i32:$y, i32:$z), 2056 (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) 2057>; 2058 2059// 64-bit version 2060def : AMDGPUPat < 2061 (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), 2062 (REG_SEQUENCE VReg_64, 2063 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 2064 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 2065 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 2066 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 2067 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 2068 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 2069>; 2070 2071// SHA-256 Ch function 2072// z ^ (x & (y ^ z)) 2073def : AMDGPUPat < 2074 (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), 2075 (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) 2076>; 2077 2078// 64-bit version 2079def : AMDGPUPat < 2080 (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), 2081 (REG_SEQUENCE VReg_64, 2082 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 2083 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 2084 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 2085 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 2086 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 2087 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 2088>; 2089 2090def : AMDGPUPat < 2091 (fcopysign f32:$src0, f32:$src1), 2092 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1) 2093>; 2094 2095def : AMDGPUPat < 2096 (fcopysign f32:$src0, f64:$src1), 2097 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 2098 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))) 2099>; 2100 2101def : AMDGPUPat < 2102 (fcopysign f64:$src0, f64:$src1), 2103 (REG_SEQUENCE SReg_64, 2104 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2105 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 2106 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 2107 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1) 2108>; 2109 2110def : AMDGPUPat < 2111 (fcopysign f64:$src0, f32:$src1), 2112 (REG_SEQUENCE SReg_64, 2113 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 2114 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 2115 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 2116 $src1), sub1) 2117>; 2118 2119def : ROTRPattern <V_ALIGNBIT_B32_e64>; 2120 2121def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), 2122 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 2123 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 2124 2125def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), 2126 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 2127 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 2128 2129/********** ====================== **********/ 2130/********** Indirect addressing **********/ 2131/********** ====================== **********/ 2132 2133multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { 2134 // Extract with offset 2135 def : GCNPat< 2136 (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), 2137 (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) 2138 >; 2139 2140 // Insert with offset 2141 def : GCNPat< 2142 (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), 2143 (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) 2144 >; 2145} 2146 2147defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; 2148defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; 2149defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; 2150defm : SI_INDIRECT_Pattern <v9f32, f32, "V9">; 2151defm : SI_INDIRECT_Pattern <v10f32, f32, "V10">; 2152defm : SI_INDIRECT_Pattern <v11f32, f32, "V11">; 2153defm : SI_INDIRECT_Pattern <v12f32, f32, "V12">; 2154defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; 2155defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">; 2156 2157defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; 2158defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; 2159defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; 2160defm : SI_INDIRECT_Pattern <v9i32, i32, "V9">; 2161defm : SI_INDIRECT_Pattern <v10i32, i32, "V10">; 2162defm : SI_INDIRECT_Pattern <v11i32, i32, "V11">; 2163defm : SI_INDIRECT_Pattern <v12i32, i32, "V12">; 2164defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; 2165defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">; 2166 2167//===----------------------------------------------------------------------===// 2168// SAD Patterns 2169//===----------------------------------------------------------------------===// 2170 2171def : GCNPat < 2172 (add (sub_oneuse (umax i32:$src0, i32:$src1), 2173 (umin i32:$src0, i32:$src1)), 2174 i32:$src2), 2175 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 2176>; 2177 2178def : GCNPat < 2179 (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)), 2180 (sub i32:$src0, i32:$src1), 2181 (sub i32:$src1, i32:$src0)), 2182 i32:$src2), 2183 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 2184>; 2185 2186//===----------------------------------------------------------------------===// 2187// Conversion Patterns 2188//===----------------------------------------------------------------------===// 2189def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)), 2190 (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 2191 2192// Handle sext_inreg in i64 2193def : GCNPat < 2194 (i64 (UniformSextInreg<i1> i64:$src)), 2195 (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 2196>; 2197 2198def : GCNPat < 2199 (i16 (UniformSextInreg<i1> i16:$src)), 2200 (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 2201>; 2202 2203def : GCNPat < 2204 (i16 (UniformSextInreg<i8> i16:$src)), 2205 (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 2206>; 2207 2208def : GCNPat < 2209 (i64 (UniformSextInreg<i8> i64:$src)), 2210 (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 2211>; 2212 2213def : GCNPat < 2214 (i64 (UniformSextInreg<i16> i64:$src)), 2215 (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 2216>; 2217 2218def : GCNPat < 2219 (i64 (UniformSextInreg<i32> i64:$src)), 2220 (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 2221>; 2222 2223def : GCNPat< 2224 (i32 (DivergentSextInreg<i1> i32:$src)), 2225 (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>; 2226 2227def : GCNPat < 2228 (i16 (DivergentSextInreg<i1> i16:$src)), 2229 (V_BFE_I32_e64 $src, (i32 0), (i32 1)) 2230>; 2231 2232def : GCNPat < 2233 (i16 (DivergentSextInreg<i8> i16:$src)), 2234 (V_BFE_I32_e64 $src, (i32 0), (i32 8)) 2235>; 2236 2237def : GCNPat< 2238 (i32 (DivergentSextInreg<i8> i32:$src)), 2239 (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8)) 2240>; 2241 2242def : GCNPat < 2243 (i32 (DivergentSextInreg<i16> i32:$src)), 2244 (V_BFE_I32_e64 $src, (i32 0), (i32 16)) 2245>; 2246 2247def : GCNPat < 2248 (i64 (DivergentSextInreg<i1> i64:$src)), 2249 (REG_SEQUENCE VReg_64, 2250 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0, 2251 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1) 2252>; 2253 2254def : GCNPat < 2255 (i64 (DivergentSextInreg<i8> i64:$src)), 2256 (REG_SEQUENCE VReg_64, 2257 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)), sub0, 2258 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1) 2259>; 2260 2261def : GCNPat < 2262 (i64 (DivergentSextInreg<i16> i64:$src)), 2263 (REG_SEQUENCE VReg_64, 2264 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)), sub0, 2265 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1) 2266>; 2267 2268def : GCNPat < 2269 (i64 (DivergentSextInreg<i32> i64:$src)), 2270 (REG_SEQUENCE VReg_64, 2271 (i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0, 2272 (V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1) 2273>; 2274 2275def : GCNPat < 2276 (i64 (zext i32:$src)), 2277 (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) 2278>; 2279 2280def : GCNPat < 2281 (i64 (anyext i32:$src)), 2282 (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) 2283>; 2284 2285class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < 2286 (i64 (ext i1:$src)), 2287 (REG_SEQUENCE VReg_64, 2288 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2289 /*src1mod*/(i32 0), /*src1*/(i32 1), $src), 2290 sub0, (S_MOV_B32 (i32 0)), sub1) 2291>; 2292 2293 2294def : ZExt_i64_i1_Pat<zext>; 2295def : ZExt_i64_i1_Pat<anyext>; 2296 2297// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that 2298// REG_SEQUENCE patterns don't support instructions with multiple outputs. 2299def : GCNPat < 2300 (i64 (UniformUnaryFrag<sext> i32:$src)), 2301 (REG_SEQUENCE SReg_64, $src, sub0, 2302 (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) 2303>; 2304 2305def : GCNPat < 2306 (i64 (DivergentUnaryFrag<sext> i32:$src)), 2307 (REG_SEQUENCE VReg_64, $src, sub0, 2308 (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1) 2309>; 2310 2311def : GCNPat < 2312 (i64 (sext i1:$src)), 2313 (REG_SEQUENCE VReg_64, 2314 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2315 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0, 2316 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2317 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1) 2318>; 2319 2320class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat < 2321 (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), 2322 (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) 2323>; 2324 2325def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; 2326def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; 2327def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; 2328def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; 2329def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; 2330def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; 2331 2332// If we need to perform a logical operation on i1 values, we need to 2333// use vector comparisons since there is only one SCC register. Vector 2334// comparisons may write to a pair of SGPRs or a single SGPR, so treat 2335// these as 32 or 64-bit comparisons. When legalizing SGPR copies, 2336// instructions resulting in the copies from SCC to these instructions 2337// will be moved to the VALU. 2338 2339let WaveSizePredicate = isWave64 in { 2340def : GCNPat < 2341 (i1 (and i1:$src0, i1:$src1)), 2342 (S_AND_B64 $src0, $src1) 2343>; 2344 2345def : GCNPat < 2346 (i1 (or i1:$src0, i1:$src1)), 2347 (S_OR_B64 $src0, $src1) 2348>; 2349 2350def : GCNPat < 2351 (i1 (xor i1:$src0, i1:$src1)), 2352 (S_XOR_B64 $src0, $src1) 2353>; 2354 2355def : GCNPat < 2356 (i1 (add i1:$src0, i1:$src1)), 2357 (S_XOR_B64 $src0, $src1) 2358>; 2359 2360def : GCNPat < 2361 (i1 (sub i1:$src0, i1:$src1)), 2362 (S_XOR_B64 $src0, $src1) 2363>; 2364 2365let AddedComplexity = 1 in { 2366def : GCNPat < 2367 (i1 (add i1:$src0, (i1 -1))), 2368 (S_NOT_B64 $src0) 2369>; 2370 2371def : GCNPat < 2372 (i1 (sub i1:$src0, (i1 -1))), 2373 (S_NOT_B64 $src0) 2374>; 2375} 2376} // end isWave64 2377 2378let WaveSizePredicate = isWave32 in { 2379def : GCNPat < 2380 (i1 (and i1:$src0, i1:$src1)), 2381 (S_AND_B32 $src0, $src1) 2382>; 2383 2384def : GCNPat < 2385 (i1 (or i1:$src0, i1:$src1)), 2386 (S_OR_B32 $src0, $src1) 2387>; 2388 2389def : GCNPat < 2390 (i1 (xor i1:$src0, i1:$src1)), 2391 (S_XOR_B32 $src0, $src1) 2392>; 2393 2394def : GCNPat < 2395 (i1 (add i1:$src0, i1:$src1)), 2396 (S_XOR_B32 $src0, $src1) 2397>; 2398 2399def : GCNPat < 2400 (i1 (sub i1:$src0, i1:$src1)), 2401 (S_XOR_B32 $src0, $src1) 2402>; 2403 2404let AddedComplexity = 1 in { 2405def : GCNPat < 2406 (i1 (add i1:$src0, (i1 -1))), 2407 (S_NOT_B32 $src0) 2408>; 2409 2410def : GCNPat < 2411 (i1 (sub i1:$src0, (i1 -1))), 2412 (S_NOT_B32 $src0) 2413>; 2414} 2415} // end isWave32 2416 2417def : GCNPat < 2418 (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))), 2419 (V_NOT_B32_e32 $src0) 2420>; 2421 2422def : GCNPat < 2423 (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))), 2424 (REG_SEQUENCE VReg_64, 2425 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0, 2426 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1 2427 ) 2428>; 2429 2430let SubtargetPredicate = NotHasTrue16BitInsts in 2431def : GCNPat < 2432 (f16 (sint_to_fp i1:$src)), 2433 (V_CVT_F16_F32_e32 ( 2434 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2435 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2436 SSrc_i1:$src)) 2437>; 2438 2439let SubtargetPredicate = HasTrue16BitInsts in 2440def : GCNPat < 2441 (f16 (sint_to_fp i1:$src)), 2442 (V_CVT_F16_F32_t16_e32 ( 2443 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2444 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2445 SSrc_i1:$src)) 2446>; 2447 2448let SubtargetPredicate = NotHasTrue16BitInsts in 2449def : GCNPat < 2450 (f16 (uint_to_fp i1:$src)), 2451 (V_CVT_F16_F32_e32 ( 2452 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2453 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2454 SSrc_i1:$src)) 2455>; 2456let SubtargetPredicate = HasTrue16BitInsts in 2457def : GCNPat < 2458 (f16 (uint_to_fp i1:$src)), 2459 (V_CVT_F16_F32_t16_e32 ( 2460 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2461 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2462 SSrc_i1:$src)) 2463>; 2464 2465def : GCNPat < 2466 (f32 (sint_to_fp i1:$src)), 2467 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2468 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2469 SSrc_i1:$src) 2470>; 2471 2472def : GCNPat < 2473 (f32 (uint_to_fp i1:$src)), 2474 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2475 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2476 SSrc_i1:$src) 2477>; 2478 2479def : GCNPat < 2480 (f64 (sint_to_fp i1:$src)), 2481 (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2482 /*src1mod*/(i32 0), /*src1*/(i32 -1), 2483 SSrc_i1:$src)) 2484>; 2485 2486def : GCNPat < 2487 (f64 (uint_to_fp i1:$src)), 2488 (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2489 /*src1mod*/(i32 0), /*src1*/(i32 1), 2490 SSrc_i1:$src)) 2491>; 2492 2493//===----------------------------------------------------------------------===// 2494// Miscellaneous Patterns 2495//===----------------------------------------------------------------------===// 2496 2497// Eliminate a zero extension from an fp16 operation if it already 2498// zeros the high bits of the 32-bit register. 2499// 2500// This is complicated on gfx9+. Some instructions maintain the legacy 2501// zeroing behavior, but others preserve the high bits. Some have a 2502// control bit to change the behavior. We can't simply say with 2503// certainty what the source behavior is without more context on how 2504// the src is lowered. e.g. fptrunc + fma may be lowered to a 2505// v_fma_mix* instruction which does not zero, or may not. 2506def : GCNPat< 2507 (i32 (DivergentUnaryFrag<abs> i32:$src)), 2508 (V_MAX_I32_e64 (V_SUB_CO_U32_e32 (i32 0), $src), $src)>; 2509 2510let AddedComplexity = 1 in { 2511def : GCNPat< 2512 (i32 (DivergentUnaryFrag<abs> i32:$src)), 2513 (V_MAX_I32_e64 (V_SUB_U32_e32 (i32 0), $src), $src)>{ 2514 let SubtargetPredicate = HasAddNoCarryInsts; 2515} 2516} // AddedComplexity = 1 2517 2518def : GCNPat< 2519 (i32 (DivergentUnaryFrag<zext> i16:$src)), 2520 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src) 2521>; 2522 2523def : GCNPat< 2524 (i64 (DivergentUnaryFrag<zext> i16:$src)), 2525 (REG_SEQUENCE VReg_64, 2526 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0, 2527 (S_MOV_B32 (i32 0)), sub1) 2528>; 2529 2530def : GCNPat< 2531 (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), 2532 (COPY VSrc_b16:$src)>; 2533 2534def : GCNPat < 2535 (i32 (trunc i64:$a)), 2536 (EXTRACT_SUBREG $a, sub0) 2537>; 2538 2539def : GCNPat < 2540 (i1 (UniformUnaryFrag<trunc> i32:$a)), 2541 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2542>; 2543 2544def : GCNPat < 2545 (i1 (UniformUnaryFrag<trunc> i16:$a)), 2546 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2547>; 2548 2549def : GCNPat < 2550 (i1 (UniformUnaryFrag<trunc> i64:$a)), 2551 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), 2552 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2553>; 2554 2555def : GCNPat < 2556 (i1 (DivergentUnaryFrag<trunc> i32:$a)), 2557 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2558>; 2559 2560def : GCNPat < 2561 (i1 (DivergentUnaryFrag<trunc> i16:$a)), 2562 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2563>; 2564 2565def IMMBitSelConst : SDNodeXForm<imm, [{ 2566 return CurDAG->getTargetConstant(1ULL << N->getZExtValue(), SDLoc(N), 2567 MVT::i32); 2568}]>; 2569 2570// Matching separate SRL and TRUNC instructions 2571// with dependent operands (SRL dest is source of TRUNC) 2572// generates three instructions. However, by using bit shifts, 2573// the V_LSHRREV_B32_e64 result can be directly used in the 2574// operand of the V_AND_B32_e64 instruction: 2575// (trunc i32 (srl i32 $a, i32 $b)) -> 2576// v_and_b32_e64 $a, (1 << $b), $a 2577// v_cmp_ne_u32_e64 $a, 0, $a 2578 2579// Handle the VALU case. 2580def : GCNPat < 2581 (i1 (DivergentUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), 2582 (V_CMP_NE_U32_e64 (V_AND_B32_e64 (i32 (IMMBitSelConst $b)), $a), 2583 (i32 0)) 2584>; 2585 2586// Handle the scalar case. 2587def : GCNPat < 2588 (i1 (UniformUnaryFrag<trunc> (i32 (srl i32:$a, (i32 imm:$b))))), 2589 (S_CMP_LG_U32 (S_AND_B32 (i32 (IMMBitSelConst $b)), $a), 2590 (i32 0)) 2591>; 2592 2593def : GCNPat < 2594 (i1 (DivergentUnaryFrag<trunc> i64:$a)), 2595 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), 2596 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2597>; 2598 2599def : GCNPat < 2600 (i32 (bswap i32:$a)), 2601 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2602 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), 2603 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) 2604>; 2605 2606// FIXME: This should have been narrowed to i32 during legalization. 2607// This pattern should also be skipped for GlobalISel 2608def : GCNPat < 2609 (i64 (bswap i64:$a)), 2610 (REG_SEQUENCE VReg_64, 2611 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2612 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2613 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2614 (i32 24)), 2615 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2616 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2617 (i32 8))), 2618 sub0, 2619 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2620 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2621 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2622 (i32 24)), 2623 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2624 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2625 (i32 8))), 2626 sub1) 2627>; 2628 2629// FIXME: The AddedComplexity should not be needed, but in GlobalISel 2630// the BFI pattern ends up taking precedence without it. 2631let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { 2632// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24) 2633// 2634// My reading of the manual suggests we should be using src0 for the 2635// register value, but this is what seems to work. 2636def : GCNPat < 2637 (i32 (bswap i32:$a)), 2638 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) 2639>; 2640 2641// FIXME: This should have been narrowed to i32 during legalization. 2642// This pattern should also be skipped for GlobalISel 2643def : GCNPat < 2644 (i64 (bswap i64:$a)), 2645 (REG_SEQUENCE VReg_64, 2646 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), 2647 (S_MOV_B32 (i32 0x00010203))), 2648 sub0, 2649 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), 2650 (S_MOV_B32 (i32 0x00010203))), 2651 sub1) 2652>; 2653 2654// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24) 2655// The 12s emit 0s. 2656def : GCNPat < 2657 (i16 (bswap i16:$a)), 2658 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 2659>; 2660 2661def : GCNPat < 2662 (i32 (zext (bswap i16:$a))), 2663 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 2664>; 2665 2666// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) 2667def : GCNPat < 2668 (v2i16 (bswap v2i16:$a)), 2669 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) 2670>; 2671 2672} 2673 2674def : GCNPat< 2675 (i64 (DivergentUnaryFrag<bitreverse> i64:$a)), 2676 (REG_SEQUENCE VReg_64, 2677 (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0, 2678 (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>; 2679 2680// Prefer selecting to max when legal, but using mul is always valid. 2681let AddedComplexity = -5 in { 2682 2683let OtherPredicates = [NotHasTrue16BitInsts] in { 2684def : GCNPat< 2685 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2686 (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 2687>; 2688 2689def : GCNPat< 2690 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 2691 (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 2692>; 2693} // End OtherPredicates 2694 2695let OtherPredicates = [HasTrue16BitInsts] in { 2696def : GCNPat< 2697 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2698 (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 2699>; 2700 2701def : GCNPat< 2702 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 2703 (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 2704>; 2705} // End OtherPredicates 2706 2707def : GCNPat< 2708 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 2709 (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) 2710>; 2711 2712def : GCNPat< 2713 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 2714 (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src) 2715>; 2716 2717def : GCNPat< 2718 (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), 2719 (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) 2720>; 2721 2722// TODO: Handle fneg like other types. 2723def : GCNPat< 2724 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 2725 (V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src) 2726>; 2727} // End AddedComplexity = -5 2728 2729multiclass SelectCanonicalizeAsMax< 2730 list<Predicate> f32_preds = [], 2731 list<Predicate> f64_preds = [], 2732 list<Predicate> f16_preds = []> { 2733 def : GCNPat< 2734 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 2735 (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> { 2736 let OtherPredicates = f32_preds; 2737 } 2738 2739 def : GCNPat< 2740 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 2741 (V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> { 2742 let OtherPredicates = f64_preds; 2743 } 2744 2745 def : GCNPat< 2746 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2747 (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 2748 let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, NotHasTrue16BitInsts]); 2749 } 2750 2751 def : GCNPat< 2752 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2753 (V_MAX_F16_t16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 2754 let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, HasTrue16BitInsts]); 2755 } 2756 2757 def : GCNPat< 2758 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 2759 (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> { 2760 // FIXME: Should have VOP3P subtarget predicate 2761 let OtherPredicates = f16_preds; 2762 } 2763} 2764 2765// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal 2766// mode, and would never flush. For f64, it's faster to do implement 2767// this with a max. For f16/f32 it's a wash, but prefer max when 2768// valid. 2769// 2770// FIXME: Lowering f32/f16 with max is worse since we can use a 2771// smaller encoding if the input is fneg'd. It also adds an extra 2772// register use. 2773let SubtargetPredicate = HasMinMaxDenormModes in { 2774 defm : SelectCanonicalizeAsMax<[], [], []>; 2775} // End SubtargetPredicate = HasMinMaxDenormModes 2776 2777let SubtargetPredicate = NotHasMinMaxDenormModes in { 2778 // Use the max lowering if we don't need to flush. 2779 2780 // FIXME: We don't do use this for f32 as a workaround for the 2781 // library being compiled with the default ieee mode, but 2782 // potentially being called from flushing kernels. Really we should 2783 // not be mixing code expecting different default FP modes, but mul 2784 // works in any FP environment. 2785 defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>; 2786} // End SubtargetPredicate = NotHasMinMaxDenormModes 2787 2788 2789let OtherPredicates = [HasDLInsts] in { 2790// Don't allow source modifiers. If there are any source modifiers then it's 2791// better to select fma instead of fmac. 2792def : GCNPat < 2793 (fma (f32 (VOP3NoMods f32:$src0)), 2794 (f32 (VOP3NoMods f32:$src1)), 2795 (f32 (VOP3NoMods f32:$src2))), 2796 (V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2797 SRCMODS.NONE, $src2) 2798>; 2799} // End OtherPredicates = [HasDLInsts] 2800 2801let SubtargetPredicate = isGFX10Plus in { 2802// Don't allow source modifiers. If there are any source modifiers then it's 2803// better to select fma instead of fmac. 2804let OtherPredicates = [NotHasTrue16BitInsts] in 2805def : GCNPat < 2806 (fma (f16 (VOP3NoMods f32:$src0)), 2807 (f16 (VOP3NoMods f32:$src1)), 2808 (f16 (VOP3NoMods f32:$src2))), 2809 (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2810 SRCMODS.NONE, $src2) 2811>; 2812let OtherPredicates = [HasTrue16BitInsts] in 2813def : GCNPat < 2814 (fma (f16 (VOP3NoMods f32:$src0)), 2815 (f16 (VOP3NoMods f32:$src1)), 2816 (f16 (VOP3NoMods f32:$src2))), 2817 (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2818 SRCMODS.NONE, $src2) 2819>; 2820} 2821 2822let OtherPredicates = [HasFmacF64Inst] in 2823// Don't allow source modifiers. If there are any source modifiers then it's 2824// better to select fma instead of fmac. 2825def : GCNPat < 2826 (fma (f64 (VOP3NoMods f64:$src0)), 2827 (f64 (VOP3NoMods f64:$src1)), 2828 (f64 (VOP3NoMods f64:$src2))), 2829 (V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2830 SRCMODS.NONE, $src2) 2831>; 2832 2833// COPY is workaround tablegen bug from multiple outputs 2834// from S_LSHL_B32's multiple outputs from implicit scc def. 2835let AddedComplexity = 1 in { 2836def : GCNPat < 2837 (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), 2838 (S_LSHL_B32 SReg_32:$src1, (i16 16)) 2839>; 2840 2841def : GCNPat < 2842 (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 VGPR_32:$src1))), 2843 (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1)) 2844>; 2845 2846 2847def : GCNPat < 2848 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), 2849 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 2850>; 2851 2852def : GCNPat < 2853 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src1), (i16 0))), 2854 (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) 2855>; 2856 2857def : GCNPat < 2858 (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), 2859 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 2860>; 2861 2862def : GCNPat < 2863 (v2f16 (DivergentBinFrag<build_vector> (f16 VGPR_32:$src1), (f16 FP_ZERO))), 2864 (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) 2865>; 2866 2867def : GCNPat < 2868 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 undef))), 2869 (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) 2870>; 2871 2872def : GCNPat < 2873 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 undef))), 2874 (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32) 2875>; 2876 2877def : GCNPat < 2878 (v2f16 (build_vector f16:$src0, (f16 undef))), 2879 (COPY $src0) 2880>; 2881 2882def : GCNPat < 2883 (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))), 2884 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 2885>; 2886 2887def : GCNPat < 2888 (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 VGPR_32:$src1))), 2889 (v2i16 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1)) 2890>; 2891 2892 2893def : GCNPat < 2894 (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))), 2895 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 2896>; 2897 2898def : GCNPat < 2899 (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 VGPR_32:$src1))), 2900 (v2f16 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1)) 2901>; 2902} 2903 2904let SubtargetPredicate = HasVOP3PInsts in { 2905def : GCNPat < 2906 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))), 2907 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 2908>; 2909 2910def : GCNPat < 2911 (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))), 2912 (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) 2913>; 2914 2915// With multiple uses of the shift, this will duplicate the shift and 2916// increase register pressure. 2917def : GCNPat < 2918 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2919 (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1)) 2920>; 2921 2922def : GCNPat < 2923 (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), 2924 (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2925 (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) 2926>; 2927 2928def : GCNPat < 2929 (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))), 2930 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 2931>; 2932 2933 2934 2935foreach Ty = [i16, f16] in { 2936 2937defvar vecTy = !if(!eq(Ty, i16), v2i16, v2f16); 2938defvar immzeroTy = !if(!eq(Ty, i16), immzero, fpimmzero); 2939 2940// Take the lower 16 bits from each VGPR_32 and concat them 2941def : GCNPat < 2942 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))), 2943 (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100))) 2944>; 2945 2946 2947// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] 2948// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) 2949def : GCNPat < 2950 (vecTy (DivergentBinFrag<build_vector> (Ty (immzeroTy)), 2951 (Ty !if(!eq(Ty, i16), 2952 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 2953 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 2954 (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b) 2955>; 2956 2957 2958// Take the lower 16 bits from V[0] and the upper 16 bits from V[1] 2959// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000) 2960def : GCNPat < 2961 (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), 2962 (Ty !if(!eq(Ty, i16), 2963 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 2964 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 2965 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b) 2966>; 2967 2968 2969// Take the upper 16 bits from V[0] and the lower 16 bits from V[1] 2970// Special case, can use V_ALIGNBIT (always uses encoded literal) 2971def : GCNPat < 2972 (vecTy (DivergentBinFrag<build_vector> 2973 (Ty !if(!eq(Ty, i16), 2974 (Ty (trunc (srl VGPR_32:$a, (i32 16)))), 2975 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), 2976 (Ty VGPR_32:$b))), 2977 (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) 2978>; 2979 2980// Take the upper 16 bits from each VGPR_32 and concat them 2981def : GCNPat < 2982 (vecTy (DivergentBinFrag<build_vector> 2983 (Ty !if(!eq(Ty, i16), 2984 (Ty (trunc (srl VGPR_32:$a, (i32 16)))), 2985 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), 2986 (Ty !if(!eq(Ty, i16), 2987 (Ty (trunc (srl VGPR_32:$b, (i32 16)))), 2988 (Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))), 2989 (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x07060302))) 2990>; 2991 2992 2993} // end foreach Ty 2994 2995 2996let AddedComplexity = 5 in { 2997def : GCNPat < 2998 (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), 2999 (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), 3000 (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) 3001>; 3002} 3003} // End SubtargetPredicate = HasVOP3PInsts 3004 3005// With multiple uses of the shift, this will duplicate the shift and 3006// increase register pressure. 3007let SubtargetPredicate = isGFX11Plus in 3008def : GCNPat < 3009 (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), (i16 SReg_32:$src1))), 3010 (v2i16 (S_PACK_HL_B32_B16 SReg_32:$src0, SReg_32:$src1)) 3011>; 3012 3013 3014def : GCNPat < 3015 (v2f16 (scalar_to_vector f16:$src0)), 3016 (COPY $src0) 3017>; 3018 3019def : GCNPat < 3020 (v2i16 (scalar_to_vector i16:$src0)), 3021 (COPY $src0) 3022>; 3023 3024def : GCNPat < 3025 (v4i16 (scalar_to_vector i16:$src0)), 3026 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 3027>; 3028 3029def : GCNPat < 3030 (v4f16 (scalar_to_vector f16:$src0)), 3031 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 3032>; 3033 3034def : GCNPat < 3035 (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, 3036 timm:$bank_mask, timm:$bound_ctrl)), 3037 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src, 3038 (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), 3039 (as_i32timm $bank_mask), 3040 (as_i1timm $bound_ctrl)) 3041>; 3042 3043def : GCNPat < 3044 (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, 3045 timm:$bank_mask, timm:$bound_ctrl)), 3046 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl), 3047 (as_i32timm $row_mask), (as_i32timm $bank_mask), 3048 (as_i1timm $bound_ctrl)) 3049>; 3050 3051//===----------------------------------------------------------------------===// 3052// Fract Patterns 3053//===----------------------------------------------------------------------===// 3054 3055let SubtargetPredicate = isGFX6 in { 3056 3057// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is 3058// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient 3059// way to implement it is using V_FRACT_F64. 3060// The workaround for the V_FRACT bug is: 3061// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 3062 3063// Convert floor(x) to (x - fract(x)) 3064 3065// Don't bother handling this for GlobalISel, it's handled during 3066// lowering. 3067// 3068// FIXME: DAG should also custom lower this. 3069def : GCNPat < 3070 (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), 3071 (V_ADD_F64_e64 3072 $mods, 3073 $x, 3074 SRCMODS.NEG, 3075 (V_CNDMASK_B64_PSEUDO 3076 (V_MIN_F64_e64 3077 SRCMODS.NONE, 3078 (V_FRACT_F64_e64 $mods, $x), 3079 SRCMODS.NONE, 3080 (V_MOV_B64_PSEUDO 0x3fefffffffffffff)), 3081 $x, 3082 (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/)))) 3083>; 3084 3085} // End SubtargetPredicates = isGFX6 3086 3087//============================================================================// 3088// Miscellaneous Optimization Patterns 3089//============================================================================// 3090 3091// Undo sub x, c -> add x, -c canonicalization since c is more likely 3092// an inline immediate than -c. 3093// TODO: Also do for 64-bit. 3094def : GCNPat< 3095 (UniformBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3096 (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1) 3097>; 3098 3099def : GCNPat< 3100 (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3101 (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 3102 let SubtargetPredicate = HasAddNoCarryInsts; 3103} 3104 3105def : GCNPat< 3106 (DivergentBinFrag<add> i32:$src0, (i32 NegSubInlineConst32:$src1)), 3107 (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 3108 let SubtargetPredicate = NotHasAddNoCarryInsts; 3109} 3110 3111 3112// Avoid pointlessly materializing a constant in VGPR. 3113// FIXME: Should also do this for readlane, but tablegen crashes on 3114// the ignored src1. 3115def : GCNPat< 3116 (int_amdgcn_readfirstlane (i32 imm:$src)), 3117 (S_MOV_B32 SReg_32:$src) 3118>; 3119 3120multiclass BFMPatterns <ValueType vt, PatFrag SHL, PatFrag ADD, InstSI BFM> { 3121 def : GCNPat < 3122 (vt (SHL (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), 3123 (BFM $a, $b) 3124 >; 3125 3126 def : GCNPat < 3127 (vt (ADD (vt (shl 1, vt:$a)), -1)), 3128 (BFM $a, (i32 0)) 3129 >; 3130} 3131 3132defm : BFMPatterns <i32, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B32>; 3133// FIXME: defm : BFMPatterns <i64, UniformBinFrag<shl>, UniformBinFrag<add>, S_BFM_B64>; 3134defm : BFMPatterns <i32, DivergentBinFrag<shl>, DivergentBinFrag<add>, V_BFM_B32_e64>; 3135 3136// Bitfield extract patterns 3137 3138def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{ 3139 return isMask_32(Imm); 3140}]>; 3141 3142def IMMPopCount : SDNodeXForm<imm, [{ 3143 return CurDAG->getTargetConstant(llvm::popcount(N->getZExtValue()), SDLoc(N), 3144 MVT::i32); 3145}]>; 3146 3147def : AMDGPUPat < 3148 (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)), 3149 IMMZeroBasedBitfieldMask:$mask), 3150 (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask))) 3151>; 3152 3153// x & ((1 << y) - 1) 3154def : AMDGPUPat < 3155 (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)), 3156 (V_BFE_U32_e64 $src, (i32 0), $width) 3157>; 3158 3159// x & ~(-1 << y) 3160def : AMDGPUPat < 3161 (DivergentBinFrag<and> i32:$src, 3162 (xor_oneuse (shl_oneuse -1, i32:$width), -1)), 3163 (V_BFE_U32_e64 $src, (i32 0), $width) 3164>; 3165 3166// x & (-1 >> (bitwidth - y)) 3167def : AMDGPUPat < 3168 (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))), 3169 (V_BFE_U32_e64 $src, (i32 0), $width) 3170>; 3171 3172// x << (bitwidth - y) >> (bitwidth - y) 3173def : AMDGPUPat < 3174 (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)), 3175 (sub 32, i32:$width)), 3176 (V_BFE_U32_e64 $src, (i32 0), $width) 3177>; 3178 3179def : AMDGPUPat < 3180 (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)), 3181 (sub 32, i32:$width)), 3182 (V_BFE_I32_e64 $src, (i32 0), $width) 3183>; 3184 3185// SHA-256 Ma patterns 3186 3187// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y 3188def : AMDGPUPat < 3189 (DivergentBinFrag<or> (and i32:$x, i32:$z), 3190 (and i32:$y, (or i32:$x, i32:$z))), 3191 (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y) 3192>; 3193 3194def : AMDGPUPat < 3195 (DivergentBinFrag<or> (and i64:$x, i64:$z), 3196 (and i64:$y, (or i64:$x, i64:$z))), 3197 (REG_SEQUENCE VReg_64, 3198 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 3199 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), 3200 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)), 3201 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0, 3202 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 3203 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), 3204 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)), 3205 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1) 3206>; 3207 3208multiclass IntMed3Pat<Instruction med3Inst, 3209 SDPatternOperator min, 3210 SDPatternOperator max> { 3211 3212 // This matches 16 permutations of 3213 // min(max(a, b), max(min(a, b), c)) 3214 def : AMDGPUPat < 3215 (min (max i32:$src0, i32:$src1), 3216 (max (min i32:$src0, i32:$src1), i32:$src2)), 3217 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3218>; 3219 3220 // This matches 16 permutations of 3221 // max(min(x, y), min(max(x, y), z)) 3222 def : AMDGPUPat < 3223 (max (min i32:$src0, i32:$src1), 3224 (min (max i32:$src0, i32:$src1), i32:$src2)), 3225 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3226>; 3227} 3228 3229defm : IntMed3Pat<V_MED3_I32_e64, smin, smax>; 3230defm : IntMed3Pat<V_MED3_U32_e64, umin, umax>; 3231 3232multiclass FPMed3Pat<ValueType vt, 3233 Instruction med3Inst> { 3234 // This matches 16 permutations of max(min(x, y), min(max(x, y), z)) 3235 def : GCNPat< 3236 (fmaxnum_like_nnan 3237 (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3238 (VOP3Mods vt:$src1, i32:$src1_mods)), 3239 (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3240 (VOP3Mods vt:$src1, i32:$src1_mods)), 3241 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3242 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3243 DSTCLAMP.NONE, DSTOMOD.NONE)>; 3244 3245 3246 // This matches 16 permutations of min(max(x, y), max(min(x, y), z)) 3247 def : GCNPat< 3248 (fminnum_like_nnan 3249 (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3250 (VOP3Mods vt:$src1, i32:$src1_mods)), 3251 (fmaxnum_like (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3252 (VOP3Mods vt:$src1, i32:$src1_mods)), 3253 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3254 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3255 DSTCLAMP.NONE, DSTOMOD.NONE)>; 3256} 3257 3258class FP16Med3Pat<ValueType vt, 3259 Instruction med3Inst> : GCNPat< 3260 (fmaxnum_like_nnan (fminnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3261 (VOP3Mods vt:$src1, i32:$src1_mods)), 3262 (fminnum_like (fmaxnum_like (VOP3Mods vt:$src0, i32:$src0_mods), 3263 (VOP3Mods vt:$src1, i32:$src1_mods)), 3264 (vt (VOP3Mods vt:$src2, i32:$src2_mods)))), 3265 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE) 3266>; 3267 3268multiclass Int16Med3Pat<Instruction med3Inst, 3269 SDPatternOperator min, 3270 SDPatternOperator max> { 3271 // This matches 16 permutations of 3272 // max(min(x, y), min(max(x, y), z)) 3273 def : GCNPat < 3274 (max (min i16:$src0, i16:$src1), 3275 (min (max i16:$src0, i16:$src1), i16:$src2)), 3276 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 3277>; 3278 3279 // This matches 16 permutations of 3280 // min(max(a, b), max(min(a, b), c)) 3281 def : GCNPat < 3282 (min (max i16:$src0, i16:$src1), 3283 (max (min i16:$src0, i16:$src1), i16:$src2)), 3284 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 3285>; 3286} 3287 3288defm : FPMed3Pat<f32, V_MED3_F32_e64>; 3289 3290class 3291IntMinMaxPat<Instruction minmaxInst, SDPatternOperator min_or_max, 3292 SDPatternOperator max_or_min_oneuse> : AMDGPUPat < 3293 (DivergentBinFrag<min_or_max> (max_or_min_oneuse i32:$src0, i32:$src1), 3294 i32:$src2), 3295 (minmaxInst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 3296>; 3297 3298class 3299FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max, 3300 SDPatternOperator max_or_min_oneuse> : GCNPat < 3301 (min_or_max (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods), 3302 (VOP3Mods vt:$src1, i32:$src1_mods)), 3303 (vt (VOP3Mods vt:$src2, i32:$src2_mods))), 3304 (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, 3305 DSTCLAMP.NONE, DSTOMOD.NONE) 3306>; 3307 3308let OtherPredicates = [isGFX11Plus] in { 3309def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>; 3310def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>; 3311def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>; 3312def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>; 3313def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>; 3314def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>; 3315def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>; 3316def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>; 3317} 3318 3319let OtherPredicates = [isGFX9Plus] in { 3320def : FP16Med3Pat<f16, V_MED3_F16_e64>; 3321defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>; 3322defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>; 3323} // End Predicates = [isGFX9Plus] 3324 3325class AMDGPUGenericInstruction : GenericInstruction { 3326 let Namespace = "AMDGPU"; 3327} 3328 3329// Convert a wave address to a swizzled vector address (i.e. this is 3330// for copying the stack pointer to a vector address appropriate to 3331// use in the offset field of mubuf instructions). 3332def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction { 3333 let OutOperandList = (outs type0:$dst); 3334 let InOperandList = (ins type0:$src); 3335 let hasSideEffects = 0; 3336} 3337 3338// Returns -1 if the input is zero. 3339def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { 3340 let OutOperandList = (outs type0:$dst); 3341 let InOperandList = (ins type1:$src); 3342 let hasSideEffects = 0; 3343} 3344 3345// Returns -1 if the input is zero. 3346def G_AMDGPU_FFBL_B32 : AMDGPUGenericInstruction { 3347 let OutOperandList = (outs type0:$dst); 3348 let InOperandList = (ins type1:$src); 3349 let hasSideEffects = 0; 3350} 3351 3352def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction { 3353 let OutOperandList = (outs type0:$dst); 3354 let InOperandList = (ins type1:$src); 3355 let hasSideEffects = 0; 3356} 3357 3358class BufferLoadGenericInstruction : AMDGPUGenericInstruction { 3359 let OutOperandList = (outs type0:$dst); 3360 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 3361 type2:$soffset, untyped_imm_0:$offset, 3362 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3363 let hasSideEffects = 0; 3364 let mayLoad = 1; 3365} 3366 3367class TBufferLoadGenericInstruction : AMDGPUGenericInstruction { 3368 let OutOperandList = (outs type0:$dst); 3369 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 3370 type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format, 3371 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3372 let hasSideEffects = 0; 3373 let mayLoad = 1; 3374} 3375 3376def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction; 3377def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction; 3378def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction; 3379def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction; 3380def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction; 3381def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction; 3382def G_AMDGPU_BUFFER_LOAD_FORMAT_TFE : BufferLoadGenericInstruction; 3383def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; 3384def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; 3385def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; 3386 3387class BufferStoreGenericInstruction : AMDGPUGenericInstruction { 3388 let OutOperandList = (outs); 3389 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3390 type2:$soffset, untyped_imm_0:$offset, 3391 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3392 let hasSideEffects = 0; 3393 let mayStore = 1; 3394} 3395 3396class TBufferStoreGenericInstruction : AMDGPUGenericInstruction { 3397 let OutOperandList = (outs); 3398 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3399 type2:$soffset, untyped_imm_0:$offset, 3400 untyped_imm_0:$format, 3401 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3402 let hasSideEffects = 0; 3403 let mayStore = 1; 3404} 3405 3406def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction; 3407def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction; 3408def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction; 3409def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction; 3410def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction; 3411def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction; 3412def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction; 3413 3414def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction { 3415 let OutOperandList = (outs type0:$dst); 3416 let InOperandList = (ins type0:$src0, type0:$src1); 3417 let hasSideEffects = 0; 3418} 3419 3420def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction { 3421 let OutOperandList = (outs type0:$dst); 3422 let InOperandList = (ins type0:$src0, type0:$src1); 3423 let hasSideEffects = 0; 3424} 3425 3426foreach N = 0-3 in { 3427def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { 3428 let OutOperandList = (outs type0:$dst); 3429 let InOperandList = (ins type0:$src0); 3430 let hasSideEffects = 0; 3431} 3432} 3433 3434def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction { 3435 let OutOperandList = (outs type0:$dst); 3436 let InOperandList = (ins type0:$src0, type0:$src1); 3437 let hasSideEffects = 0; 3438} 3439 3440def G_AMDGPU_SMED3 : AMDGPUGenericInstruction { 3441 let OutOperandList = (outs type0:$dst); 3442 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3443 let hasSideEffects = 0; 3444} 3445 3446def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { 3447 let OutOperandList = (outs type0:$dst); 3448 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3449 let hasSideEffects = 0; 3450} 3451 3452def G_AMDGPU_FMED3 : AMDGPUGenericInstruction { 3453 let OutOperandList = (outs type0:$dst); 3454 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3455 let hasSideEffects = 0; 3456} 3457 3458def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { 3459 let OutOperandList = (outs type0:$dst); 3460 let InOperandList = (ins type0:$src); 3461 let hasSideEffects = 0; 3462} 3463 3464// Integer multiply-add: arg0 * arg1 + arg2. 3465// 3466// arg0 and arg1 are 32-bit integers (interpreted as signed or unsigned), 3467// arg2 is a 64-bit integer. Result is a 64-bit integer and a 1-bit carry-out. 3468class G_AMDGPU_MAD_64_32 : AMDGPUGenericInstruction { 3469 let OutOperandList = (outs type0:$dst, type1:$carry_out); 3470 let InOperandList = (ins type2:$arg0, type2:$arg1, type0:$arg2); 3471 let hasSideEffects = 0; 3472} 3473 3474def G_AMDGPU_MAD_U64_U32 : G_AMDGPU_MAD_64_32; 3475def G_AMDGPU_MAD_I64_I32 : G_AMDGPU_MAD_64_32; 3476 3477// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector 3478// operand Expects a MachineMemOperand in addition to explicit 3479// operands. 3480def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { 3481 let OutOperandList = (outs type0:$oldval); 3482 let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval); 3483 let hasSideEffects = 0; 3484 let mayLoad = 1; 3485 let mayStore = 1; 3486} 3487 3488let Namespace = "AMDGPU" in { 3489def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP; 3490def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; 3491def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; 3492def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; 3493} 3494 3495class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction { 3496 let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst)); 3497 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3498 type2:$soffset, untyped_imm_0:$offset, 3499 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3500 let hasSideEffects = 0; 3501 let mayLoad = 1; 3502 let mayStore = 1; 3503} 3504 3505def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction; 3506def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction; 3507def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction; 3508def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction; 3509def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; 3510def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; 3511def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; 3512def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; 3513def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; 3514def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; 3515def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; 3516def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; 3517def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; 3518def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; 3519def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; 3520 3521def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { 3522 let OutOperandList = (outs type0:$dst); 3523 let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex, 3524 type2:$voffset, type2:$soffset, untyped_imm_0:$offset, 3525 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3526 let hasSideEffects = 0; 3527 let mayLoad = 1; 3528 let mayStore = 1; 3529} 3530 3531// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as 3532// a workaround for the intrinsic being defined as readnone, but 3533// really needs a memory operand. 3534def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { 3535 let OutOperandList = (outs type0:$dst); 3536 let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); 3537 let hasSideEffects = 0; 3538 let mayLoad = 1; 3539 let mayStore = 0; 3540} 3541 3542// This is equivalent to the G_INTRINSIC*, but the operands may have 3543// been legalized depending on the subtarget requirements. 3544def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { 3545 let OutOperandList = (outs type0:$dst); 3546 let InOperandList = (ins unknown:$intrin, variable_ops); 3547 let hasSideEffects = 0; 3548 let mayLoad = 1; 3549 3550 // FIXME: Use separate opcode for atomics. 3551 let mayStore = 1; 3552} 3553 3554def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction { 3555 let OutOperandList = (outs type0:$dst); 3556 let InOperandList = (ins unknown:$intrin, variable_ops); 3557 let hasSideEffects = 0; 3558 let mayLoad = 1; 3559 3560 // FIXME: Use separate opcode for atomics. 3561 let mayStore = 1; 3562} 3563 3564// This is equivalent to the G_INTRINSIC*, but the operands may have 3565// been legalized depending on the subtarget requirements. 3566def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { 3567 let OutOperandList = (outs); 3568 let InOperandList = (ins unknown:$intrin, variable_ops); 3569 let hasSideEffects = 0; 3570 let mayStore = 1; 3571} 3572 3573def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction { 3574 let OutOperandList = (outs); 3575 let InOperandList = (ins unknown:$intrin, variable_ops); 3576 let hasSideEffects = 0; 3577 let mayStore = 1; 3578} 3579 3580def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { 3581 let OutOperandList = (outs type0:$dst); 3582 let InOperandList = (ins unknown:$intrin, variable_ops); 3583 let hasSideEffects = 0; 3584 let mayLoad = 1; 3585 let mayStore = 0; 3586} 3587 3588// Generic instruction for SI_CALL, so we can select the register bank and insert a waterfall loop 3589// if necessary. 3590def G_SI_CALL : AMDGPUGenericInstruction { 3591 let OutOperandList = (outs SReg_64:$dst); 3592 let InOperandList = (ins type0:$src0, unknown:$callee); 3593 let Size = 4; 3594 let isCall = 1; 3595 let UseNamedOperandTable = 1; 3596 let SchedRW = [WriteBranch]; 3597 // TODO: Should really base this on the call target 3598 let isConvergent = 1; 3599} 3600 3601def G_FPTRUNC_ROUND_UPWARD : AMDGPUGenericInstruction { 3602 let OutOperandList = (outs type0:$vdst); 3603 let InOperandList = (ins type1:$src0); 3604 let hasSideEffects = 0; 3605} 3606 3607def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction { 3608 let OutOperandList = (outs type0:$vdst); 3609 let InOperandList = (ins type1:$src0); 3610 let hasSideEffects = 0; 3611} 3612 3613//============================================================================// 3614// Dummy Instructions 3615//============================================================================// 3616 3617def V_ILLEGAL_gfx6_gfx7_gfx8_gfx9 : Enc32, InstSI<(outs), (ins), "v_illegal"> { 3618 let Inst{31-0} = 0xFFFFFFFF; 3619 let FixedSize = 1; 3620 let Size = 4; 3621 let Uses = [EXEC]; 3622 let hasSideEffects = 1; 3623 let SubtargetPredicate = isGFX6GFX7GFX8GFX9; 3624} 3625 3626def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> { 3627 let Inst{31-0} = 0x00000000; 3628 let FixedSize = 1; 3629 let Size = 4; 3630 let Uses = [EXEC]; 3631 let hasSideEffects = 1; 3632 let SubtargetPredicate = isGFX10Plus; 3633} 3634