1//===-- SIInstructions.td - SI Instruction Definitions --------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// This file was originally auto-generated from a GPU register header file and 9// all the instruction definitions were originally commented out. Instructions 10// that are not yet supported remain commented out. 11//===----------------------------------------------------------------------===// 12 13class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl { 14 15} 16 17include "SOPInstructions.td" 18include "VOPInstructions.td" 19include "SMInstructions.td" 20include "FLATInstructions.td" 21include "BUFInstructions.td" 22 23//===----------------------------------------------------------------------===// 24// EXP Instructions 25//===----------------------------------------------------------------------===// 26 27defm EXP : EXP_m<0>; 28defm EXP_DONE : EXP_m<1>; 29 30class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< 31 (int_amdgcn_exp timm:$tgt, timm:$en, 32 (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), 33 (vt ExpSrc2:$src2), (vt ExpSrc3:$src3), 34 done_val, timm:$vm), 35 (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, 36 ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en) 37>; 38 39class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat< 40 (int_amdgcn_exp_compr timm:$tgt, timm:$en, 41 (vt ExpSrc0:$src0), (vt ExpSrc1:$src1), 42 done_val, timm:$vm), 43 (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1, 44 (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en) 45>; 46 47// FIXME: The generated DAG matcher seems to have strange behavior 48// with a 1-bit literal to match, so use a -1 for checking a true 49// 1-bit value. 50def : ExpPattern<i32, EXP, 0>; 51def : ExpPattern<i32, EXP_DONE, -1>; 52def : ExpPattern<f32, EXP, 0>; 53def : ExpPattern<f32, EXP_DONE, -1>; 54 55def : ExpComprPattern<v2i16, EXP, 0>; 56def : ExpComprPattern<v2i16, EXP_DONE, -1>; 57def : ExpComprPattern<v2f16, EXP, 0>; 58def : ExpComprPattern<v2f16, EXP_DONE, -1>; 59 60//===----------------------------------------------------------------------===// 61// VINTRP Instructions 62//===----------------------------------------------------------------------===// 63 64// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI) 65def VINTRPDst : VINTRPDstOperand <VGPR_32>; 66 67let Uses = [MODE, M0, EXEC] in { 68 69// FIXME: Specify SchedRW for VINTRP instructions. 70 71multiclass V_INTERP_P1_F32_m : VINTRP_m < 72 0x00000000, 73 (outs VINTRPDst:$vdst), 74 (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), 75 "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", 76 [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc, 77 (i32 timm:$attrchan), (i32 timm:$attr), M0))] 78>; 79 80let OtherPredicates = [has32BankLDS] in { 81 82defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; 83 84} // End OtherPredicates = [has32BankLDS] 85 86let OtherPredicates = [has16BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { 87 88defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; 89 90} // End OtherPredicates = [has32BankLDS], Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 91 92let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { 93 94defm V_INTERP_P2_F32 : VINTRP_m < 95 0x00000001, 96 (outs VINTRPDst:$vdst), 97 (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), 98 "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", 99 [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, 100 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 101 102} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" 103 104defm V_INTERP_MOV_F32 : VINTRP_m < 105 0x00000002, 106 (outs VINTRPDst:$vdst), 107 (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), 108 "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", 109 [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), 110 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 111 112} // End Uses = [MODE, M0, EXEC] 113 114//===----------------------------------------------------------------------===// 115// Pseudo Instructions 116//===----------------------------------------------------------------------===// 117def ATOMIC_FENCE : SPseudoInstSI< 118 (outs), (ins i32imm:$ordering, i32imm:$scope), 119 [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))], 120 "ATOMIC_FENCE $ordering, $scope"> { 121 let hasSideEffects = 1; 122 let maybeAtomic = 1; 123} 124 125def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> { 126 let HasExt = 1; 127 let HasExtDPP = 1; 128} 129 130let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { 131 132// For use in patterns 133def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), 134 (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { 135 let isPseudo = 1; 136 let isCodeGenOnly = 1; 137 let usesCustomInserter = 1; 138} 139 140// 64-bit vector move instruction. This is mainly used by the 141// SIFoldOperands pass to enable folding of inline immediates. 142def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), 143 (ins VSrc_b64:$src0)>; 144 145// 64-bit vector move with dpp. Expanded post-RA. 146def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> { 147 let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. 148} 149 150// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the 151// WQM pass processes it. 152def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 153 154// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is 155// turned into a copy by WQM pass, but does not seed WQM requirements. 156def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 157 158// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so 159// that the @earlyclobber is respected. The @earlyclobber is to make sure that 160// the instruction that defines $src0 (which is run in WWM) doesn't 161// accidentally clobber inactive channels of $vdst. 162let Constraints = "@earlyclobber $vdst" in { 163def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 164} 165 166} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] 167 168def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 169 let Uses = [EXEC]; 170 let Defs = [EXEC, SCC]; 171 let hasSideEffects = 0; 172 let mayLoad = 0; 173 let mayStore = 0; 174} 175 176def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 177 let hasSideEffects = 0; 178 let mayLoad = 0; 179 let mayStore = 0; 180} 181 182// Invert the exec mask and overwrite the inactive lanes of dst with inactive, 183// restoring it after we're done. 184def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), 185 (ins VGPR_32: $src, VSrc_b32:$inactive), 186 [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { 187 let Constraints = "$src = $vdst"; 188} 189 190def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), 191 (ins VReg_64: $src, VSrc_b64:$inactive), 192 [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { 193 let Constraints = "$src = $vdst"; 194} 195 196let usesCustomInserter = 1, Defs = [VCC, EXEC] in { 197def V_ADD_U64_PSEUDO : VPseudoInstSI < 198 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 199 [(set VReg_64:$vdst, (getDivergentFrag<add>.ret i64:$src0, i64:$src1))] 200>; 201 202def V_SUB_U64_PSEUDO : VPseudoInstSI < 203 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 204 [(set VReg_64:$vdst, (getDivergentFrag<sub>.ret i64:$src0, i64:$src1))] 205>; 206} // End usesCustomInserter = 1, Defs = [VCC, EXEC] 207 208let usesCustomInserter = 1, Defs = [SCC] in { 209def S_ADD_U64_PSEUDO : SPseudoInstSI < 210 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 211 [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))] 212>; 213 214def S_SUB_U64_PSEUDO : SPseudoInstSI < 215 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 216 [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))] 217>; 218 219def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < 220 (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) 221>; 222 223def S_SUB_U64_CO_PSEUDO : SPseudoInstSI < 224 (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) 225>; 226 227def S_ADD_CO_PSEUDO : SPseudoInstSI < 228 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 229>; 230 231def S_SUB_CO_PSEUDO : SPseudoInstSI < 232 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 233>; 234 235def S_UADDO_PSEUDO : SPseudoInstSI < 236 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 237>; 238 239def S_USUBO_PSEUDO : SPseudoInstSI < 240 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 241>; 242 243} // End usesCustomInserter = 1, Defs = [SCC] 244 245let usesCustomInserter = 1 in { 246def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins), 247 [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; 248} // End let usesCustomInserter = 1, SALU = 1 249 250// Wrap an instruction by duplicating it, except for setting isTerminator. 251class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< 252 base_inst.OutOperandList, 253 base_inst.InOperandList> { 254 let Uses = base_inst.Uses; 255 let Defs = base_inst.Defs; 256 let isTerminator = 1; 257 let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; 258 let hasSideEffects = base_inst.hasSideEffects; 259 let UseNamedOperandTable = base_inst.UseNamedOperandTable; 260 let CodeSize = base_inst.CodeSize; 261 let SchedRW = base_inst.SchedRW; 262} 263 264let WaveSizePredicate = isWave64 in { 265def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; 266def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; 267def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; 268} 269 270let WaveSizePredicate = isWave32 in { 271def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>; 272def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; 273def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; 274def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; 275} 276 277 278def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), 279 [(int_amdgcn_wave_barrier)]> { 280 let SchedRW = []; 281 let hasNoSchedulingInfo = 1; 282 let hasSideEffects = 1; 283 let mayLoad = 0; 284 let mayStore = 0; 285 let isConvergent = 1; 286 let FixedSize = 1; 287 let Size = 0; 288} 289 290// SI pseudo instructions. These are used by the CFG structurizer pass 291// and should be lowered to ISA instructions prior to codegen. 292 293// Dummy terminator instruction to use after control flow instructions 294// replaced with exec mask operations. 295def SI_MASK_BRANCH : VPseudoInstSI < 296 (outs), (ins brtarget:$target)> { 297 let isBranch = 0; 298 let isTerminator = 1; 299 let isBarrier = 0; 300 let SchedRW = []; 301 let hasNoSchedulingInfo = 1; 302 let FixedSize = 1; 303 let Size = 0; 304} 305 306let isTerminator = 1 in { 307 308let OtherPredicates = [EnableLateCFGStructurize] in { 309 def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < 310 (outs), 311 (ins SReg_1:$vcc, brtarget:$target), 312 [(brcond i1:$vcc, bb:$target)]> { 313 let Size = 12; 314} 315} 316 317def SI_IF: CFPseudoInstSI < 318 (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target), 319 [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { 320 let Constraints = ""; 321 let Size = 12; 322 let hasSideEffects = 1; 323} 324 325def SI_ELSE : CFPseudoInstSI < 326 (outs SReg_1:$dst), 327 (ins SReg_1:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> { 328 let Size = 12; 329 let hasSideEffects = 1; 330} 331 332def SI_LOOP : CFPseudoInstSI < 333 (outs), (ins SReg_1:$saved, brtarget:$target), 334 [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { 335 let Size = 8; 336 let isBranch = 1; 337 let hasSideEffects = 1; 338} 339 340} // End isTerminator = 1 341 342def SI_END_CF : CFPseudoInstSI < 343 (outs), (ins SReg_1:$saved), [], 1, 1> { 344 let Size = 4; 345 let isAsCheapAsAMove = 1; 346 let isReMaterializable = 1; 347 let hasSideEffects = 1; 348 let mayLoad = 1; // FIXME: Should not need memory flags 349 let mayStore = 1; 350} 351 352def SI_IF_BREAK : CFPseudoInstSI < 353 (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { 354 let Size = 4; 355 let isAsCheapAsAMove = 1; 356 let isReMaterializable = 1; 357} 358 359let Uses = [EXEC] in { 360 361multiclass PseudoInstKill <dag ins> { 362 // Even though this pseudo can usually be expanded without an SCC def, we 363 // conservatively assume that it has an SCC def, both because it is sometimes 364 // required in degenerate cases (when V_CMPX cannot be used due to constant 365 // bus limitations) and because it allows us to avoid having to track SCC 366 // liveness across basic blocks. 367 let Defs = [EXEC,VCC,SCC] in 368 def _PSEUDO : PseudoInstSI <(outs), ins> { 369 let isConvergent = 1; 370 let usesCustomInserter = 1; 371 } 372 373 let Defs = [EXEC,VCC,SCC] in 374 def _TERMINATOR : SPseudoInstSI <(outs), ins> { 375 let isTerminator = 1; 376 } 377} 378 379defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; 380defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; 381 382let Defs = [EXEC] in 383def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>; 384 385let Defs = [EXEC,VCC] in 386def SI_ILLEGAL_COPY : SPseudoInstSI < 387 (outs unknown:$dst), (ins unknown:$src), 388 [], " ; illegal copy $src to $dst">; 389 390} // End Uses = [EXEC], Defs = [EXEC,VCC] 391 392// Branch on undef scc. Used to avoid intermediate copy from 393// IMPLICIT_DEF to SCC. 394def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { 395 let isTerminator = 1; 396 let usesCustomInserter = 1; 397 let isBranch = 1; 398} 399 400def SI_PS_LIVE : PseudoInstSI < 401 (outs SReg_1:$dst), (ins), 402 [(set i1:$dst, (int_amdgcn_ps_live))]> { 403 let SALU = 1; 404} 405 406def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), 407 [(int_amdgcn_unreachable)], 408 "; divergent unreachable"> { 409 let Size = 0; 410 let hasNoSchedulingInfo = 1; 411 let FixedSize = 1; 412} 413 414// Used as an isel pseudo to directly emit initialization with an 415// s_mov_b32 rather than a copy of another initialized 416// register. MachineCSE skips copies, and we don't want to have to 417// fold operands before it runs. 418def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { 419 let Defs = [M0]; 420 let usesCustomInserter = 1; 421 let isAsCheapAsAMove = 1; 422 let isReMaterializable = 1; 423} 424 425def SI_INIT_EXEC : SPseudoInstSI < 426 (outs), (ins i64imm:$src), 427 [(int_amdgcn_init_exec (i64 timm:$src))]> { 428 let Defs = [EXEC]; 429 let usesCustomInserter = 1; 430 let isAsCheapAsAMove = 1; 431 let WaveSizePredicate = isWave64; 432} 433 434// FIXME: Intrinsic should be mangled for wave size. 435def SI_INIT_EXEC_LO : SPseudoInstSI < 436 (outs), (ins i32imm:$src), []> { 437 let Defs = [EXEC_LO]; 438 let usesCustomInserter = 1; 439 let isAsCheapAsAMove = 1; 440 let WaveSizePredicate = isWave32; 441} 442 443// FIXME: Wave32 version 444def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < 445 (outs), (ins SSrc_b32:$input, i32imm:$shift), 446 [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { 447 let Defs = [EXEC]; 448 let usesCustomInserter = 1; 449} 450 451def : GCNPat < 452 (int_amdgcn_init_exec timm:$src), 453 (SI_INIT_EXEC_LO (as_i32timm timm:$src))> { 454 let WaveSizePredicate = isWave32; 455} 456 457// Return for returning shaders to a shader variant epilog. 458def SI_RETURN_TO_EPILOG : SPseudoInstSI < 459 (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { 460 let isTerminator = 1; 461 let isBarrier = 1; 462 let isReturn = 1; 463 let hasNoSchedulingInfo = 1; 464 let DisableWQM = 1; 465 let FixedSize = 1; 466} 467 468// Return for returning function calls. 469def SI_RETURN : SPseudoInstSI < 470 (outs), (ins), [], 471 "; return"> { 472 let isTerminator = 1; 473 let isBarrier = 1; 474 let isReturn = 1; 475 let SchedRW = [WriteBranch]; 476} 477 478// Return for returning function calls without output register. 479// 480// This version is only needed so we can fill in the output register 481// in the custom inserter. 482def SI_CALL_ISEL : SPseudoInstSI < 483 (outs), (ins SSrc_b64:$src0, unknown:$callee), 484 [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { 485 let Size = 4; 486 let isCall = 1; 487 let SchedRW = [WriteBranch]; 488 let usesCustomInserter = 1; 489 // TODO: Should really base this on the call target 490 let isConvergent = 1; 491} 492 493def : GCNPat< 494 (AMDGPUcall i64:$src0, (i64 0)), 495 (SI_CALL_ISEL $src0, (i64 0)) 496>; 497 498// Wrapper around s_swappc_b64 with extra $callee parameter to track 499// the called function after regalloc. 500def SI_CALL : SPseudoInstSI < 501 (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { 502 let Size = 4; 503 let isCall = 1; 504 let UseNamedOperandTable = 1; 505 let SchedRW = [WriteBranch]; 506 // TODO: Should really base this on the call target 507 let isConvergent = 1; 508} 509 510// Tail call handling pseudo 511def SI_TCRETURN : SPseudoInstSI <(outs), 512 (ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff), 513 [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { 514 let Size = 4; 515 let isCall = 1; 516 let isTerminator = 1; 517 let isReturn = 1; 518 let isBarrier = 1; 519 let UseNamedOperandTable = 1; 520 let SchedRW = [WriteBranch]; 521 // TODO: Should really base this on the call target 522 let isConvergent = 1; 523} 524 525 526def ADJCALLSTACKUP : SPseudoInstSI< 527 (outs), (ins i32imm:$amt0, i32imm:$amt1), 528 [(callseq_start timm:$amt0, timm:$amt1)], 529 "; adjcallstackup $amt0 $amt1"> { 530 let Size = 8; // Worst case. (s_add_u32 + constant) 531 let FixedSize = 1; 532 let hasSideEffects = 1; 533 let usesCustomInserter = 1; 534 let SchedRW = [WriteSALU]; 535 let Defs = [SCC]; 536} 537 538def ADJCALLSTACKDOWN : SPseudoInstSI< 539 (outs), (ins i32imm:$amt1, i32imm:$amt2), 540 [(callseq_end timm:$amt1, timm:$amt2)], 541 "; adjcallstackdown $amt1"> { 542 let Size = 8; // Worst case. (s_add_u32 + constant) 543 let hasSideEffects = 1; 544 let usesCustomInserter = 1; 545 let SchedRW = [WriteSALU]; 546 let Defs = [SCC]; 547} 548 549let Defs = [M0, EXEC, SCC], 550 UseNamedOperandTable = 1 in { 551 552// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect 553// addressing implementation. 554class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < 555 (outs VGPR_32:$vdst), 556 (ins rc:$src, VS_32:$idx, i32imm:$offset)> { 557 let usesCustomInserter = 1; 558} 559 560class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < 561 (outs rc:$vdst), 562 (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { 563 let Constraints = "$src = $vdst"; 564 let usesCustomInserter = 1; 565} 566 567def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; 568def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; 569def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; 570def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; 571def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; 572def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>; 573 574def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; 575def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; 576def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; 577def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; 578def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; 579def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>; 580 581} // End Uses = [EXEC], Defs = [M0, EXEC] 582 583 584// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32 585// expecting to be executed with gpr indexing mode enabled) 586// instruction in which the vector operand appears only twice, once as 587// def and once as use. Using this pseudo avoids problems with the Two 588// Address instructions pass. 589class INDIRECT_REG_WRITE_pseudo<RegisterClass rc, 590 RegisterOperand val_ty> : PseudoInstSI < 591 (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> { 592 let Constraints = "$vsrc = $vdst"; 593 let Uses = [M0]; 594} 595 596class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> : 597 INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> { 598 let VALU = 1; 599 let VOP1 = 1; 600 let Uses = [M0, EXEC]; 601} 602 603class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc, 604 RegisterOperand val_ty> : 605 INDIRECT_REG_WRITE_pseudo<rc, val_ty> { 606 let SALU = 1; 607 let SOP1 = 1; 608 let Uses = [M0]; 609} 610 611class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> : 612 S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>; 613class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> : 614 S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>; 615 616 617def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>; 618def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>; 619def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>; 620def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>; 621def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>; 622def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>; 623def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>; 624def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>; 625 626def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>; 627def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>; 628def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>; 629def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>; 630def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>; 631def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>; 632def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>; 633def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>; 634 635def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>; 636def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>; 637def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>; 638def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>; 639def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>; 640 641 642multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { 643 let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in { 644 def _SAVE : PseudoInstSI < 645 (outs), 646 (ins sgpr_class:$data, i32imm:$addr)> { 647 let mayStore = 1; 648 let mayLoad = 0; 649 } 650 651 def _RESTORE : PseudoInstSI < 652 (outs sgpr_class:$data), 653 (ins i32imm:$addr)> { 654 let mayStore = 0; 655 let mayLoad = 1; 656 } 657 } // End UseNamedOperandTable = 1 658} 659 660// You cannot use M0 as the output of v_readlane_b32 instructions or 661// use it in the sdata operand of SMEM instructions. We still need to 662// be able to spill the physical register m0, so allow it for 663// SI_SPILL_32_* instructions. 664defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; 665defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; 666defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; 667defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; 668defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; 669defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>; 670defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; 671defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; 672defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; 673 674multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> { 675 let UseNamedOperandTable = 1, VGPRSpill = 1, 676 SchedRW = [WriteVMEM] in { 677 def _SAVE : VPseudoInstSI < 678 (outs), 679 (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, 680 SReg_32:$soffset, i32imm:$offset)> { 681 let mayStore = 1; 682 let mayLoad = 0; 683 // (2 * 4) + (8 * num_subregs) bytes maximum 684 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); 685 // Size field is unsigned char and cannot fit more. 686 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 687 } 688 689 def _RESTORE : VPseudoInstSI < 690 (outs vgpr_class:$vdata), 691 (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, 692 i32imm:$offset)> { 693 let mayStore = 0; 694 let mayLoad = 1; 695 696 // (2 * 4) + (8 * num_subregs) bytes maximum 697 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8); 698 // Size field is unsigned char and cannot fit more. 699 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 700 } 701 } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] 702} 703 704defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; 705defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; 706defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; 707defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; 708defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; 709defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; 710defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; 711defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; 712defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; 713 714multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> { 715 let UseNamedOperandTable = 1, VGPRSpill = 1, 716 Constraints = "@earlyclobber $tmp", 717 SchedRW = [WriteVMEM] in { 718 def _SAVE : VPseudoInstSI < 719 (outs VGPR_32:$tmp), 720 (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc, 721 SReg_32:$soffset, i32imm:$offset)> { 722 let mayStore = 1; 723 let mayLoad = 0; 724 // (2 * 4) + (16 * num_subregs) bytes maximum 725 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); 726 // Size field is unsigned char and cannot fit more. 727 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 728 } 729 730 def _RESTORE : VPseudoInstSI < 731 (outs vgpr_class:$vdata, VGPR_32:$tmp), 732 (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset, 733 i32imm:$offset)> { 734 let mayStore = 0; 735 let mayLoad = 1; 736 737 // (2 * 4) + (16 * num_subregs) bytes maximum 738 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8); 739 // Size field is unsigned char and cannot fit more. 740 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 741 } 742 } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] 743} 744 745defm SI_SPILL_A32 : SI_SPILL_AGPR <AGPR_32>; 746defm SI_SPILL_A64 : SI_SPILL_AGPR <AReg_64>; 747defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>; 748defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>; 749defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>; 750 751def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < 752 (outs SReg_64:$dst), 753 (ins si_ga:$ptr_lo, si_ga:$ptr_hi), 754 [(set SReg_64:$dst, 755 (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> { 756 let Defs = [SCC]; 757} 758 759def : GCNPat < 760 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0), 761 (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) 762>; 763 764def : GCNPat< 765 (AMDGPUtrap timm:$trapid), 766 (S_TRAP $trapid) 767>; 768 769def : GCNPat< 770 (AMDGPUelse i1:$src, bb:$target), 771 (SI_ELSE $src, $target, 0) 772>; 773 774def : Pat < 775 (int_amdgcn_kill i1:$src), 776 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0) 777>; 778 779def : Pat < 780 (int_amdgcn_kill (i1 (not i1:$src))), 781 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1) 782>; 783 784def : Pat < 785 (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), 786 (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) 787>; 788 789 // TODO: we could add more variants for other types of conditionals 790 791def : Pat < 792 (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 793 (COPY $src) // Return the SGPRs representing i1 src 794>; 795 796def : Pat < 797 (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 798 (COPY $src) // Return the SGPRs representing i1 src 799>; 800 801//===----------------------------------------------------------------------===// 802// VOP1 Patterns 803//===----------------------------------------------------------------------===// 804 805let OtherPredicates = [UnsafeFPMath] in { 806 807//def : RcpPat<V_RCP_F64_e32, f64>; 808//defm : RsqPat<V_RSQ_F64_e32, f64>; 809//defm : RsqPat<V_RSQ_F32_e32, f32>; 810 811def : RsqPat<V_RSQ_F32_e32, f32>; 812def : RsqPat<V_RSQ_F64_e32, f64>; 813 814// Convert (x - floor(x)) to fract(x) 815def : GCNPat < 816 (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), 817 (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), 818 (V_FRACT_F32_e64 $mods, $x) 819>; 820 821// Convert (x + (-floor(x))) to fract(x) 822def : GCNPat < 823 (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), 824 (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), 825 (V_FRACT_F64_e64 $mods, $x) 826>; 827 828} // End OtherPredicates = [UnsafeFPMath] 829 830 831// f16_to_fp patterns 832def : GCNPat < 833 (f32 (f16_to_fp i32:$src0)), 834 (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0) 835>; 836 837def : GCNPat < 838 (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), 839 (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0) 840>; 841 842def : GCNPat < 843 (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), 844 (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) 845>; 846 847def : GCNPat < 848 (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), 849 (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0) 850>; 851 852def : GCNPat < 853 (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), 854 (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0) 855>; 856 857def : GCNPat < 858 (f64 (fpextend f16:$src)), 859 (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) 860>; 861 862// fp_to_fp16 patterns 863def : GCNPat < 864 (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), 865 (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0) 866>; 867 868def : GCNPat < 869 (i32 (fp_to_sint f16:$src)), 870 (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src)) 871>; 872 873def : GCNPat < 874 (i32 (fp_to_uint f16:$src)), 875 (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src)) 876>; 877 878def : GCNPat < 879 (f16 (sint_to_fp i32:$src)), 880 (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 VSrc_b32:$src)) 881>; 882 883def : GCNPat < 884 (f16 (uint_to_fp i32:$src)), 885 (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src)) 886>; 887 888//===----------------------------------------------------------------------===// 889// VOP2 Patterns 890//===----------------------------------------------------------------------===// 891 892// TODO: Check only no src2 mods? 893class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node> 894 : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)), 895 (vt (VOP3NoMods vt:$src1)), 896 (vt (VOP3NoMods vt:$src2)))), 897 (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 898 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 899>; 900 901 902// Prefer mac form when there are no modifiers. 903let AddedComplexity = 9 in { 904def : FMADPat <f32, V_MAC_F32_e64, fmad>; 905def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>; 906 907let SubtargetPredicate = Has16BitInsts in { 908def : FMADPat <f16, V_MAC_F16_e64, fmad>; 909def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>; 910} 911 912} 913 914class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr> 915 : GCNPat< 916 (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)), 917 (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)), 918 (Ty (VOP3Mods Ty:$src2, i32:$src2_mod)))), 919 (inst $src0_mod, $src0, $src1_mod, $src1, 920 $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 921>; 922 923let SubtargetPredicate = HasMadMacF32Insts in 924def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>; 925def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> { 926 let SubtargetPredicate = Has16BitInsts; 927} 928 929class VOPSelectModsPat <ValueType vt> : GCNPat < 930 (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods), 931 (VOP3Mods vt:$src2, i32:$src2_mods))), 932 (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, 933 FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) 934>; 935 936class VOPSelectPat <ValueType vt> : GCNPat < 937 (vt (select i1:$src0, vt:$src1, vt:$src2)), 938 (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0) 939>; 940 941def : VOPSelectModsPat <i32>; 942def : VOPSelectModsPat <f32>; 943def : VOPSelectPat <f16>; 944def : VOPSelectPat <i16>; 945 946let AddedComplexity = 1 in { 947def : GCNPat < 948 (i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)), 949 (V_BCNT_U32_B32_e64 $popcnt, $val) 950>; 951} 952 953def : GCNPat < 954 (i32 (ctpop i32:$popcnt)), 955 (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) 956>; 957 958def : GCNPat < 959 (i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)), 960 (V_BCNT_U32_B32_e64 $popcnt, $val) 961>; 962 963/********** ============================================ **********/ 964/********** Extraction, Insertion, Building and Casting **********/ 965/********** ============================================ **********/ 966 967foreach Index = 0-2 in { 968 def Extract_Element_v2i32_#Index : Extract_Element < 969 i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) 970 >; 971 def Insert_Element_v2i32_#Index : Insert_Element < 972 i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) 973 >; 974 975 def Extract_Element_v2f32_#Index : Extract_Element < 976 f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) 977 >; 978 def Insert_Element_v2f32_#Index : Insert_Element < 979 f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) 980 >; 981} 982 983foreach Index = 0-2 in { 984 def Extract_Element_v3i32_#Index : Extract_Element < 985 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 986 >; 987 def Insert_Element_v3i32_#Index : Insert_Element < 988 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 989 >; 990 991 def Extract_Element_v3f32_#Index : Extract_Element < 992 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 993 >; 994 def Insert_Element_v3f32_#Index : Insert_Element < 995 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 996 >; 997} 998 999foreach Index = 0-3 in { 1000 def Extract_Element_v4i32_#Index : Extract_Element < 1001 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1002 >; 1003 def Insert_Element_v4i32_#Index : Insert_Element < 1004 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1005 >; 1006 1007 def Extract_Element_v4f32_#Index : Extract_Element < 1008 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1009 >; 1010 def Insert_Element_v4f32_#Index : Insert_Element < 1011 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1012 >; 1013} 1014 1015foreach Index = 0-4 in { 1016 def Extract_Element_v5i32_#Index : Extract_Element < 1017 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1018 >; 1019 def Insert_Element_v5i32_#Index : Insert_Element < 1020 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1021 >; 1022 1023 def Extract_Element_v5f32_#Index : Extract_Element < 1024 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1025 >; 1026 def Insert_Element_v5f32_#Index : Insert_Element < 1027 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1028 >; 1029} 1030 1031foreach Index = 0-7 in { 1032 def Extract_Element_v8i32_#Index : Extract_Element < 1033 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1034 >; 1035 def Insert_Element_v8i32_#Index : Insert_Element < 1036 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1037 >; 1038 1039 def Extract_Element_v8f32_#Index : Extract_Element < 1040 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1041 >; 1042 def Insert_Element_v8f32_#Index : Insert_Element < 1043 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1044 >; 1045} 1046 1047foreach Index = 0-15 in { 1048 def Extract_Element_v16i32_#Index : Extract_Element < 1049 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1050 >; 1051 def Insert_Element_v16i32_#Index : Insert_Element < 1052 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1053 >; 1054 1055 def Extract_Element_v16f32_#Index : Extract_Element < 1056 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1057 >; 1058 def Insert_Element_v16f32_#Index : Insert_Element < 1059 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1060 >; 1061} 1062 1063 1064def : Pat < 1065 (extract_subvector v4i16:$vec, (i32 0)), 1066 (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0)) 1067>; 1068 1069def : Pat < 1070 (extract_subvector v4i16:$vec, (i32 2)), 1071 (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1)) 1072>; 1073 1074def : Pat < 1075 (extract_subvector v4f16:$vec, (i32 0)), 1076 (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0)) 1077>; 1078 1079def : Pat < 1080 (extract_subvector v4f16:$vec, (i32 2)), 1081 (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) 1082>; 1083 1084foreach Index = 0-31 in { 1085 def Extract_Element_v32i32_#Index : Extract_Element < 1086 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1087 >; 1088 1089 def Insert_Element_v32i32_#Index : Insert_Element < 1090 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1091 >; 1092 1093 def Extract_Element_v32f32_#Index : Extract_Element < 1094 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1095 >; 1096 1097 def Insert_Element_v32f32_#Index : Insert_Element < 1098 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1099 >; 1100} 1101 1102// FIXME: Why do only some of these type combinations for SReg and 1103// VReg? 1104// 16-bit bitcast 1105def : BitConvert <i16, f16, VGPR_32>; 1106def : BitConvert <f16, i16, VGPR_32>; 1107def : BitConvert <i16, f16, SReg_32>; 1108def : BitConvert <f16, i16, SReg_32>; 1109 1110// 32-bit bitcast 1111def : BitConvert <i32, f32, VGPR_32>; 1112def : BitConvert <f32, i32, VGPR_32>; 1113def : BitConvert <i32, f32, SReg_32>; 1114def : BitConvert <f32, i32, SReg_32>; 1115def : BitConvert <v2i16, i32, SReg_32>; 1116def : BitConvert <i32, v2i16, SReg_32>; 1117def : BitConvert <v2f16, i32, SReg_32>; 1118def : BitConvert <i32, v2f16, SReg_32>; 1119def : BitConvert <v2i16, v2f16, SReg_32>; 1120def : BitConvert <v2f16, v2i16, SReg_32>; 1121def : BitConvert <v2f16, f32, SReg_32>; 1122def : BitConvert <f32, v2f16, SReg_32>; 1123def : BitConvert <v2i16, f32, SReg_32>; 1124def : BitConvert <f32, v2i16, SReg_32>; 1125 1126// 64-bit bitcast 1127def : BitConvert <i64, f64, VReg_64>; 1128def : BitConvert <f64, i64, VReg_64>; 1129def : BitConvert <v2i32, v2f32, VReg_64>; 1130def : BitConvert <v2f32, v2i32, VReg_64>; 1131def : BitConvert <i64, v2i32, VReg_64>; 1132def : BitConvert <v2i32, i64, VReg_64>; 1133def : BitConvert <i64, v2f32, VReg_64>; 1134def : BitConvert <v2f32, i64, VReg_64>; 1135def : BitConvert <f64, v2f32, VReg_64>; 1136def : BitConvert <v2f32, f64, VReg_64>; 1137def : BitConvert <f64, v2i32, VReg_64>; 1138def : BitConvert <v2i32, f64, VReg_64>; 1139def : BitConvert <v4i16, v4f16, VReg_64>; 1140def : BitConvert <v4f16, v4i16, VReg_64>; 1141 1142// FIXME: Make SGPR 1143def : BitConvert <v2i32, v4f16, VReg_64>; 1144def : BitConvert <v4f16, v2i32, VReg_64>; 1145def : BitConvert <v2i32, v4f16, VReg_64>; 1146def : BitConvert <v2i32, v4i16, VReg_64>; 1147def : BitConvert <v4i16, v2i32, VReg_64>; 1148def : BitConvert <v2f32, v4f16, VReg_64>; 1149def : BitConvert <v4f16, v2f32, VReg_64>; 1150def : BitConvert <v2f32, v4i16, VReg_64>; 1151def : BitConvert <v4i16, v2f32, VReg_64>; 1152def : BitConvert <v4i16, f64, VReg_64>; 1153def : BitConvert <v4f16, f64, VReg_64>; 1154def : BitConvert <f64, v4i16, VReg_64>; 1155def : BitConvert <f64, v4f16, VReg_64>; 1156def : BitConvert <v4i16, i64, VReg_64>; 1157def : BitConvert <v4f16, i64, VReg_64>; 1158def : BitConvert <i64, v4i16, VReg_64>; 1159def : BitConvert <i64, v4f16, VReg_64>; 1160 1161def : BitConvert <v4i32, v4f32, VReg_128>; 1162def : BitConvert <v4f32, v4i32, VReg_128>; 1163 1164// 96-bit bitcast 1165def : BitConvert <v3i32, v3f32, SGPR_96>; 1166def : BitConvert <v3f32, v3i32, SGPR_96>; 1167 1168// 128-bit bitcast 1169def : BitConvert <v2i64, v4i32, SReg_128>; 1170def : BitConvert <v4i32, v2i64, SReg_128>; 1171def : BitConvert <v2f64, v4f32, VReg_128>; 1172def : BitConvert <v2f64, v4i32, VReg_128>; 1173def : BitConvert <v4f32, v2f64, VReg_128>; 1174def : BitConvert <v4i32, v2f64, VReg_128>; 1175def : BitConvert <v2i64, v2f64, VReg_128>; 1176def : BitConvert <v2f64, v2i64, VReg_128>; 1177def : BitConvert <v4f32, v2i64, VReg_128>; 1178def : BitConvert <v2i64, v4f32, VReg_128>; 1179 1180// 160-bit bitcast 1181def : BitConvert <v5i32, v5f32, SGPR_160>; 1182def : BitConvert <v5f32, v5i32, SGPR_160>; 1183 1184// 256-bit bitcast 1185def : BitConvert <v8i32, v8f32, SReg_256>; 1186def : BitConvert <v8f32, v8i32, SReg_256>; 1187def : BitConvert <v8i32, v8f32, VReg_256>; 1188def : BitConvert <v8f32, v8i32, VReg_256>; 1189def : BitConvert <v4i64, v4f64, VReg_256>; 1190def : BitConvert <v4f64, v4i64, VReg_256>; 1191def : BitConvert <v4i64, v8i32, VReg_256>; 1192def : BitConvert <v4i64, v8f32, VReg_256>; 1193def : BitConvert <v4f64, v8i32, VReg_256>; 1194def : BitConvert <v4f64, v8f32, VReg_256>; 1195def : BitConvert <v8i32, v4i64, VReg_256>; 1196def : BitConvert <v8f32, v4i64, VReg_256>; 1197def : BitConvert <v8i32, v4f64, VReg_256>; 1198def : BitConvert <v8f32, v4f64, VReg_256>; 1199 1200 1201// 512-bit bitcast 1202def : BitConvert <v16i32, v16f32, VReg_512>; 1203def : BitConvert <v16f32, v16i32, VReg_512>; 1204def : BitConvert <v8i64, v8f64, VReg_512>; 1205def : BitConvert <v8f64, v8i64, VReg_512>; 1206def : BitConvert <v8i64, v16i32, VReg_512>; 1207def : BitConvert <v8f64, v16i32, VReg_512>; 1208def : BitConvert <v16i32, v8i64, VReg_512>; 1209def : BitConvert <v16i32, v8f64, VReg_512>; 1210def : BitConvert <v8i64, v16f32, VReg_512>; 1211def : BitConvert <v8f64, v16f32, VReg_512>; 1212def : BitConvert <v16f32, v8i64, VReg_512>; 1213def : BitConvert <v16f32, v8f64, VReg_512>; 1214 1215// 1024-bit bitcast 1216def : BitConvert <v32i32, v32f32, VReg_1024>; 1217def : BitConvert <v32f32, v32i32, VReg_1024>; 1218def : BitConvert <v16i64, v16f64, VReg_1024>; 1219def : BitConvert <v16f64, v16i64, VReg_1024>; 1220def : BitConvert <v16i64, v32i32, VReg_1024>; 1221def : BitConvert <v32i32, v16i64, VReg_1024>; 1222def : BitConvert <v16f64, v32f32, VReg_1024>; 1223def : BitConvert <v32f32, v16f64, VReg_1024>; 1224def : BitConvert <v16i64, v32f32, VReg_1024>; 1225def : BitConvert <v32i32, v16f64, VReg_1024>; 1226def : BitConvert <v16f64, v32i32, VReg_1024>; 1227def : BitConvert <v32f32, v16i64, VReg_1024>; 1228 1229 1230/********** =================== **********/ 1231/********** Src & Dst modifiers **********/ 1232/********** =================== **********/ 1233 1234 1235// If denormals are not enabled, it only impacts the compare of the 1236// inputs. The output result is not flushed. 1237class ClampPat<Instruction inst, ValueType vt> : GCNPat < 1238 (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), 1239 (inst i32:$src0_modifiers, vt:$src0, 1240 i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) 1241>; 1242 1243def : ClampPat<V_MAX_F32_e64, f32>; 1244def : ClampPat<V_MAX_F64, f64>; 1245def : ClampPat<V_MAX_F16_e64, f16>; 1246 1247let SubtargetPredicate = HasVOP3PInsts in { 1248def : GCNPat < 1249 (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), 1250 (V_PK_MAX_F16 $src0_modifiers, $src0, 1251 $src0_modifiers, $src0, DSTCLAMP.ENABLE) 1252>; 1253} 1254 1255/********** ================================ **********/ 1256/********** Floating point absolute/negative **********/ 1257/********** ================================ **********/ 1258 1259// Prevent expanding both fneg and fabs. 1260// TODO: Add IgnoredBySelectionDAG bit? 1261let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG 1262 1263def : GCNPat < 1264 (fneg (fabs (f32 SReg_32:$src))), 1265 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit 1266>; 1267 1268def : GCNPat < 1269 (fabs (f32 SReg_32:$src)), 1270 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) 1271>; 1272 1273def : GCNPat < 1274 (fneg (f32 SReg_32:$src)), 1275 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) 1276>; 1277 1278def : GCNPat < 1279 (fneg (f16 SReg_32:$src)), 1280 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) 1281>; 1282 1283def : GCNPat < 1284 (fneg (f16 VGPR_32:$src)), 1285 (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) 1286>; 1287 1288def : GCNPat < 1289 (fabs (f16 SReg_32:$src)), 1290 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) 1291>; 1292 1293def : GCNPat < 1294 (fneg (fabs (f16 SReg_32:$src))), 1295 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit 1296>; 1297 1298def : GCNPat < 1299 (fneg (fabs (f16 VGPR_32:$src))), 1300 (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit 1301>; 1302 1303def : GCNPat < 1304 (fneg (v2f16 SReg_32:$src)), 1305 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) 1306>; 1307 1308def : GCNPat < 1309 (fabs (v2f16 SReg_32:$src)), 1310 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) 1311>; 1312 1313// This is really (fneg (fabs v2f16:$src)) 1314// 1315// fabs is not reported as free because there is modifier for it in 1316// VOP3P instructions, so it is turned into the bit op. 1317def : GCNPat < 1318 (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), 1319 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1320>; 1321 1322def : GCNPat < 1323 (fneg (v2f16 (fabs SReg_32:$src))), 1324 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1325>; 1326 1327// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled 1328 // def : GCNPat < 1329// (fneg (f64 SReg_64:$src)), 1330// (REG_SEQUENCE SReg_64, 1331// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1332// sub0, 1333// (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1334// (i32 (S_MOV_B32 (i32 0x80000000)))), 1335// sub1) 1336// >; 1337 1338// def : GCNPat < 1339// (fneg (fabs (f64 SReg_64:$src))), 1340// (REG_SEQUENCE SReg_64, 1341// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1342// sub0, 1343// (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1344// (S_MOV_B32 (i32 0x80000000))), // Set sign bit. 1345// sub1) 1346// >; 1347 1348// FIXME: Use S_BITSET0_B32/B64? 1349// def : GCNPat < 1350// (fabs (f64 SReg_64:$src)), 1351// (REG_SEQUENCE SReg_64, 1352// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1353// sub0, 1354// (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1355// (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1356// sub1) 1357// >; 1358 1359} // End let AddedComplexity = 1 1360 1361def : GCNPat < 1362 (fabs (f32 VGPR_32:$src)), 1363 (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) 1364>; 1365 1366def : GCNPat < 1367 (fneg (f32 VGPR_32:$src)), 1368 (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) 1369>; 1370 1371def : GCNPat < 1372 (fabs (f16 VGPR_32:$src)), 1373 (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) 1374>; 1375 1376def : GCNPat < 1377 (fneg (v2f16 VGPR_32:$src)), 1378 (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 1379>; 1380 1381def : GCNPat < 1382 (fabs (v2f16 VGPR_32:$src)), 1383 (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) 1384>; 1385 1386def : GCNPat < 1387 (fneg (v2f16 (fabs VGPR_32:$src))), 1388 (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit 1389>; 1390 1391def : GCNPat < 1392 (fabs (f64 VReg_64:$src)), 1393 (REG_SEQUENCE VReg_64, 1394 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1395 sub0, 1396 (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), 1397 (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit. 1398 sub1) 1399>; 1400 1401// TODO: Use SGPR for constant 1402def : GCNPat < 1403 (fneg (f64 VReg_64:$src)), 1404 (REG_SEQUENCE VReg_64, 1405 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1406 sub0, 1407 (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), 1408 (i32 (V_MOV_B32_e32 (i32 0x80000000)))), 1409 sub1) 1410>; 1411 1412// TODO: Use SGPR for constant 1413def : GCNPat < 1414 (fneg (fabs (f64 VReg_64:$src))), 1415 (REG_SEQUENCE VReg_64, 1416 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1417 sub0, 1418 (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)), 1419 (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit. 1420 sub1) 1421>; 1422 1423def : GCNPat < 1424 (fcopysign f16:$src0, f16:$src1), 1425 (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) 1426>; 1427 1428def : GCNPat < 1429 (fcopysign f32:$src0, f16:$src1), 1430 (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), $src0, 1431 (V_LSHLREV_B32_e64 (i32 16), $src1)) 1432>; 1433 1434def : GCNPat < 1435 (fcopysign f64:$src0, f16:$src1), 1436 (REG_SEQUENCE SReg_64, 1437 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 1438 (V_BFI_B32 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), 1439 (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) 1440>; 1441 1442def : GCNPat < 1443 (fcopysign f16:$src0, f32:$src1), 1444 (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, 1445 (V_LSHRREV_B32_e64 (i32 16), $src1)) 1446>; 1447 1448def : GCNPat < 1449 (fcopysign f16:$src0, f64:$src1), 1450 (V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, 1451 (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) 1452>; 1453 1454/********** ================== **********/ 1455/********** Immediate Patterns **********/ 1456/********** ================== **********/ 1457 1458def : GCNPat < 1459 (VGPRImm<(i32 imm)>:$imm), 1460 (V_MOV_B32_e32 imm:$imm) 1461>; 1462 1463def : GCNPat < 1464 (VGPRImm<(f32 fpimm)>:$imm), 1465 (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) 1466>; 1467 1468def : GCNPat < 1469 (i32 imm:$imm), 1470 (S_MOV_B32 imm:$imm) 1471>; 1472 1473def : GCNPat < 1474 (VGPRImm<(SIlds tglobaladdr:$ga)>), 1475 (V_MOV_B32_e32 $ga) 1476>; 1477 1478def : GCNPat < 1479 (SIlds tglobaladdr:$ga), 1480 (S_MOV_B32 $ga) 1481>; 1482 1483// FIXME: Workaround for ordering issue with peephole optimizer where 1484// a register class copy interferes with immediate folding. Should 1485// use s_mov_b32, which can be shrunk to s_movk_i32 1486def : GCNPat < 1487 (VGPRImm<(f16 fpimm)>:$imm), 1488 (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) 1489>; 1490 1491def : GCNPat < 1492 (f32 fpimm:$imm), 1493 (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) 1494>; 1495 1496def : GCNPat < 1497 (f16 fpimm:$imm), 1498 (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) 1499>; 1500 1501def : GCNPat < 1502 (i32 frameindex:$fi), 1503 (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi))) 1504>; 1505 1506def : GCNPat < 1507 (i64 InlineImm64:$imm), 1508 (S_MOV_B64 InlineImm64:$imm) 1509>; 1510 1511// XXX - Should this use a s_cmp to set SCC? 1512 1513// Set to sign-extended 64-bit value (true = -1, false = 0) 1514def : GCNPat < 1515 (i1 imm:$imm), 1516 (S_MOV_B64 (i64 (as_i64imm $imm))) 1517> { 1518 let WaveSizePredicate = isWave64; 1519} 1520 1521def : GCNPat < 1522 (i1 imm:$imm), 1523 (S_MOV_B32 (i32 (as_i32imm $imm))) 1524> { 1525 let WaveSizePredicate = isWave32; 1526} 1527 1528def : GCNPat < 1529 (f64 InlineImmFP64:$imm), 1530 (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm))) 1531>; 1532 1533/********** ================== **********/ 1534/********** Intrinsic Patterns **********/ 1535/********** ================== **********/ 1536 1537// FIXME: Should use _e64 and select source modifiers. 1538def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>; 1539 1540def : GCNPat < 1541 (i32 (sext i1:$src0)), 1542 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1543 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0) 1544>; 1545 1546class Ext32Pat <SDNode ext> : GCNPat < 1547 (i32 (ext i1:$src0)), 1548 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1549 /*src1mod*/(i32 0), /*src1*/(i32 1), $src0) 1550>; 1551 1552def : Ext32Pat <zext>; 1553def : Ext32Pat <anyext>; 1554 1555// The multiplication scales from [0,1) to the unsigned integer range, 1556// rounding down a bit to avoid unwanted overflow. 1557def : GCNPat < 1558 (AMDGPUurecip i32:$src0), 1559 (V_CVT_U32_F32_e32 1560 (V_MUL_F32_e32 (i32 CONST.FP_4294966784), 1561 (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) 1562>; 1563 1564//===----------------------------------------------------------------------===// 1565// VOP3 Patterns 1566//===----------------------------------------------------------------------===// 1567 1568def : IMad24Pat<V_MAD_I32_I24, 1>; 1569def : UMad24Pat<V_MAD_U32_U24, 1>; 1570 1571// FIXME: This should only be done for VALU inputs 1572defm : BFIPatterns <V_BFI_B32, S_MOV_B32, SReg_64>; 1573def : ROTRPattern <V_ALIGNBIT_B32>; 1574 1575def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), 1576 (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 1577 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 1578 1579def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), 1580 (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 1581 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 1582 1583/********** ====================== **********/ 1584/********** Indirect addressing **********/ 1585/********** ====================== **********/ 1586 1587multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { 1588 // Extract with offset 1589 def : GCNPat< 1590 (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), 1591 (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) 1592 >; 1593 1594 // Insert with offset 1595 def : GCNPat< 1596 (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), 1597 (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) 1598 >; 1599} 1600 1601defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; 1602defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; 1603defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; 1604defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; 1605defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">; 1606 1607defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; 1608defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; 1609defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; 1610defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; 1611defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">; 1612 1613//===----------------------------------------------------------------------===// 1614// SAD Patterns 1615//===----------------------------------------------------------------------===// 1616 1617def : GCNPat < 1618 (add (sub_oneuse (umax i32:$src0, i32:$src1), 1619 (umin i32:$src0, i32:$src1)), 1620 i32:$src2), 1621 (V_SAD_U32 $src0, $src1, $src2, (i1 0)) 1622>; 1623 1624def : GCNPat < 1625 (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)), 1626 (sub i32:$src0, i32:$src1), 1627 (sub i32:$src1, i32:$src0)), 1628 i32:$src2), 1629 (V_SAD_U32 $src0, $src1, $src2, (i1 0)) 1630>; 1631 1632//===----------------------------------------------------------------------===// 1633// Conversion Patterns 1634//===----------------------------------------------------------------------===// 1635 1636def : GCNPat<(i32 (sext_inreg i32:$src, i1)), 1637 (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 1638 1639// Handle sext_inreg in i64 1640def : GCNPat < 1641 (i64 (sext_inreg i64:$src, i1)), 1642 (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 1643>; 1644 1645def : GCNPat < 1646 (i16 (sext_inreg i16:$src, i1)), 1647 (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 1648>; 1649 1650def : GCNPat < 1651 (i16 (sext_inreg i16:$src, i8)), 1652 (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 1653>; 1654 1655def : GCNPat < 1656 (i64 (sext_inreg i64:$src, i8)), 1657 (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 1658>; 1659 1660def : GCNPat < 1661 (i64 (sext_inreg i64:$src, i16)), 1662 (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 1663>; 1664 1665def : GCNPat < 1666 (i64 (sext_inreg i64:$src, i32)), 1667 (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 1668>; 1669 1670def : GCNPat < 1671 (i64 (zext i32:$src)), 1672 (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) 1673>; 1674 1675def : GCNPat < 1676 (i64 (anyext i32:$src)), 1677 (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) 1678>; 1679 1680class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < 1681 (i64 (ext i1:$src)), 1682 (REG_SEQUENCE VReg_64, 1683 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1684 /*src1mod*/(i32 0), /*src1*/(i32 1), $src), 1685 sub0, (S_MOV_B32 (i32 0)), sub1) 1686>; 1687 1688 1689def : ZExt_i64_i1_Pat<zext>; 1690def : ZExt_i64_i1_Pat<anyext>; 1691 1692// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that 1693// REG_SEQUENCE patterns don't support instructions with multiple outputs. 1694def : GCNPat < 1695 (i64 (sext i32:$src)), 1696 (REG_SEQUENCE SReg_64, $src, sub0, 1697 (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) 1698>; 1699 1700def : GCNPat < 1701 (i64 (sext i1:$src)), 1702 (REG_SEQUENCE VReg_64, 1703 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1704 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0, 1705 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1706 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1) 1707>; 1708 1709class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat < 1710 (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), 1711 (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) 1712>; 1713 1714def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; 1715def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; 1716def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; 1717def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; 1718 1719// If we need to perform a logical operation on i1 values, we need to 1720// use vector comparisons since there is only one SCC register. Vector 1721// comparisons may write to a pair of SGPRs or a single SGPR, so treat 1722// these as 32 or 64-bit comparisons. When legalizing SGPR copies, 1723// instructions resulting in the copies from SCC to these instructions 1724// will be moved to the VALU. 1725 1726let WaveSizePredicate = isWave64 in { 1727def : GCNPat < 1728 (i1 (and i1:$src0, i1:$src1)), 1729 (S_AND_B64 $src0, $src1) 1730>; 1731 1732def : GCNPat < 1733 (i1 (or i1:$src0, i1:$src1)), 1734 (S_OR_B64 $src0, $src1) 1735>; 1736 1737def : GCNPat < 1738 (i1 (xor i1:$src0, i1:$src1)), 1739 (S_XOR_B64 $src0, $src1) 1740>; 1741 1742def : GCNPat < 1743 (i1 (add i1:$src0, i1:$src1)), 1744 (S_XOR_B64 $src0, $src1) 1745>; 1746 1747def : GCNPat < 1748 (i1 (sub i1:$src0, i1:$src1)), 1749 (S_XOR_B64 $src0, $src1) 1750>; 1751 1752let AddedComplexity = 1 in { 1753def : GCNPat < 1754 (i1 (add i1:$src0, (i1 -1))), 1755 (S_NOT_B64 $src0) 1756>; 1757 1758def : GCNPat < 1759 (i1 (sub i1:$src0, (i1 -1))), 1760 (S_NOT_B64 $src0) 1761>; 1762} 1763} // end isWave64 1764 1765let WaveSizePredicate = isWave32 in { 1766def : GCNPat < 1767 (i1 (and i1:$src0, i1:$src1)), 1768 (S_AND_B32 $src0, $src1) 1769>; 1770 1771def : GCNPat < 1772 (i1 (or i1:$src0, i1:$src1)), 1773 (S_OR_B32 $src0, $src1) 1774>; 1775 1776def : GCNPat < 1777 (i1 (xor i1:$src0, i1:$src1)), 1778 (S_XOR_B32 $src0, $src1) 1779>; 1780 1781def : GCNPat < 1782 (i1 (add i1:$src0, i1:$src1)), 1783 (S_XOR_B32 $src0, $src1) 1784>; 1785 1786def : GCNPat < 1787 (i1 (sub i1:$src0, i1:$src1)), 1788 (S_XOR_B32 $src0, $src1) 1789>; 1790 1791let AddedComplexity = 1 in { 1792def : GCNPat < 1793 (i1 (add i1:$src0, (i1 -1))), 1794 (S_NOT_B32 $src0) 1795>; 1796 1797def : GCNPat < 1798 (i1 (sub i1:$src0, (i1 -1))), 1799 (S_NOT_B32 $src0) 1800>; 1801} 1802} // end isWave32 1803 1804def : GCNPat < 1805 (f16 (sint_to_fp i1:$src)), 1806 (V_CVT_F16_F32_e32 ( 1807 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1808 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 1809 SSrc_i1:$src)) 1810>; 1811 1812def : GCNPat < 1813 (f16 (uint_to_fp i1:$src)), 1814 (V_CVT_F16_F32_e32 ( 1815 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1816 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 1817 SSrc_i1:$src)) 1818>; 1819 1820def : GCNPat < 1821 (f32 (sint_to_fp i1:$src)), 1822 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1823 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 1824 SSrc_i1:$src) 1825>; 1826 1827def : GCNPat < 1828 (f32 (uint_to_fp i1:$src)), 1829 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1830 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 1831 SSrc_i1:$src) 1832>; 1833 1834def : GCNPat < 1835 (f64 (sint_to_fp i1:$src)), 1836 (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1837 /*src1mod*/(i32 0), /*src1*/(i32 -1), 1838 SSrc_i1:$src)) 1839>; 1840 1841def : GCNPat < 1842 (f64 (uint_to_fp i1:$src)), 1843 (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1844 /*src1mod*/(i32 0), /*src1*/(i32 1), 1845 SSrc_i1:$src)) 1846>; 1847 1848//===----------------------------------------------------------------------===// 1849// Miscellaneous Patterns 1850//===----------------------------------------------------------------------===// 1851def : GCNPat < 1852 (i32 (AMDGPUfp16_zext f16:$src)), 1853 (COPY $src) 1854>; 1855 1856 1857def : GCNPat < 1858 (i32 (trunc i64:$a)), 1859 (EXTRACT_SUBREG $a, sub0) 1860>; 1861 1862def : GCNPat < 1863 (i1 (trunc i32:$a)), 1864 (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) 1865>; 1866 1867def : GCNPat < 1868 (i1 (trunc i16:$a)), 1869 (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) 1870>; 1871 1872def : GCNPat < 1873 (i1 (trunc i64:$a)), 1874 (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), 1875 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 1876>; 1877 1878def : GCNPat < 1879 (i32 (bswap i32:$a)), 1880 (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), 1881 (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), 1882 (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) 1883>; 1884 1885// FIXME: This should have been narrowed to i32 during legalization. 1886// This pattern should also be skipped for GlobalISel 1887def : GCNPat < 1888 (i64 (bswap i64:$a)), 1889 (REG_SEQUENCE VReg_64, 1890 (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), 1891 (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 1892 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 1893 (i32 24)), 1894 (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 1895 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 1896 (i32 8))), 1897 sub0, 1898 (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)), 1899 (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 1900 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 1901 (i32 24)), 1902 (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 1903 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 1904 (i32 8))), 1905 sub1) 1906>; 1907 1908// FIXME: The AddedComplexity should not be needed, but in GlobalISel 1909// the BFI pattern ends up taking precedence without it. 1910let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { 1911// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24) 1912// 1913// My reading of the manual suggests we should be using src0 for the 1914// register value, but this is what seems to work. 1915def : GCNPat < 1916 (i32 (bswap i32:$a)), 1917 (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) 1918>; 1919 1920// FIXME: This should have been narrowed to i32 during legalization. 1921// This pattern should also be skipped for GlobalISel 1922def : GCNPat < 1923 (i64 (bswap i64:$a)), 1924 (REG_SEQUENCE VReg_64, 1925 (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), 1926 (S_MOV_B32 (i32 0x00010203))), 1927 sub0, 1928 (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), 1929 (S_MOV_B32 (i32 0x00010203))), 1930 sub1) 1931>; 1932 1933// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24) 1934// The 12s emit 0s. 1935def : GCNPat < 1936 (i16 (bswap i16:$a)), 1937 (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 1938>; 1939 1940def : GCNPat < 1941 (i32 (zext (bswap i16:$a))), 1942 (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 1943>; 1944 1945// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) 1946def : GCNPat < 1947 (v2i16 (bswap v2i16:$a)), 1948 (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) 1949>; 1950 1951} 1952 1953 1954// Prefer selecting to max when legal, but using mul is always valid. 1955let AddedComplexity = -5 in { 1956def : GCNPat< 1957 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 1958 (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 1959>; 1960 1961def : GCNPat< 1962 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 1963 (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 1964>; 1965 1966def : GCNPat< 1967 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 1968 (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) 1969>; 1970 1971def : GCNPat< 1972 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 1973 (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src) 1974>; 1975 1976def : GCNPat< 1977 (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), 1978 (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) 1979>; 1980 1981// TODO: Handle fneg like other types. 1982def : GCNPat< 1983 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 1984 (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src) 1985>; 1986} // End AddedComplexity = -5 1987 1988multiclass SelectCanonicalizeAsMax< 1989 list<Predicate> f32_preds = [], 1990 list<Predicate> f64_preds = [], 1991 list<Predicate> f16_preds = []> { 1992 def : GCNPat< 1993 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 1994 (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> { 1995 let OtherPredicates = f32_preds; 1996 } 1997 1998 def : GCNPat< 1999 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 2000 (V_MAX_F64 $src_mods, $src, $src_mods, $src)> { 2001 let OtherPredicates = f64_preds; 2002 } 2003 2004 def : GCNPat< 2005 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2006 (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 2007 // FIXME: Should have 16-bit inst subtarget predicate 2008 let OtherPredicates = f16_preds; 2009 } 2010 2011 def : GCNPat< 2012 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 2013 (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> { 2014 // FIXME: Should have VOP3P subtarget predicate 2015 let OtherPredicates = f16_preds; 2016 } 2017} 2018 2019// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal 2020// mode, and would never flush. For f64, it's faster to do implement 2021// this with a max. For f16/f32 it's a wash, but prefer max when 2022// valid. 2023// 2024// FIXME: Lowering f32/f16 with max is worse since we can use a 2025// smaller encoding if the input is fneg'd. It also adds an extra 2026// register use. 2027let SubtargetPredicate = HasMinMaxDenormModes in { 2028 defm : SelectCanonicalizeAsMax<[], [], []>; 2029} // End SubtargetPredicate = HasMinMaxDenormModes 2030 2031let SubtargetPredicate = NotHasMinMaxDenormModes in { 2032 // Use the max lowering if we don't need to flush. 2033 2034 // FIXME: We don't do use this for f32 as a workaround for the 2035 // library being compiled with the default ieee mode, but 2036 // potentially being called from flushing kernels. Really we should 2037 // not be mixing code expecting different default FP modes, but mul 2038 // works in any FP environment. 2039 defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>; 2040} // End SubtargetPredicate = NotHasMinMaxDenormModes 2041 2042 2043let OtherPredicates = [HasDLInsts] in { 2044def : GCNPat < 2045 (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)), 2046 (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)), 2047 (f32 (VOP3NoMods f32:$src2))), 2048 (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, 2049 SRCMODS.NONE, $src2) 2050>; 2051} // End OtherPredicates = [HasDLInsts] 2052 2053let SubtargetPredicate = isGFX10Plus in 2054def : GCNPat < 2055 (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)), 2056 (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)), 2057 (f16 (VOP3NoMods f32:$src2))), 2058 (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1, 2059 SRCMODS.NONE, $src2) 2060>; 2061 2062// COPY is workaround tablegen bug from multiple outputs 2063// from S_LSHL_B32's multiple outputs from implicit scc def. 2064def : GCNPat < 2065 (v2i16 (build_vector (i16 0), (i16 SReg_32:$src1))), 2066 (S_LSHL_B32 SReg_32:$src1, (i16 16)) 2067>; 2068 2069def : GCNPat < 2070 (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))), 2071 (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) 2072>; 2073 2074def : GCNPat < 2075 (v2i16 (build_vector (i16 VGPR_32:$src0), (i16 undef))), 2076 (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32) 2077>; 2078 2079def : GCNPat < 2080 (v2f16 (build_vector f16:$src0, (f16 undef))), 2081 (COPY $src0) 2082>; 2083 2084def : GCNPat < 2085 (v2i16 (build_vector (i16 undef), (i16 SReg_32:$src1))), 2086 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 2087>; 2088 2089def : GCNPat < 2090 (v2f16 (build_vector (f16 undef), (f16 SReg_32:$src1))), 2091 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 2092>; 2093 2094let SubtargetPredicate = HasVOP3PInsts in { 2095def : GCNPat < 2096 (v2i16 (build_vector (i16 SReg_32:$src0), (i16 SReg_32:$src1))), 2097 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 2098>; 2099 2100// With multiple uses of the shift, this will duplicate the shift and 2101// increase register pressure. 2102def : GCNPat < 2103 (v2i16 (build_vector (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2104 (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1)) 2105>; 2106 2107 2108def : GCNPat < 2109 (v2i16 (build_vector (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), 2110 (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2111 (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) 2112>; 2113 2114// TODO: Should source modifiers be matched to v_pack_b32_f16? 2115def : GCNPat < 2116 (v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))), 2117 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 2118>; 2119 2120} // End SubtargetPredicate = HasVOP3PInsts 2121 2122 2123def : GCNPat < 2124 (v2f16 (scalar_to_vector f16:$src0)), 2125 (COPY $src0) 2126>; 2127 2128def : GCNPat < 2129 (v2i16 (scalar_to_vector i16:$src0)), 2130 (COPY $src0) 2131>; 2132 2133def : GCNPat < 2134 (v4i16 (scalar_to_vector i16:$src0)), 2135 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 2136>; 2137 2138def : GCNPat < 2139 (v4f16 (scalar_to_vector f16:$src0)), 2140 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 2141>; 2142 2143def : GCNPat < 2144 (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, 2145 timm:$bank_mask, timm:$bound_ctrl)), 2146 (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src, 2147 (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), 2148 (as_i32timm $bank_mask), 2149 (as_i1timm $bound_ctrl)) 2150>; 2151 2152def : GCNPat < 2153 (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, 2154 timm:$bank_mask, timm:$bound_ctrl)), 2155 (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl), 2156 (as_i32timm $row_mask), (as_i32timm $bank_mask), 2157 (as_i1timm $bound_ctrl)) 2158>; 2159 2160//===----------------------------------------------------------------------===// 2161// Fract Patterns 2162//===----------------------------------------------------------------------===// 2163 2164let SubtargetPredicate = isGFX6 in { 2165 2166// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is 2167// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient 2168// way to implement it is using V_FRACT_F64. 2169// The workaround for the V_FRACT bug is: 2170// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2171 2172// Convert floor(x) to (x - fract(x)) 2173 2174// Don't bother handling this for GlobalISel, it's handled during 2175// lowering. 2176// 2177// FIXME: DAG should also custom lower this. 2178def : GCNPat < 2179 (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), 2180 (V_ADD_F64 2181 $mods, 2182 $x, 2183 SRCMODS.NEG, 2184 (V_CNDMASK_B64_PSEUDO 2185 (V_MIN_F64 2186 SRCMODS.NONE, 2187 (V_FRACT_F64_e64 $mods, $x), 2188 SRCMODS.NONE, 2189 (V_MOV_B64_PSEUDO 0x3fefffffffffffff)), 2190 $x, 2191 (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/)))) 2192>; 2193 2194} // End SubtargetPredicates = isGFX6 2195 2196//============================================================================// 2197// Miscellaneous Optimization Patterns 2198//============================================================================// 2199 2200// Undo sub x, c -> add x, -c canonicalization since c is more likely 2201// an inline immediate than -c. 2202// TODO: Also do for 64-bit. 2203def : GCNPat< 2204 (add i32:$src0, (i32 NegSubInlineConst32:$src1)), 2205 (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1) 2206>; 2207 2208def : GCNPat< 2209 (add i32:$src0, (i32 NegSubInlineConst32:$src1)), 2210 (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 2211 let SubtargetPredicate = HasAddNoCarryInsts; 2212} 2213 2214def : GCNPat< 2215 (add i32:$src0, (i32 NegSubInlineConst32:$src1)), 2216 (V_SUB_I32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 2217 let SubtargetPredicate = NotHasAddNoCarryInsts; 2218} 2219 2220 2221// Avoid pointlessly materializing a constant in VGPR. 2222// FIXME: Should also do this for readlane, but tablegen crashes on 2223// the ignored src1. 2224def : GCNPat< 2225 (int_amdgcn_readfirstlane (i32 imm:$src)), 2226 (S_MOV_B32 SReg_32:$src) 2227>; 2228 2229multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { 2230 def : GCNPat < 2231 (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), 2232 (BFM $a, $b) 2233 >; 2234 2235 def : GCNPat < 2236 (vt (add (vt (shl 1, vt:$a)), -1)), 2237 (BFM $a, (MOV (i32 0))) 2238 >; 2239} 2240 2241defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; 2242// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; 2243 2244defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>; 2245defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>; 2246 2247multiclass IntMed3Pat<Instruction med3Inst, 2248 SDPatternOperator min, 2249 SDPatternOperator max, 2250 SDPatternOperator min_oneuse, 2251 SDPatternOperator max_oneuse> { 2252 2253 // This matches 16 permutations of 2254 // min(max(a, b), max(min(a, b), c)) 2255 def : AMDGPUPat < 2256 (min (max_oneuse i32:$src0, i32:$src1), 2257 (max_oneuse (min_oneuse i32:$src0, i32:$src1), i32:$src2)), 2258 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 2259>; 2260 2261 // This matches 16 permutations of 2262 // max(min(x, y), min(max(x, y), z)) 2263 def : AMDGPUPat < 2264 (max (min_oneuse i32:$src0, i32:$src1), 2265 (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)), 2266 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 2267>; 2268} 2269 2270defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>; 2271defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>; 2272 2273// This matches 16 permutations of 2274// max(min(x, y), min(max(x, y), z)) 2275class FPMed3Pat<ValueType vt, 2276 //SDPatternOperator max, SDPatternOperator min, 2277 Instruction med3Inst> : GCNPat< 2278 (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), 2279 (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), 2280 (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), 2281 (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), 2282 (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), 2283 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 2284>; 2285 2286class FP16Med3Pat<ValueType vt, 2287 Instruction med3Inst> : GCNPat< 2288 (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), 2289 (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), 2290 (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), 2291 (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), 2292 (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), 2293 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE) 2294>; 2295 2296multiclass Int16Med3Pat<Instruction med3Inst, 2297 SDPatternOperator min, 2298 SDPatternOperator max, 2299 SDPatternOperator max_oneuse, 2300 SDPatternOperator min_oneuse> { 2301 // This matches 16 permutations of 2302 // max(min(x, y), min(max(x, y), z)) 2303 def : GCNPat < 2304 (max (min_oneuse i16:$src0, i16:$src1), 2305 (min_oneuse (max_oneuse i16:$src0, i16:$src1), i16:$src2)), 2306 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 2307>; 2308 2309 // This matches 16 permutations of 2310 // min(max(a, b), max(min(a, b), c)) 2311 def : GCNPat < 2312 (min (max_oneuse i16:$src0, i16:$src1), 2313 (max_oneuse (min_oneuse i16:$src0, i16:$src1), i16:$src2)), 2314 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 2315>; 2316} 2317 2318def : FPMed3Pat<f32, V_MED3_F32>; 2319 2320let OtherPredicates = [isGFX9Plus] in { 2321def : FP16Med3Pat<f16, V_MED3_F16>; 2322defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>; 2323defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>; 2324} // End Predicates = [isGFX9Plus] 2325 2326class AMDGPUGenericInstruction : GenericInstruction { 2327 let Namespace = "AMDGPU"; 2328} 2329 2330def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { 2331 let OutOperandList = (outs type0:$dst); 2332 let InOperandList = (ins type1:$src); 2333 let hasSideEffects = 0; 2334} 2335 2336def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction { 2337 let OutOperandList = (outs type0:$dst); 2338 let InOperandList = (ins type1:$src); 2339 let hasSideEffects = 0; 2340} 2341 2342class BufferLoadGenericInstruction : AMDGPUGenericInstruction { 2343 let OutOperandList = (outs type0:$dst); 2344 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 2345 type2:$soffset, untyped_imm_0:$offset, 2346 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2347 let hasSideEffects = 0; 2348 let mayLoad = 1; 2349} 2350 2351class TBufferLoadGenericInstruction : AMDGPUGenericInstruction { 2352 let OutOperandList = (outs type0:$dst); 2353 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 2354 type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format, 2355 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2356 let hasSideEffects = 0; 2357 let mayLoad = 1; 2358} 2359 2360def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction; 2361def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction; 2362def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction; 2363def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction; 2364def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction; 2365def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction; 2366def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; 2367def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; 2368def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; 2369 2370class BufferStoreGenericInstruction : AMDGPUGenericInstruction { 2371 let OutOperandList = (outs); 2372 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 2373 type2:$soffset, untyped_imm_0:$offset, 2374 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2375 let hasSideEffects = 0; 2376 let mayStore = 1; 2377} 2378 2379class TBufferStoreGenericInstruction : AMDGPUGenericInstruction { 2380 let OutOperandList = (outs); 2381 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 2382 type2:$soffset, untyped_imm_0:$offset, 2383 untyped_imm_0:$format, 2384 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2385 let hasSideEffects = 0; 2386 let mayStore = 1; 2387} 2388 2389def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction; 2390def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction; 2391def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction; 2392def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction; 2393def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction; 2394def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction; 2395def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction; 2396 2397def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction { 2398 let OutOperandList = (outs type0:$dst); 2399 let InOperandList = (ins type0:$src0, type0:$src1); 2400 let hasSideEffects = 0; 2401} 2402 2403def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction { 2404 let OutOperandList = (outs type0:$dst); 2405 let InOperandList = (ins type0:$src0, type0:$src1); 2406 let hasSideEffects = 0; 2407} 2408 2409foreach N = 0-3 in { 2410def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { 2411 let OutOperandList = (outs type0:$dst); 2412 let InOperandList = (ins type0:$src0); 2413 let hasSideEffects = 0; 2414} 2415} 2416 2417// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector 2418// operand Expects a MachineMemOperand in addition to explicit 2419// operands. 2420def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { 2421 let OutOperandList = (outs type0:$oldval); 2422 let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval); 2423 let hasSideEffects = 0; 2424 let mayLoad = 1; 2425 let mayStore = 1; 2426} 2427 2428let Namespace = "AMDGPU" in { 2429def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP; 2430def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; 2431} 2432 2433class BufferAtomicGenericInstruction : AMDGPUGenericInstruction { 2434 let OutOperandList = (outs type0:$dst); 2435 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 2436 type2:$soffset, untyped_imm_0:$offset, 2437 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2438 let hasSideEffects = 0; 2439 let mayLoad = 1; 2440 let mayStore = 1; 2441} 2442 2443def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction; 2444def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction; 2445def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction; 2446def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction; 2447def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; 2448def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; 2449def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; 2450def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; 2451def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; 2452def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; 2453def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; 2454def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; 2455 2456def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { 2457 let OutOperandList = (outs type0:$dst); 2458 let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex, 2459 type2:$voffset, type2:$soffset, untyped_imm_0:$offset, 2460 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2461 let hasSideEffects = 0; 2462 let mayLoad = 1; 2463 let mayStore = 1; 2464} 2465 2466// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as 2467// a workaround for the intrinsic being defined as readnone, but 2468// really needs a memory operand. 2469def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { 2470 let OutOperandList = (outs type0:$dst); 2471 let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); 2472 let hasSideEffects = 0; 2473 let mayLoad = 1; 2474 let mayStore = 0; 2475} 2476 2477// This is equivalent to the G_INTRINSIC*, but the operands may have 2478// been legalized depending on the subtarget requirements. 2479def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { 2480 let OutOperandList = (outs type0:$dst); 2481 let InOperandList = (ins unknown:$intrin, variable_ops); 2482 let hasSideEffects = 0; 2483 let mayLoad = 1; 2484 2485 // FIXME: Use separate opcode for atomics. 2486 let mayStore = 1; 2487} 2488 2489// This is equivalent to the G_INTRINSIC*, but the operands may have 2490// been legalized depending on the subtarget requirements. 2491def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { 2492 let OutOperandList = (outs); 2493 let InOperandList = (ins unknown:$intrin, variable_ops); 2494 let hasSideEffects = 0; 2495 let mayStore = 1; 2496} 2497