1//===-- SIInstructions.td - SI Instruction Definitions --------------------===// 2// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6// 7//===----------------------------------------------------------------------===// 8// This file was originally auto-generated from a GPU register header file and 9// all the instruction definitions were originally commented out. Instructions 10// that are not yet supported remain commented out. 11//===----------------------------------------------------------------------===// 12 13class GCNPat<dag pattern, dag result> : Pat<pattern, result>, GCNPredicateControl { 14 15} 16 17include "SOPInstructions.td" 18include "VOPInstructions.td" 19include "SMInstructions.td" 20include "FLATInstructions.td" 21include "BUFInstructions.td" 22include "EXPInstructions.td" 23 24//===----------------------------------------------------------------------===// 25// VINTRP Instructions 26//===----------------------------------------------------------------------===// 27 28// Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI) 29def VINTRPDst : VINTRPDstOperand <VGPR_32>; 30 31let Uses = [MODE, M0, EXEC] in { 32 33// FIXME: Specify SchedRW for VINTRP instructions. 34 35multiclass V_INTERP_P1_F32_m : VINTRP_m < 36 0x00000000, 37 (outs VINTRPDst:$vdst), 38 (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), 39 "v_interp_p1_f32$vdst, $vsrc, $attr$attrchan", 40 [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc, 41 (i32 timm:$attrchan), (i32 timm:$attr), M0))] 42>; 43 44let OtherPredicates = [has32BankLDS, isNotGFX90APlus] in { 45 46defm V_INTERP_P1_F32 : V_INTERP_P1_F32_m; 47 48} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus] 49 50let OtherPredicates = [has16BankLDS, isNotGFX90APlus], 51 Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 in { 52 53defm V_INTERP_P1_F32_16bank : V_INTERP_P1_F32_m; 54 55} // End OtherPredicates = [has32BankLDS, isNotGFX90APlus], 56 // Constraints = "@earlyclobber $vdst", isAsmParserOnly=1 57 58let OtherPredicates = [isNotGFX90APlus] in { 59let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in { 60 61defm V_INTERP_P2_F32 : VINTRP_m < 62 0x00000001, 63 (outs VINTRPDst:$vdst), 64 (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan), 65 "v_interp_p2_f32$vdst, $vsrc, $attr$attrchan", 66 [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc, 67 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 68 69} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst" 70 71defm V_INTERP_MOV_F32 : VINTRP_m < 72 0x00000002, 73 (outs VINTRPDst:$vdst), 74 (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan), 75 "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan", 76 [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc), 77 (i32 timm:$attrchan), (i32 timm:$attr), M0))]>; 78 79} // End OtherPredicates = [isNotGFX90APlus] 80 81} // End Uses = [MODE, M0, EXEC] 82 83//===----------------------------------------------------------------------===// 84// Pseudo Instructions 85//===----------------------------------------------------------------------===// 86def ATOMIC_FENCE : SPseudoInstSI< 87 (outs), (ins i32imm:$ordering, i32imm:$scope), 88 [(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))], 89 "ATOMIC_FENCE $ordering, $scope"> { 90 let hasSideEffects = 1; 91 let maybeAtomic = 1; 92} 93 94let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in { 95 96// For use in patterns 97def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), 98 (ins VSrc_b64:$src0, VSrc_b64:$src1, SSrc_b64:$src2), "", []> { 99 let isPseudo = 1; 100 let isCodeGenOnly = 1; 101 let usesCustomInserter = 1; 102} 103 104// 64-bit vector move instruction. This is mainly used by the 105// SIFoldOperands pass to enable folding of inline immediates. 106def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), 107 (ins VSrc_b64:$src0)> { 108 let isReMaterializable = 1; 109 let isAsCheapAsAMove = 1; 110 let isMoveImm = 1; 111 let SchedRW = [Write64Bit]; 112 let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each. 113} 114 115// 64-bit vector move with dpp. Expanded post-RA. 116def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64> { 117 let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete. 118} 119 120// 64-bit scalar move immediate instruction. This is used to avoid subregs 121// initialization and allow rematerialization. 122def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst), 123 (ins i64imm:$src0)> { 124 let isReMaterializable = 1; 125 let isAsCheapAsAMove = 1; 126 let isMoveImm = 1; 127 let SchedRW = [WriteSALU, Write64Bit]; 128 let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each. 129 let Uses = []; 130} 131 132// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the 133// WQM pass processes it. 134def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 135 136// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is 137// turned into a copy by WQM pass, but does not seed WQM requirements. 138def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 139 140// Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so 141// that the @earlyclobber is respected. The @earlyclobber is to make sure that 142// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't 143// accidentally clobber inactive channels of $vdst. 144let Constraints = "@earlyclobber $vdst" in { 145def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 146def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>; 147} 148 149} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] 150 151def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 152 let Uses = [EXEC]; 153 let Defs = [EXEC, SCC]; 154 let hasSideEffects = 0; 155 let mayLoad = 0; 156 let mayStore = 0; 157} 158 159def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 160 let hasSideEffects = 0; 161 let mayLoad = 0; 162 let mayStore = 0; 163} 164 165def ENTER_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { 166 let Uses = [EXEC]; 167 let Defs = [EXEC, SCC]; 168 let hasSideEffects = 0; 169 let mayLoad = 0; 170 let mayStore = 0; 171} 172 173def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> { 174 let hasSideEffects = 0; 175 let mayLoad = 0; 176 let mayStore = 0; 177} 178 179// Invert the exec mask and overwrite the inactive lanes of dst with inactive, 180// restoring it after we're done. 181let Defs = [SCC] in { 182def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst), 183 (ins VGPR_32: $src, VSrc_b32:$inactive), 184 [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> { 185 let Constraints = "$src = $vdst"; 186} 187 188def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst), 189 (ins VReg_64: $src, VSrc_b64:$inactive), 190 [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> { 191 let Constraints = "$src = $vdst"; 192} 193} // End Defs = [SCC] 194 195let usesCustomInserter = 1, Defs = [VCC, EXEC] in { 196def V_ADD_U64_PSEUDO : VPseudoInstSI < 197 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 198 [(set VReg_64:$vdst, (getDivergentFrag<add>.ret i64:$src0, i64:$src1))] 199>; 200 201def V_SUB_U64_PSEUDO : VPseudoInstSI < 202 (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1), 203 [(set VReg_64:$vdst, (getDivergentFrag<sub>.ret i64:$src0, i64:$src1))] 204>; 205} // End usesCustomInserter = 1, Defs = [VCC, EXEC] 206 207let usesCustomInserter = 1, Defs = [SCC] in { 208def S_ADD_U64_PSEUDO : SPseudoInstSI < 209 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 210 [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))] 211>; 212 213def S_SUB_U64_PSEUDO : SPseudoInstSI < 214 (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1), 215 [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))] 216>; 217 218def S_ADD_U64_CO_PSEUDO : SPseudoInstSI < 219 (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) 220>; 221 222def S_SUB_U64_CO_PSEUDO : SPseudoInstSI < 223 (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1) 224>; 225 226def S_ADD_CO_PSEUDO : SPseudoInstSI < 227 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 228>; 229 230def S_SUB_CO_PSEUDO : SPseudoInstSI < 231 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in) 232>; 233 234def S_UADDO_PSEUDO : SPseudoInstSI < 235 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 236>; 237 238def S_USUBO_PSEUDO : SPseudoInstSI < 239 (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1) 240>; 241 242} // End usesCustomInserter = 1, Defs = [SCC] 243 244let usesCustomInserter = 1 in { 245def GET_GROUPSTATICSIZE : SPseudoInstSI <(outs SReg_32:$sdst), (ins), 246 [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>; 247} // End let usesCustomInserter = 1, SALU = 1 248 249// Wrap an instruction by duplicating it, except for setting isTerminator. 250class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI< 251 base_inst.OutOperandList, 252 base_inst.InOperandList> { 253 let Uses = base_inst.Uses; 254 let Defs = base_inst.Defs; 255 let isTerminator = 1; 256 let isAsCheapAsAMove = base_inst.isAsCheapAsAMove; 257 let hasSideEffects = base_inst.hasSideEffects; 258 let UseNamedOperandTable = base_inst.UseNamedOperandTable; 259 let CodeSize = base_inst.CodeSize; 260 let SchedRW = base_inst.SchedRW; 261} 262 263let WaveSizePredicate = isWave64 in { 264def S_MOV_B64_term : WrapTerminatorInst<S_MOV_B64>; 265def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>; 266def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>; 267def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>; 268def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>; 269} 270 271let WaveSizePredicate = isWave32 in { 272def S_MOV_B32_term : WrapTerminatorInst<S_MOV_B32>; 273def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>; 274def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>; 275def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>; 276def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>; 277} 278 279 280def WAVE_BARRIER : SPseudoInstSI<(outs), (ins), 281 [(int_amdgcn_wave_barrier)]> { 282 let SchedRW = []; 283 let hasNoSchedulingInfo = 1; 284 let hasSideEffects = 1; 285 let mayLoad = 0; 286 let mayStore = 0; 287 let isConvergent = 1; 288 let FixedSize = 1; 289 let Size = 0; 290} 291 292// SI pseudo instructions. These are used by the CFG structurizer pass 293// and should be lowered to ISA instructions prior to codegen. 294 295let isTerminator = 1 in { 296 297let OtherPredicates = [EnableLateCFGStructurize] in { 298 def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < 299 (outs), 300 (ins SReg_1:$vcc, brtarget:$target), 301 [(brcond i1:$vcc, bb:$target)]> { 302 let Size = 12; 303} 304} 305 306def SI_IF: CFPseudoInstSI < 307 (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target), 308 [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { 309 let Constraints = ""; 310 let Size = 12; 311 let hasSideEffects = 1; 312} 313 314def SI_ELSE : CFPseudoInstSI < 315 (outs SReg_1:$dst), 316 (ins SReg_1:$src, brtarget:$target), [], 1, 1> { 317 let Size = 12; 318 let hasSideEffects = 1; 319} 320 321def SI_WATERFALL_LOOP : CFPseudoInstSI < 322 (outs), 323 (ins brtarget:$target), [], 1> { 324 let Size = 8; 325 let isBranch = 1; 326 let Defs = []; 327} 328 329def SI_LOOP : CFPseudoInstSI < 330 (outs), (ins SReg_1:$saved, brtarget:$target), 331 [(AMDGPUloop i1:$saved, bb:$target)], 1, 1> { 332 let Size = 8; 333 let isBranch = 1; 334 let hasSideEffects = 1; 335} 336 337} // End isTerminator = 1 338 339def SI_END_CF : CFPseudoInstSI < 340 (outs), (ins SReg_1:$saved), [], 1, 1> { 341 let Size = 4; 342 let isAsCheapAsAMove = 1; 343 let isReMaterializable = 1; 344 let hasSideEffects = 1; 345 let mayLoad = 1; // FIXME: Should not need memory flags 346 let mayStore = 1; 347} 348 349def SI_IF_BREAK : CFPseudoInstSI < 350 (outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> { 351 let Size = 4; 352 let isAsCheapAsAMove = 1; 353 let isReMaterializable = 1; 354} 355 356// Branch to the early termination block of the shader if SCC is 0. 357// This uses SCC from a previous SALU operation, i.e. the update of 358// a mask of live lanes after a kill/demote operation. 359// Only valid in pixel shaders. 360def SI_EARLY_TERMINATE_SCC0 : SPseudoInstSI <(outs), (ins)> { 361 let Uses = [EXEC,SCC]; 362} 363 364let Uses = [EXEC] in { 365 366multiclass PseudoInstKill <dag ins> { 367 // Even though this pseudo can usually be expanded without an SCC def, we 368 // conservatively assume that it has an SCC def, both because it is sometimes 369 // required in degenerate cases (when V_CMPX cannot be used due to constant 370 // bus limitations) and because it allows us to avoid having to track SCC 371 // liveness across basic blocks. 372 let Defs = [EXEC,SCC] in 373 def _PSEUDO : PseudoInstSI <(outs), ins> { 374 let isConvergent = 1; 375 let usesCustomInserter = 1; 376 } 377 378 let Defs = [EXEC,SCC] in 379 def _TERMINATOR : SPseudoInstSI <(outs), ins> { 380 let isTerminator = 1; 381 } 382} 383 384defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>; 385let Defs = [VCC] in 386defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>; 387 388let Defs = [EXEC,VCC] in 389def SI_ILLEGAL_COPY : SPseudoInstSI < 390 (outs unknown:$dst), (ins unknown:$src), 391 [], " ; illegal copy $src to $dst">; 392 393} // End Uses = [EXEC], Defs = [EXEC,VCC] 394 395// Branch on undef scc. Used to avoid intermediate copy from 396// IMPLICIT_DEF to SCC. 397def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> { 398 let isTerminator = 1; 399 let usesCustomInserter = 1; 400 let isBranch = 1; 401} 402 403def SI_PS_LIVE : PseudoInstSI < 404 (outs SReg_1:$dst), (ins), 405 [(set i1:$dst, (int_amdgcn_ps_live))]> { 406 let SALU = 1; 407} 408 409let Uses = [EXEC] in { 410def SI_LIVE_MASK : PseudoInstSI < 411 (outs SReg_1:$dst), (ins), 412 [(set i1:$dst, (int_amdgcn_live_mask))]> { 413 let SALU = 1; 414} 415let Defs = [EXEC,SCC] in { 416// Demote: Turn a pixel shader thread into a helper lane. 417def SI_DEMOTE_I1 : SPseudoInstSI <(outs), (ins SCSrc_i1:$src, i1imm:$killvalue)>; 418} // End Defs = [EXEC,SCC] 419} // End Uses = [EXEC] 420 421def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins), 422 [(int_amdgcn_unreachable)], 423 "; divergent unreachable"> { 424 let Size = 0; 425 let hasNoSchedulingInfo = 1; 426 let FixedSize = 1; 427} 428 429// Used as an isel pseudo to directly emit initialization with an 430// s_mov_b32 rather than a copy of another initialized 431// register. MachineCSE skips copies, and we don't want to have to 432// fold operands before it runs. 433def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { 434 let Defs = [M0]; 435 let usesCustomInserter = 1; 436 let isAsCheapAsAMove = 1; 437 let isReMaterializable = 1; 438} 439 440def SI_INIT_EXEC : SPseudoInstSI < 441 (outs), (ins i64imm:$src), 442 [(int_amdgcn_init_exec (i64 timm:$src))]> { 443 let Defs = [EXEC]; 444 let isAsCheapAsAMove = 1; 445} 446 447def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < 448 (outs), (ins SSrc_b32:$input, i32imm:$shift), 449 [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> { 450 let Defs = [EXEC]; 451} 452 453// Return for returning shaders to a shader variant epilog. 454def SI_RETURN_TO_EPILOG : SPseudoInstSI < 455 (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { 456 let isTerminator = 1; 457 let isBarrier = 1; 458 let isReturn = 1; 459 let hasNoSchedulingInfo = 1; 460 let DisableWQM = 1; 461 let FixedSize = 1; 462} 463 464// Return for returning function calls. 465def SI_RETURN : SPseudoInstSI < 466 (outs), (ins), [], 467 "; return"> { 468 let isTerminator = 1; 469 let isBarrier = 1; 470 let isReturn = 1; 471 let SchedRW = [WriteBranch]; 472} 473 474// Return for returning function calls without output register. 475// 476// This version is only needed so we can fill in the output register 477// in the custom inserter. 478def SI_CALL_ISEL : SPseudoInstSI < 479 (outs), (ins SSrc_b64:$src0, unknown:$callee), 480 [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> { 481 let Size = 4; 482 let isCall = 1; 483 let SchedRW = [WriteBranch]; 484 let usesCustomInserter = 1; 485 // TODO: Should really base this on the call target 486 let isConvergent = 1; 487} 488 489def : GCNPat< 490 (AMDGPUcall i64:$src0, (i64 0)), 491 (SI_CALL_ISEL $src0, (i64 0)) 492>; 493 494// Wrapper around s_swappc_b64 with extra $callee parameter to track 495// the called function after regalloc. 496def SI_CALL : SPseudoInstSI < 497 (outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> { 498 let Size = 4; 499 let isCall = 1; 500 let UseNamedOperandTable = 1; 501 let SchedRW = [WriteBranch]; 502 // TODO: Should really base this on the call target 503 let isConvergent = 1; 504} 505 506// Tail call handling pseudo 507def SI_TCRETURN : SPseudoInstSI <(outs), 508 (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff), 509 [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> { 510 let Size = 4; 511 let isCall = 1; 512 let isTerminator = 1; 513 let isReturn = 1; 514 let isBarrier = 1; 515 let UseNamedOperandTable = 1; 516 let SchedRW = [WriteBranch]; 517 // TODO: Should really base this on the call target 518 let isConvergent = 1; 519} 520 521// Handle selecting indirect tail calls 522def : GCNPat< 523 (AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)), 524 (SI_TCRETURN SReg_64:$src0, (i64 0), i32imm:$fpdiff) 525>; 526 527def ADJCALLSTACKUP : SPseudoInstSI< 528 (outs), (ins i32imm:$amt0, i32imm:$amt1), 529 [(callseq_start timm:$amt0, timm:$amt1)], 530 "; adjcallstackup $amt0 $amt1"> { 531 let Size = 8; // Worst case. (s_add_u32 + constant) 532 let FixedSize = 1; 533 let hasSideEffects = 1; 534 let usesCustomInserter = 1; 535 let SchedRW = [WriteSALU]; 536 let Defs = [SCC]; 537} 538 539def ADJCALLSTACKDOWN : SPseudoInstSI< 540 (outs), (ins i32imm:$amt1, i32imm:$amt2), 541 [(callseq_end timm:$amt1, timm:$amt2)], 542 "; adjcallstackdown $amt1"> { 543 let Size = 8; // Worst case. (s_add_u32 + constant) 544 let hasSideEffects = 1; 545 let usesCustomInserter = 1; 546 let SchedRW = [WriteSALU]; 547 let Defs = [SCC]; 548} 549 550let Defs = [M0, EXEC, SCC], 551 UseNamedOperandTable = 1 in { 552 553// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect 554// addressing implementation. 555class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI < 556 (outs VGPR_32:$vdst), 557 (ins rc:$src, VS_32:$idx, i32imm:$offset)> { 558 let usesCustomInserter = 1; 559} 560 561class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI < 562 (outs rc:$vdst), 563 (ins rc:$src, VS_32:$idx, i32imm:$offset, VGPR_32:$val)> { 564 let Constraints = "$src = $vdst"; 565 let usesCustomInserter = 1; 566} 567 568def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>; 569def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>; 570def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>; 571def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>; 572def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>; 573def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>; 574 575def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>; 576def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>; 577def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>; 578def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>; 579def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>; 580def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>; 581 582} // End Uses = [EXEC], Defs = [M0, EXEC] 583 584// This is a pseudo variant of the v_movreld_b32 instruction in which the 585// vector operand appears only twice, once as def and once as use. Using this 586// pseudo avoids problems with the Two Address instructions pass. 587class INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 588 RegisterOperand val_ty> : PseudoInstSI < 589 (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> { 590 let Constraints = "$vsrc = $vdst"; 591 let Uses = [M0]; 592} 593 594class V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 595 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, VSrc_b32> { 596 let VALU = 1; 597 let VOP1 = 1; 598 let Uses = [M0, EXEC]; 599} 600 601class S_INDIRECT_REG_WRITE_MOVREL_pseudo<RegisterClass rc, 602 RegisterOperand val_ty> : 603 INDIRECT_REG_WRITE_MOVREL_pseudo<rc, val_ty> { 604 let SALU = 1; 605 let SOP1 = 1; 606 let Uses = [M0]; 607} 608 609class S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<RegisterClass rc> : 610 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b32>; 611class S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<RegisterClass rc> : 612 S_INDIRECT_REG_WRITE_MOVREL_pseudo<rc, SSrc_b64>; 613 614def V_INDIRECT_REG_WRITE_MOVREL_B32_V1 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VGPR_32>; 615def V_INDIRECT_REG_WRITE_MOVREL_B32_V2 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_64>; 616def V_INDIRECT_REG_WRITE_MOVREL_B32_V3 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_96>; 617def V_INDIRECT_REG_WRITE_MOVREL_B32_V4 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_128>; 618def V_INDIRECT_REG_WRITE_MOVREL_B32_V5 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_160>; 619def V_INDIRECT_REG_WRITE_MOVREL_B32_V8 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_256>; 620def V_INDIRECT_REG_WRITE_MOVREL_B32_V16 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_512>; 621def V_INDIRECT_REG_WRITE_MOVREL_B32_V32 : V_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<VReg_1024>; 622 623def S_INDIRECT_REG_WRITE_MOVREL_B32_V1 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_32>; 624def S_INDIRECT_REG_WRITE_MOVREL_B32_V2 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_64>; 625def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_96>; 626def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>; 627def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>; 628def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>; 629def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>; 630def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>; 631 632def S_INDIRECT_REG_WRITE_MOVREL_B64_V1 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_64>; 633def S_INDIRECT_REG_WRITE_MOVREL_B64_V2 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_128>; 634def S_INDIRECT_REG_WRITE_MOVREL_B64_V4 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_256>; 635def S_INDIRECT_REG_WRITE_MOVREL_B64_V8 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_512>; 636def S_INDIRECT_REG_WRITE_MOVREL_B64_V16 : S_INDIRECT_REG_WRITE_MOVREL_B64_pseudo<SReg_1024>; 637 638// These variants of V_INDIRECT_REG_READ/WRITE use VGPR indexing. By using these 639// pseudos we avoid spills or copies being inserted within indirect sequences 640// that switch the VGPR indexing mode. Spills to accvgprs could be effected by 641// this mode switching. 642 643class V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 644 (outs rc:$vdst), (ins rc:$vsrc, VSrc_b32:$val, SSrc_b32:$idx, i32imm:$subreg)> { 645 let Constraints = "$vsrc = $vdst"; 646 let VALU = 1; 647 let Uses = [M0, EXEC]; 648 let Defs = [M0]; 649} 650 651def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VGPR_32>; 652def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_64>; 653def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_96>; 654def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_128>; 655def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_160>; 656def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_256>; 657def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_512>; 658def V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32 : V_INDIRECT_REG_WRITE_GPR_IDX_pseudo<VReg_1024>; 659 660class V_INDIRECT_REG_READ_GPR_IDX_pseudo<RegisterClass rc> : PseudoInstSI < 661 (outs VGPR_32:$vdst), (ins rc:$vsrc, SSrc_b32:$idx, i32imm:$subreg)> { 662 let VALU = 1; 663 let Uses = [M0, EXEC]; 664 let Defs = [M0]; 665} 666 667def V_INDIRECT_REG_READ_GPR_IDX_B32_V1 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VGPR_32>; 668def V_INDIRECT_REG_READ_GPR_IDX_B32_V2 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_64>; 669def V_INDIRECT_REG_READ_GPR_IDX_B32_V3 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_96>; 670def V_INDIRECT_REG_READ_GPR_IDX_B32_V4 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_128>; 671def V_INDIRECT_REG_READ_GPR_IDX_B32_V5 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_160>; 672def V_INDIRECT_REG_READ_GPR_IDX_B32_V8 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_256>; 673def V_INDIRECT_REG_READ_GPR_IDX_B32_V16 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_512>; 674def V_INDIRECT_REG_READ_GPR_IDX_B32_V32 : V_INDIRECT_REG_READ_GPR_IDX_pseudo<VReg_1024>; 675 676multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> { 677 let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in { 678 def _SAVE : PseudoInstSI < 679 (outs), 680 (ins sgpr_class:$data, i32imm:$addr)> { 681 let mayStore = 1; 682 let mayLoad = 0; 683 } 684 685 def _RESTORE : PseudoInstSI < 686 (outs sgpr_class:$data), 687 (ins i32imm:$addr)> { 688 let mayStore = 0; 689 let mayLoad = 1; 690 } 691 } // End UseNamedOperandTable = 1 692} 693 694// You cannot use M0 as the output of v_readlane_b32 instructions or 695// use it in the sdata operand of SMEM instructions. We still need to 696// be able to spill the physical register m0, so allow it for 697// SI_SPILL_32_* instructions. 698defm SI_SPILL_S32 : SI_SPILL_SGPR <SReg_32>; 699defm SI_SPILL_S64 : SI_SPILL_SGPR <SReg_64>; 700defm SI_SPILL_S96 : SI_SPILL_SGPR <SReg_96>; 701defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>; 702defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>; 703defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>; 704defm SI_SPILL_S224 : SI_SPILL_SGPR <SReg_224>; 705defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>; 706defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>; 707defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>; 708 709// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register 710// needs to be used and an extra instruction to move between VGPR and AGPR. 711// UsesTmp adds to the total size of an expanded spill in this case. 712multiclass SI_SPILL_VGPR <RegisterClass vgpr_class, bit UsesTmp = 0> { 713 let UseNamedOperandTable = 1, VGPRSpill = 1, 714 SchedRW = [WriteVMEM] in { 715 def _SAVE : VPseudoInstSI < 716 (outs), 717 (ins vgpr_class:$vdata, i32imm:$vaddr, 718 SReg_32:$soffset, i32imm:$offset)> { 719 let mayStore = 1; 720 let mayLoad = 0; 721 // (2 * 4) + (8 * num_subregs) bytes maximum 722 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 723 // Size field is unsigned char and cannot fit more. 724 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 725 } 726 727 def _RESTORE : VPseudoInstSI < 728 (outs vgpr_class:$vdata), 729 (ins i32imm:$vaddr, 730 SReg_32:$soffset, i32imm:$offset)> { 731 let mayStore = 0; 732 let mayLoad = 1; 733 734 // (2 * 4) + (8 * num_subregs) bytes maximum 735 int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), !add(UsesTmp, 3)), 8); 736 // Size field is unsigned char and cannot fit more. 737 let Size = !if(!le(MaxSize, 256), MaxSize, 252); 738 } 739 } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM] 740} 741 742defm SI_SPILL_V32 : SI_SPILL_VGPR <VGPR_32>; 743defm SI_SPILL_V64 : SI_SPILL_VGPR <VReg_64>; 744defm SI_SPILL_V96 : SI_SPILL_VGPR <VReg_96>; 745defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>; 746defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>; 747defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>; 748defm SI_SPILL_V224 : SI_SPILL_VGPR <VReg_224>; 749defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>; 750defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>; 751defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>; 752 753defm SI_SPILL_A32 : SI_SPILL_VGPR <AGPR_32, 1>; 754defm SI_SPILL_A64 : SI_SPILL_VGPR <AReg_64, 1>; 755defm SI_SPILL_A96 : SI_SPILL_VGPR <AReg_96, 1>; 756defm SI_SPILL_A128 : SI_SPILL_VGPR <AReg_128, 1>; 757defm SI_SPILL_A160 : SI_SPILL_VGPR <AReg_160, 1>; 758defm SI_SPILL_A192 : SI_SPILL_VGPR <AReg_192, 1>; 759defm SI_SPILL_A224 : SI_SPILL_VGPR <AReg_224, 1>; 760defm SI_SPILL_A256 : SI_SPILL_VGPR <AReg_256, 1>; 761defm SI_SPILL_A512 : SI_SPILL_VGPR <AReg_512, 1>; 762defm SI_SPILL_A1024 : SI_SPILL_VGPR <AReg_1024, 1>; 763 764defm SI_SPILL_AV32 : SI_SPILL_VGPR <AV_32, 1>; 765defm SI_SPILL_AV64 : SI_SPILL_VGPR <AV_64, 1>; 766defm SI_SPILL_AV96 : SI_SPILL_VGPR <AV_96, 1>; 767defm SI_SPILL_AV128 : SI_SPILL_VGPR <AV_128, 1>; 768defm SI_SPILL_AV160 : SI_SPILL_VGPR <AV_160, 1>; 769defm SI_SPILL_AV192 : SI_SPILL_VGPR <AV_192, 1>; 770defm SI_SPILL_AV224 : SI_SPILL_VGPR <AV_224, 1>; 771defm SI_SPILL_AV256 : SI_SPILL_VGPR <AV_256, 1>; 772defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>; 773defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>; 774 775def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < 776 (outs SReg_64:$dst), 777 (ins si_ga:$ptr_lo, si_ga:$ptr_hi), 778 [(set SReg_64:$dst, 779 (i64 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, tglobaladdr:$ptr_hi)))]> { 780 let Defs = [SCC]; 781} 782 783def : GCNPat < 784 (SIpc_add_rel_offset tglobaladdr:$ptr_lo, 0), 785 (SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0)) 786>; 787 788def : GCNPat< 789 (AMDGPUtrap timm:$trapid), 790 (S_TRAP $trapid) 791>; 792 793def : GCNPat< 794 (AMDGPUelse i1:$src, bb:$target), 795 (SI_ELSE $src, $target) 796>; 797 798def : Pat < 799 (int_amdgcn_kill i1:$src), 800 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0) 801>; 802 803def : Pat < 804 (int_amdgcn_kill (i1 (not i1:$src))), 805 (SI_KILL_I1_PSEUDO SCSrc_i1:$src, -1) 806>; 807 808def : Pat < 809 (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))), 810 (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond)) 811>; 812 813def : Pat < 814 (int_amdgcn_wqm_demote i1:$src), 815 (SI_DEMOTE_I1 SCSrc_i1:$src, 0) 816>; 817 818def : Pat < 819 (int_amdgcn_wqm_demote (i1 (not i1:$src))), 820 (SI_DEMOTE_I1 SCSrc_i1:$src, -1) 821>; 822 823 // TODO: we could add more variants for other types of conditionals 824 825def : Pat < 826 (i64 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 827 (COPY $src) // Return the SGPRs representing i1 src 828>; 829 830def : Pat < 831 (i32 (int_amdgcn_icmp i1:$src, (i1 0), (i32 33))), 832 (COPY $src) // Return the SGPRs representing i1 src 833>; 834 835//===----------------------------------------------------------------------===// 836// VOP1 Patterns 837//===----------------------------------------------------------------------===// 838 839let OtherPredicates = [UnsafeFPMath] in { 840 841// Convert (x - floor(x)) to fract(x) 842def : GCNPat < 843 (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)), 844 (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))), 845 (V_FRACT_F32_e64 $mods, $x) 846>; 847 848// Convert (x + (-floor(x))) to fract(x) 849def : GCNPat < 850 (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)), 851 (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))), 852 (V_FRACT_F64_e64 $mods, $x) 853>; 854 855} // End OtherPredicates = [UnsafeFPMath] 856 857 858// f16_to_fp patterns 859def : GCNPat < 860 (f32 (f16_to_fp i32:$src0)), 861 (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0) 862>; 863 864def : GCNPat < 865 (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), 866 (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0) 867>; 868 869def : GCNPat < 870 (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), 871 (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) 872>; 873 874def : GCNPat < 875 (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), 876 (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0) 877>; 878 879def : GCNPat < 880 (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), 881 (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0) 882>; 883 884def : GCNPat < 885 (f64 (fpextend f16:$src)), 886 (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src)) 887>; 888 889// fp_to_fp16 patterns 890def : GCNPat < 891 (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), 892 (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0) 893>; 894 895def : GCNPat < 896 (i32 (fp_to_sint f16:$src)), 897 (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src)) 898>; 899 900def : GCNPat < 901 (i32 (fp_to_uint f16:$src)), 902 (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src)) 903>; 904 905def : GCNPat < 906 (f16 (sint_to_fp i32:$src)), 907 (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 VSrc_b32:$src)) 908>; 909 910def : GCNPat < 911 (f16 (uint_to_fp i32:$src)), 912 (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src)) 913>; 914 915//===----------------------------------------------------------------------===// 916// VOP2 Patterns 917//===----------------------------------------------------------------------===// 918 919// NoMods pattern used for mac. If there are any source modifiers then it's 920// better to select mad instead of mac. 921class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node> 922 : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)), 923 (vt (VOP3NoMods vt:$src1)), 924 (vt (VOP3NoMods vt:$src2)))), 925 (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 926 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 927>; 928 929// Prefer mac form when there are no modifiers. 930let AddedComplexity = 9 in { 931let OtherPredicates = [HasMadMacF32Insts] in { 932def : FMADPat <f32, V_MAC_F32_e64, fmad>; 933def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>; 934} // OtherPredicates = [HasMadMacF32Insts] 935 936// Don't allow source modifiers. If there are any source modifiers then it's 937// better to select mad instead of mac. 938let SubtargetPredicate = isGFX6GFX7GFX10, 939 OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 940def : GCNPat < 941 (f32 (fadd (AMDGPUfmul_legacy (VOP3NoMods f32:$src0), 942 (VOP3NoMods f32:$src1)), 943 (VOP3NoMods f32:$src2))), 944 (V_MAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 945 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 946>; 947 948// Don't allow source modifiers. If there are any source modifiers then it's 949// better to select fma instead of fmac. 950let SubtargetPredicate = HasFmaLegacy32 in 951def : GCNPat < 952 (f32 (int_amdgcn_fma_legacy (VOP3NoMods f32:$src0), 953 (VOP3NoMods f32:$src1), 954 (VOP3NoMods f32:$src2))), 955 (V_FMAC_LEGACY_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 956 SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 957>; 958 959let SubtargetPredicate = Has16BitInsts in { 960def : FMADPat <f16, V_MAC_F16_e64, fmad>; 961def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>; 962} // SubtargetPredicate = Has16BitInsts 963} // AddedComplexity = 9 964 965class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr> 966 : GCNPat< 967 (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)), 968 (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)), 969 (Ty (VOP3Mods Ty:$src2, i32:$src2_mod)))), 970 (inst $src0_mod, $src0, $src1_mod, $src1, 971 $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 972>; 973 974let OtherPredicates = [HasMadMacF32Insts] in 975def : FMADModsPat<f32, V_MAD_F32_e64, AMDGPUfmad_ftz>; 976 977let OtherPredicates = [HasMadMacF32Insts, NoFP32Denormals] in 978def : GCNPat < 979 (f32 (fadd (AMDGPUfmul_legacy (VOP3Mods f32:$src0, i32:$src0_mod), 980 (VOP3Mods f32:$src1, i32:$src1_mod)), 981 (VOP3Mods f32:$src2, i32:$src2_mod))), 982 (V_MAD_LEGACY_F32_e64 $src0_mod, $src0, $src1_mod, $src1, 983 $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 984>; 985 986let SubtargetPredicate = Has16BitInsts in 987def : FMADModsPat<f16, V_MAD_F16_e64, AMDGPUfmad_ftz>; 988 989class VOPSelectModsPat <ValueType vt> : GCNPat < 990 (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods), 991 (VOP3Mods vt:$src2, i32:$src2_mods))), 992 (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2, 993 FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0) 994>; 995 996class VOPSelectPat <ValueType vt> : GCNPat < 997 (vt (select i1:$src0, vt:$src1, vt:$src2)), 998 (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0) 999>; 1000 1001def : VOPSelectModsPat <i32>; 1002def : VOPSelectModsPat <f32>; 1003def : VOPSelectPat <f16>; 1004def : VOPSelectPat <i16>; 1005 1006let AddedComplexity = 1 in { 1007def : GCNPat < 1008 (i32 (add (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)), i32:$val)), 1009 (V_BCNT_U32_B32_e64 $popcnt, $val) 1010>; 1011} 1012 1013def : GCNPat < 1014 (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)), 1015 (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0)) 1016>; 1017 1018def : GCNPat < 1019 (i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)), 1020 (V_BCNT_U32_B32_e64 $popcnt, $val) 1021>; 1022 1023def : GCNPat < 1024 (i64 (DivergentUnaryFrag<ctpop> i64:$src)), 1025 (REG_SEQUENCE VReg_64, 1026 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)), 1027 (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0, 1028 (i32 (V_MOV_B32_e32 (i32 0))), sub1) 1029>; 1030 1031/********** ============================================ **********/ 1032/********** Extraction, Insertion, Building and Casting **********/ 1033/********** ============================================ **********/ 1034 1035// Special case for 2 element vectors. REQ_SEQUENCE produces better code 1036// than an INSERT_SUBREG. 1037multiclass Insert_Element_V2<RegisterClass RC, ValueType elem_type, ValueType vec_type> { 1038 def : GCNPat < 1039 (insertelt vec_type:$vec, elem_type:$elem, 0), 1040 (REG_SEQUENCE RC, $elem, sub0, (elem_type (EXTRACT_SUBREG $vec, sub1)), sub1) 1041 >; 1042 1043 def : GCNPat < 1044 (insertelt vec_type:$vec, elem_type:$elem, 1), 1045 (REG_SEQUENCE RC, (elem_type (EXTRACT_SUBREG $vec, sub0)), sub0, $elem, sub1) 1046 >; 1047} 1048 1049foreach Index = 0-1 in { 1050 def Extract_Element_v2i32_#Index : Extract_Element < 1051 i32, v2i32, Index, !cast<SubRegIndex>(sub#Index) 1052 >; 1053 1054 def Extract_Element_v2f32_#Index : Extract_Element < 1055 f32, v2f32, Index, !cast<SubRegIndex>(sub#Index) 1056 >; 1057} 1058 1059defm : Insert_Element_V2 <SReg_64, i32, v2i32>; 1060defm : Insert_Element_V2 <SReg_64, f32, v2f32>; 1061 1062foreach Index = 0-2 in { 1063 def Extract_Element_v3i32_#Index : Extract_Element < 1064 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1065 >; 1066 def Insert_Element_v3i32_#Index : Insert_Element < 1067 i32, v3i32, Index, !cast<SubRegIndex>(sub#Index) 1068 >; 1069 1070 def Extract_Element_v3f32_#Index : Extract_Element < 1071 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1072 >; 1073 def Insert_Element_v3f32_#Index : Insert_Element < 1074 f32, v3f32, Index, !cast<SubRegIndex>(sub#Index) 1075 >; 1076} 1077 1078foreach Index = 0-3 in { 1079 def Extract_Element_v4i32_#Index : Extract_Element < 1080 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1081 >; 1082 def Insert_Element_v4i32_#Index : Insert_Element < 1083 i32, v4i32, Index, !cast<SubRegIndex>(sub#Index) 1084 >; 1085 1086 def Extract_Element_v4f32_#Index : Extract_Element < 1087 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1088 >; 1089 def Insert_Element_v4f32_#Index : Insert_Element < 1090 f32, v4f32, Index, !cast<SubRegIndex>(sub#Index) 1091 >; 1092} 1093 1094foreach Index = 0-4 in { 1095 def Extract_Element_v5i32_#Index : Extract_Element < 1096 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1097 >; 1098 def Insert_Element_v5i32_#Index : Insert_Element < 1099 i32, v5i32, Index, !cast<SubRegIndex>(sub#Index) 1100 >; 1101 1102 def Extract_Element_v5f32_#Index : Extract_Element < 1103 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1104 >; 1105 def Insert_Element_v5f32_#Index : Insert_Element < 1106 f32, v5f32, Index, !cast<SubRegIndex>(sub#Index) 1107 >; 1108} 1109 1110foreach Index = 0-5 in { 1111 def Extract_Element_v6i32_#Index : Extract_Element < 1112 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1113 >; 1114 def Insert_Element_v6i32_#Index : Insert_Element < 1115 i32, v6i32, Index, !cast<SubRegIndex>(sub#Index) 1116 >; 1117 1118 def Extract_Element_v6f32_#Index : Extract_Element < 1119 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1120 >; 1121 def Insert_Element_v6f32_#Index : Insert_Element < 1122 f32, v6f32, Index, !cast<SubRegIndex>(sub#Index) 1123 >; 1124} 1125 1126foreach Index = 0-6 in { 1127 def Extract_Element_v7i32_#Index : Extract_Element < 1128 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1129 >; 1130 def Insert_Element_v7i32_#Index : Insert_Element < 1131 i32, v7i32, Index, !cast<SubRegIndex>(sub#Index) 1132 >; 1133 1134 def Extract_Element_v7f32_#Index : Extract_Element < 1135 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1136 >; 1137 def Insert_Element_v7f32_#Index : Insert_Element < 1138 f32, v7f32, Index, !cast<SubRegIndex>(sub#Index) 1139 >; 1140} 1141 1142foreach Index = 0-7 in { 1143 def Extract_Element_v8i32_#Index : Extract_Element < 1144 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1145 >; 1146 def Insert_Element_v8i32_#Index : Insert_Element < 1147 i32, v8i32, Index, !cast<SubRegIndex>(sub#Index) 1148 >; 1149 1150 def Extract_Element_v8f32_#Index : Extract_Element < 1151 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1152 >; 1153 def Insert_Element_v8f32_#Index : Insert_Element < 1154 f32, v8f32, Index, !cast<SubRegIndex>(sub#Index) 1155 >; 1156} 1157 1158foreach Index = 0-15 in { 1159 def Extract_Element_v16i32_#Index : Extract_Element < 1160 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1161 >; 1162 def Insert_Element_v16i32_#Index : Insert_Element < 1163 i32, v16i32, Index, !cast<SubRegIndex>(sub#Index) 1164 >; 1165 1166 def Extract_Element_v16f32_#Index : Extract_Element < 1167 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1168 >; 1169 def Insert_Element_v16f32_#Index : Insert_Element < 1170 f32, v16f32, Index, !cast<SubRegIndex>(sub#Index) 1171 >; 1172} 1173 1174 1175def : Pat < 1176 (extract_subvector v4i16:$vec, (i32 0)), 1177 (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0)) 1178>; 1179 1180def : Pat < 1181 (extract_subvector v4i16:$vec, (i32 2)), 1182 (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1)) 1183>; 1184 1185def : Pat < 1186 (extract_subvector v4f16:$vec, (i32 0)), 1187 (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0)) 1188>; 1189 1190def : Pat < 1191 (extract_subvector v4f16:$vec, (i32 2)), 1192 (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1)) 1193>; 1194 1195def : Pat < 1196 (extract_subvector v8i16:$vec, (i32 0)), 1197 (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1)) 1198>; 1199 1200def : Pat < 1201 (extract_subvector v8i16:$vec, (i32 4)), 1202 (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3)) 1203>; 1204 1205def : Pat < 1206 (extract_subvector v8f16:$vec, (i32 0)), 1207 (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1)) 1208>; 1209 1210def : Pat < 1211 (extract_subvector v8f16:$vec, (i32 4)), 1212 (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3)) 1213>; 1214 1215foreach Index = 0-31 in { 1216 def Extract_Element_v32i32_#Index : Extract_Element < 1217 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1218 >; 1219 1220 def Insert_Element_v32i32_#Index : Insert_Element < 1221 i32, v32i32, Index, !cast<SubRegIndex>(sub#Index) 1222 >; 1223 1224 def Extract_Element_v32f32_#Index : Extract_Element < 1225 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1226 >; 1227 1228 def Insert_Element_v32f32_#Index : Insert_Element < 1229 f32, v32f32, Index, !cast<SubRegIndex>(sub#Index) 1230 >; 1231} 1232 1233// FIXME: Why do only some of these type combinations for SReg and 1234// VReg? 1235// 16-bit bitcast 1236def : BitConvert <i16, f16, VGPR_32>; 1237def : BitConvert <f16, i16, VGPR_32>; 1238def : BitConvert <i16, f16, SReg_32>; 1239def : BitConvert <f16, i16, SReg_32>; 1240 1241// 32-bit bitcast 1242def : BitConvert <i32, f32, VGPR_32>; 1243def : BitConvert <f32, i32, VGPR_32>; 1244def : BitConvert <i32, f32, SReg_32>; 1245def : BitConvert <f32, i32, SReg_32>; 1246def : BitConvert <v2i16, i32, SReg_32>; 1247def : BitConvert <i32, v2i16, SReg_32>; 1248def : BitConvert <v2f16, i32, SReg_32>; 1249def : BitConvert <i32, v2f16, SReg_32>; 1250def : BitConvert <v2i16, v2f16, SReg_32>; 1251def : BitConvert <v2f16, v2i16, SReg_32>; 1252def : BitConvert <v2f16, f32, SReg_32>; 1253def : BitConvert <f32, v2f16, SReg_32>; 1254def : BitConvert <v2i16, f32, SReg_32>; 1255def : BitConvert <f32, v2i16, SReg_32>; 1256 1257// 64-bit bitcast 1258def : BitConvert <i64, f64, VReg_64>; 1259def : BitConvert <f64, i64, VReg_64>; 1260def : BitConvert <v2i32, v2f32, VReg_64>; 1261def : BitConvert <v2f32, v2i32, VReg_64>; 1262def : BitConvert <i64, v2i32, VReg_64>; 1263def : BitConvert <v2i32, i64, VReg_64>; 1264def : BitConvert <i64, v2f32, VReg_64>; 1265def : BitConvert <v2f32, i64, VReg_64>; 1266def : BitConvert <f64, v2f32, VReg_64>; 1267def : BitConvert <v2f32, f64, VReg_64>; 1268def : BitConvert <f64, v2i32, VReg_64>; 1269def : BitConvert <v2i32, f64, VReg_64>; 1270def : BitConvert <v4i16, v4f16, VReg_64>; 1271def : BitConvert <v4f16, v4i16, VReg_64>; 1272 1273// FIXME: Make SGPR 1274def : BitConvert <v2i32, v4f16, VReg_64>; 1275def : BitConvert <v4f16, v2i32, VReg_64>; 1276def : BitConvert <v2i32, v4f16, VReg_64>; 1277def : BitConvert <v2i32, v4i16, VReg_64>; 1278def : BitConvert <v4i16, v2i32, VReg_64>; 1279def : BitConvert <v2f32, v4f16, VReg_64>; 1280def : BitConvert <v4f16, v2f32, VReg_64>; 1281def : BitConvert <v2f32, v4i16, VReg_64>; 1282def : BitConvert <v4i16, v2f32, VReg_64>; 1283def : BitConvert <v4i16, f64, VReg_64>; 1284def : BitConvert <v4f16, f64, VReg_64>; 1285def : BitConvert <f64, v4i16, VReg_64>; 1286def : BitConvert <f64, v4f16, VReg_64>; 1287def : BitConvert <v4i16, i64, VReg_64>; 1288def : BitConvert <v4f16, i64, VReg_64>; 1289def : BitConvert <i64, v4i16, VReg_64>; 1290def : BitConvert <i64, v4f16, VReg_64>; 1291 1292def : BitConvert <v4i32, v4f32, VReg_128>; 1293def : BitConvert <v4f32, v4i32, VReg_128>; 1294 1295// 96-bit bitcast 1296def : BitConvert <v3i32, v3f32, SGPR_96>; 1297def : BitConvert <v3f32, v3i32, SGPR_96>; 1298 1299// 128-bit bitcast 1300def : BitConvert <v2i64, v4i32, SReg_128>; 1301def : BitConvert <v4i32, v2i64, SReg_128>; 1302def : BitConvert <v2f64, v4f32, VReg_128>; 1303def : BitConvert <v2f64, v4i32, VReg_128>; 1304def : BitConvert <v4f32, v2f64, VReg_128>; 1305def : BitConvert <v4i32, v2f64, VReg_128>; 1306def : BitConvert <v2i64, v2f64, VReg_128>; 1307def : BitConvert <v2f64, v2i64, VReg_128>; 1308def : BitConvert <v4f32, v2i64, VReg_128>; 1309def : BitConvert <v2i64, v4f32, VReg_128>; 1310def : BitConvert <v8i16, v4i32, SReg_128>; 1311def : BitConvert <v4i32, v8i16, SReg_128>; 1312def : BitConvert <v8f16, v4f32, VReg_128>; 1313def : BitConvert <v8f16, v4i32, VReg_128>; 1314def : BitConvert <v4f32, v8f16, VReg_128>; 1315def : BitConvert <v4i32, v8f16, VReg_128>; 1316def : BitConvert <v8i16, v8f16, VReg_128>; 1317def : BitConvert <v8f16, v8i16, VReg_128>; 1318def : BitConvert <v4f32, v8i16, VReg_128>; 1319def : BitConvert <v8i16, v4f32, VReg_128>; 1320def : BitConvert <v8i16, v8f16, SReg_128>; 1321def : BitConvert <v8i16, v2i64, SReg_128>; 1322def : BitConvert <v8i16, v2f64, SReg_128>; 1323def : BitConvert <v8f16, v2i64, SReg_128>; 1324def : BitConvert <v8f16, v2f64, SReg_128>; 1325def : BitConvert <v8f16, v8i16, SReg_128>; 1326def : BitConvert <v2i64, v8i16, SReg_128>; 1327def : BitConvert <v2f64, v8i16, SReg_128>; 1328def : BitConvert <v2i64, v8f16, SReg_128>; 1329def : BitConvert <v2f64, v8f16, SReg_128>; 1330 1331// 160-bit bitcast 1332def : BitConvert <v5i32, v5f32, SReg_160>; 1333def : BitConvert <v5f32, v5i32, SReg_160>; 1334def : BitConvert <v5i32, v5f32, VReg_160>; 1335def : BitConvert <v5f32, v5i32, VReg_160>; 1336 1337// 192-bit bitcast 1338def : BitConvert <v6i32, v6f32, SReg_192>; 1339def : BitConvert <v6f32, v6i32, SReg_192>; 1340def : BitConvert <v6i32, v6f32, VReg_192>; 1341def : BitConvert <v6f32, v6i32, VReg_192>; 1342def : BitConvert <v3i64, v3f64, VReg_192>; 1343def : BitConvert <v3f64, v3i64, VReg_192>; 1344def : BitConvert <v3i64, v6i32, VReg_192>; 1345def : BitConvert <v3i64, v6f32, VReg_192>; 1346def : BitConvert <v3f64, v6i32, VReg_192>; 1347def : BitConvert <v3f64, v6f32, VReg_192>; 1348def : BitConvert <v6i32, v3i64, VReg_192>; 1349def : BitConvert <v6f32, v3i64, VReg_192>; 1350def : BitConvert <v6i32, v3f64, VReg_192>; 1351def : BitConvert <v6f32, v3f64, VReg_192>; 1352 1353// 224-bit bitcast 1354def : BitConvert <v7i32, v7f32, SReg_224>; 1355def : BitConvert <v7f32, v7i32, SReg_224>; 1356def : BitConvert <v7i32, v7f32, VReg_224>; 1357def : BitConvert <v7f32, v7i32, VReg_224>; 1358 1359// 256-bit bitcast 1360def : BitConvert <v8i32, v8f32, SReg_256>; 1361def : BitConvert <v8f32, v8i32, SReg_256>; 1362def : BitConvert <v8i32, v8f32, VReg_256>; 1363def : BitConvert <v8f32, v8i32, VReg_256>; 1364def : BitConvert <v4i64, v4f64, VReg_256>; 1365def : BitConvert <v4f64, v4i64, VReg_256>; 1366def : BitConvert <v4i64, v8i32, VReg_256>; 1367def : BitConvert <v4i64, v8f32, VReg_256>; 1368def : BitConvert <v4f64, v8i32, VReg_256>; 1369def : BitConvert <v4f64, v8f32, VReg_256>; 1370def : BitConvert <v8i32, v4i64, VReg_256>; 1371def : BitConvert <v8f32, v4i64, VReg_256>; 1372def : BitConvert <v8i32, v4f64, VReg_256>; 1373def : BitConvert <v8f32, v4f64, VReg_256>; 1374 1375 1376// 512-bit bitcast 1377def : BitConvert <v16i32, v16f32, VReg_512>; 1378def : BitConvert <v16f32, v16i32, VReg_512>; 1379def : BitConvert <v8i64, v8f64, VReg_512>; 1380def : BitConvert <v8f64, v8i64, VReg_512>; 1381def : BitConvert <v8i64, v16i32, VReg_512>; 1382def : BitConvert <v8f64, v16i32, VReg_512>; 1383def : BitConvert <v16i32, v8i64, VReg_512>; 1384def : BitConvert <v16i32, v8f64, VReg_512>; 1385def : BitConvert <v8i64, v16f32, VReg_512>; 1386def : BitConvert <v8f64, v16f32, VReg_512>; 1387def : BitConvert <v16f32, v8i64, VReg_512>; 1388def : BitConvert <v16f32, v8f64, VReg_512>; 1389 1390// 1024-bit bitcast 1391def : BitConvert <v32i32, v32f32, VReg_1024>; 1392def : BitConvert <v32f32, v32i32, VReg_1024>; 1393def : BitConvert <v16i64, v16f64, VReg_1024>; 1394def : BitConvert <v16f64, v16i64, VReg_1024>; 1395def : BitConvert <v16i64, v32i32, VReg_1024>; 1396def : BitConvert <v32i32, v16i64, VReg_1024>; 1397def : BitConvert <v16f64, v32f32, VReg_1024>; 1398def : BitConvert <v32f32, v16f64, VReg_1024>; 1399def : BitConvert <v16i64, v32f32, VReg_1024>; 1400def : BitConvert <v32i32, v16f64, VReg_1024>; 1401def : BitConvert <v16f64, v32i32, VReg_1024>; 1402def : BitConvert <v32f32, v16i64, VReg_1024>; 1403 1404 1405/********** =================== **********/ 1406/********** Src & Dst modifiers **********/ 1407/********** =================== **********/ 1408 1409 1410// If denormals are not enabled, it only impacts the compare of the 1411// inputs. The output result is not flushed. 1412class ClampPat<Instruction inst, ValueType vt> : GCNPat < 1413 (vt (AMDGPUclamp (VOP3Mods vt:$src0, i32:$src0_modifiers))), 1414 (inst i32:$src0_modifiers, vt:$src0, 1415 i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, DSTOMOD.NONE) 1416>; 1417 1418def : ClampPat<V_MAX_F32_e64, f32>; 1419def : ClampPat<V_MAX_F64_e64, f64>; 1420def : ClampPat<V_MAX_F16_e64, f16>; 1421 1422let SubtargetPredicate = HasVOP3PInsts in { 1423def : GCNPat < 1424 (v2f16 (AMDGPUclamp (VOP3PMods v2f16:$src0, i32:$src0_modifiers))), 1425 (V_PK_MAX_F16 $src0_modifiers, $src0, 1426 $src0_modifiers, $src0, DSTCLAMP.ENABLE) 1427>; 1428} 1429 1430 1431/********** ================================ **********/ 1432/********** Floating point absolute/negative **********/ 1433/********** ================================ **********/ 1434 1435def : GCNPat < 1436 (UniformUnaryFrag<fneg> (fabs (f32 SReg_32:$src))), 1437 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit 1438>; 1439 1440def : GCNPat < 1441 (UniformUnaryFrag<fabs> (f32 SReg_32:$src)), 1442 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff))) 1443>; 1444 1445def : GCNPat < 1446 (UniformUnaryFrag<fneg> (f32 SReg_32:$src)), 1447 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) 1448>; 1449 1450def : GCNPat < 1451 (UniformUnaryFrag<fneg> (f16 SReg_32:$src)), 1452 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) 1453>; 1454 1455def : GCNPat < 1456 (UniformUnaryFrag<fabs> (f16 SReg_32:$src)), 1457 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff))) 1458>; 1459 1460def : GCNPat < 1461 (UniformUnaryFrag<fneg> (fabs (f16 SReg_32:$src))), 1462 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit 1463>; 1464 1465def : GCNPat < 1466 (UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)), 1467 (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) 1468>; 1469 1470def : GCNPat < 1471 (UniformUnaryFrag<fabs> (v2f16 SReg_32:$src)), 1472 (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff))) 1473>; 1474 1475// This is really (fneg (fabs v2f16:$src)) 1476// 1477// fabs is not reported as free because there is modifier for it in 1478// VOP3P instructions, so it is turned into the bit op. 1479def : GCNPat < 1480 (UniformUnaryFrag<fneg> (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))), 1481 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1482>; 1483 1484def : GCNPat < 1485 (UniformUnaryFrag<fneg> (v2f16 (fabs SReg_32:$src))), 1486 (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit 1487>; 1488 1489 1490// COPY_TO_REGCLASS is needed to avoid using SCC from S_XOR_B32 instead 1491// of the real value. 1492def : GCNPat < 1493 (UniformUnaryFrag<fneg> (v2f32 SReg_64:$src)), 1494 (v2f32 (REG_SEQUENCE SReg_64, 1495 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1496 (i32 (S_MOV_B32 (i32 0x80000000)))), 1497 SReg_32)), sub0, 1498 (f32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1499 (i32 (S_MOV_B32 (i32 0x80000000)))), 1500 SReg_32)), sub1)) 1501>; 1502 1503def : GCNPat < 1504 (UniformUnaryFrag<fabs> (v2f32 SReg_64:$src)), 1505 (v2f32 (REG_SEQUENCE SReg_64, 1506 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1507 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1508 SReg_32)), sub0, 1509 (f32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1510 (i32 (S_MOV_B32 (i32 0x7fffffff)))), 1511 SReg_32)), sub1)) 1512>; 1513 1514def : GCNPat < 1515 (UniformUnaryFrag<fneg> (fabs (v2f32 SReg_64:$src))), 1516 (v2f32 (REG_SEQUENCE SReg_64, 1517 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub0)), 1518 (i32 (S_MOV_B32 (i32 0x80000000)))), 1519 SReg_32)), sub0, 1520 (f32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG $src, sub1)), 1521 (i32 (S_MOV_B32 (i32 0x80000000)))), 1522 SReg_32)), sub1)) 1523>; 1524 1525// FIXME: Use S_BITSET0_B32/B64? 1526def : GCNPat < 1527 (UniformUnaryFrag<fabs> (f64 SReg_64:$src)), 1528 (REG_SEQUENCE SReg_64, 1529 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1530 sub0, 1531 (i32 (COPY_TO_REGCLASS (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1532 (S_MOV_B32 (i32 0x7fffffff))), SReg_32)), // Set sign bit. 1533 sub1) 1534>; 1535 1536def : GCNPat < 1537 (UniformUnaryFrag<fneg> (f64 SReg_64:$src)), 1538 (REG_SEQUENCE SReg_64, 1539 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1540 sub0, 1541 (i32 (COPY_TO_REGCLASS (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1542 (i32 (S_MOV_B32 (i32 0x80000000)))), SReg_32)), 1543 sub1) 1544>; 1545 1546def : GCNPat < 1547 (UniformUnaryFrag<fneg> (fabs (f64 SReg_64:$src))), 1548 (REG_SEQUENCE SReg_64, 1549 (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)), 1550 sub0, 1551 (i32 (COPY_TO_REGCLASS (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)), 1552 (S_MOV_B32 (i32 0x80000000))), SReg_32)),// Set sign bit. 1553 sub1) 1554>; 1555 1556 1557def : GCNPat < 1558 (fneg (fabs (f32 VGPR_32:$src))), 1559 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) // Set sign bit 1560>; 1561 1562def : GCNPat < 1563 (fabs (f32 VGPR_32:$src)), 1564 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src) 1565>; 1566 1567def : GCNPat < 1568 (fneg (f32 VGPR_32:$src)), 1569 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src) 1570>; 1571 1572def : GCNPat < 1573 (fabs (f16 VGPR_32:$src)), 1574 (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src) 1575>; 1576 1577def : GCNPat < 1578 (fneg (f16 VGPR_32:$src)), 1579 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) 1580>; 1581 1582def : GCNPat < 1583 (fneg (fabs (f16 VGPR_32:$src))), 1584 (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit 1585>; 1586 1587def : GCNPat < 1588 (fneg (v2f16 VGPR_32:$src)), 1589 (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 1590>; 1591 1592def : GCNPat < 1593 (fabs (v2f16 VGPR_32:$src)), 1594 (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src) 1595>; 1596 1597def : GCNPat < 1598 (fneg (v2f16 (fabs VGPR_32:$src))), 1599 (V_OR_B32_e64 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) 1600>; 1601 1602def : GCNPat < 1603 (fabs (f64 VReg_64:$src)), 1604 (REG_SEQUENCE VReg_64, 1605 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1606 sub0, 1607 (V_AND_B32_e64 (i32 (S_MOV_B32 (i32 0x7fffffff))), 1608 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 1609 sub1) 1610>; 1611 1612def : GCNPat < 1613 (fneg (f64 VReg_64:$src)), 1614 (REG_SEQUENCE VReg_64, 1615 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1616 sub0, 1617 (V_XOR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 1618 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 1619 sub1) 1620>; 1621 1622def : GCNPat < 1623 (fneg (fabs (f64 VReg_64:$src))), 1624 (REG_SEQUENCE VReg_64, 1625 (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)), 1626 sub0, 1627 (V_OR_B32_e64 (i32 (S_MOV_B32 (i32 0x80000000))), 1628 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1))), 1629 sub1) 1630>; 1631 1632def : GCNPat < 1633 (getDivergentFrag<fneg>.ret (v2f32 VReg_64:$src)), 1634 (V_PK_ADD_F32 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, VReg_64:$src, 1635 11 /* OP_SEL_1 | NEG_LO | HEG_HI */, 0, 1636 0, 0, 0, 0, 0) 1637> { 1638 let SubtargetPredicate = HasPackedFP32Ops; 1639} 1640 1641def : GCNPat < 1642 (fcopysign f16:$src0, f16:$src1), 1643 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1) 1644>; 1645 1646def : GCNPat < 1647 (fcopysign f32:$src0, f16:$src1), 1648 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 1649 (V_LSHLREV_B32_e64 (i32 16), $src1)) 1650>; 1651 1652def : GCNPat < 1653 (fcopysign f64:$src0, f16:$src1), 1654 (REG_SEQUENCE SReg_64, 1655 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 1656 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)), 1657 (V_LSHLREV_B32_e64 (i32 16), $src1)), sub1) 1658>; 1659 1660def : GCNPat < 1661 (fcopysign f16:$src0, f32:$src1), 1662 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 1663 (V_LSHRREV_B32_e64 (i32 16), $src1)) 1664>; 1665 1666def : GCNPat < 1667 (fcopysign f16:$src0, f64:$src1), 1668 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, 1669 (V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1))) 1670>; 1671 1672/********** ================== **********/ 1673/********** Immediate Patterns **********/ 1674/********** ================== **********/ 1675 1676def : GCNPat < 1677 (VGPRImm<(i32 imm)>:$imm), 1678 (V_MOV_B32_e32 imm:$imm) 1679>; 1680 1681def : GCNPat < 1682 (VGPRImm<(f32 fpimm)>:$imm), 1683 (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm))) 1684>; 1685 1686def : GCNPat < 1687 (i32 imm:$imm), 1688 (S_MOV_B32 imm:$imm) 1689>; 1690 1691def : GCNPat < 1692 (VGPRImm<(SIlds tglobaladdr:$ga)>), 1693 (V_MOV_B32_e32 $ga) 1694>; 1695 1696def : GCNPat < 1697 (SIlds tglobaladdr:$ga), 1698 (S_MOV_B32 $ga) 1699>; 1700 1701// FIXME: Workaround for ordering issue with peephole optimizer where 1702// a register class copy interferes with immediate folding. Should 1703// use s_mov_b32, which can be shrunk to s_movk_i32 1704def : GCNPat < 1705 (VGPRImm<(f16 fpimm)>:$imm), 1706 (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm))) 1707>; 1708 1709def : GCNPat < 1710 (f32 fpimm:$imm), 1711 (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm))) 1712>; 1713 1714def : GCNPat < 1715 (f16 fpimm:$imm), 1716 (S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm))) 1717>; 1718 1719def : GCNPat < 1720 (p5 frameindex:$fi), 1721 (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi))) 1722>; 1723 1724def : GCNPat < 1725 (p5 frameindex:$fi), 1726 (S_MOV_B32 (p5 (frameindex_to_targetframeindex $fi))) 1727>; 1728 1729def : GCNPat < 1730 (i64 InlineImm64:$imm), 1731 (S_MOV_B64 InlineImm64:$imm) 1732>; 1733 1734// XXX - Should this use a s_cmp to set SCC? 1735 1736// Set to sign-extended 64-bit value (true = -1, false = 0) 1737def : GCNPat < 1738 (i1 imm:$imm), 1739 (S_MOV_B64 (i64 (as_i64imm $imm))) 1740> { 1741 let WaveSizePredicate = isWave64; 1742} 1743 1744def : GCNPat < 1745 (i1 imm:$imm), 1746 (S_MOV_B32 (i32 (as_i32imm $imm))) 1747> { 1748 let WaveSizePredicate = isWave32; 1749} 1750 1751def : GCNPat < 1752 (f64 InlineImmFP64:$imm), 1753 (S_MOV_B64 (f64 (bitcast_fpimm_to_i64 InlineImmFP64:$imm))) 1754>; 1755 1756/********** ================== **********/ 1757/********** Intrinsic Patterns **********/ 1758/********** ================== **********/ 1759 1760def : GCNPat < 1761 (f32 (fpow (VOP3Mods f32:$src0, i32:$src0_mods), (VOP3Mods f32:$src1, i32:$src1_mods))), 1762 (V_EXP_F32_e64 SRCMODS.NONE, (V_MUL_LEGACY_F32_e64 $src1_mods, $src1, SRCMODS.NONE, (V_LOG_F32_e64 $src0_mods, $src0), 0, 0)) 1763>; 1764 1765def : GCNPat < 1766 (i32 (sext i1:$src0)), 1767 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1768 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0) 1769>; 1770 1771class Ext32Pat <SDNode ext> : GCNPat < 1772 (i32 (ext i1:$src0)), 1773 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 1774 /*src1mod*/(i32 0), /*src1*/(i32 1), $src0) 1775>; 1776 1777def : Ext32Pat <zext>; 1778def : Ext32Pat <anyext>; 1779 1780// The multiplication scales from [0,1) to the unsigned integer range, 1781// rounding down a bit to avoid unwanted overflow. 1782def : GCNPat < 1783 (AMDGPUurecip i32:$src0), 1784 (V_CVT_U32_F32_e32 1785 (V_MUL_F32_e32 (i32 CONST.FP_4294966784), 1786 (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0)))) 1787>; 1788 1789//===----------------------------------------------------------------------===// 1790// VOP3 Patterns 1791//===----------------------------------------------------------------------===// 1792 1793def : IMad24Pat<V_MAD_I32_I24_e64, 1>; 1794def : UMad24Pat<V_MAD_U32_U24_e64, 1>; 1795 1796// BFI patterns 1797 1798def BFIImm32 : PatFrag< 1799 (ops node:$x, node:$y, node:$z), 1800 (i32 (DivergentBinFrag<or> (and node:$y, node:$x), (and node:$z, imm))), 1801 [{ 1802 auto *X = dyn_cast<ConstantSDNode>(N->getOperand(0)->getOperand(1)); 1803 auto *NotX = dyn_cast<ConstantSDNode>(N->getOperand(1)->getOperand(1)); 1804 return X && NotX && 1805 ~(unsigned)X->getZExtValue() == (unsigned)NotX->getZExtValue(); 1806 }] 1807>; 1808 1809// Definition from ISA doc: 1810// (y & x) | (z & ~x) 1811def : AMDGPUPat < 1812 (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))), 1813 (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) 1814>; 1815 1816// (y & C) | (z & ~C) 1817def : AMDGPUPat < 1818 (BFIImm32 i32:$x, i32:$y, i32:$z), 1819 (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) 1820>; 1821 1822// 64-bit version 1823def : AMDGPUPat < 1824 (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))), 1825 (REG_SEQUENCE VReg_64, 1826 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 1827 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 1828 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 1829 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 1830 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 1831 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 1832>; 1833 1834// SHA-256 Ch function 1835// z ^ (x & (y ^ z)) 1836def : AMDGPUPat < 1837 (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))), 1838 (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z) 1839>; 1840 1841// 64-bit version 1842def : AMDGPUPat < 1843 (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), 1844 (REG_SEQUENCE VReg_64, 1845 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 1846 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), 1847 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0, 1848 (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 1849 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), 1850 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) 1851>; 1852 1853def : AMDGPUPat < 1854 (fcopysign f32:$src0, f32:$src1), 1855 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1) 1856>; 1857 1858def : AMDGPUPat < 1859 (fcopysign f32:$src0, f64:$src1), 1860 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, 1861 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))) 1862>; 1863 1864def : AMDGPUPat < 1865 (fcopysign f64:$src0, f64:$src1), 1866 (REG_SEQUENCE SReg_64, 1867 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 1868 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 1869 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 1870 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1))), sub1) 1871>; 1872 1873def : AMDGPUPat < 1874 (fcopysign f64:$src0, f32:$src1), 1875 (REG_SEQUENCE SReg_64, 1876 (i32 (EXTRACT_SUBREG $src0, sub0)), sub0, 1877 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), 1878 (i32 (EXTRACT_SUBREG SReg_64:$src0, sub1)), 1879 $src1), sub1) 1880>; 1881 1882def : ROTRPattern <V_ALIGNBIT_B32_e64>; 1883 1884def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), 1885 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 1886 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 1887 1888def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), 1889 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), 1890 (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; 1891 1892/********** ====================== **********/ 1893/********** Indirect addressing **********/ 1894/********** ====================== **********/ 1895 1896multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> { 1897 // Extract with offset 1898 def : GCNPat< 1899 (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), 1900 (!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) 1901 >; 1902 1903 // Insert with offset 1904 def : GCNPat< 1905 (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), 1906 (!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) 1907 >; 1908} 1909 1910defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">; 1911defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">; 1912defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">; 1913defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">; 1914defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">; 1915 1916defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">; 1917defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">; 1918defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">; 1919defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">; 1920defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">; 1921 1922//===----------------------------------------------------------------------===// 1923// SAD Patterns 1924//===----------------------------------------------------------------------===// 1925 1926def : GCNPat < 1927 (add (sub_oneuse (umax i32:$src0, i32:$src1), 1928 (umin i32:$src0, i32:$src1)), 1929 i32:$src2), 1930 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 1931>; 1932 1933def : GCNPat < 1934 (add (select_oneuse (i1 (setugt i32:$src0, i32:$src1)), 1935 (sub i32:$src0, i32:$src1), 1936 (sub i32:$src1, i32:$src0)), 1937 i32:$src2), 1938 (V_SAD_U32_e64 $src0, $src1, $src2, (i1 0)) 1939>; 1940 1941//===----------------------------------------------------------------------===// 1942// Conversion Patterns 1943//===----------------------------------------------------------------------===// 1944 1945class UniformSextInreg<ValueType VT> : PatFrag< 1946 (ops node:$src), 1947 (sext_inreg $src, VT), 1948 [{ return !N->isDivergent(); }]>; 1949 1950def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)), 1951 (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 1952 1953// Handle sext_inreg in i64 1954def : GCNPat < 1955 (i64 (UniformSextInreg<i1> i64:$src)), 1956 (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 1957>; 1958 1959def : GCNPat < 1960 (i16 (UniformSextInreg<i1> i16:$src)), 1961 (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 1962>; 1963 1964def : GCNPat < 1965 (i16 (UniformSextInreg<i8> i16:$src)), 1966 (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 1967>; 1968 1969def : GCNPat < 1970 (i64 (UniformSextInreg<i8> i64:$src)), 1971 (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 1972>; 1973 1974def : GCNPat < 1975 (i64 (UniformSextInreg<i16> i64:$src)), 1976 (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 1977>; 1978 1979def : GCNPat < 1980 (i64 (UniformSextInreg<i32> i64:$src)), 1981 (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 1982>; 1983 1984 1985class DivergentSextInreg<ValueType VT> : PatFrag< 1986 (ops node:$src), 1987 (sext_inreg $src, VT), 1988 [{ return N->isDivergent(); }]>; 1989 1990def : GCNPat<(i32 (DivergentSextInreg<i1> i32:$src)), 1991 (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>; 1992 1993def : GCNPat < 1994 (i16 (DivergentSextInreg<i1> i16:$src)), 1995 (V_BFE_I32_e64 $src, (i32 0), (i32 1)) // 0 | 1 << 16 1996>; 1997 1998def : GCNPat < 1999 (i16 (DivergentSextInreg<i8> i16:$src)), 2000 (V_BFE_I32_e64 $src, (i32 0), (i32 8)) // 0 | 8 << 16 2001>; 2002 2003def : GCNPat < 2004 (i64 (DivergentSextInreg<i1> i64:$src)), 2005 (REG_SEQUENCE VReg_64, 2006 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0, 2007 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1) 2008>; 2009 2010def : GCNPat < 2011 (i64 (DivergentSextInreg<i8> i64:$src)), 2012 (REG_SEQUENCE VReg_64, 2013 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)/* 0 | 8 << 16 */), sub0, 2014 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1) 2015>; 2016 2017def : GCNPat < 2018 (i64 (DivergentSextInreg<i16> i64:$src)), 2019 (REG_SEQUENCE VReg_64, 2020 (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)/* 0 | 16 << 16 */), sub0, 2021 (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1) 2022>; 2023 2024def : GCNPat < 2025 (i64 (DivergentSextInreg<i32> i64:$src)), 2026 (REG_SEQUENCE VReg_64, 2027 (i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0, 2028 (V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1) 2029>; 2030 2031def : GCNPat < 2032 (i64 (zext i32:$src)), 2033 (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) 2034>; 2035 2036def : GCNPat < 2037 (i64 (anyext i32:$src)), 2038 (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) 2039>; 2040 2041class ZExt_i64_i1_Pat <SDNode ext> : GCNPat < 2042 (i64 (ext i1:$src)), 2043 (REG_SEQUENCE VReg_64, 2044 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2045 /*src1mod*/(i32 0), /*src1*/(i32 1), $src), 2046 sub0, (S_MOV_B32 (i32 0)), sub1) 2047>; 2048 2049 2050def : ZExt_i64_i1_Pat<zext>; 2051def : ZExt_i64_i1_Pat<anyext>; 2052 2053// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that 2054// REG_SEQUENCE patterns don't support instructions with multiple outputs. 2055def : GCNPat < 2056 (i64 (sext i32:$src)), 2057 (REG_SEQUENCE SReg_64, $src, sub0, 2058 (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) 2059>; 2060 2061def : GCNPat < 2062 (i64 (sext i1:$src)), 2063 (REG_SEQUENCE VReg_64, 2064 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2065 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub0, 2066 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2067 /*src1mod*/(i32 0), /*src1*/(i32 -1), $src), sub1) 2068>; 2069 2070class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : GCNPat < 2071 (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), 2072 (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) 2073>; 2074 2075def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>; 2076def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>; 2077def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; 2078def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>; 2079def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>; 2080def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>; 2081 2082// If we need to perform a logical operation on i1 values, we need to 2083// use vector comparisons since there is only one SCC register. Vector 2084// comparisons may write to a pair of SGPRs or a single SGPR, so treat 2085// these as 32 or 64-bit comparisons. When legalizing SGPR copies, 2086// instructions resulting in the copies from SCC to these instructions 2087// will be moved to the VALU. 2088 2089let WaveSizePredicate = isWave64 in { 2090def : GCNPat < 2091 (i1 (and i1:$src0, i1:$src1)), 2092 (S_AND_B64 $src0, $src1) 2093>; 2094 2095def : GCNPat < 2096 (i1 (or i1:$src0, i1:$src1)), 2097 (S_OR_B64 $src0, $src1) 2098>; 2099 2100def : GCNPat < 2101 (i1 (xor i1:$src0, i1:$src1)), 2102 (S_XOR_B64 $src0, $src1) 2103>; 2104 2105def : GCNPat < 2106 (i1 (add i1:$src0, i1:$src1)), 2107 (S_XOR_B64 $src0, $src1) 2108>; 2109 2110def : GCNPat < 2111 (i1 (sub i1:$src0, i1:$src1)), 2112 (S_XOR_B64 $src0, $src1) 2113>; 2114 2115let AddedComplexity = 1 in { 2116def : GCNPat < 2117 (i1 (add i1:$src0, (i1 -1))), 2118 (S_NOT_B64 $src0) 2119>; 2120 2121def : GCNPat < 2122 (i1 (sub i1:$src0, (i1 -1))), 2123 (S_NOT_B64 $src0) 2124>; 2125} 2126} // end isWave64 2127 2128let WaveSizePredicate = isWave32 in { 2129def : GCNPat < 2130 (i1 (and i1:$src0, i1:$src1)), 2131 (S_AND_B32 $src0, $src1) 2132>; 2133 2134def : GCNPat < 2135 (i1 (or i1:$src0, i1:$src1)), 2136 (S_OR_B32 $src0, $src1) 2137>; 2138 2139def : GCNPat < 2140 (i1 (xor i1:$src0, i1:$src1)), 2141 (S_XOR_B32 $src0, $src1) 2142>; 2143 2144def : GCNPat < 2145 (i1 (add i1:$src0, i1:$src1)), 2146 (S_XOR_B32 $src0, $src1) 2147>; 2148 2149def : GCNPat < 2150 (i1 (sub i1:$src0, i1:$src1)), 2151 (S_XOR_B32 $src0, $src1) 2152>; 2153 2154let AddedComplexity = 1 in { 2155def : GCNPat < 2156 (i1 (add i1:$src0, (i1 -1))), 2157 (S_NOT_B32 $src0) 2158>; 2159 2160def : GCNPat < 2161 (i1 (sub i1:$src0, (i1 -1))), 2162 (S_NOT_B32 $src0) 2163>; 2164} 2165} // end isWave32 2166 2167def : GCNPat < 2168 (i32 (DivergentBinFrag<xor> i32:$src0, (i32 -1))), 2169 (V_NOT_B32_e32 $src0) 2170>; 2171 2172def : GCNPat < 2173 (i64 (DivergentBinFrag<xor> i64:$src0, (i64 -1))), 2174 (REG_SEQUENCE VReg_64, 2175 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0, 2176 (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1 2177 ) 2178>; 2179 2180def : GCNPat < 2181 (f16 (sint_to_fp i1:$src)), 2182 (V_CVT_F16_F32_e32 ( 2183 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2184 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2185 SSrc_i1:$src)) 2186>; 2187 2188def : GCNPat < 2189 (f16 (uint_to_fp i1:$src)), 2190 (V_CVT_F16_F32_e32 ( 2191 V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2192 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2193 SSrc_i1:$src)) 2194>; 2195 2196def : GCNPat < 2197 (f32 (sint_to_fp i1:$src)), 2198 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2199 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE), 2200 SSrc_i1:$src) 2201>; 2202 2203def : GCNPat < 2204 (f32 (uint_to_fp i1:$src)), 2205 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2206 /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE), 2207 SSrc_i1:$src) 2208>; 2209 2210def : GCNPat < 2211 (f64 (sint_to_fp i1:$src)), 2212 (V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2213 /*src1mod*/(i32 0), /*src1*/(i32 -1), 2214 SSrc_i1:$src)) 2215>; 2216 2217def : GCNPat < 2218 (f64 (uint_to_fp i1:$src)), 2219 (V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), 2220 /*src1mod*/(i32 0), /*src1*/(i32 1), 2221 SSrc_i1:$src)) 2222>; 2223 2224//===----------------------------------------------------------------------===// 2225// Miscellaneous Patterns 2226//===----------------------------------------------------------------------===// 2227 2228// Eliminate a zero extension from an fp16 operation if it already 2229// zeros the high bits of the 32-bit register. 2230// 2231// This is complicated on gfx9+. Some instructions maintain the legacy 2232// zeroing behavior, but others preserve the high bits. Some have a 2233// control bit to change the behavior. We can't simply say with 2234// certainty what the source behavior is without more context on how 2235// the src is lowered. e.g. fptrunc + fma may be lowered to a 2236// v_fma_mix* instruction which does not zero, or may not. 2237def : GCNPat< 2238 (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), 2239 (COPY VSrc_b16:$src)>; 2240 2241def : GCNPat < 2242 (i32 (trunc i64:$a)), 2243 (EXTRACT_SUBREG $a, sub0) 2244>; 2245 2246def : GCNPat < 2247 (i1 (UniformUnaryFrag<trunc> i32:$a)), 2248 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2249>; 2250 2251def : GCNPat < 2252 (i1 (UniformUnaryFrag<trunc> i16:$a)), 2253 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) 2254>; 2255 2256def : GCNPat < 2257 (i1 (UniformUnaryFrag<trunc> i64:$a)), 2258 (S_CMP_EQ_U32 (S_AND_B32 (i32 1), 2259 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2260>; 2261 2262def : GCNPat < 2263 (i1 (DivergentUnaryFrag<trunc> i32:$a)), 2264 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2265>; 2266 2267def : GCNPat < 2268 (i1 (DivergentUnaryFrag<trunc> i16:$a)), 2269 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), $a), (i32 1)) 2270>; 2271 2272def : GCNPat < 2273 (i1 (DivergentUnaryFrag<trunc> i64:$a)), 2274 (V_CMP_EQ_U32_e64 (V_AND_B32_e64 (i32 1), 2275 (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) 2276>; 2277 2278def : GCNPat < 2279 (i32 (bswap i32:$a)), 2280 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2281 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 24)), 2282 (V_ALIGNBIT_B32_e64 VSrc_b32:$a, VSrc_b32:$a, (i32 8))) 2283>; 2284 2285// FIXME: This should have been narrowed to i32 during legalization. 2286// This pattern should also be skipped for GlobalISel 2287def : GCNPat < 2288 (i64 (bswap i64:$a)), 2289 (REG_SEQUENCE VReg_64, 2290 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2291 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2292 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2293 (i32 24)), 2294 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2295 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)), 2296 (i32 8))), 2297 sub0, 2298 (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00ff00ff)), 2299 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2300 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2301 (i32 24)), 2302 (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2303 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)), 2304 (i32 8))), 2305 sub1) 2306>; 2307 2308// FIXME: The AddedComplexity should not be needed, but in GlobalISel 2309// the BFI pattern ends up taking precedence without it. 2310let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in { 2311// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24) 2312// 2313// My reading of the manual suggests we should be using src0 for the 2314// register value, but this is what seems to work. 2315def : GCNPat < 2316 (i32 (bswap i32:$a)), 2317 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203))) 2318>; 2319 2320// FIXME: This should have been narrowed to i32 during legalization. 2321// This pattern should also be skipped for GlobalISel 2322def : GCNPat < 2323 (i64 (bswap i64:$a)), 2324 (REG_SEQUENCE VReg_64, 2325 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1), 2326 (S_MOV_B32 (i32 0x00010203))), 2327 sub0, 2328 (V_PERM_B32_e64 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0), 2329 (S_MOV_B32 (i32 0x00010203))), 2330 sub1) 2331>; 2332 2333// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24) 2334// The 12s emit 0s. 2335def : GCNPat < 2336 (i16 (bswap i16:$a)), 2337 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 2338>; 2339 2340def : GCNPat < 2341 (i32 (zext (bswap i16:$a))), 2342 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001))) 2343>; 2344 2345// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24) 2346def : GCNPat < 2347 (v2i16 (bswap v2i16:$a)), 2348 (V_PERM_B32_e64 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001))) 2349>; 2350 2351} 2352 2353 2354// Prefer selecting to max when legal, but using mul is always valid. 2355let AddedComplexity = -5 in { 2356def : GCNPat< 2357 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2358 (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src) 2359>; 2360 2361def : GCNPat< 2362 (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))), 2363 (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src) 2364>; 2365 2366def : GCNPat< 2367 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 2368 (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE) 2369>; 2370 2371def : GCNPat< 2372 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 2373 (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src) 2374>; 2375 2376def : GCNPat< 2377 (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))), 2378 (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src) 2379>; 2380 2381// TODO: Handle fneg like other types. 2382def : GCNPat< 2383 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 2384 (V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src) 2385>; 2386} // End AddedComplexity = -5 2387 2388multiclass SelectCanonicalizeAsMax< 2389 list<Predicate> f32_preds = [], 2390 list<Predicate> f64_preds = [], 2391 list<Predicate> f16_preds = []> { 2392 def : GCNPat< 2393 (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))), 2394 (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> { 2395 let OtherPredicates = f32_preds; 2396 } 2397 2398 def : GCNPat< 2399 (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))), 2400 (V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> { 2401 let OtherPredicates = f64_preds; 2402 } 2403 2404 def : GCNPat< 2405 (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))), 2406 (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> { 2407 // FIXME: Should have 16-bit inst subtarget predicate 2408 let OtherPredicates = f16_preds; 2409 } 2410 2411 def : GCNPat< 2412 (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))), 2413 (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> { 2414 // FIXME: Should have VOP3P subtarget predicate 2415 let OtherPredicates = f16_preds; 2416 } 2417} 2418 2419// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal 2420// mode, and would never flush. For f64, it's faster to do implement 2421// this with a max. For f16/f32 it's a wash, but prefer max when 2422// valid. 2423// 2424// FIXME: Lowering f32/f16 with max is worse since we can use a 2425// smaller encoding if the input is fneg'd. It also adds an extra 2426// register use. 2427let SubtargetPredicate = HasMinMaxDenormModes in { 2428 defm : SelectCanonicalizeAsMax<[], [], []>; 2429} // End SubtargetPredicate = HasMinMaxDenormModes 2430 2431let SubtargetPredicate = NotHasMinMaxDenormModes in { 2432 // Use the max lowering if we don't need to flush. 2433 2434 // FIXME: We don't do use this for f32 as a workaround for the 2435 // library being compiled with the default ieee mode, but 2436 // potentially being called from flushing kernels. Really we should 2437 // not be mixing code expecting different default FP modes, but mul 2438 // works in any FP environment. 2439 defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>; 2440} // End SubtargetPredicate = NotHasMinMaxDenormModes 2441 2442 2443let OtherPredicates = [HasDLInsts] in { 2444// Don't allow source modifiers. If there are any source modifiers then it's 2445// better to select fma instead of fmac. 2446def : GCNPat < 2447 (fma (f32 (VOP3NoMods f32:$src0)), 2448 (f32 (VOP3NoMods f32:$src1)), 2449 (f32 (VOP3NoMods f32:$src2))), 2450 (V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2451 SRCMODS.NONE, $src2) 2452>; 2453} // End OtherPredicates = [HasDLInsts] 2454 2455let SubtargetPredicate = isGFX10Plus in 2456// Don't allow source modifiers. If there are any source modifiers then it's 2457// better to select fma instead of fmac. 2458def : GCNPat < 2459 (fma (f16 (VOP3NoMods f32:$src0)), 2460 (f16 (VOP3NoMods f32:$src1)), 2461 (f16 (VOP3NoMods f32:$src2))), 2462 (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2463 SRCMODS.NONE, $src2) 2464>; 2465 2466let SubtargetPredicate = isGFX90APlus in 2467// Don't allow source modifiers. If there are any source modifiers then it's 2468// better to select fma instead of fmac. 2469def : GCNPat < 2470 (fma (f64 (VOP3NoMods f64:$src0)), 2471 (f64 (VOP3NoMods f64:$src1)), 2472 (f64 (VOP3NoMods f64:$src2))), 2473 (V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, 2474 SRCMODS.NONE, $src2) 2475>; 2476 2477// COPY is workaround tablegen bug from multiple outputs 2478// from S_LSHL_B32's multiple outputs from implicit scc def. 2479def : GCNPat < 2480 (v2i16 (UniformBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), 2481 (S_LSHL_B32 SReg_32:$src1, (i16 16)) 2482>; 2483 2484def : GCNPat < 2485 (v2i16 (DivergentBinFrag<build_vector> (i16 0), (i16 SReg_32:$src1))), 2486 (v2i16 (V_LSHLREV_B32_e64 (i16 16), SReg_32:$src1)) 2487>; 2488 2489 2490def : GCNPat < 2491 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), 2492 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 2493>; 2494 2495def : GCNPat < 2496 (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src1), (i16 0))), 2497 (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1)) 2498>; 2499 2500def : GCNPat < 2501 (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), 2502 (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) 2503>; 2504 2505def : GCNPat < 2506 (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src1), (f16 FP_ZERO))), 2507 (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src1)) 2508>; 2509 2510def : GCNPat < 2511 (v2i16 (build_vector (i16 SReg_32:$src0), (i16 undef))), 2512 (COPY_TO_REGCLASS SReg_32:$src0, SReg_32) 2513>; 2514 2515def : GCNPat < 2516 (v2i16 (build_vector (i16 VGPR_32:$src0), (i16 undef))), 2517 (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32) 2518>; 2519 2520def : GCNPat < 2521 (v2f16 (build_vector f16:$src0, (f16 undef))), 2522 (COPY $src0) 2523>; 2524 2525def : GCNPat < 2526 (v2i16 (UniformBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))), 2527 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 2528>; 2529 2530def : GCNPat < 2531 (v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 SReg_32:$src1))), 2532 (v2i16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) 2533>; 2534 2535 2536def : GCNPat < 2537 (v2f16 (UniformBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))), 2538 (S_LSHL_B32 SReg_32:$src1, (i32 16)) 2539>; 2540 2541def : GCNPat < 2542 (v2f16 (DivergentBinFrag<build_vector> (f16 undef), (f16 SReg_32:$src1))), 2543 (v2f16 (V_LSHLREV_B32_e64 (i32 16), SReg_32:$src1)) 2544>; 2545 2546let SubtargetPredicate = HasVOP3PInsts in { 2547def : GCNPat < 2548 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))), 2549 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 2550>; 2551 2552def : GCNPat < 2553 (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 SReg_32:$src1))), 2554 (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) 2555>; 2556 2557// With multiple uses of the shift, this will duplicate the shift and 2558// increase register pressure. 2559def : GCNPat < 2560 (v2i16 (UniformBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2561 (v2i16 (S_PACK_LH_B32_B16 SReg_32:$src0, SReg_32:$src1)) 2562>; 2563 2564def : GCNPat < 2565 (v2i16 (DivergentBinFrag<build_vector> (i16 SReg_32:$src0), (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2566 (v2i16 (V_BFI_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0, SReg_32:$src1)) 2567>; 2568 2569 2570def : GCNPat < 2571 (v2i16 (UniformBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), 2572 (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2573 (S_PACK_HH_B32_B16 SReg_32:$src0, SReg_32:$src1) 2574>; 2575 2576def : GCNPat < 2577 (v2i16 (DivergentBinFrag<build_vector> (i16 (trunc (srl_oneuse SReg_32:$src0, (i32 16)))), 2578 (i16 (trunc (srl_oneuse SReg_32:$src1, (i32 16)))))), 2579 (v2i16 (V_AND_OR_B32_e64 SReg_32:$src1, (i32 (V_MOV_B32_e32 (i32 0xffff0000))), (i32 (V_LSHRREV_B32_e64 (i32 16), SReg_32:$src0)))) 2580>; 2581 2582def : GCNPat < 2583 (v2f16 (UniformBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))), 2584 (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1) 2585>; 2586 2587def : GCNPat < 2588 (v2f16 (DivergentBinFrag<build_vector> (f16 SReg_32:$src0), (f16 SReg_32:$src1))), 2589 (v2f16 (V_LSHL_OR_B32_e64 SReg_32:$src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), SReg_32:$src0)))) 2590>; 2591 2592 2593def : GCNPat < 2594 (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)), 2595 (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))), 2596 (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1) 2597>; 2598} // End SubtargetPredicate = HasVOP3PInsts 2599 2600def : GCNPat < 2601 (v2f16 (scalar_to_vector f16:$src0)), 2602 (COPY $src0) 2603>; 2604 2605def : GCNPat < 2606 (v2i16 (scalar_to_vector i16:$src0)), 2607 (COPY $src0) 2608>; 2609 2610def : GCNPat < 2611 (v4i16 (scalar_to_vector i16:$src0)), 2612 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 2613>; 2614 2615def : GCNPat < 2616 (v4f16 (scalar_to_vector f16:$src0)), 2617 (INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0) 2618>; 2619 2620def : GCNPat < 2621 (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, 2622 timm:$bank_mask, timm:$bound_ctrl)), 2623 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src, 2624 (as_i32timm $dpp_ctrl), (as_i32timm $row_mask), 2625 (as_i32timm $bank_mask), 2626 (as_i1timm $bound_ctrl)) 2627>; 2628 2629def : GCNPat < 2630 (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask, 2631 timm:$bank_mask, timm:$bound_ctrl)), 2632 (V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$old, VReg_64_Align2:$src, (as_i32timm $dpp_ctrl), 2633 (as_i32timm $row_mask), (as_i32timm $bank_mask), 2634 (as_i1timm $bound_ctrl)) 2635>; 2636 2637//===----------------------------------------------------------------------===// 2638// Fract Patterns 2639//===----------------------------------------------------------------------===// 2640 2641let SubtargetPredicate = isGFX6 in { 2642 2643// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) is 2644// used instead. However, SI doesn't have V_FLOOR_F64, so the most efficient 2645// way to implement it is using V_FRACT_F64. 2646// The workaround for the V_FRACT bug is: 2647// fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) 2648 2649// Convert floor(x) to (x - fract(x)) 2650 2651// Don't bother handling this for GlobalISel, it's handled during 2652// lowering. 2653// 2654// FIXME: DAG should also custom lower this. 2655def : GCNPat < 2656 (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))), 2657 (V_ADD_F64_e64 2658 $mods, 2659 $x, 2660 SRCMODS.NEG, 2661 (V_CNDMASK_B64_PSEUDO 2662 (V_MIN_F64_e64 2663 SRCMODS.NONE, 2664 (V_FRACT_F64_e64 $mods, $x), 2665 SRCMODS.NONE, 2666 (V_MOV_B64_PSEUDO 0x3fefffffffffffff)), 2667 $x, 2668 (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/)))) 2669>; 2670 2671} // End SubtargetPredicates = isGFX6 2672 2673//============================================================================// 2674// Miscellaneous Optimization Patterns 2675//============================================================================// 2676 2677// Undo sub x, c -> add x, -c canonicalization since c is more likely 2678// an inline immediate than -c. 2679// TODO: Also do for 64-bit. 2680def : GCNPat< 2681 (add i32:$src0, (i32 NegSubInlineConst32:$src1)), 2682 (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1) 2683>; 2684 2685def : GCNPat< 2686 (add i32:$src0, (i32 NegSubInlineConst32:$src1)), 2687 (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 2688 let SubtargetPredicate = HasAddNoCarryInsts; 2689} 2690 2691def : GCNPat< 2692 (add i32:$src0, (i32 NegSubInlineConst32:$src1)), 2693 (V_SUB_CO_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> { 2694 let SubtargetPredicate = NotHasAddNoCarryInsts; 2695} 2696 2697 2698// Avoid pointlessly materializing a constant in VGPR. 2699// FIXME: Should also do this for readlane, but tablegen crashes on 2700// the ignored src1. 2701def : GCNPat< 2702 (int_amdgcn_readfirstlane (i32 imm:$src)), 2703 (S_MOV_B32 SReg_32:$src) 2704>; 2705 2706multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> { 2707 def : GCNPat < 2708 (vt (shl (vt (add (vt (shl 1, vt:$a)), -1)), vt:$b)), 2709 (BFM $a, $b) 2710 >; 2711 2712 def : GCNPat < 2713 (vt (add (vt (shl 1, vt:$a)), -1)), 2714 (BFM $a, (MOV (i32 0))) 2715 >; 2716} 2717 2718defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>; 2719// FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>; 2720 2721// Bitfield extract patterns 2722 2723def IMMZeroBasedBitfieldMask : ImmLeaf <i32, [{ 2724 return isMask_32(Imm); 2725}]>; 2726 2727def IMMPopCount : SDNodeXForm<imm, [{ 2728 return CurDAG->getTargetConstant(countPopulation(N->getZExtValue()), SDLoc(N), 2729 MVT::i32); 2730}]>; 2731 2732def : AMDGPUPat < 2733 (DivergentBinFrag<and> (i32 (srl i32:$src, i32:$rshift)), 2734 IMMZeroBasedBitfieldMask:$mask), 2735 (V_BFE_U32_e64 $src, $rshift, (i32 (IMMPopCount $mask))) 2736>; 2737 2738// x & ((1 << y) - 1) 2739def : AMDGPUPat < 2740 (DivergentBinFrag<and> i32:$src, (add_oneuse (shl_oneuse 1, i32:$width), -1)), 2741 (V_BFE_U32_e64 $src, (i32 0), $width) 2742>; 2743 2744// x & ~(-1 << y) 2745def : AMDGPUPat < 2746 (DivergentBinFrag<and> i32:$src, 2747 (xor_oneuse (shl_oneuse -1, i32:$width), -1)), 2748 (V_BFE_U32_e64 $src, (i32 0), $width) 2749>; 2750 2751// x & (-1 >> (bitwidth - y)) 2752def : AMDGPUPat < 2753 (DivergentBinFrag<and> i32:$src, (srl_oneuse -1, (sub 32, i32:$width))), 2754 (V_BFE_U32_e64 $src, (i32 0), $width) 2755>; 2756 2757// x << (bitwidth - y) >> (bitwidth - y) 2758def : AMDGPUPat < 2759 (DivergentBinFrag<srl> (shl_oneuse i32:$src, (sub 32, i32:$width)), 2760 (sub 32, i32:$width)), 2761 (V_BFE_U32_e64 $src, (i32 0), $width) 2762>; 2763 2764def : AMDGPUPat < 2765 (DivergentBinFrag<sra> (shl_oneuse i32:$src, (sub 32, i32:$width)), 2766 (sub 32, i32:$width)), 2767 (V_BFE_I32_e64 $src, (i32 0), $width) 2768>; 2769 2770// SHA-256 Ma patterns 2771 2772// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y 2773def : AMDGPUPat < 2774 (DivergentBinFrag<or> (and i32:$x, i32:$z), 2775 (and i32:$y, (or i32:$x, i32:$z))), 2776 (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y) 2777>; 2778 2779def : AMDGPUPat < 2780 (DivergentBinFrag<or> (and i64:$x, i64:$z), 2781 (and i64:$y, (or i64:$x, i64:$z))), 2782 (REG_SEQUENCE VReg_64, 2783 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), 2784 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), 2785 (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)), 2786 (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0, 2787 (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)), 2788 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), 2789 (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)), 2790 (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1) 2791>; 2792 2793multiclass IntMed3Pat<Instruction med3Inst, 2794 SDPatternOperator min, 2795 SDPatternOperator max, 2796 SDPatternOperator min_oneuse, 2797 SDPatternOperator max_oneuse> { 2798 2799 // This matches 16 permutations of 2800 // min(max(a, b), max(min(a, b), c)) 2801 def : AMDGPUPat < 2802 (min (max_oneuse i32:$src0, i32:$src1), 2803 (max_oneuse (min_oneuse i32:$src0, i32:$src1), i32:$src2)), 2804 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 2805>; 2806 2807 // This matches 16 permutations of 2808 // max(min(x, y), min(max(x, y), z)) 2809 def : AMDGPUPat < 2810 (max (min_oneuse i32:$src0, i32:$src1), 2811 (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)), 2812 (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2) 2813>; 2814} 2815 2816defm : IntMed3Pat<V_MED3_I32_e64, smin, smax, smin_oneuse, smax_oneuse>; 2817defm : IntMed3Pat<V_MED3_U32_e64, umin, umax, umin_oneuse, umax_oneuse>; 2818 2819// This matches 16 permutations of 2820// max(min(x, y), min(max(x, y), z)) 2821class FPMed3Pat<ValueType vt, 2822 //SDPatternOperator max, SDPatternOperator min, 2823 Instruction med3Inst> : GCNPat< 2824 (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), 2825 (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), 2826 (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), 2827 (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), 2828 (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), 2829 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE) 2830>; 2831 2832class FP16Med3Pat<ValueType vt, 2833 Instruction med3Inst> : GCNPat< 2834 (fmaxnum_like (fminnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), 2835 (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), 2836 (fminnum_like_oneuse (fmaxnum_like_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods), 2837 (VOP3Mods_nnan vt:$src1, i32:$src1_mods)), 2838 (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))), 2839 (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE) 2840>; 2841 2842multiclass Int16Med3Pat<Instruction med3Inst, 2843 SDPatternOperator min, 2844 SDPatternOperator max, 2845 SDPatternOperator max_oneuse, 2846 SDPatternOperator min_oneuse> { 2847 // This matches 16 permutations of 2848 // max(min(x, y), min(max(x, y), z)) 2849 def : GCNPat < 2850 (max (min_oneuse i16:$src0, i16:$src1), 2851 (min_oneuse (max_oneuse i16:$src0, i16:$src1), i16:$src2)), 2852 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 2853>; 2854 2855 // This matches 16 permutations of 2856 // min(max(a, b), max(min(a, b), c)) 2857 def : GCNPat < 2858 (min (max_oneuse i16:$src0, i16:$src1), 2859 (max_oneuse (min_oneuse i16:$src0, i16:$src1), i16:$src2)), 2860 (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE) 2861>; 2862} 2863 2864def : FPMed3Pat<f32, V_MED3_F32_e64>; 2865 2866let OtherPredicates = [isGFX9Plus] in { 2867def : FP16Med3Pat<f16, V_MED3_F16_e64>; 2868defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, smax_oneuse, smin_oneuse>; 2869defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, umax_oneuse, umin_oneuse>; 2870} // End Predicates = [isGFX9Plus] 2871 2872class AMDGPUGenericInstruction : GenericInstruction { 2873 let Namespace = "AMDGPU"; 2874} 2875 2876// Convert a wave address to a swizzled vector address (i.e. this is 2877// for copying the stack pointer to a vector address appropriate to 2878// use in the offset field of mubuf instructions). 2879def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction { 2880 let OutOperandList = (outs type0:$dst); 2881 let InOperandList = (ins type0:$src); 2882 let hasSideEffects = 0; 2883} 2884 2885// Returns -1 if the input is zero. 2886def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction { 2887 let OutOperandList = (outs type0:$dst); 2888 let InOperandList = (ins type1:$src); 2889 let hasSideEffects = 0; 2890} 2891 2892// Returns -1 if the input is zero. 2893def G_AMDGPU_FFBL_B32 : AMDGPUGenericInstruction { 2894 let OutOperandList = (outs type0:$dst); 2895 let InOperandList = (ins type1:$src); 2896 let hasSideEffects = 0; 2897} 2898 2899def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction { 2900 let OutOperandList = (outs type0:$dst); 2901 let InOperandList = (ins type1:$src); 2902 let hasSideEffects = 0; 2903} 2904 2905class BufferLoadGenericInstruction : AMDGPUGenericInstruction { 2906 let OutOperandList = (outs type0:$dst); 2907 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 2908 type2:$soffset, untyped_imm_0:$offset, 2909 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2910 let hasSideEffects = 0; 2911 let mayLoad = 1; 2912} 2913 2914class TBufferLoadGenericInstruction : AMDGPUGenericInstruction { 2915 let OutOperandList = (outs type0:$dst); 2916 let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset, 2917 type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format, 2918 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2919 let hasSideEffects = 0; 2920 let mayLoad = 1; 2921} 2922 2923def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction; 2924def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction; 2925def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction; 2926def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction; 2927def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction; 2928def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction; 2929def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction; 2930def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction; 2931def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction; 2932 2933class BufferStoreGenericInstruction : AMDGPUGenericInstruction { 2934 let OutOperandList = (outs); 2935 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 2936 type2:$soffset, untyped_imm_0:$offset, 2937 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2938 let hasSideEffects = 0; 2939 let mayStore = 1; 2940} 2941 2942class TBufferStoreGenericInstruction : AMDGPUGenericInstruction { 2943 let OutOperandList = (outs); 2944 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 2945 type2:$soffset, untyped_imm_0:$offset, 2946 untyped_imm_0:$format, 2947 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 2948 let hasSideEffects = 0; 2949 let mayStore = 1; 2950} 2951 2952def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction; 2953def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction; 2954def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction; 2955def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction; 2956def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction; 2957def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction; 2958def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction; 2959 2960def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction { 2961 let OutOperandList = (outs type0:$dst); 2962 let InOperandList = (ins type0:$src0, type0:$src1); 2963 let hasSideEffects = 0; 2964} 2965 2966def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction { 2967 let OutOperandList = (outs type0:$dst); 2968 let InOperandList = (ins type0:$src0, type0:$src1); 2969 let hasSideEffects = 0; 2970} 2971 2972foreach N = 0-3 in { 2973def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction { 2974 let OutOperandList = (outs type0:$dst); 2975 let InOperandList = (ins type0:$src0); 2976 let hasSideEffects = 0; 2977} 2978} 2979 2980def G_AMDGPU_CVT_PK_I16_I32 : AMDGPUGenericInstruction { 2981 let OutOperandList = (outs type0:$dst); 2982 let InOperandList = (ins type0:$src0, type0:$src1); 2983 let hasSideEffects = 0; 2984} 2985 2986def G_AMDGPU_SMED3 : AMDGPUGenericInstruction { 2987 let OutOperandList = (outs type0:$dst); 2988 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 2989 let hasSideEffects = 0; 2990} 2991 2992def G_AMDGPU_UMED3 : AMDGPUGenericInstruction { 2993 let OutOperandList = (outs type0:$dst); 2994 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 2995 let hasSideEffects = 0; 2996} 2997 2998def G_AMDGPU_FMED3 : AMDGPUGenericInstruction { 2999 let OutOperandList = (outs type0:$dst); 3000 let InOperandList = (ins type0:$src0, type0:$src1, type0:$src2); 3001 let hasSideEffects = 0; 3002} 3003 3004def G_AMDGPU_CLAMP : AMDGPUGenericInstruction { 3005 let OutOperandList = (outs type0:$dst); 3006 let InOperandList = (ins type0:$src); 3007 let hasSideEffects = 0; 3008} 3009 3010// Atomic cmpxchg. $cmpval ad $newval are packed in a single vector 3011// operand Expects a MachineMemOperand in addition to explicit 3012// operands. 3013def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction { 3014 let OutOperandList = (outs type0:$oldval); 3015 let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval); 3016 let hasSideEffects = 0; 3017 let mayLoad = 1; 3018 let mayStore = 1; 3019} 3020 3021let Namespace = "AMDGPU" in { 3022def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP; 3023def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP; 3024def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP; 3025def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP; 3026} 3027 3028class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction { 3029 let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst)); 3030 let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset, 3031 type2:$soffset, untyped_imm_0:$offset, 3032 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3033 let hasSideEffects = 0; 3034 let mayLoad = 1; 3035 let mayStore = 1; 3036} 3037 3038def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction; 3039def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction; 3040def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction; 3041def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction; 3042def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction; 3043def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction; 3044def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction; 3045def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction; 3046def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction; 3047def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction; 3048def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction; 3049def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction; 3050def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction; 3051def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction; 3052def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction; 3053 3054def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction { 3055 let OutOperandList = (outs type0:$dst); 3056 let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex, 3057 type2:$voffset, type2:$soffset, untyped_imm_0:$offset, 3058 untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen); 3059 let hasSideEffects = 0; 3060 let mayLoad = 1; 3061 let mayStore = 1; 3062} 3063 3064// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as 3065// a workaround for the intrinsic being defined as readnone, but 3066// really needs a memory operand. 3067def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction { 3068 let OutOperandList = (outs type0:$dst); 3069 let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy); 3070 let hasSideEffects = 0; 3071 let mayLoad = 1; 3072 let mayStore = 0; 3073} 3074 3075// This is equivalent to the G_INTRINSIC*, but the operands may have 3076// been legalized depending on the subtarget requirements. 3077def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction { 3078 let OutOperandList = (outs type0:$dst); 3079 let InOperandList = (ins unknown:$intrin, variable_ops); 3080 let hasSideEffects = 0; 3081 let mayLoad = 1; 3082 3083 // FIXME: Use separate opcode for atomics. 3084 let mayStore = 1; 3085} 3086 3087def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction { 3088 let OutOperandList = (outs type0:$dst); 3089 let InOperandList = (ins unknown:$intrin, variable_ops); 3090 let hasSideEffects = 0; 3091 let mayLoad = 1; 3092 3093 // FIXME: Use separate opcode for atomics. 3094 let mayStore = 1; 3095} 3096 3097// This is equivalent to the G_INTRINSIC*, but the operands may have 3098// been legalized depending on the subtarget requirements. 3099def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction { 3100 let OutOperandList = (outs); 3101 let InOperandList = (ins unknown:$intrin, variable_ops); 3102 let hasSideEffects = 0; 3103 let mayStore = 1; 3104} 3105 3106def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction { 3107 let OutOperandList = (outs); 3108 let InOperandList = (ins unknown:$intrin, variable_ops); 3109 let hasSideEffects = 0; 3110 let mayStore = 1; 3111} 3112 3113def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction { 3114 let OutOperandList = (outs type0:$dst); 3115 let InOperandList = (ins unknown:$intrin, variable_ops); 3116 let hasSideEffects = 0; 3117 let mayLoad = 1; 3118 let mayStore = 0; 3119} 3120 3121// Generic instruction for SI_CALL, so we can select the register bank and insert a waterfall loop 3122// if necessary. 3123def G_SI_CALL : AMDGPUGenericInstruction { 3124 let OutOperandList = (outs SReg_64:$dst); 3125 let InOperandList = (ins type0:$src0, unknown:$callee); 3126 let Size = 4; 3127 let isCall = 1; 3128 let UseNamedOperandTable = 1; 3129 let SchedRW = [WriteBranch]; 3130 // TODO: Should really base this on the call target 3131 let isConvergent = 1; 3132} 3133