1 //===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// Interface definition of the TargetLowering class that is common 11 /// to all AMD GPUs. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H 16 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H 17 18 #include "AMDGPU.h" 19 #include "llvm/CodeGen/CallingConvLower.h" 20 #include "llvm/CodeGen/TargetLowering.h" 21 22 namespace llvm { 23 24 class AMDGPUMachineFunction; 25 class AMDGPUSubtarget; 26 struct ArgDescriptor; 27 28 class AMDGPUTargetLowering : public TargetLowering { 29 private: 30 const AMDGPUSubtarget *Subtarget; 31 32 /// \returns AMDGPUISD::FFBH_U32 node if the incoming \p Op may have been 33 /// legalized from a smaller type VT. Need to match pre-legalized type because 34 /// the generic legalization inserts the add/sub between the select and 35 /// compare. 36 SDValue getFFBX_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, unsigned Opc) const; 37 38 public: 39 static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG); 40 static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG); 41 42 protected: 43 SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; 44 SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; 45 /// Split a vector store into multiple scalar stores. 46 /// \returns The resulting chain. 47 48 SDValue LowerFREM(SDValue Op, SelectionDAG &DAG) const; 49 SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const; 50 SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; 51 SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; 52 SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; 53 54 SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const; 55 SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; 56 SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; 57 SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; 58 SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG, 59 double Log2BaseInverted) const; 60 SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const; 61 62 SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const; 63 64 SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const; 65 SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; 66 SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; 67 SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; 68 69 SDValue LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG, bool Signed) const; 70 SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; 71 SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const; 72 SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const; 73 74 SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; 75 76 protected: 77 bool shouldCombineMemoryType(EVT VT) const; 78 SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; 79 SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; 80 SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const; 81 82 SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, 83 unsigned Opc, SDValue LHS, 84 uint32_t ValLo, uint32_t ValHi) const; 85 SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const; 86 SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const; 87 SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const; 88 SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const; 89 SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const; 90 SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const; 91 SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const; 92 SDValue performMulLoHi24Combine(SDNode *N, DAGCombinerInfo &DCI) const; 93 SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, 94 SDValue RHS, DAGCombinerInfo &DCI) const; 95 SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const; 96 97 bool isConstantCostlierToNegate(SDValue N) const; 98 SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const; 99 SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const; 100 SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const; 101 102 static EVT getEquivalentMemType(LLVMContext &Context, EVT VT); 103 104 virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, 105 SelectionDAG &DAG) const; 106 107 /// Return 64-bit value Op as two 32-bit integers. 108 std::pair<SDValue, SDValue> split64BitValue(SDValue Op, 109 SelectionDAG &DAG) const; 110 SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const; 111 SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const; 112 113 /// Split a vector type into two parts. The first part is a power of two 114 /// vector. The second part is whatever is left over, and is a scalar if it 115 /// would otherwise be a 1-vector. 116 std::pair<EVT, EVT> getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const; 117 118 /// Split a vector value into two parts of types LoVT and HiVT. HiVT could be 119 /// scalar. 120 std::pair<SDValue, SDValue> splitVector(const SDValue &N, const SDLoc &DL, 121 const EVT &LoVT, const EVT &HighVT, 122 SelectionDAG &DAG) const; 123 124 /// Split a vector load into 2 loads of half the vector. 125 SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const; 126 127 /// Widen a vector load from vec3 to vec4. 128 SDValue WidenVectorLoad(SDValue Op, SelectionDAG &DAG) const; 129 130 /// Split a vector store into 2 stores of half the vector. 131 SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const; 132 133 SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; 134 SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; 135 SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const; 136 SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; 137 void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, 138 SmallVectorImpl<SDValue> &Results) const; 139 140 void analyzeFormalArgumentsCompute( 141 CCState &State, 142 const SmallVectorImpl<ISD::InputArg> &Ins) const; 143 144 public: 145 AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); 146 147 bool mayIgnoreSignedZero(SDValue Op) const { 148 if (getTargetMachine().Options.NoSignedZerosFPMath) 149 return true; 150 151 const auto Flags = Op.getNode()->getFlags(); 152 if (Flags.isDefined()) 153 return Flags.hasNoSignedZeros(); 154 155 return false; 156 } 157 158 static inline SDValue stripBitcast(SDValue Val) { 159 return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val; 160 } 161 162 static bool allUsesHaveSourceMods(const SDNode *N, 163 unsigned CostThreshold = 4); 164 bool isFAbsFree(EVT VT) const override; 165 bool isFNegFree(EVT VT) const override; 166 bool isTruncateFree(EVT Src, EVT Dest) const override; 167 bool isTruncateFree(Type *Src, Type *Dest) const override; 168 169 bool isZExtFree(Type *Src, Type *Dest) const override; 170 bool isZExtFree(EVT Src, EVT Dest) const override; 171 bool isZExtFree(SDValue Val, EVT VT2) const override; 172 173 bool isNarrowingProfitable(EVT VT1, EVT VT2) const override; 174 175 MVT getVectorIdxTy(const DataLayout &) const override; 176 bool isSelectSupported(SelectSupportKind) const override; 177 178 bool isFPImmLegal(const APFloat &Imm, EVT VT, 179 bool ForCodeSize) const override; 180 bool ShouldShrinkFPConstant(EVT VT) const override; 181 bool shouldReduceLoadWidth(SDNode *Load, 182 ISD::LoadExtType ExtType, 183 EVT ExtVT) const override; 184 185 bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, 186 const MachineMemOperand &MMO) const final; 187 188 bool storeOfVectorConstantIsCheap(EVT MemVT, 189 unsigned NumElem, 190 unsigned AS) const override; 191 bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override; 192 bool isCheapToSpeculateCttz() const override; 193 bool isCheapToSpeculateCtlz() const override; 194 195 bool isSDNodeAlwaysUniform(const SDNode *N) const override; 196 static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg); 197 static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg); 198 199 SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, 200 const SmallVectorImpl<ISD::OutputArg> &Outs, 201 const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL, 202 SelectionDAG &DAG) const override; 203 204 SDValue addTokenForArgument(SDValue Chain, 205 SelectionDAG &DAG, 206 MachineFrameInfo &MFI, 207 int ClobberedFI) const; 208 209 SDValue lowerUnhandledCall(CallLoweringInfo &CLI, 210 SmallVectorImpl<SDValue> &InVals, 211 StringRef Reason) const; 212 SDValue LowerCall(CallLoweringInfo &CLI, 213 SmallVectorImpl<SDValue> &InVals) const override; 214 215 SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, 216 SelectionDAG &DAG) const; 217 218 SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; 219 SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; 220 void ReplaceNodeResults(SDNode * N, 221 SmallVectorImpl<SDValue> &Results, 222 SelectionDAG &DAG) const override; 223 224 SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, 225 SDValue RHS, SDValue True, SDValue False, 226 SDValue CC, DAGCombinerInfo &DCI) const; 227 228 const char* getTargetNodeName(unsigned Opcode) const override; 229 230 // FIXME: Turn off MergeConsecutiveStores() before Instruction Selection for 231 // AMDGPU. Commit r319036, 232 // (https://github.com/llvm/llvm-project/commit/db77e57ea86d941a4262ef60261692f4cb6893e6) 233 // turned on MergeConsecutiveStores() before Instruction Selection for all 234 // targets. Enough AMDGPU compiles go into an infinite loop ( 235 // MergeConsecutiveStores() merges two stores; LegalizeStoreOps() un-merges; 236 // MergeConsecutiveStores() re-merges, etc. ) to warrant turning it off for 237 // now. 238 bool mergeStoresAfterLegalization(EVT) const override { return false; } 239 240 bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { 241 return true; 242 } 243 SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, 244 int &RefinementSteps, bool &UseOneConstNR, 245 bool Reciprocal) const override; 246 SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, 247 int &RefinementSteps) const override; 248 249 virtual SDNode *PostISelFolding(MachineSDNode *N, 250 SelectionDAG &DAG) const = 0; 251 252 /// Determine which of the bits specified in \p Mask are known to be 253 /// either zero or one and return them in the \p KnownZero and \p KnownOne 254 /// bitsets. 255 void computeKnownBitsForTargetNode(const SDValue Op, 256 KnownBits &Known, 257 const APInt &DemandedElts, 258 const SelectionDAG &DAG, 259 unsigned Depth = 0) const override; 260 261 unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, 262 const SelectionDAG &DAG, 263 unsigned Depth = 0) const override; 264 265 bool isKnownNeverNaNForTargetNode(SDValue Op, 266 const SelectionDAG &DAG, 267 bool SNaN = false, 268 unsigned Depth = 0) const override; 269 270 /// Helper function that adds Reg to the LiveIn list of the DAG's 271 /// MachineFunction. 272 /// 273 /// \returns a RegisterSDNode representing Reg if \p RawReg is true, otherwise 274 /// a copy from the register. 275 SDValue CreateLiveInRegister(SelectionDAG &DAG, 276 const TargetRegisterClass *RC, 277 unsigned Reg, EVT VT, 278 const SDLoc &SL, 279 bool RawReg = false) const; 280 SDValue CreateLiveInRegister(SelectionDAG &DAG, 281 const TargetRegisterClass *RC, 282 unsigned Reg, EVT VT) const { 283 return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode())); 284 } 285 286 // Returns the raw live in register rather than a copy from it. 287 SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG, 288 const TargetRegisterClass *RC, 289 unsigned Reg, EVT VT) const { 290 return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true); 291 } 292 293 /// Similar to CreateLiveInRegister, except value maybe loaded from a stack 294 /// slot rather than passed in a register. 295 SDValue loadStackInputValue(SelectionDAG &DAG, 296 EVT VT, 297 const SDLoc &SL, 298 int64_t Offset) const; 299 300 SDValue storeStackInputValue(SelectionDAG &DAG, 301 const SDLoc &SL, 302 SDValue Chain, 303 SDValue ArgVal, 304 int64_t Offset) const; 305 306 SDValue loadInputValue(SelectionDAG &DAG, 307 const TargetRegisterClass *RC, 308 EVT VT, const SDLoc &SL, 309 const ArgDescriptor &Arg) const; 310 311 enum ImplicitParameter { 312 FIRST_IMPLICIT, 313 GRID_DIM = FIRST_IMPLICIT, 314 GRID_OFFSET, 315 }; 316 317 /// Helper function that returns the byte offset of the given 318 /// type of implicit parameter. 319 uint32_t getImplicitParameterOffset(const MachineFunction &MF, 320 const ImplicitParameter Param) const; 321 322 MVT getFenceOperandTy(const DataLayout &DL) const override { 323 return MVT::i32; 324 } 325 326 AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; 327 328 bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N, 329 SDValue Addr, SDValue &VAddr, SDValue &Offset, 330 SDValue &SLC) const; 331 }; 332 333 namespace AMDGPUISD { 334 335 enum NodeType : unsigned { 336 // AMDIL ISD Opcodes 337 FIRST_NUMBER = ISD::BUILTIN_OP_END, 338 UMUL, // 32bit unsigned multiplication 339 BRANCH_COND, 340 // End AMDIL ISD Opcodes 341 342 // Function call. 343 CALL, 344 TC_RETURN, 345 TRAP, 346 347 // Masked control flow nodes. 348 IF, 349 ELSE, 350 LOOP, 351 352 // A uniform kernel return that terminates the wavefront. 353 ENDPGM, 354 355 // Return to a shader part's epilog code. 356 RETURN_TO_EPILOG, 357 358 // Return with values from a non-entry function. 359 RET_FLAG, 360 361 DWORDADDR, 362 FRACT, 363 364 /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output 365 /// modifier behavior with dx10_enable. 366 CLAMP, 367 368 // This is SETCC with the full mask result which is used for a compare with a 369 // result bit per item in the wavefront. 370 SETCC, 371 SETREG, 372 // FP ops with input and output chain. 373 FMA_W_CHAIN, 374 FMUL_W_CHAIN, 375 376 // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. 377 // Denormals handled on some parts. 378 COS_HW, 379 SIN_HW, 380 FMAX_LEGACY, 381 FMIN_LEGACY, 382 383 FMAX3, 384 SMAX3, 385 UMAX3, 386 FMIN3, 387 SMIN3, 388 UMIN3, 389 FMED3, 390 SMED3, 391 UMED3, 392 FDOT2, 393 URECIP, 394 DIV_SCALE, 395 DIV_FMAS, 396 DIV_FIXUP, 397 // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is 398 // treated as an illegal operation. 399 FMAD_FTZ, 400 TRIG_PREOP, // 1 ULP max error for f64 401 402 // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. 403 // For f64, max error 2^29 ULP, handles denormals. 404 RCP, 405 RSQ, 406 RCP_LEGACY, 407 RSQ_LEGACY, 408 RCP_IFLAG, 409 FMUL_LEGACY, 410 RSQ_CLAMP, 411 LDEXP, 412 FP_CLASS, 413 DOT4, 414 CARRY, 415 BORROW, 416 BFE_U32, // Extract range of bits with zero extension to 32-bits. 417 BFE_I32, // Extract range of bits with sign extension to 32-bits. 418 BFI, // (src0 & src1) | (~src0 & src2) 419 BFM, // Insert a range of bits into a 32-bit word. 420 FFBH_U32, // ctlz with -1 if input is zero. 421 FFBH_I32, 422 FFBL_B32, // cttz with -1 if input is zero. 423 MUL_U24, 424 MUL_I24, 425 MULHI_U24, 426 MULHI_I24, 427 MAD_U24, 428 MAD_I24, 429 MAD_U64_U32, 430 MAD_I64_I32, 431 MUL_LOHI_I24, 432 MUL_LOHI_U24, 433 PERM, 434 TEXTURE_FETCH, 435 EXPORT, // exp on SI+ 436 EXPORT_DONE, // exp on SI+ with done bit set 437 R600_EXPORT, 438 CONST_ADDRESS, 439 REGISTER_LOAD, 440 REGISTER_STORE, 441 SAMPLE, 442 SAMPLEB, 443 SAMPLED, 444 SAMPLEL, 445 446 // These cvt_f32_ubyte* nodes need to remain consecutive and in order. 447 CVT_F32_UBYTE0, 448 CVT_F32_UBYTE1, 449 CVT_F32_UBYTE2, 450 CVT_F32_UBYTE3, 451 452 // Convert two float 32 numbers into a single register holding two packed f16 453 // with round to zero. 454 CVT_PKRTZ_F16_F32, 455 CVT_PKNORM_I16_F32, 456 CVT_PKNORM_U16_F32, 457 CVT_PK_I16_I32, 458 CVT_PK_U16_U32, 459 460 // Same as the standard node, except the high bits of the resulting integer 461 // are known 0. 462 FP_TO_FP16, 463 464 // Wrapper around fp16 results that are known to zero the high bits. 465 FP16_ZEXT, 466 467 /// This node is for VLIW targets and it is used to represent a vector 468 /// that is stored in consecutive registers with the same channel. 469 /// For example: 470 /// |X |Y|Z|W| 471 /// T0|v.x| | | | 472 /// T1|v.y| | | | 473 /// T2|v.z| | | | 474 /// T3|v.w| | | | 475 BUILD_VERTICAL_VECTOR, 476 /// Pointer to the start of the shader's constant data. 477 CONST_DATA_PTR, 478 INIT_EXEC, 479 INIT_EXEC_FROM_INPUT, 480 SENDMSG, 481 SENDMSGHALT, 482 INTERP_MOV, 483 INTERP_P1, 484 INTERP_P2, 485 INTERP_P1LL_F16, 486 INTERP_P1LV_F16, 487 INTERP_P2_F16, 488 PC_ADD_REL_OFFSET, 489 LDS, 490 KILL, 491 DUMMY_CHAIN, 492 FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, 493 LOAD_D16_HI, 494 LOAD_D16_LO, 495 LOAD_D16_HI_I8, 496 LOAD_D16_HI_U8, 497 LOAD_D16_LO_I8, 498 LOAD_D16_LO_U8, 499 500 STORE_MSKOR, 501 LOAD_CONSTANT, 502 TBUFFER_STORE_FORMAT, 503 TBUFFER_STORE_FORMAT_D16, 504 TBUFFER_LOAD_FORMAT, 505 TBUFFER_LOAD_FORMAT_D16, 506 DS_ORDERED_COUNT, 507 ATOMIC_CMP_SWAP, 508 ATOMIC_INC, 509 ATOMIC_DEC, 510 ATOMIC_LOAD_FMIN, 511 ATOMIC_LOAD_FMAX, 512 BUFFER_LOAD, 513 BUFFER_LOAD_UBYTE, 514 BUFFER_LOAD_USHORT, 515 BUFFER_LOAD_BYTE, 516 BUFFER_LOAD_SHORT, 517 BUFFER_LOAD_FORMAT, 518 BUFFER_LOAD_FORMAT_D16, 519 SBUFFER_LOAD, 520 BUFFER_STORE, 521 BUFFER_STORE_BYTE, 522 BUFFER_STORE_SHORT, 523 BUFFER_STORE_FORMAT, 524 BUFFER_STORE_FORMAT_D16, 525 BUFFER_ATOMIC_SWAP, 526 BUFFER_ATOMIC_ADD, 527 BUFFER_ATOMIC_SUB, 528 BUFFER_ATOMIC_SMIN, 529 BUFFER_ATOMIC_UMIN, 530 BUFFER_ATOMIC_SMAX, 531 BUFFER_ATOMIC_UMAX, 532 BUFFER_ATOMIC_AND, 533 BUFFER_ATOMIC_OR, 534 BUFFER_ATOMIC_XOR, 535 BUFFER_ATOMIC_CMPSWAP, 536 BUFFER_ATOMIC_FADD, 537 BUFFER_ATOMIC_PK_FADD, 538 ATOMIC_FADD, 539 ATOMIC_PK_FADD, 540 541 LAST_AMDGPU_ISD_NUMBER 542 }; 543 544 545 } // End namespace AMDGPUISD 546 547 } // End namespace llvm 548 549 #endif 550