Lines Matching +full:abs +full:- +full:flat

1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
12 //===----------------------------------------------------------------------===//
49 #define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
59 "amdgpu-use-divergent-register-indexing",
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); in denormalModeIsFlushAllF32()
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign(); in denormalModeIsFlushAllF64F16()
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class(); in SITargetLowering()
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96)); in SITargetLowering()
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128)); in SITargetLowering()
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160)); in SITargetLowering()
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192)); in SITargetLowering()
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192)); in SITargetLowering()
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224)); in SITargetLowering()
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256)); in SITargetLowering()
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256)); in SITargetLowering()
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288)); in SITargetLowering()
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320)); in SITargetLowering()
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352)); in SITargetLowering()
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384)); in SITargetLowering()
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512)); in SITargetLowering()
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512)); in SITargetLowering()
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); in SITargetLowering()
151 if (Subtarget->has16BitInsts()) { in SITargetLowering()
152 if (Subtarget->useRealTrue16Insts()) { in SITargetLowering()
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024)); in SITargetLowering()
183 computeRegisterProperties(Subtarget->getRegisterInfo()); in SITargetLowering()
186 // really produce a 1-bit result. Any copy/extend from these will turn into a in SITargetLowering()
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as in SITargetLowering()
347 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that in SITargetLowering()
350 // Most operations are naturally 32-bit vector operations. We only support in SITargetLowering()
463 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. in SITargetLowering()
469 if (Subtarget->hasSMemRealTime() || in SITargetLowering()
470 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11) in SITargetLowering()
474 if (Subtarget->has16BitInsts()) { in SITargetLowering()
481 if (Subtarget->hasMadMacF32Insts()) in SITargetLowering()
484 if (!Subtarget->hasBFI()) in SITargetLowering()
488 if (!Subtarget->hasBCNT(32)) in SITargetLowering()
491 if (!Subtarget->hasBCNT(64)) in SITargetLowering()
494 if (Subtarget->hasFFBH()) in SITargetLowering()
497 if (Subtarget->hasFFBL()) in SITargetLowering()
500 // We only really have 32-bit BFE instructions (and 16-bit on VI). in SITargetLowering()
502 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any in SITargetLowering()
505 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that in SITargetLowering()
508 if (Subtarget->hasBFE()) in SITargetLowering()
512 if (Subtarget->hasIntClamp()) in SITargetLowering()
515 if (Subtarget->hasAddNoCarry()) in SITargetLowering()
528 if (Subtarget->haveRoundOpsF64()) in SITargetLowering()
551 if (Subtarget->has16BitInsts()) { in SITargetLowering()
582 // F16 - Constant Actions. in SITargetLowering()
586 // F16 - Load/Store Actions. in SITargetLowering()
592 // BF16 - Load/Store Actions. in SITargetLowering()
598 // F16 - VOP1 Actions. in SITargetLowering()
606 // F16 - VOP2 Actions. in SITargetLowering()
613 // F16 - VOP3 Actions. in SITargetLowering()
650 // XXX - Do these do anything? Vector constants turn into build_vector. in SITargetLowering()
745 Subtarget->hasVOP3PInsts() ? Legal : Custom); in SITargetLowering()
773 if (Subtarget->hasVOP3PInsts()) { in SITargetLowering()
794 ISD::MUL, ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, in SITargetLowering()
811 if (Subtarget->hasPackedFP32Ops()) { in SITargetLowering()
822 if (Subtarget->has16BitInsts()) { in SITargetLowering()
843 if (Subtarget->hasScalarSMulU64()) in SITargetLowering()
846 if (Subtarget->hasMad64_32()) in SITargetLowering()
849 if (Subtarget->hasPrefetch()) in SITargetLowering()
852 if (Subtarget->hasIEEEMinMax()) { in SITargetLowering()
925 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16()) in SITargetLowering()
955 // FIXME: In other contexts we pretend this is a per-function property. in SITargetLowering()
970 //===----------------------------------------------------------------------===//
972 //===----------------------------------------------------------------------===//
980 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || in isFPExtFoldable()
981 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && in isFPExtFoldable()
990 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) || in isFPExtFoldable()
991 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) && in isFPExtFoldable()
1014 if (Subtarget->has16BitInsts()) { in getRegisterTypeForCallingConv()
1023 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32; in getRegisterTypeForCallingConv()
1044 // FIXME: Should probably promote 8-bit vectors to i16. in getNumRegistersForCallingConv()
1045 if (Size == 16 && Subtarget->has16BitInsts()) in getNumRegistersForCallingConv()
1067 // FIXME: We should fix the ABI to be the same on targets without 16-bit in getVectorTypeBreakdownForCallingConv()
1068 // support, but unless we can properly handle 3-vectors, it will be still be in getVectorTypeBreakdownForCallingConv()
1070 if (Size == 16 && Subtarget->has16BitInsts()) { in getVectorTypeBreakdownForCallingConv()
1089 if (Size < 16 && Subtarget->has16BitInsts()) { in getVectorTypeBreakdownForCallingConv()
1122 LLVMContext &Ctx = Ty->getContext(); in memVTFromLoadIntrData()
1124 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements()); in memVTFromLoadIntrData()
1125 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()), in memVTFromLoadIntrData()
1141 assert(ST->getNumContainedTypes() == 2 && in memVTFromLoadIntrReturn()
1142 ST->getContainedType(1)->isIntegerTy(32)); in memVTFromLoadIntrReturn()
1143 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes); in memVTFromLoadIntrReturn()
1146 /// Map address space 7 to MVT::v5i32 because that's its in-memory
1147 /// representation. This return value is vector-typed because there is no
1151 /// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1161 /// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1163 /// The in-memory representation of a p9 is {p8, i32, i32}, which is
1194 if (RsrcIntr->IsImage) { in getTgtMemIntrinsic()
1197 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); in getTgtMemIntrinsic()
1201 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg); in getTgtMemIntrinsic()
1202 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) { in getTgtMemIntrinsic()
1203 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE) in getTgtMemIntrinsic()
1212 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1)); in getTgtMemIntrinsic()
1213 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE) in getTgtMemIntrinsic()
1217 if (RsrcIntr->IsImage) { in getTgtMemIntrinsic()
1220 if (!BaseOpcode->Gather4) { in getTgtMemIntrinsic()
1224 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); in getTgtMemIntrinsic()
1242 Type *DataTy = CI.getArgOperand(0)->getType(); in getTgtMemIntrinsic()
1243 if (RsrcIntr->IsImage) { in getTgtMemIntrinsic()
1244 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); in getTgtMemIntrinsic()
1254 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID : in getTgtMemIntrinsic()
1262 if (RsrcIntr->IsImage && BaseOpcode->NoReturn) { in getTgtMemIntrinsic()
1266 // XXX - Should this be volatile without known ordering? in getTgtMemIntrinsic()
1268 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); in getTgtMemIntrinsic()
1275 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); in getTgtMemIntrinsic()
1303 if (!Vol->isZero()) in getTgtMemIntrinsic()
1311 Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); in getTgtMemIntrinsic()
1326 if (!Vol->isZero()) in getTgtMemIntrinsic()
1396 Info.ptrVal = MFI->getGWSPSV(TM); in getTgtMemIntrinsic()
1411 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); in getTgtMemIntrinsic()
1424 Info.ptrVal = MFI->getGWSPSV(TM); in getTgtMemIntrinsic()
1445 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace(); in CollectTargetIntrinsicOperands()
1446 unsigned DstAS = I.getType()->getPointerAddressSpace(); in CollectTargetIntrinsicOperands()
1460 switch (II->getIntrinsicID()) { in getAddrModeArguments()
1482 Ptr = II->getArgOperand(0); in getAddrModeArguments()
1485 Ptr = II->getArgOperand(1); in getAddrModeArguments()
1490 AccessTy = II->getType(); in getAddrModeArguments()
1497 if (!Subtarget->hasFlatInstOffsets()) { in isLegalFlatAddressingMode()
1498 // Flat instructions do not have offsets, and only have the register in isLegalFlatAddressingMode()
1503 decltype(SIInstrFlags::FLAT) FlatVariant = in isLegalFlatAddressingMode()
1506 : SIInstrFlags::FLAT; in isLegalFlatAddressingMode()
1509 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( in isLegalFlatAddressingMode()
1514 if (Subtarget->hasFlatGlobalInsts()) in isLegalGlobalAddressingMode()
1517 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { in isLegalGlobalAddressingMode()
1518 // Assume the we will use FLAT for all global memory accesses in isLegalGlobalAddressingMode()
1534 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and in isLegalMUBUFAddressingMode()
1535 // additionally can do r + r + i with addr64. 32-bit has more addressing in isLegalMUBUFAddressingMode()
1543 const SIInstrInfo *TII = Subtarget->getInstrInfo(); in isLegalMUBUFAddressingMode()
1544 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs)) in isLegalMUBUFAddressingMode()
1589 if (!Subtarget->hasScalarSubwordLoads()) { in isLegalAddressingMode()
1594 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4) in isLegalAddressingMode()
1598 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { in isLegalAddressingMode()
1599 // SMRD instructions have an 8-bit, dword offset on SI. in isLegalAddressingMode()
1602 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) { in isLegalAddressingMode()
1603 // On CI+, this can also be a 32-bit literal constant offset. If it fits in isLegalAddressingMode()
1604 // in 8-bits, it can use a smaller encoding. in isLegalAddressingMode()
1607 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) { in isLegalAddressingMode()
1608 // On VI, these use the SMEM format and the offset is 20-bit in bytes. in isLegalAddressingMode()
1611 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) { in isLegalAddressingMode()
1612 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative in isLegalAddressingMode()
1617 // On GFX12, all offsets are signed 24-bit in bytes. in isLegalAddressingMode()
1625 // Scalar (non-buffer) loads can only use a negative offset if in isLegalAddressingMode()
1626 // soffset+offset is non-negative. Since the compiler can only prove that in isLegalAddressingMode()
1642 return Subtarget->enableFlatScratch() in isLegalAddressingMode()
1647 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) { in isLegalAddressingMode()
1648 // Basic, single offset DS instructions allow a 16-bit unsigned immediate in isLegalAddressingMode()
1650 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have in isLegalAddressingMode()
1651 // an 8-bit dword offset but we don't know the alignment here. in isLegalAddressingMode()
1668 // addressing modes, so treat them as having no offset like flat in isLegalAddressingMode()
1682 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); in canMergeStoresTo()
1700 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4)) in allowsMisalignedMemoryAccessesImpl()
1704 if (Subtarget->hasLDSMisalignedBug() && Size > 32 && in allowsMisalignedMemoryAccessesImpl()
1717 // out-of-bounds even if base + offsets is in bounds. Split vectorized in allowsMisalignedMemoryAccessesImpl()
1718 // loads here to avoid emitting ds_read2_b32. We may re-combine the in allowsMisalignedMemoryAccessesImpl()
1720 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8)) in allowsMisalignedMemoryAccessesImpl()
1723 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we in allowsMisalignedMemoryAccessesImpl()
1728 if (Subtarget->hasUnalignedDSAccessEnabled()) { in allowsMisalignedMemoryAccessesImpl()
1737 // operates with a speed comparable to N-bit wide load". With the full in allowsMisalignedMemoryAccessesImpl()
1752 if (!Subtarget->hasDS96AndDS128()) in allowsMisalignedMemoryAccessesImpl()
1755 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on in allowsMisalignedMemoryAccessesImpl()
1758 if (Subtarget->hasUnalignedDSAccessEnabled()) { in allowsMisalignedMemoryAccessesImpl()
1775 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128()) in allowsMisalignedMemoryAccessesImpl()
1778 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on in allowsMisalignedMemoryAccessesImpl()
1783 if (Subtarget->hasUnalignedDSAccessEnabled()) { in allowsMisalignedMemoryAccessesImpl()
1807 // Note that we have a single-dword or sub-dword here, so if underaligned in allowsMisalignedMemoryAccessesImpl()
1813 Subtarget->hasUnalignedDSAccessEnabled(); in allowsMisalignedMemoryAccessesImpl()
1822 Subtarget->enableFlatScratch() || in allowsMisalignedMemoryAccessesImpl()
1823 Subtarget->hasUnalignedScratchAccess(); in allowsMisalignedMemoryAccessesImpl()
1826 // FIXME: We have to be conservative here and assume that flat operations in allowsMisalignedMemoryAccessesImpl()
1830 !Subtarget->hasUnalignedScratchAccess()) { in allowsMisalignedMemoryAccessesImpl()
1839 // than multiple smaller memory ops -- even when misaligned in allowsMisalignedMemoryAccessesImpl()
1845 Subtarget->hasUnalignedBufferAccessEnabled(); in allowsMisalignedMemoryAccessesImpl()
1852 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the in allowsMisalignedMemoryAccessesImpl()
1853 // byte-address are ignored, thus forcing Dword alignment. in allowsMisalignedMemoryAccessesImpl()
1873 // use. Make sure we switch these to 64-bit accesses. in getOptimalMemOpType()
1888 return MemNode->getMemOperand()->getFlags() & MONoClobber; in isMemOpHasNoClobberedMemOperand()
1898 // Flat -> private/local is a simple truncate. in isFreeAddrSpaceCast()
1899 // Flat -> global is no-op in isFreeAddrSpaceCast()
1911 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand()); in isMemOpUniform()
1938 if (Subtarget->has16BitInsts() && VT == MVT::i16) { in isTypeDesirableForOp()
1943 // These operations are done with 32-bit instructions anyway. in isTypeDesirableForOp()
1977 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); in lowerKernArgParameterPtr()
1986 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); in lowerKernArgParameterPtr()
2024 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && in convertArgType()
2026 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; in convertArgType()
2052 int64_t OffsetDiff = Offset - AlignDownOffset; in lowerKernargMemParameter()
2151 if (Subtarget->hasArchitectedSGPRs() && in getPreloadedValue()
2184 // It's undefined behavior if a function marked with the amdgpu-no-* in getPreloadedValue()
2200 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && in processPSInputArgs()
2205 !Arg->Flags.isInReg() && PSInputNum <= 15) { in processPSInputArgs()
2206 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); in processPSInputArgs()
2211 if (Arg->Flags.isSplit()) { in processPSInputArgs()
2212 while (!Arg->Flags.isSplitEnd()) { in processPSInputArgs()
2213 assert((!Arg->VT.isVector() || in processPSInputArgs()
2214 Arg->VT.getScalarSizeInBits() == 16) && in processPSInputArgs()
2224 Skipped.set(Arg->getOrigArgIndex()); in processPSInputArgs()
2229 Info->markPSInputAllocated(PSInputNum); in processPSInputArgs()
2230 if (Arg->Used) in processPSInputArgs()
2231 Info->markPSInputEnabled(PSInputNum); in processPSInputArgs()
2253 unsigned Mask = (Subtarget->hasPackedTID() && in allocateSpecialEntryInputVGPRs()
2260 if (Subtarget->hasPackedTID()) { in allocateSpecialEntryInputVGPRs()
2274 if (Subtarget->hasPackedTID()) { in allocateSpecialEntryInputVGPRs()
2318 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32); in allocateSGPR32InputImpl()
2421 // flat_scratch_init is not applicable for non-kernel functions. in allocateSpecialInputSGPRs()
2484 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { in allocateHSAUserSGPRs()
2500 // Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2517 // Don't preload non-original args or parts not in the current preload in allocatePreloadKernArgSGPRs()
2537 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); in allocatePreloadKernArgSGPRs()
2541 unsigned Padding = ArgOffset - LastExplicitArgOffset; in allocatePreloadKernArgSGPRs()
2556 if (PreloadRegs->size() > 1) in allocatePreloadKernArgSGPRs()
2580 // Allocate special input registers that are initialized per-wave.
2586 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs(); in allocateSystemSGPRs()
2587 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) { in allocateSystemSGPRs()
2588 // Note: user SGPRs are handled by the front-end for graphics shaders in allocateSystemSGPRs()
2657 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader || in allocateSystemSGPRs()
2671 // Record that we know we have non-spill stack objects so we don't need to in reservePrivateMemoryRegs()
2712 // whereas non-entry functions get this "for free". This means there is no in reservePrivateMemoryRegs()
2746 if (ST.getFrameLowering()->hasFP(MF)) { in reservePrivateMemoryRegs()
2752 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); in supportSplitCSR()
2753 return !Info->isEntryFunction(); in supportSplitCSR()
2763 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); in insertCopiesSplitCSR()
2765 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); in insertCopiesSplitCSR()
2769 const TargetInstrInfo *TII = Subtarget->getInstrInfo(); in insertCopiesSplitCSR()
2770 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); in insertCopiesSplitCSR()
2771 MachineBasicBlock::iterator MBBI = Entry->begin(); in insertCopiesSplitCSR()
2781 Register NewVR = MRI->createVirtualRegister(RC); in insertCopiesSplitCSR()
2783 Entry->addLiveIn(*I); in insertCopiesSplitCSR()
2784 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) in insertCopiesSplitCSR()
2787 // Insert the copy-back instructions right before the terminator. in insertCopiesSplitCSR()
2789 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), in insertCopiesSplitCSR()
2790 TII->get(TargetOpcode::COPY), *I) in insertCopiesSplitCSR()
2799 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); in LowerFormalArguments()
2806 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) { in LowerFormalArguments()
2808 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); in LowerFormalArguments()
2809 DAG.getContext()->diagnose(NoGraphicsHSA); in LowerFormalArguments()
2824 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); in LowerFormalArguments()
2826 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && in LowerFormalArguments()
2827 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && in LowerFormalArguments()
2828 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); in LowerFormalArguments()
2830 if (!Subtarget->enableFlatScratch()) in LowerFormalArguments()
2834 !Subtarget->hasArchitectedSGPRs()) in LowerFormalArguments()
2835 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && in LowerFormalArguments()
2836 !Info->hasWorkGroupIDZ()); in LowerFormalArguments()
2847 // based on run-time states. Since we can't know what the final PSInputEna in LowerFormalArguments()
2852 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. in LowerFormalArguments()
2853 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be in LowerFormalArguments()
2855 if ((Info->getPSInputAddr() & 0x7F) == 0 || in LowerFormalArguments()
2856 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) { in LowerFormalArguments()
2859 Info->markPSInputAllocated(0); in LowerFormalArguments()
2860 Info->markPSInputEnabled(0); in LowerFormalArguments()
2862 if (Subtarget->isAmdPalOS()) { in LowerFormalArguments()
2864 // based on run-time states; the register values being generated here are in LowerFormalArguments()
2871 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable(); in LowerFormalArguments()
2874 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr())); in LowerFormalArguments()
2877 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); in LowerFormalArguments()
2888 if (IsKernel && Subtarget->hasKernargPreload()) in LowerFormalArguments()
2897 if (!Subtarget->enableFlatScratch()) in LowerFormalArguments()
2898 CCInfo.AllocateReg(Info->getScratchRSrcReg()); in LowerFormalArguments()
2913 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit in LowerFormalArguments()
2950 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) { in LowerFormalArguments()
2954 int64_t OffsetDiff = Offset - AlignDownOffset; in LowerFormalArguments()
2961 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0]; in LowerFormalArguments()
2981 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs; in LowerFormalArguments()
2990 TRI->getRegSizeInBits(*RC))); in LowerFormalArguments()
3009 // If the argument was preloaded to multiple consecutive 32-bit in LowerFormalArguments()
3013 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a in LowerFormalArguments()
3034 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); in LowerFormalArguments()
3035 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && in LowerFormalArguments()
3036 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || in LowerFormalArguments()
3037 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { in LowerFormalArguments()
3039 // less than 16-bits. On CI and newer they could potentially be in LowerFormalArguments()
3078 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); in LowerFormalArguments()
3083 // If this is an 8 or 16-bit value, it is really passed promoted in LowerFormalArguments()
3119 auto &ArgUsageInfo = DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); in LowerFormalArguments()
3120 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo()); in LowerFormalArguments()
3124 Info->setBytesInStackArgArea(StackArgSize); in LowerFormalArguments()
3149 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF); in CanLowerReturn()
3174 Info->setIfReturnsVoid(Outs.empty()); in LowerReturn()
3175 bool IsWaveEnd = Info->returnsVoid() && IsShader; in LowerReturn()
3177 // CCValAssign - represent the assignment of the return value to a location. in LowerReturn()
3181 // CCState - Info about the registers and stack slots. in LowerReturn()
3226 if (!Info->isEntryFunction()) { in LowerReturn()
3227 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); in LowerReturn()
3229 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); in LowerReturn()
3326 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); in passSpecialInputs()
3331 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { in passSpecialInputs()
3336 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); in passSpecialInputs()
3346 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"}, in passSpecialInputs()
3347 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" }, in passSpecialInputs()
3348 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"}, in passSpecialInputs()
3349 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"}, in passSpecialInputs()
3350 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"}, in passSpecialInputs()
3351 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"}, in passSpecialInputs()
3352 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}, in passSpecialInputs()
3353 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"}, in passSpecialInputs()
3364 if (CLI.CB->hasFnAttr(Attr.second)) in passSpecialInputs()
3368 CalleeArgInfo->getPreloadedValue(InputID); in passSpecialInputs()
3380 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32; in passSpecialInputs()
3403 if (OutgoingArg->isRegister()) { in passSpecialInputs()
3404 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); in passSpecialInputs()
3405 if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) in passSpecialInputs()
3423 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); in passSpecialInputs()
3426 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); in passSpecialInputs()
3429 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); in passSpecialInputs()
3443 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x"); in passSpecialInputs()
3444 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y"); in passSpecialInputs()
3445 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z"); in passSpecialInputs()
3448 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && in passSpecialInputs()
3450 if (Subtarget->getMaxWorkitemID(F, 0) != 0) { in passSpecialInputs()
3457 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && in passSpecialInputs()
3458 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) { in passSpecialInputs()
3466 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && in passSpecialInputs()
3467 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) { in passSpecialInputs()
3493 if (OutgoingArg->isRegister()) { in passSpecialInputs()
3495 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); in passSpecialInputs()
3497 CCInfo.AllocateReg(OutgoingArg->getRegister()); in passSpecialInputs()
3536 if (Callee->isDivergent()) in isEligibleForTailCallOptimization()
3542 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); in isEligibleForTailCallOptimization()
3543 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); in isEligibleForTailCallOptimization()
3577 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); in isEligibleForTailCallOptimization()
3578 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) in isEligibleForTailCallOptimization()
3595 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) in isEligibleForTailCallOptimization()
3603 if (!CI->isTailCall()) in mayBeEmittedAsTailCall()
3606 const Function *ParentFn = CI->getParent()->getParent(); in mayBeEmittedAsTailCall()
3607 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) in mayBeEmittedAsTailCall()
3628 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize())) in LowerCall()
3635 if (RequestedExec.Ty->isIntegerTy(64)) { in LowerCall()
3682 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) { in LowerCall()
3727 // arguments to begin at SP+0. Completely unused for non-tail calls. in LowerCall()
3737 if (!Subtarget->enableFlatScratch()) { in LowerCall()
3742 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); in LowerCall()
3805 : commonAlignment(Subtarget->getStackAlignment(), Offset); in LowerCall()
3823 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(), in LowerCall()
3828 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); in LowerCall()
3853 // Build a sequence of copy-to-reg nodes chained together with token chain in LowerCall()
3863 // We don't usually want to end the call-sequence here because we would tidy in LowerCall()
3864 // the frame up *after* the call, however in the ABI-changing tail-call case in LowerCall()
3878 const GlobalValue *GV = GSD->getGlobal(); in LowerCall()
3901 // Add a register mask operand representing the call-preserved registers. in LowerCall()
3902 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); in LowerCall()
3903 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); in LowerCall()
3971 Register SPReg = Info->getStackPtrOffsetReg(); in lowerDYNAMIC_STACKALLOCImpl()
3980 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue(); in lowerDYNAMIC_STACKALLOCImpl()
3981 const TargetFrameLowering *TFL = Subtarget->getFrameLowering(); in lowerDYNAMIC_STACKALLOCImpl()
3983 TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ? in lowerDYNAMIC_STACKALLOCImpl()
3988 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); in lowerDYNAMIC_STACKALLOCImpl()
3990 Align StackAlign = TFL->getStackAlign(); in lowerDYNAMIC_STACKALLOCImpl()
3994 DAG.getConstant(-(uint64_t)Alignment->value() in lowerDYNAMIC_STACKALLOCImpl()
3995 << Subtarget->getWavefrontSizeLog2(), in lowerDYNAMIC_STACKALLOCImpl()
4007 // We only handle constant sizes here to allow non-entry block, static sized in LowerDYNAMIC_STACKALLOC()
4026 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32); in LowerSTACKSAVE()
4047 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(), in lowerGET_ROUNDING()
4061 // [1:0] Single-precision round mode. in lowerGET_ROUNDING()
4062 // [3:2] Double/Half-precision round mode. in lowerGET_ROUNDING()
4064 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero. in lowerGET_ROUNDING()
4067 // Toward-0 3 0 in lowerGET_ROUNDING()
4070 // -Inf 2 3 in lowerGET_ROUNDING()
4073 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit in lowerGET_ROUNDING()
4085 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we in lowerGET_ROUNDING()
4095 // There's a gap in the 4-bit encoded table and actual enum values, so offset in lowerGET_ROUNDING()
4114 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the in lowerSET_ROUNDING()
4118 static_cast<uint32_t>(ConstMode->getZExtValue()), in lowerSET_ROUNDING()
4124 // the range 0-3, we can use a simplified mapping to hardware values. in lowerSET_ROUNDING()
4127 // The supported standard values are 0-3. The extended values start at 8. We in lowerSET_ROUNDING()
4131 // Truncate to the low 32-bits. in lowerSET_ROUNDING()
4145 // table_index = umin(value, value - 4) in lowerSET_ROUNDING()
4186 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0), in lowerSET_ROUNDING()
4193 if (Op->isDivergent()) in lowerPREFETCH()
4196 switch (cast<MemSDNode>(Op)->getAddressSpace()) { in lowerPREFETCH()
4313 if (!Subtarget->hasFlatScrRegister() && in getRegisterByName()
4314 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { in getRegisterByName()
4346 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/); in splitKillBlock()
4347 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in splitKillBlock()
4348 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode())); in splitKillBlock()
4364 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); in splitBlockForLoop()
4365 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); in splitBlockForLoop()
4369 MF->insert(MBBI, LoopBB); in splitBlockForLoop()
4370 MF->insert(MBBI, RemainderBB); in splitBlockForLoop()
4372 LoopBB->addSuccessor(LoopBB); in splitBlockForLoop()
4373 LoopBB->addSuccessor(RemainderBB); in splitBlockForLoop()
4376 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); in splitBlockForLoop()
4382 LoopBB->splice(LoopBB->begin(), &MBB, I, Next); in splitBlockForLoop()
4385 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end()); in splitBlockForLoop()
4387 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); in splitBlockForLoop()
4398 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in bundleInstWithWaitcnt()
4402 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) in bundleInstWithWaitcnt()
4414 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); in emitGWSMemViolTestLoop()
4418 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in emitGWSMemViolTestLoop()
4421 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) in emitGWSMemViolTestLoop()
4422 Src->setIsKill(false); in emitGWSMemViolTestLoop()
4426 MachineBasicBlock::iterator I = LoopBB->end(); in emitGWSMemViolTestLoop()
4432 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) in emitGWSMemViolTestLoop()
4441 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) in emitGWSMemViolTestLoop()
4445 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) in emitGWSMemViolTestLoop()
4448 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) in emitGWSMemViolTestLoop()
4468 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); in emitLoadM0FromVGPRLoop()
4472 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); in emitLoadM0FromVGPRLoop()
4478 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) in emitLoadM0FromVGPRLoop()
4484 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec) in emitLoadM0FromVGPRLoop()
4490 // Read the next variant <- also loop target. in emitLoadM0FromVGPRLoop()
4491 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg) in emitLoadM0FromVGPRLoop()
4495 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg) in emitLoadM0FromVGPRLoop()
4500 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 in emitLoadM0FromVGPRLoop()
4512 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg) in emitLoadM0FromVGPRLoop()
4519 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) in emitLoadM0FromVGPRLoop()
4522 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) in emitLoadM0FromVGPRLoop()
4531 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term in emitLoadM0FromVGPRLoop()
4536 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use in emitLoadM0FromVGPRLoop()
4540 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) in emitLoadM0FromVGPRLoop()
4543 return InsertPt->getIterator(); in emitLoadM0FromVGPRLoop()
4546 // This has slightly sub-optimal regalloc when the source vector is killed by
4548 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
4556 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); in loadM0FromVGPR()
4558 MachineRegisterInfo &MRI = MF->getRegInfo(); in loadM0FromVGPR()
4562 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); in loadM0FromVGPR()
4569 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); in loadM0FromVGPR()
4572 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) in loadM0FromVGPR()
4579 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); in loadM0FromVGPR()
4585 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock(); in loadM0FromVGPR()
4588 MF->insert(MBBI, LandingPad); in loadM0FromVGPR()
4589 LoopBB->removeSuccessor(RemainderBB); in loadM0FromVGPR()
4590 LandingPad->addSuccessor(RemainderBB); in loadM0FromVGPR()
4591 LoopBB->addSuccessor(LandingPad); in loadM0FromVGPR()
4592 MachineBasicBlock::iterator First = LandingPad->begin(); in loadM0FromVGPR()
4593 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec) in loadM0FromVGPR()
4622 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); in setM0ToIndexFromSGPR()
4624 assert(Idx->getReg() != AMDGPU::NoRegister); in setM0ToIndexFromSGPR()
4627 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx); in setM0ToIndexFromSGPR()
4629 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0) in setM0ToIndexFromSGPR()
4642 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); in getIndirectSGPRIdx()
4645 return Idx->getReg(); in getIndirectSGPRIdx()
4648 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) in getIndirectSGPRIdx()
4658 const SIRegisterInfo &TRI = TII->getRegisterInfo(); in emitIndirectSrc()
4660 MachineRegisterInfo &MRI = MF->getRegInfo(); in emitIndirectSrc()
4663 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); in emitIndirectSrc()
4664 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); in emitIndirectSrc()
4665 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); in emitIndirectSrc()
4668 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); in emitIndirectSrc()
4677 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { in emitIndirectSrc()
4688 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); in emitIndirectSrc()
4696 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) in emitIndirectSrc()
4713 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); in emitIndirectSrc()
4719 MachineBasicBlock *LoopBB = InsPt->getParent(); in emitIndirectSrc()
4723 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true); in emitIndirectSrc()
4730 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) in emitIndirectSrc()
4744 const SIRegisterInfo &TRI = TII->getRegisterInfo(); in emitIndirectDst()
4746 MachineRegisterInfo &MRI = MF->getRegInfo(); in emitIndirectDst()
4749 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); in emitIndirectDst()
4750 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); in emitIndirectDst()
4751 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); in emitIndirectDst()
4752 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); in emitIndirectDst()
4753 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg()); in emitIndirectDst()
4754 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg()); in emitIndirectDst()
4757 assert(Val->getReg()); in emitIndirectDst()
4761 SrcVec->getReg(), in emitIndirectDst()
4765 if (Idx->getReg() == AMDGPU::NoRegister) { in emitIndirectDst()
4771 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst) in emitIndirectDst()
4781 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) { in emitIndirectDst()
4789 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); in emitIndirectDst()
4791 .addReg(SrcVec->getReg()) in emitIndirectDst()
4798 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( in emitIndirectDst()
4801 .addReg(SrcVec->getReg()) in emitIndirectDst()
4810 if (Val->isReg()) in emitIndirectDst()
4811 MRI.clearKillFlags(Val->getReg()); in emitIndirectDst()
4818 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset, in emitIndirectDst()
4820 MachineBasicBlock *LoopBB = InsPt->getParent(); in emitIndirectDst()
4824 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false); in emitIndirectDst()
4832 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo( in emitIndirectDst()
4848 MachineRegisterInfo &MRI = BB.getParent()->getRegInfo(); in lowerWaveReduce()
4855 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg)); in lowerWaveReduce()
4861 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg); in lowerWaveReduce()
4881 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass(); in lowerWaveReduce()
4902 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); in lowerWaveReduce()
4903 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg) in lowerWaveReduce()
4905 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop); in lowerWaveReduce()
4908 I = ComputeLoop->end(); in lowerWaveReduce()
4910 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg) in lowerWaveReduce()
4914 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg) in lowerWaveReduce()
4915 .addReg(TmpSReg->getOperand(0).getReg()) in lowerWaveReduce()
4920 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg) in lowerWaveReduce()
4921 .addReg(ActiveBits->getOperand(0).getReg()); in lowerWaveReduce()
4923 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg) in lowerWaveReduce()
4925 .addReg(FF1->getOperand(0).getReg()); in lowerWaveReduce()
4926 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg) in lowerWaveReduce()
4927 .addReg(Accumulator->getOperand(0).getReg()) in lowerWaveReduce()
4928 .addReg(LaneValue->getOperand(0).getReg()); in lowerWaveReduce()
4934 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg) in lowerWaveReduce()
4935 .addReg(FF1->getOperand(0).getReg()) in lowerWaveReduce()
4936 .addReg(ActiveBits->getOperand(0).getReg()); in lowerWaveReduce()
4939 Accumulator.addReg(NewAccumulator->getOperand(0).getReg()) in lowerWaveReduce()
4941 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg()) in lowerWaveReduce()
4946 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc)) in lowerWaveReduce()
4947 .addReg(NewActiveBits->getOperand(0).getReg()) in lowerWaveReduce()
4949 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) in lowerWaveReduce()
4961 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in EmitInstrWithCustomInserter()
4962 MachineFunction *MF = BB->getParent(); in EmitInstrWithCustomInserter()
4963 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); in EmitInstrWithCustomInserter()
4981 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1); in EmitInstrWithCustomInserter()
4983 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg()) in EmitInstrWithCustomInserter()
4992 // For targets older than GFX12, we emit a sequence of 32-bit operations. in EmitInstrWithCustomInserter()
4994 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); in EmitInstrWithCustomInserter()
4995 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); in EmitInstrWithCustomInserter()
5001 if (Subtarget->hasScalarAddSub64()) { in EmitInstrWithCustomInserter()
5003 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg()) in EmitInstrWithCustomInserter()
5008 const TargetRegisterClass *BoolRC = TRI->getBoolRC(); in EmitInstrWithCustomInserter()
5013 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5015 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5018 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5020 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5025 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) in EmitInstrWithCustomInserter()
5028 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) in EmitInstrWithCustomInserter()
5031 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) in EmitInstrWithCustomInserter()
5042 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); in EmitInstrWithCustomInserter()
5043 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); in EmitInstrWithCustomInserter()
5054 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64), in EmitInstrWithCustomInserter()
5059 TII->legalizeOperands(*Add); in EmitInstrWithCustomInserter()
5064 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); in EmitInstrWithCustomInserter()
5080 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); in EmitInstrWithCustomInserter()
5082 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); in EmitInstrWithCustomInserter()
5084 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5086 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5089 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5091 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5095 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) in EmitInstrWithCustomInserter()
5103 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) in EmitInstrWithCustomInserter()
5110 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) in EmitInstrWithCustomInserter()
5115 TII->legalizeOperands(*LoHalf); in EmitInstrWithCustomInserter()
5116 TII->legalizeOperands(*HiHalf); in EmitInstrWithCustomInserter()
5125 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); in EmitInstrWithCustomInserter()
5126 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); in EmitInstrWithCustomInserter()
5138 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { in EmitInstrWithCustomInserter()
5140 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) in EmitInstrWithCustomInserter()
5144 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) { in EmitInstrWithCustomInserter()
5146 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1) in EmitInstrWithCustomInserter()
5151 if (TRI->isVectorRegister(MRI, Src2.getReg())) { in EmitInstrWithCustomInserter()
5152 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2) in EmitInstrWithCustomInserter()
5158 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC); in EmitInstrWithCustomInserter()
5163 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) in EmitInstrWithCustomInserter()
5168 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0); in EmitInstrWithCustomInserter()
5169 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5171 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5175 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32) in EmitInstrWithCustomInserter()
5179 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) in EmitInstrWithCustomInserter()
5184 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32)) in EmitInstrWithCustomInserter()
5189 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1); in EmitInstrWithCustomInserter()
5194 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg()) in EmitInstrWithCustomInserter()
5195 .addImm(-1) in EmitInstrWithCustomInserter()
5203 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) in EmitInstrWithCustomInserter()
5212 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) in EmitInstrWithCustomInserter()
5214 .addImm(MFI->getLDSSize()); in EmitInstrWithCustomInserter()
5219 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters()); in EmitInstrWithCustomInserter()
5220 MachineRegisterInfo &MRI = MF->getRegInfo(); in EmitInstrWithCustomInserter()
5234 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1) in EmitInstrWithCustomInserter()
5237 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1) in EmitInstrWithCustomInserter()
5240 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2) in EmitInstrWithCustomInserter()
5242 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) in EmitInstrWithCustomInserter()
5246 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo) in EmitInstrWithCustomInserter()
5249 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE)) in EmitInstrWithCustomInserter()
5284 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); in EmitInstrWithCustomInserter()
5285 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); in EmitInstrWithCustomInserter()
5296 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); in EmitInstrWithCustomInserter()
5307 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); in EmitInstrWithCustomInserter()
5309 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); in EmitInstrWithCustomInserter()
5311 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5313 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5316 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5318 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( in EmitInstrWithCustomInserter()
5321 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) in EmitInstrWithCustomInserter()
5323 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) in EmitInstrWithCustomInserter()
5329 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) in EmitInstrWithCustomInserter()
5336 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) in EmitInstrWithCustomInserter()
5345 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in EmitInstrWithCustomInserter()
5347 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) in EmitInstrWithCustomInserter()
5349 Br->getOperand(1).setIsUndef(); // read undef SCC in EmitInstrWithCustomInserter()
5355 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); in EmitInstrWithCustomInserter()
5357 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine) in EmitInstrWithCustomInserter()
5358 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit); in EmitInstrWithCustomInserter()
5362 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in EmitInstrWithCustomInserter()
5365 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); in EmitInstrWithCustomInserter()
5368 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); in EmitInstrWithCustomInserter()
5385 if (TII->pseudoToMCOpcode(Opc) == -1) { in EmitInstrWithCustomInserter()
5390 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg()); in EmitInstrWithCustomInserter()
5391 if (TII->isVOP3(*I)) { in EmitInstrWithCustomInserter()
5392 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); in EmitInstrWithCustomInserter()
5394 I.addReg(TRI->getVCC(), RegState::Define); in EmitInstrWithCustomInserter()
5401 TII->legalizeOperands(*I); in EmitInstrWithCustomInserter()
5411 TII->legalizeOperands(MI); in EmitInstrWithCustomInserter()
5416 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0); in EmitInstrWithCustomInserter()
5422 if (getSubtarget()->hasGWSAutoReplay()) { in EmitInstrWithCustomInserter()
5446 if (getSubtarget()->hasDenormModeInst()) { in EmitInstrWithCustomInserter()
5465 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); in EmitInstrWithCustomInserter()
5467 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { in EmitInstrWithCustomInserter()
5468 unsigned ImmVal = Def->getOperand(1).getImm(); in EmitInstrWithCustomInserter()
5470 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) in EmitInstrWithCustomInserter()
5478 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) in EmitInstrWithCustomInserter()
5491 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode)); in EmitInstrWithCustomInserter()
5499 MI.setDesc(TII->get(AMDGPU::COPY)); in EmitInstrWithCustomInserter()
5503 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) { in EmitInstrWithCustomInserter()
5504 MI.setDesc(TII->get(AMDGPU::S_ENDPGM)); in EmitInstrWithCustomInserter()
5513 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/); in EmitInstrWithCustomInserter()
5514 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); in EmitInstrWithCustomInserter()
5515 MF->push_back(TrapBB); in EmitInstrWithCustomInserter()
5516 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM)) in EmitInstrWithCustomInserter()
5518 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) in EmitInstrWithCustomInserter()
5521 BB->addSuccessor(TrapBB); in EmitInstrWithCustomInserter()
5526 assert(Subtarget->hasPrivEnabledTrap2NopBug()); in EmitInstrWithCustomInserter()
5527 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); in EmitInstrWithCustomInserter()
5529 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc()); in EmitInstrWithCustomInserter()
5534 if (TII->isImage(MI) || TII->isMUBUF(MI)) { in EmitInstrWithCustomInserter()
5571 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts()) in getPreferredShiftAmountTy()
5598 if (!Subtarget->hasMadMacF32Insts()) in isFMAFasterThanFMulAndFAdd()
5599 return Subtarget->hasFastFMAF32(); in isFMAFasterThanFMulAndFAdd()
5605 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); in isFMAFasterThanFMulAndFAdd()
5608 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts(); in isFMAFasterThanFMulAndFAdd()
5613 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF); in isFMAFasterThanFMulAndFAdd()
5642 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF()); in isFMADLegal()
5644 return Subtarget->hasMadMacF32Insts() && in isFMADLegal()
5654 EVT VT = N->getValueType(0); in isFMADLegal()
5656 return Subtarget->hasMadMacF32Insts() && in isFMADLegal()
5659 return Subtarget->hasMadF16() && in isFMADLegal()
5666 //===----------------------------------------------------------------------===//
5668 //===----------------------------------------------------------------------===//
5686 Op->getFlags()); in splitUnaryVectorOp()
5688 Op->getFlags()); in splitUnaryVectorOp()
5712 Op->getFlags()); in splitBinaryVectorOp()
5714 Op->getFlags()); in splitBinaryVectorOp()
5744 Op->getFlags()); in splitTernaryVectorOp()
5746 Op->getFlags()); in splitTernaryVectorOp()
5760 Result.getNode()->getNumValues() == 2) && in LowerOperation()
5808 if (Op.getOperand(0)->getValueType(0) != MVT::f32) in LowerOperation()
5820 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0)); in LowerOperation()
5826 case ISD::ABS: in LowerOperation()
5942 bool Unpacked = Subtarget->hasUnpackedD16VMem(); in adjustLoadValueType()
5943 EVT LoadVT = M->getValueType(0); in adjustLoadValueType()
5964 VTList, Ops, M->getMemoryVT(), in adjustLoadValueType()
5965 M->getMemOperand()); in adjustLoadValueType()
5976 EVT LoadVT = M->getValueType(0); in lowerIntrinsicLoad()
5982 assert(M->getNumValues() == 2 || M->getNumValues() == 3); in lowerIntrinsicLoad()
5983 bool IsTFE = M->getNumValues() == 3; in lowerIntrinsicLoad()
5996 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(), in lowerIntrinsicLoad()
6000 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, in lowerIntrinsicLoad()
6001 M->getMemOperand(), DAG); in lowerIntrinsicLoad()
6007 M->getMemOperand(), DAG); in lowerIntrinsicLoad()
6015 EVT VT = N->getValueType(0); in lowerICMPIntrinsic()
6016 unsigned CondCode = N->getConstantOperandVal(3); in lowerICMPIntrinsic()
6022 SDValue LHS = N->getOperand(1); in lowerICMPIntrinsic()
6023 SDValue RHS = N->getOperand(2); in lowerICMPIntrinsic()
6037 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); in lowerICMPIntrinsic()
6049 EVT VT = N->getValueType(0); in lowerFCMPIntrinsic()
6051 unsigned CondCode = N->getConstantOperandVal(3); in lowerFCMPIntrinsic()
6055 SDValue Src0 = N->getOperand(1); in lowerFCMPIntrinsic()
6056 SDValue Src1 = N->getOperand(2); in lowerFCMPIntrinsic()
6067 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); in lowerFCMPIntrinsic()
6078 EVT VT = N->getValueType(0); in lowerBALLOTIntrinsic()
6079 SDValue Src = N->getOperand(1); in lowerBALLOTIntrinsic()
6083 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) in lowerBALLOTIntrinsic()
6088 // (ballot 0) -> 0 in lowerBALLOTIntrinsic()
6089 if (Arg->isZero()) in lowerBALLOTIntrinsic()
6092 // (ballot 1) -> EXEC/EXEC_LO in lowerBALLOTIntrinsic()
6093 if (Arg->isOne()) { in lowerBALLOTIntrinsic()
6106 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0) in lowerBALLOTIntrinsic()
6115 EVT VT = N->getValueType(0); in lowerLaneOp()
6117 unsigned IID = N->getConstantOperandVal(0); in lowerLaneOp()
6124 SDValue Src2, MVT ValT) -> SDValue { in lowerLaneOp()
6129 Operands.push_back(N->getOperand(6)); in lowerLaneOp()
6130 Operands.push_back(N->getOperand(5)); in lowerLaneOp()
6131 Operands.push_back(N->getOperand(4)); in lowerLaneOp()
6150 if (SDNode *GL = N->getGluedNode()) { in lowerLaneOp()
6151 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); in lowerLaneOp()
6152 GL = GL->getOperand(0).getNode(); in lowerLaneOp()
6160 SDValue Src0 = N->getOperand(1); in lowerLaneOp()
6164 Src1 = N->getOperand(2); in lowerLaneOp()
6166 Src2 = N->getOperand(3); in lowerLaneOp()
6197 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue { in lowerLaneOp()
6198 EVT VT = N->getValueType(0); in lowerLaneOp()
6202 unsigned NumOperands = N->getNumOperands(); in lowerLaneOp()
6204 SDNode *GL = N->getGluedNode(); in lowerLaneOp()
6207 assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE); in lowerLaneOp()
6210 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e; in lowerLaneOp()
6212 SDValue Operand = N->getOperand(j); in lowerLaneOp()
6226 Operands[NumOperands - 1] = in lowerLaneOp()
6228 SDValue(GL->getOperand(0).getNode(), 0)); in lowerLaneOp()
6230 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands)); in lowerLaneOp()
6294 switch (N->getOpcode()) { in ReplaceNodeResults()
6306 unsigned IID = N->getConstantOperandVal(0); in ReplaceNodeResults()
6312 SDValue Src0 = N->getOperand(1); in ReplaceNodeResults()
6313 SDValue Src1 = N->getOperand(2); in ReplaceNodeResults()
6324 SDValue Src0 = N->getOperand(1); in ReplaceNodeResults()
6325 SDValue Src1 = N->getOperand(2); in ReplaceNodeResults()
6338 EVT VT = N->getValueType(0); in ReplaceNodeResults()
6353 if (!Subtarget->hasScalarSubwordLoads()) in ReplaceNodeResults()
6360 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n"); in ReplaceNodeResults()
6372 if (!Offset->isDivergent()) { in ReplaceNodeResults()
6417 EVT VT = N->getValueType(0); in ReplaceNodeResults()
6419 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1)); in ReplaceNodeResults()
6420 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2)); in ReplaceNodeResults()
6430 N->getOperand(0), LHS, RHS); in ReplaceNodeResults()
6438 if (N->getValueType(0) != MVT::v2f16) in ReplaceNodeResults()
6442 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); in ReplaceNodeResults()
6451 if (N->getValueType(0) != MVT::v2f16) in ReplaceNodeResults()
6455 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0)); in ReplaceNodeResults()
6464 if (N->getValueType(0) != MVT::f16) in ReplaceNodeResults()
6479 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); in findUser()
6485 if (I->getOpcode() == Opcode) in findUser()
6492 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) { in isCFIntrinsic()
6493 switch (Intr->getConstantOperandVal(1)) { in isCFIntrinsic()
6514 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || in shouldEmitFixup()
6515 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && in shouldEmitFixup()
6520 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) in shouldEmitGOTReloc()
6525 return (GV->getValueType()->isFunctionTy() || in shouldEmitGOTReloc()
6526 !isNonGlobalAddrSpace(GV->getAddressSpace())) && in shouldEmitGOTReloc()
6535 if (!GV->hasExternalLinkage()) in shouldUseLDSConstAddress()
6553 if (Intr->getOpcode() == ISD::SETCC) { in LowerBRCOND()
6556 Intr = SetCC->getOperand(0).getNode(); in LowerBRCOND()
6562 Target = BR->getOperand(1); in LowerBRCOND()
6571 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID || in LowerBRCOND()
6572 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN; in LowerBRCOND()
6575 (SetCC->getConstantOperandVal(1) == 1 && in LowerBRCOND()
6576 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() == in LowerBRCOND()
6584 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end()); in LowerBRCOND()
6587 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end()); in LowerBRCOND()
6604 BR->getOperand(0), in LowerBRCOND()
6607 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); in LowerBRCOND()
6611 SDValue Chain = SDValue(Result, Result->getNumValues() - 1); in LowerBRCOND()
6614 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { in LowerBRCOND()
6621 CopyToReg->getOperand(1), in LowerBRCOND()
6622 SDValue(Result, i - 1), in LowerBRCOND()
6625 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); in LowerBRCOND()
6630 SDValue(Intr, Intr->getNumValues() - 1), in LowerBRCOND()
6631 Intr->getOperand(0)); in LowerBRCOND()
6647 if (Info->isEntryFunction()) in LowerRETURNADDR()
6654 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); in LowerRETURNADDR()
6655 // Get the return address reg and mark it as an implicit live-in in LowerRETURNADDR()
6656 …Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDiver… in LowerRETURNADDR()
6673 "Do not know how to custom lower FP_ROUND for non-f16 type"); in lowerFP_ROUND()
6696 bool IsIEEEMode = Info->getMode().IEEE; in lowerFMINNUM_FMAXNUM()
6699 // ieee_mode. Currently a combine can produce the ieee version for non-ieee in lowerFMINNUM_FMAXNUM()
6759 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit in lowerMUL()
6763 // registers, then we have to split s_mul_u64 in 32-bit multiplications. in lowerMUL()
6767 // operands are zero-extended/sign-extended from 32-bits, then we split the in lowerMUL()
6768 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not in lowerMUL()
6769 // possible to check if the operands are zero-extended or sign-extended in in lowerMUL()
6771 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace in lowerMUL()
6772 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended. in lowerMUL()
6780 if (Op->isDivergent()) in lowerMUL()
6785 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64 in lowerMUL()
6786 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to in lowerMUL()
6787 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo. in lowerMUL()
6813 const APInt &C = RHSC->getAPIntValue(); in lowerXMULO()
6814 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X } in lowerXMULO()
6834 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32)) in lowerXMULO()
6842 if (Op->isDivergent()) { in lowerXMUL_LOHI()
6846 if (Subtarget->hasSMulHi()) { in lowerXMUL_LOHI()
6857 if (!Subtarget->isTrapHandlerEnabled() || in lowerTRAP()
6858 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) in lowerTRAP()
6861 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) : in lowerTRAP()
6897 Register UserSGPR = Info->getQueuePtrUserSGPR(); in lowerTrapHsaQueuePtr()
6901 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the in lowerTrapHsaQueuePtr()
6931 if (Subtarget->hasPrivEnabledTrap2NopBug()) in lowerTrapHsa()
6947 if (!Subtarget->isTrapHandlerEnabled() || in lowerDEBUGTRAP()
6948 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) { in lowerDEBUGTRAP()
6968 if (Subtarget->hasApertureRegs()) { in getSegmentAperture()
6972 // Note: this feature (register) is broken. When used as a 32-bit operand, in getSegmentAperture()
7008 Register UserSGPR = Info->getQueuePtrUserSGPR(); in getSegmentAperture()
7011 // amdgpu-no-queue-ptr. This is undefined. in getSegmentAperture()
7044 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace); in isKnownNonNull()
7062 SrcAS = ASC->getSrcAddressSpace(); in lowerADDRSPACECAST()
7063 Src = ASC->getOperand(0); in lowerADDRSPACECAST()
7064 DestAS = ASC->getDestAddressSpace(); in lowerADDRSPACECAST()
7069 Src = Op->getOperand(1); in lowerADDRSPACECAST()
7070 SrcAS = Op->getConstantOperandVal(2); in lowerADDRSPACECAST()
7071 DestAS = Op->getConstantOperandVal(3); in lowerADDRSPACECAST()
7077 // flat -> local/private in lowerADDRSPACECAST()
7095 // local/private -> flat in lowerADDRSPACECAST()
7123 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32); in lowerADDRSPACECAST()
7132 // global <-> flat are no-ops and never emitted. in lowerADDRSPACECAST()
7137 DAG.getContext()->diagnose(InvalidAddrSpaceCast); in lowerADDRSPACECAST()
7139 return DAG.getUNDEF(Op->getValueType(0)); in lowerADDRSPACECAST()
7156 unsigned IdxVal = Idx->getAsZExtVal(); in lowerINSERT_SUBVECTOR()
7160 // Insert 32-bit registers at a time. in lowerINSERT_SUBVECTOR()
7222 unsigned Idx = KIdx->getZExtValue(); in lowerINSERT_VECTOR_ELT()
7227 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32)); in lowerINSERT_VECTOR_ELT()
7250 // Convert vector index to bit-index and get the required bit mask. in lowerINSERT_VECTOR_ELT()
7292 // XXX - Why doesn't this get called when vector_shuffle is expanded? in lowerEXTRACT_VECTOR_ELT()
7342 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT); in lowerEXTRACT_VECTOR_ELT()
7365 // Convert vector index to bit-index (* EltSize) in lowerEXTRACT_VECTOR_ELT()
7395 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) in lowerVECTOR_SHUFFLE()
7398 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2) in lowerVECTOR_SHUFFLE()
7401 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) in lowerVECTOR_SHUFFLE()
7406 if (elementPairIsContiguous(SVN->getMask(), I)) { in lowerVECTOR_SHUFFLE()
7407 const int Idx = SVN->getMaskElt(I); in lowerVECTOR_SHUFFLE()
7409 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; in lowerVECTOR_SHUFFLE()
7411 PackVT, SVN->getOperand(VecIdx), in lowerVECTOR_SHUFFLE()
7415 const int Idx0 = SVN->getMaskElt(I); in lowerVECTOR_SHUFFLE()
7416 const int Idx1 = SVN->getMaskElt(I + 1); in lowerVECTOR_SHUFFLE()
7419 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts; in lowerVECTOR_SHUFFLE()
7420 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts; in lowerVECTOR_SHUFFLE()
7422 SDValue Vec0 = SVN->getOperand(VecIdx0); in lowerVECTOR_SHUFFLE()
7426 SDValue Vec1 = SVN->getOperand(VecIdx1); in lowerVECTOR_SHUFFLE()
7524 assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); in lowerBUILD_VECTOR()
7554 // 32-bit addend in the instruction, so it is not safe to allow offset folding in isOffsetFoldingLegal()
7555 // which can create arbitrary 64-bit addends. (This is only a problem for in isOffsetFoldingLegal()
7561 if (!Subtarget->isAmdHsaOS()) in isOffsetFoldingLegal()
7565 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || in isOffsetFoldingLegal()
7566 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || in isOffsetFoldingLegal()
7567 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && in isOffsetFoldingLegal()
7568 !shouldEmitGOTReloc(GA->getGlobal()); in isOffsetFoldingLegal()
7575 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); in buildPCRelGlobalAddress()
7576 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is in buildPCRelGlobalAddress()
7586 // constant, which is a pc-relative offset from the encoding of the $symbol in buildPCRelGlobalAddress()
7597 // which is a 64-bit pc-relative offset from the encoding of the $symbol in buildPCRelGlobalAddress()
7615 const GlobalValue *GV = GSD->getGlobal(); in LowerGlobalAddress()
7616 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && in LowerGlobalAddress()
7618 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || in LowerGlobalAddress()
7619 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { in LowerGlobalAddress()
7620 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && in LowerGlobalAddress()
7621 GV->hasExternalLinkage()) { in LowerGlobalAddress()
7622 Type *Ty = GV->getValueType(); in LowerGlobalAddress()
7624 // zero-sized type in other languages to declare the dynamic shared in LowerGlobalAddress()
7629 assert(PtrVT == MVT::i32 && "32-bit pointer is expected."); in LowerGlobalAddress()
7632 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV)); in LowerGlobalAddress()
7633 MFI->setUsesDynamicLDS(true); in LowerGlobalAddress()
7641 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { in LowerGlobalAddress()
7642 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), in LowerGlobalAddress()
7647 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { in LowerGlobalAddress()
7649 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); in LowerGlobalAddress()
7653 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI); in LowerGlobalAddress()
7660 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); in LowerGlobalAddress()
7663 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, in LowerGlobalAddress()
7704 // The local size values will have the hi 16-bits as zero. in lowerImplicitZextParam()
7712 "non-hsa intrinsic with hsa target", in emitNonHSAIntrinsicError()
7714 DAG.getContext()->diagnose(BadIntrin); in emitNonHSAIntrinsicError()
7723 DAG.getContext()->diagnose(BadIntrin); in emitRemovedIntrinsicError()
7768 while (ExtraElts--) in padEltsToUndef()
7774 // Re-construct the required return value for a image load intrinsic.
7813 NumDataDwords - MaskPopDwords); in constructRetValue()
7843 if (Result->getNumValues() == 1) in constructRetValue()
7853 uint64_t Value = TexFailCtrlConst->getZExtValue(); in parseTexFail()
7882 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || in packImage16bitOpsToDwords()
7883 I == DimIdx + NumGradients - 1))) { in packImage16bitOpsToDwords()
7903 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); in lowerImage()
7904 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); in lowerImage()
7905 unsigned IntrOpcode = Intr->BaseOpcode; in lowerImage()
7910 SmallVector<EVT, 3> ResultTypes(Op->values()); in lowerImage()
7911 SmallVector<EVT, 3> OrigResultTypes(Op->values()); in lowerImage()
7926 if (BaseOpcode->Atomic) { in lowerImage()
7930 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 || in lowerImage()
7931 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16); in lowerImage()
7934 if (BaseOpcode->AtomicX2) { in lowerImage()
7949 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex); in lowerImage()
7950 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask); in lowerImage()
7952 if (BaseOpcode->Store) { in lowerImage()
7957 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) in lowerImage()
7965 } else if (!BaseOpcode->NoReturn) { in lowerImage()
7970 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) in lowerImage()
7984 if (IsD16 && !Subtarget->hasUnpackedD16VMem() && in lowerImage()
7985 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug())) in lowerImage()
7994 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd; in lowerImage()
7999 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType(); in lowerImage()
8004 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType(); in lowerImage()
8010 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) { in lowerImage()
8012 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument"); in lowerImage()
8014 // occupies full 32-bit. in lowerImage()
8020 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) && in lowerImage()
8026 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) { in lowerImage()
8036 if (!ST->hasA16()) { in lowerImage()
8048 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) { in lowerImage()
8051 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); in lowerImage()
8052 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 in lowerImage()
8058 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart); in lowerImage()
8060 ArgOffset + Intr->GradientStart, in lowerImage()
8061 ArgOffset + Intr->CoordStart, Intr->NumGradients); in lowerImage()
8063 for (unsigned I = ArgOffset + Intr->GradientStart; in lowerImage()
8064 I < ArgOffset + Intr->CoordStart; I++) in lowerImage()
8071 ArgOffset + Intr->CoordStart, VAddrEnd, in lowerImage()
8075 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++) in lowerImage()
8080 // without introducing moves, then using the non-sequential address encoding in lowerImage()
8086 // so force non-NSA for the common 2-address case as a heuristic. in lowerImage()
8088 // SIShrinkInstructions will convert NSA encodings to non-NSA after register in lowerImage()
8093 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler); in lowerImage()
8094 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding(); in lowerImage()
8095 const bool UseNSA = ST->hasNSAEncoding() && in lowerImage()
8096 VAddrs.size() >= ST->getNSAThreshold(MF) && in lowerImage()
8104 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1)); in lowerImage()
8113 if (!BaseOpcode->Sampler) { in lowerImage()
8117 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex); in lowerImage()
8124 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex); in lowerImage()
8131 // Expecting to get an error flag since TFC is on - and dmask is 0 in lowerImage()
8145 if (DMaskLanes == 0 && !BaseOpcode->Store) { in lowerImage()
8146 // This is a no-op load. This can be eliminated in lowerImage()
8166 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex); in lowerImage()
8167 if (BaseOpcode->Atomic) in lowerImage()
8168 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization in lowerImage()
8174 if (BaseOpcode->Store || BaseOpcode->Atomic) in lowerImage()
8177 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1)); in lowerImage()
8184 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex)); in lowerImage()
8185 if (BaseOpcode->Sampler) in lowerImage()
8186 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex)); in lowerImage()
8189 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); in lowerImage()
8190 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) in lowerImage()
8194 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); in lowerImage()
8197 if (!Subtarget->hasGFX90AInsts()) { in lowerImage()
8199 } else if (TFE->getAsZExtVal()) { in lowerImage()
8202 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA) in lowerImage()
8205 Ops.push_back(DimInfo->DA ? True : False); in lowerImage()
8206 if (BaseOpcode->HasD16) in lowerImage()
8213 int Opcode = -1; in lowerImage()
8229 if (Subtarget->hasGFX90AInsts()) { in lowerImage()
8232 if (Opcode == -1) in lowerImage()
8236 if (Opcode == -1 && in lowerImage()
8237 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) in lowerImage()
8240 if (Opcode == -1) in lowerImage()
8244 if (Opcode == -1) in lowerImage()
8249 MachineMemOperand *MemRef = MemOp->getMemOperand(); in lowerImage()
8253 if (BaseOpcode->AtomicX2) { in lowerImage()
8258 if (BaseOpcode->NoReturn) in lowerImage()
8261 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes, in lowerImage()
8280 if (!Offset->isDivergent()) { in lowerSBuffer()
8287 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { in lowerSBuffer()
8296 !Subtarget->hasScalarDwordx3Loads()) { in lowerSBuffer()
8323 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) { in lowerSBuffer()
8347 uint64_t InstOffset = Ops[5]->getAsZExtVal(); in lowerSBuffer()
8362 if (!Subtarget->hasArchitectedSGPRs()) in lowerWaveID()
8376 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim); in lowerWorkitemID()
8405 // TODO: Should this propagate fast-math-flags? in LowerINTRINSIC_WO_CHAIN()
8409 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction())) in LowerINTRINSIC_WO_CHAIN()
8416 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) { in LowerINTRINSIC_WO_CHAIN()
8420 DAG.getContext()->diagnose(BadIntrin); in LowerINTRINSIC_WO_CHAIN()
8429 if (MFI->isEntryFunction()) in LowerINTRINSIC_WO_CHAIN()
8451 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) in LowerINTRINSIC_WO_CHAIN()
8455 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) in LowerINTRINSIC_WO_CHAIN()
8459 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) in LowerINTRINSIC_WO_CHAIN()
8463 APFloat Max = APFloat::getLargest(Type->getFltSemantics()); in LowerINTRINSIC_WO_CHAIN()
8464 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true); in LowerINTRINSIC_WO_CHAIN()
8473 if (Subtarget->isAmdHsaOS()) in LowerINTRINSIC_WO_CHAIN()
8480 if (Subtarget->isAmdHsaOS()) in LowerINTRINSIC_WO_CHAIN()
8487 if (Subtarget->isAmdHsaOS()) in LowerINTRINSIC_WO_CHAIN()
8494 if (Subtarget->isAmdHsaOS()) in LowerINTRINSIC_WO_CHAIN()
8501 if (Subtarget->isAmdHsaOS()) in LowerINTRINSIC_WO_CHAIN()
8508 if (Subtarget->isAmdHsaOS()) in LowerINTRINSIC_WO_CHAIN()
8515 if (Subtarget->isAmdHsaOS()) in LowerINTRINSIC_WO_CHAIN()
8521 if (Subtarget->isAmdHsaOS()) in LowerINTRINSIC_WO_CHAIN()
8527 if (Subtarget->isAmdHsaOS()) in LowerINTRINSIC_WO_CHAIN()
8544 if (MFI->isEntryFunction()) in LowerINTRINSIC_WO_CHAIN()
8550 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX); in LowerINTRINSIC_WO_CHAIN()
8552 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY); in LowerINTRINSIC_WO_CHAIN()
8554 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ); in LowerINTRINSIC_WO_CHAIN()
8562 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12) in LowerINTRINSIC_WO_CHAIN()
8583 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) in LowerINTRINSIC_WO_CHAIN()
8616 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator; in LowerINTRINSIC_WO_CHAIN()
8618 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, in LowerINTRINSIC_WO_CHAIN()
8622 // There is a Pat that handles this variant, so return it as-is. in LowerINTRINSIC_WO_CHAIN()
8685 Op->getOperand(1), Op->getOperand(2)), 0); in LowerINTRINSIC_WO_CHAIN()
8694 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize)); in LowerINTRINSIC_WO_CHAIN()
8717 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); in LowerINTRINSIC_WO_CHAIN()
8718 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); in LowerINTRINSIC_WO_CHAIN()
8720 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); in LowerINTRINSIC_WO_CHAIN()
8777 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset)) in selectSOffset()
8806 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, in lowerRawBufferAtomicIntrin()
8807 M->getMemOperand()); in lowerRawBufferAtomicIntrin()
8834 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT, in lowerStructBufferAtomicIntrin()
8835 M->getMemOperand()); in lowerStructBufferAtomicIntrin()
8847 SDValue Chain = M->getOperand(0); in LowerINTRINSIC_W_CHAIN()
8848 SDValue M0 = M->getOperand(2); in LowerINTRINSIC_W_CHAIN()
8849 SDValue Value = M->getOperand(3); in LowerINTRINSIC_W_CHAIN()
8850 unsigned IndexOperand = M->getConstantOperandVal(7); in LowerINTRINSIC_W_CHAIN()
8851 unsigned WaveRelease = M->getConstantOperandVal(8); in LowerINTRINSIC_W_CHAIN()
8852 unsigned WaveDone = M->getConstantOperandVal(9); in LowerINTRINSIC_W_CHAIN()
8858 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { in LowerINTRINSIC_W_CHAIN()
8880 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) in LowerINTRINSIC_W_CHAIN()
8881 Offset1 |= (CountDw - 1) << 6; in LowerINTRINSIC_W_CHAIN()
8883 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11) in LowerINTRINSIC_W_CHAIN()
8895 M->getVTList(), Ops, M->getMemoryVT(), in LowerINTRINSIC_W_CHAIN()
8896 M->getMemOperand()); in LowerINTRINSIC_W_CHAIN()
8973 Op->getVTList(), Ops, LoadVT, M->getMemOperand(), in LowerINTRINSIC_W_CHAIN()
9000 Op->getVTList(), Ops, LoadVT, M->getMemOperand(), in LowerINTRINSIC_W_CHAIN()
9126 Op->getVTList(), Ops, VT, M->getMemOperand()); in LowerINTRINSIC_W_CHAIN()
9130 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG); in LowerINTRINSIC_W_CHAIN()
9149 Op->getVTList(), Ops, VT, M->getMemOperand()); in LowerINTRINSIC_W_CHAIN()
9153 SDValue NodePtr = M->getOperand(2); in LowerINTRINSIC_W_CHAIN()
9154 SDValue RayExtent = M->getOperand(3); in LowerINTRINSIC_W_CHAIN()
9155 SDValue RayOrigin = M->getOperand(4); in LowerINTRINSIC_W_CHAIN()
9156 SDValue RayDir = M->getOperand(5); in LowerINTRINSIC_W_CHAIN()
9157 SDValue RayInvDir = M->getOperand(6); in LowerINTRINSIC_W_CHAIN()
9158 SDValue TDescr = M->getOperand(7); in LowerINTRINSIC_W_CHAIN()
9165 if (!Subtarget->hasGFX10_AEncoding()) { in LowerINTRINSIC_W_CHAIN()
9178 const bool UseNSA = (Subtarget->hasNSAEncoding() && in LowerINTRINSIC_W_CHAIN()
9179 NumVAddrs <= Subtarget->getNSAMaxSize()) || in LowerINTRINSIC_W_CHAIN()
9199 assert(Opcode != -1); in LowerINTRINSIC_W_CHAIN()
9265 Ops.append(16 - Ops.size(), Undef); in LowerINTRINSIC_W_CHAIN()
9276 Ops.push_back(M->getChain()); in LowerINTRINSIC_W_CHAIN()
9278 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops); in LowerINTRINSIC_W_CHAIN()
9279 MachineMemOperand *MemRef = M->getMemOperand(); in LowerINTRINSIC_W_CHAIN()
9293 M->getOperand(0), // Chain in LowerINTRINSIC_W_CHAIN()
9294 M->getOperand(2), // Ptr in LowerINTRINSIC_W_CHAIN()
9295 M->getOperand(3) // Value in LowerINTRINSIC_W_CHAIN()
9316 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), in LowerINTRINSIC_W_CHAIN()
9317 Ops, M->getMemOperand()); in LowerINTRINSIC_W_CHAIN()
9320 SDValue Chain = Op->getOperand(0); in LowerINTRINSIC_W_CHAIN()
9326 if (isa<ConstantSDNode>(Op->getOperand(2))) { in LowerINTRINSIC_W_CHAIN()
9327 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue(); in LowerINTRINSIC_W_CHAIN()
9341 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); in LowerINTRINSIC_W_CHAIN()
9389 if (!Subtarget->hasDwordx3LoadStores() && in getMemIntrinsicNode()
9416 if (Subtarget->hasUnpackedD16VMem()) { in handleD16VData()
9430 if (ImageStore && Subtarget->hasImageStoreD16Bug()) { in handleD16VData()
9491 if (!Subtarget->hasCompressedExport()) { in LowerINTRINSIC_VOID()
9495 DAG.getContext()->diagnose(BadIntrin); in LowerINTRINSIC_VOID()
9517 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; in LowerINTRINSIC_VOID()
9518 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); in LowerINTRINSIC_VOID()
9571 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, in LowerINTRINSIC_VOID()
9572 M->getMemoryVT(), M->getMemOperand()); in LowerINTRINSIC_VOID()
9599 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, in LowerINTRINSIC_VOID()
9600 M->getMemoryVT(), M->getMemOperand()); in LowerINTRINSIC_VOID()
9649 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, in LowerINTRINSIC_VOID()
9650 M->getMemoryVT(), M->getMemOperand()); in LowerINTRINSIC_VOID()
9701 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, in LowerINTRINSIC_VOID()
9702 M->getMemoryVT(), M->getMemOperand()); in LowerINTRINSIC_VOID()
9716 unsigned Size = Op->getConstantOperandVal(4); in LowerINTRINSIC_VOID()
9767 MachineMemOperand *LoadMMO = M->getMemOperand(); in LowerINTRINSIC_VOID()
9770 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); in LowerINTRINSIC_VOID()
9778 auto F = LoadMMO->getFlags() & in LowerINTRINSIC_VOID()
9782 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); in LowerINTRINSIC_VOID()
9786 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); in LowerINTRINSIC_VOID()
9788 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); in LowerINTRINSIC_VOID()
9795 unsigned Size = Op->getConstantOperandVal(4); in LowerINTRINSIC_VOID()
9819 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) { in LowerINTRINSIC_VOID()
9823 if (LHS->isDivergent()) in LowerINTRINSIC_VOID()
9826 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND && in LowerINTRINSIC_VOID()
9835 if (!Addr->isDivergent()) { in LowerINTRINSIC_VOID()
9849 MachineMemOperand *LoadMMO = M->getMemOperand(); in LowerINTRINSIC_VOID()
9850 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); in LowerINTRINSIC_VOID()
9851 LoadPtrI.Offset = Op->getConstantOperandVal(5); in LowerINTRINSIC_VOID()
9857 auto F = LoadMMO->getFlags() & in LowerINTRINSIC_VOID()
9861 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); in LowerINTRINSIC_VOID()
9864 LoadMMO->getAAInfo()); in LowerINTRINSIC_VOID()
9866 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); in LowerINTRINSIC_VOID()
9873 Op->getOperand(2), Chain), 0); in LowerINTRINSIC_VOID()
9877 SDValue Chain = Op->getOperand(0); in LowerINTRINSIC_VOID()
9879 SDValue BarOp = Op->getOperand(2); in LowerINTRINSIC_VOID()
9885 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue(); in LowerINTRINSIC_VOID()
9941 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); in LowerINTRINSIC_VOID()
9975 unsigned ImmOffset = C1->getZExtValue(); in splitBufferOffsets()
9985 ImmOffset -= Overflow; in splitBufferOffsets()
10014 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in setBufferOffsets()
10017 uint32_t Imm = C->getZExtValue(); in setBufferOffsets()
10019 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) { in setBufferOffsets()
10030 int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); in setBufferOffsets()
10032 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { in setBufferOffsets()
10040 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset() in setBufferOffsets()
10060 // Wrap a global or flat pointer into a buffer intrinsic using the flags
10066 SDValue Pointer = Op->getOperand(1); in lowerPointerAsRsrcIntrin()
10067 SDValue Stride = Op->getOperand(2); in lowerPointerAsRsrcIntrin()
10068 SDValue NumRecords = Op->getOperand(3); in lowerPointerAsRsrcIntrin()
10069 SDValue Flags = Op->getOperand(4); in lowerPointerAsRsrcIntrin()
10076 ConstStride = ConstNode->getZExtValue(); in lowerPointerAsRsrcIntrin()
10148 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType, in handleByteShortBufferStores()
10149 M->getMemOperand()); in handleByteShortBufferStores()
10172 // Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10173 // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10176 if (Ld->getAlign() < Align(4) || Ld->isDivergent()) in widenLoad()
10180 unsigned AS = Ld->getAddressSpace(); in widenLoad()
10183 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant())) in widenLoad()
10188 // pre-legalize. in widenLoad()
10189 EVT MemVT = Ld->getMemoryVT(); in widenLoad()
10196 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) && in widenLoad()
10200 SDValue Ptr = Ld->getBasePtr(); in widenLoad()
10202 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr, in widenLoad()
10203 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(), in widenLoad()
10204 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(), in widenLoad()
10209 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD && in widenLoad()
10215 if (Ld->getExtensionType() == ISD::SEXTLOAD) { in widenLoad()
10218 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD || in widenLoad()
10219 Ld->getExtensionType() == ISD::NON_EXTLOAD) { in widenLoad()
10222 assert(Ld->getExtensionType() == ISD::EXTLOAD); in widenLoad()
10225 EVT VT = Ld->getValueType(0); in widenLoad()
10230 // We may need to handle exotic cases, such as i16->i64 extloads, so insert in widenLoad()
10231 // the appropriate extension from the 32-bit load. in widenLoad()
10232 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT); in widenLoad()
10252 ISD::LoadExtType ExtType = Load->getExtensionType(); in LowerLOAD()
10253 EVT MemVT = Load->getMemoryVT(); in LowerLOAD()
10262 SDValue Chain = Load->getChain(); in LowerLOAD()
10263 SDValue BasePtr = Load->getBasePtr(); in LowerLOAD()
10264 MachineMemOperand *MMO = Load->getMemOperand(); in LowerLOAD()
10300 "Custom lowering for non-i32 vectors hasn't been implemented."); in LowerLOAD()
10302 Align Alignment = Load->getAlign(); in LowerLOAD()
10303 unsigned AS = Load->getAddressSpace(); in LowerLOAD()
10304 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS && in LowerLOAD()
10311 // If there is a possibility that flat instruction access scratch memory in LowerLOAD()
10314 !Subtarget->hasMultiDwordFlatScratchAddressing()) in LowerLOAD()
10315 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ? in LowerLOAD()
10322 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) { in LowerLOAD()
10324 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) in LowerLOAD()
10328 // Non-uniform loads will be selected to MUBUF instructions, so they in LowerLOAD()
10337 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && in LowerLOAD()
10338 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) && in LowerLOAD()
10341 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3)) in LowerLOAD()
10345 // Non-uniform loads will be selected to MUBUF instructions, so they in LowerLOAD()
10357 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) in LowerLOAD()
10367 switch (Subtarget->getMaxPrivateElementSize()) { in LowerLOAD()
10378 // Same as global/flat in LowerLOAD()
10382 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) in LowerLOAD()
10391 auto Flags = Load->getMemOperand()->getFlags(); in LowerLOAD()
10393 Load->getAlign(), Flags, &Fast) && in LowerLOAD()
10402 MemVT, *Load->getMemOperand())) { in LowerLOAD()
10450 const SDNodeFlags Flags = Op->getFlags(); in lowerFastUnsafeFDIV()
10462 if (CLHS->isExactlyValue(1.0)) { in lowerFastUnsafeFDIV()
10470 // 1.0 / sqrt(x) -> rsq(x) in lowerFastUnsafeFDIV()
10472 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP in lowerFastUnsafeFDIV()
10474 // 1.0 / x -> rcp(x) in lowerFastUnsafeFDIV()
10479 if (CLHS->isExactlyValue(-1.0)) { in lowerFastUnsafeFDIV()
10480 // -1.0 / x -> rcp (fneg x) in lowerFastUnsafeFDIV()
10492 // x / y -> x * (1.0 / y) in lowerFastUnsafeFDIV()
10503 const SDNodeFlags Flags = Op->getFlags(); in lowerFastUnsafeFDIV64()
10527 if (GlueChain->getNumValues() <= 1) { in getFPBinOp()
10531 assert(GlueChain->getNumValues() == 3); in getFPBinOp()
10549 if (GlueChain->getNumValues() <= 1) { in getFPTernOp()
10553 assert(GlueChain->getNumValues() == 3); in getFPTernOp()
10590 SDNodeFlags Flags = Op->getFlags(); in lowerFDIV_FAST()
10600 const APFloat K1Val(0x1p-32f); in lowerFDIV_FAST()
10627 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); in getSPDenormModeValue()
10628 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue(); in getSPDenormModeValue()
10641 SDNodeFlags Flags = Op->getFlags(); in LowerFDIV32()
10669 const DenormalMode DenormMode = Info->getMode().FP32Denormals; in LowerFDIV32()
10679 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV in LowerFDIV32()
10697 if (Subtarget->hasDenormModeInst()) { in LowerFDIV32()
10740 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) { in LowerFDIV32()
10805 if (!Subtarget->hasUsableDivScaleConditionOutput()) { in LowerFDIV64()
10857 EVT ResultExpVT = Op->getValueType(1); in LowerFFREXP()
10868 if (Subtarget->hasFractBug()) { in LowerFFREXP()
10886 EVT VT = Store->getMemoryVT(); in LowerSTORE()
10889 return DAG.getTruncStore(Store->getChain(), DL, in LowerSTORE()
10890 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32), in LowerSTORE()
10891 Store->getBasePtr(), MVT::i1, Store->getMemOperand()); in LowerSTORE()
10895 Store->getValue().getValueType().getScalarType() == MVT::i32); in LowerSTORE()
10897 unsigned AS = Store->getAddressSpace(); in LowerSTORE()
10898 if (Subtarget->hasLDSMisalignedBug() && in LowerSTORE()
10900 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) { in LowerSTORE()
10906 // If there is a possibility that flat instruction access scratch memory in LowerSTORE()
10909 !Subtarget->hasMultiDwordFlatScratchAddressing()) in LowerSTORE()
10910 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ? in LowerSTORE()
10919 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) in LowerSTORE()
10923 VT, *Store->getMemOperand())) in LowerSTORE()
10929 switch (Subtarget->getMaxPrivateElementSize()) { in LowerSTORE()
10938 (NumElements == 3 && !Subtarget->enableFlatScratch())) in LowerSTORE()
10946 auto Flags = Store->getMemOperand()->getFlags(); in LowerSTORE()
10948 Store->getAlign(), Flags, &Fast) && in LowerSTORE()
10965 assert(!Subtarget->has16BitInsts()); in lowerFSQRTF16()
10966 SDNodeFlags Flags = Op->getFlags(); in lowerFSQRTF16()
10980 SDNodeFlags Flags = Op->getFlags(); in lowerFSQRTF32()
10991 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT); in lowerFSQRTF32()
11009 DAG.getConstant(-1, DL, MVT::i32)); in lowerFSQRTF32()
11054 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT); in lowerFSQRTF32()
11075 // r0 = 0.5 - h0 * g0 in lowerFSQRTF64()
11079 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1 in lowerFSQRTF64()
11083 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2 in lowerFSQRTF64()
11088 SDNodeFlags Flags = Op->getFlags(); in lowerFSQRTF64()
11093 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); in lowerFSQRTF64()
11130 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32); in lowerFSQRTF64()
11136 // with finite only or nsz because rsq(+/-0) = +/-inf in lowerFSQRTF64()
11143 // If x is +INF, +0, or -0, use its original value in lowerFSQRTF64()
11154 // Propagate fast-math flags so that the multiply we introduce can be folded in LowerTrig()
11156 auto Flags = Op->getFlags(); in LowerTrig()
11160 if (Subtarget->hasTrigReducedRange()) { in LowerTrig()
11179 assert(AtomicNode->isCompareAndSwap()); in LowerATOMIC_CMP_SWAP()
11180 unsigned AS = AtomicNode->getAddressSpace(); in LowerATOMIC_CMP_SWAP()
11186 // Non-local address space requires custom lowering for atomic compare in LowerATOMIC_CMP_SWAP()
11200 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(), in LowerATOMIC_CMP_SWAP()
11201 Ops, VT, AtomicNode->getMemOperand()); in LowerATOMIC_CMP_SWAP()
11204 //===----------------------------------------------------------------------===//
11206 //===----------------------------------------------------------------------===//
11210 EVT VT = N->getValueType(0); in performUCharToFloatCombine()
11218 SDValue Src = N->getOperand(0); in performUCharToFloatCombine()
11223 // types are legalized. v4i8 -> v4f32 is probably the only case to worry in performUCharToFloatCombine()
11244 SDValue MagnitudeOp = N->getOperand(0); in performFCopySignCombine()
11245 SDValue SignOp = N->getOperand(1); in performFCopySignCombine()
11251 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y) in performFCopySignCombine()
11274 // fcopysign f64:x, f64:y -> in performFCopySignCombine()
11282 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), in performFCopySignCombine()
11286 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11287 // (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11291 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11306 SDValue N0 = N->getOperand(0); in performSHLPtrCombine()
11307 SDValue N1 = N->getOperand(1); in performSHLPtrCombine()
11312 N0->hasOneUse()) in performSHLPtrCombine()
11325 if (N0->getOpcode() == ISD::OR && in performSHLPtrCombine()
11331 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue(); in performSHLPtrCombine()
11341 EVT VT = N->getValueType(0); in performSHLPtrCombine()
11347 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() && in performSHLPtrCombine()
11349 N0->getFlags().hasNoUnsignedWrap())); in performSHLPtrCombine()
11358 switch (N->getOpcode()) { in getBasePtrIndex()
11374 SDValue Ptr = N->getOperand(PtrIdx); in performMemSDNodeCombine()
11378 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(), in performMemSDNodeCombine()
11379 N->getMemoryVT(), DCI); in performMemSDNodeCombine()
11381 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end()); in performMemSDNodeCombine()
11397 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11398 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11399 // integer combine opportunities since most 64-bit operations are decomposed
11407 uint64_t Val = CRHS->getZExtValue(); in splitBinaryBitConstantOp()
11410 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in splitBinaryBitConstantOp()
11414 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) { in splitBinaryBitConstantOp()
11415 // If we need to materialize a 64-bit immediate, it will be split up later in splitBinaryBitConstantOp()
11416 // anyway. Avoid creating the harder to understand 64-bit immediate in splitBinaryBitConstantOp()
11450 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte in getConstantPermuteMask()
11458 // or -1 if not succeeded.
11460 // value 0-3 selects corresponding source byte;
11473 uint32_t C = N1->getZExtValue(); in getPermuteMask()
11510 EVT VT = N->getValueType(0); in performAndCombine()
11511 SDValue LHS = N->getOperand(0); in performAndCombine()
11512 SDValue RHS = N->getOperand(1); in performAndCombine()
11527 uint64_t Mask = CRHS->getZExtValue(); in performAndCombine()
11529 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL && in performAndCombine()
11531 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { in performAndCombine()
11532 unsigned Shift = CShift->getZExtValue(); in performAndCombine()
11533 unsigned NB = CRHS->getAPIntValue().countr_zero(); in performAndCombine()
11535 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary. in performAndCombine()
11538 LHS->getOperand(0), in performAndCombine()
11551 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) in performAndCombine()
11566 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) -> in performAndCombine()
11569 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); in performAndCombine()
11570 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get(); in performAndCombine()
11584 if (!C1 || !C1->isInfinity() || C1->isNegative()) in performAndCombine()
11612 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get(); in performAndCombine()
11613 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan) in performAndCombine()
11614 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan) in performAndCombine()
11621 Mask->getZExtValue() & ~OrdMask : in performAndCombine()
11622 Mask->getZExtValue() & OrdMask; in performAndCombine()
11640 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) in performAndCombine()
11641 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in performAndCombine()
11643 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { in performAndCombine()
11655 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. in performAndCombine()
11664 // Each byte in each mask is either selector mask 0-3, or has higher in performAndCombine()
11714 // trunc* 255 srl -256
11718 // *In this example, the truncs are from i32->i16
11720 // calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11721 // respectively. calculateSrcByte would find (given node) -> ultimate src &
11722 // byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11744 switch (Op->getOpcode()) { in calculateSrcByte()
11746 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); in calculateSrcByte()
11752 SDValue NarrowOp = Op->getOperand(0); in calculateSrcByte()
11754 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { in calculateSrcByte()
11755 auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); in calculateSrcByte()
11756 NarrowVT = VTSign->getVT(); in calculateSrcByte()
11764 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); in calculateSrcByte()
11769 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); in calculateSrcByte()
11773 uint64_t BitShift = ShiftOp->getZExtValue(); in calculateSrcByte()
11780 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); in calculateSrcByte()
11807 if (Index > BitWidth / 8 - 1) in calculateByteProvider()
11826 if (!LHS->isConstantZero() && !RHS->isConstantZero()) in calculateByteProvider()
11828 if (!LHS || LHS->isConstantZero()) in calculateByteProvider()
11830 if (!RHS || RHS->isConstantZero()) in calculateByteProvider()
11839 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); in calculateByteProvider()
11843 uint32_t BitMask = BitMaskOp->getZExtValue(); in calculateByteProvider()
11855 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); in calculateByteProvider()
11862 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) in calculateByteProvider()
11863 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2)); in calculateByteProvider()
11871 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided); in calculateByteProvider()
11890 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); in calculateByteProvider()
11894 uint64_t BitShift = ShiftOp->getZExtValue(); in calculateByteProvider()
11904 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes. in calculateByteProvider()
11908 return BytesProvided - ByteShift > Index in calculateByteProvider()
11909 ? calculateSrcByte(Op->getOperand(0), StartingIndex, in calculateByteProvider()
11918 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); in calculateByteProvider()
11922 uint64_t BitShift = ShiftOp->getZExtValue(); in calculateByteProvider()
11930 // of interest is Index - ByteShift of the src in calculateByteProvider()
11933 : calculateByteProvider(Op.getOperand(0), Index - ByteShift, in calculateByteProvider()
11945 SDValue NarrowOp = Op->getOperand(0); in calculateByteProvider()
11947 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || in calculateByteProvider()
11948 Op->getOpcode() == ISD::AssertZext || in calculateByteProvider()
11949 Op->getOpcode() == ISD::AssertSext) { in calculateByteProvider()
11950 auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); in calculateByteProvider()
11951 NarrowBitWidth = VTSign->getVT().getSizeInBits(); in calculateByteProvider()
11989 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); in calculateByteProvider()
11998 return L->getExtensionType() == ISD::ZEXTLOAD in calculateByteProvider()
12015 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1, in calculateByteProvider()
12020 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); in calculateByteProvider()
12023 auto VecIdx = IdxOp->getZExtValue(); in calculateByteProvider()
12035 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2)); in calculateByteProvider()
12040 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8); in calculateByteProvider()
12072 auto ExtType = cast<LoadSDNode>(L)->getExtensionType(); in isExtendedFrom16Bits()
12075 auto MemVT = L->getMemoryVT(); in isExtendedFrom16Bits()
12078 return L->getMemoryVT().getSizeInBits() == 16; in isExtendedFrom16Bits()
12093 bool IsConsecutive = (Hi8 - Low8 == 1); in addresses16Bits()
12164 : NumElements - NormalizedTrunc; in getDWordFromOffset()
12185 [[maybe_unused]] EVT VT = N->getValueType(0); in matchPERM()
12195 if (!P || P->isConstantZero()) in matchPERM()
12217 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) || in matchPERM()
12218 ((PermOp.SrcOffset / 4) != SecondSrc->second)) in matchPERM()
12223 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8)); in matchPERM()
12248 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op; in matchPERM()
12251 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second); in matchPERM()
12261 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. in matchPERM()
12264 // ANY_EXTEND as the extended bits are dont-cares. in matchPERM()
12277 SDValue LHS = N->getOperand(0); in performOrCombine()
12278 SDValue RHS = N->getOperand(1); in performOrCombine()
12280 EVT VT = N->getValueType(0); in performOrCombine()
12282 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2) in performOrCombine()
12297 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask; in performOrCombine()
12306 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2) in performOrCombine()
12310 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1)); in performOrCombine()
12320 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2) in performOrCombine()
12321 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in performOrCombine()
12323 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { in performOrCombine()
12328 // If we have any non-vectorized use, then it is a candidate for v_perm in performOrCombine()
12329 if (OrUse->getOpcode() != ISD::BITCAST || in performOrCombine()
12330 !OrUse->getValueType(0).isVector()) in performOrCombine()
12333 // If we have any non-vectorized use, then it is a candidate for v_perm in performOrCombine()
12334 for (auto VUse : OrUse->uses()) { in performOrCombine()
12335 if (!VUse->getValueType(0).isVector()) in performOrCombine()
12340 // TODO -- whitelist more uses in performOrCombine()
12342 if (VUse->getOpcode() == VectorwiseOp) in performOrCombine()
12348 if (!any_of(N->uses(), usesCombinedOperand)) in performOrCombine()
12363 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range. in performOrCombine()
12398 // (or i64:x, (zero_extend i32:y)) -> in performOrCombine()
12422 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); in performOrCombine()
12426 N->getOperand(0), CRHS)) in performOrCombine()
12438 SDValue LHS = N->getOperand(0); in performXorCombine()
12439 SDValue RHS = N->getOperand(1); in performXorCombine()
12444 EVT VT = N->getValueType(0); in performXorCombine()
12451 // Make sure to apply the 64-bit constant splitting fold before trying to fold in performXorCombine()
12452 // fneg-like xors into 64-bit select. in performXorCombine()
12455 if (CRHS && CRHS->getAPIntValue().isSignMask() && in performXorCombine()
12457 // xor (select c, a, b), 0x80000000 -> in performXorCombine()
12461 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1)); in performXorCombine()
12463 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2)); in performXorCombine()
12467 LHS->getOperand(0), FNegLHS, FNegRHS); in performXorCombine()
12477 if (!Subtarget->has16BitInsts() || in performZeroExtendCombine()
12481 EVT VT = N->getValueType(0); in performZeroExtendCombine()
12485 SDValue Src = N->getOperand(0); in performZeroExtendCombine()
12495 SDValue Src = N->getOperand(0); in performSignExtendInRegCombine()
12496 auto *VTSign = cast<VTSDNode>(N->getOperand(1)); in performSignExtendInRegCombine()
12501 VTSign->getVT() == MVT::i8) || in performSignExtendInRegCombine()
12503 VTSign->getVT() == MVT::i16))) { in performSignExtendInRegCombine()
12504 assert(Subtarget->hasScalarSubwordLoads() && in performSignExtendInRegCombine()
12520 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand()); in performSignExtendInRegCombine()
12525 VTSign->getVT() == MVT::i8) || in performSignExtendInRegCombine()
12527 VTSign->getVT() == MVT::i16)) && in performSignExtendInRegCombine()
12547 Ops, M->getMemoryVT(), in performSignExtendInRegCombine()
12548 M->getMemOperand()); in performSignExtendInRegCombine()
12558 SDValue Mask = N->getOperand(1); in performClassCombine()
12560 // fp_class x, 0 -> false in performClassCombine()
12564 if (N->getOperand(0).isUndef()) in performClassCombine()
12572 EVT VT = N->getValueType(0); in performRcpCombine()
12573 SDValue N0 = N->getOperand(0); in performRcpCombine()
12584 N->getFlags()); in performRcpCombine()
12589 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { in performRcpCombine()
12591 N0.getOperand(0), N->getFlags()); in performRcpCombine()
12604 const auto &F = CFP->getValueAPF(); in isCanonicalized()
12667 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); in isCanonicalized()
12676 if (RHS->getZExtValue() == 0xffff0000) { in isCanonicalized()
12677 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); in isCanonicalized()
12705 if (Subtarget->supportsMinMaxDenormModes() || in isCanonicalized()
12711 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such in isCanonicalized()
12716 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1)) in isCanonicalized()
12723 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) && in isCanonicalized()
12724 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1); in isCanonicalized()
12729 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1)) in isCanonicalized()
12737 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); in isCanonicalized()
12740 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) && in isCanonicalized()
12741 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); in isCanonicalized()
12751 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1); in isCanonicalized()
12759 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1); in isCanonicalized()
12801 unsigned Opcode = MI->getOpcode(); in isCanonicalized()
12809 if (FCR->Value.isSignaling()) in isCanonicalized()
12811 if (!FCR->Value.isDenormal()) in isCanonicalized()
12814 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics()); in isCanonicalized()
12852 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1); in isCanonicalized()
12859 if (Subtarget->supportsMinMaxDenormModes() || in isCanonicalized()
12867 for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) in isCanonicalized()
12868 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1)) in isCanonicalized()
12873 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { in isCanonicalized()
12939 // TODO: Can we use -1 as the canonical NaN value since it's an inline in getCanonicalConstantFP()
12957 SDValue N0 = N->getOperand(0); in performFCanonicalizeCombine()
12958 EVT VT = N->getValueType(0); in performFCanonicalizeCombine()
12960 // fcanonicalize undef -> qnan in performFCanonicalizeCombine()
12967 EVT VT = N->getValueType(0); in performFCanonicalizeCombine()
12968 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF()); in performFCanonicalizeCombine()
12971 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x), in performFCanonicalizeCombine()
12974 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0 in performFCanonicalizeCombine()
12991 CFP->getValueAPF()); in performFCanonicalizeCombine()
13065 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue())) in performIntMed3ImmCombine()
13068 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue())) in performIntMed3ImmCombine()
13072 EVT VT = MinK->getValueType(0); in performIntMed3ImmCombine()
13074 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) in performIntMed3ImmCombine()
13080 // pre-GFX10 where VOP3 instructions couldn't take literal operands. in performIntMed3ImmCombine()
13089 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode()) in getSplatConstantFP()
13109 if (K0->getValueAPF() > K1->getValueAPF()) in performFPMed3ImmCombine()
13117 if (Info->getMode().DX10Clamp) { in performFPMed3ImmCombine()
13120 // FIXME: Should this be allowing -0.0? in performFPMed3ImmCombine()
13121 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0)) in performFPMed3ImmCombine()
13126 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) { in performFPMed3ImmCombine()
13135 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in performFPMed3ImmCombine()
13137 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) && in performFPMed3ImmCombine()
13138 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) { in performFPMed3ImmCombine()
13139 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), in performFPMed3ImmCombine()
13178 EVT VT = N->getValueType(0); in performMinMaxCombine()
13179 unsigned Opc = N->getOpcode(); in performMinMaxCombine()
13180 SDValue Op0 = N->getOperand(0); in performMinMaxCombine()
13181 SDValue Op1 = N->getOperand(1); in performMinMaxCombine()
13187 // max(max(a, b), c) -> max3(a, b, c) in performMinMaxCombine()
13188 // min(min(a, b), c) -> min3(a, b, c) in performMinMaxCombine()
13193 N->getValueType(0), in performMinMaxCombine()
13200 // max(a, max(b, c)) -> max3(a, b, c) in performMinMaxCombine()
13201 // min(a, min(b, c)) -> min3(a, b, c) in performMinMaxCombine()
13206 N->getValueType(0), in performMinMaxCombine()
13213 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1) in performMinMaxCombine()
13214 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0) in performMinMaxCombine()
13217 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true)) in performMinMaxCombine()
13222 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true)) in performMinMaxCombine()
13228 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false)) in performMinMaxCombine()
13233 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false)) in performMinMaxCombine()
13237 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1) in performMinMaxCombine()
13243 (VT == MVT::f16 && Subtarget->has16BitInsts()) || in performMinMaxCombine()
13244 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) && in performMinMaxCombine()
13256 // FIXME: Should this be allowing -0.0? in isClampZeroToOne()
13257 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) || in isClampZeroToOne()
13258 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0)); in isClampZeroToOne()
13268 EVT VT = N->getValueType(0); in performFMed3Combine()
13275 SDValue Src0 = N->getOperand(0); in performFMed3Combine()
13276 SDValue Src1 = N->getOperand(1); in performFMed3Combine()
13277 SDValue Src2 = N->getOperand(2); in performFMed3Combine()
13280 // const_a, const_b, x -> clamp is safe in all cases including signaling in performFMed3Combine()
13282 // FIXME: Should this be allowing -0.0? in performFMed3Combine()
13290 // handling no dx10-clamp? in performFMed3Combine()
13291 if (Info->getMode().DX10Clamp) { in performFMed3Combine()
13312 SDValue Src0 = N->getOperand(0); in performCvtPkRTZCombine()
13313 SDValue Src1 = N->getOperand(1); in performCvtPkRTZCombine()
13315 return DCI.DAG.getUNDEF(N->getValueType(0)); in performCvtPkRTZCombine()
13319 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13330 // Sub-dword vectors of size 2 dword or less have better implementation. in shouldExpandVectorDynExt()
13334 // Always expand the rest of sub-dword instructions, otherwise it will be in shouldExpandVectorDynExt()
13339 // Always do this if var-idx is divergent, otherwise it will become a loop. in shouldExpandVectorDynExt()
13349 if (!Subtarget->hasMovrel()) in shouldExpandVectorDynExt()
13358 SDValue Idx = N->getOperand(N->getNumOperands() - 1); in shouldExpandVectorDynExt()
13362 SDValue Vec = N->getOperand(0); in shouldExpandVectorDynExt()
13369 EltSize, NumElem, Idx->isDivergent(), getSubtarget()); in shouldExpandVectorDynExt()
13374 SDValue Vec = N->getOperand(0); in performExtractVectorEltCombine()
13379 EVT ResVT = N->getValueType(0); in performExtractVectorEltCombine()
13387 SDValue Idx = N->getOperand(1); in performExtractVectorEltCombine()
13393 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx) in performExtractVectorEltCombine()
13397 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt in performExtractVectorEltCombine()
13400 SDValue Idx = N->getOperand(1); in performExtractVectorEltCombine()
13428 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags()); in performExtractVectorEltCombine()
13433 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) in performExtractVectorEltCombine()
13436 SDValue Idx = N->getOperand(1); in performExtractVectorEltCombine()
13452 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit in performExtractVectorEltCombine()
13454 // multiple small extract_vector_elements with a single 32-bit extract. in performExtractVectorEltCombine()
13455 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1)); in performExtractVectorEltCombine()
13460 unsigned BitIndex = Idx->getZExtValue() * VecEltSize; in performExtractVectorEltCombine()
13493 SDValue Vec = N->getOperand(0); in performInsertVectorEltCombine()
13494 SDValue Idx = N->getOperand(2); in performInsertVectorEltCombine()
13498 // INSERT_VECTOR_ELT (<n x e>, var-idx) in performInsertVectorEltCombine()
13499 // => BUILD_VECTOR n x select (e, const-idx) in performInsertVectorEltCombine()
13505 SDValue Ins = N->getOperand(1); in performInsertVectorEltCombine()
13528 APFloat Val = CFP->getValueAPF(); in strictFPExtFromF16()
13540 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() && in performFPRoundCombine()
13543 SDValue TruncSrc = N->getOperand(0); in performFPRoundCombine()
13544 EVT VT = N->getValueType(0); in performFPRoundCombine()
13586 EVT VT = N0->getValueType(0); in getFusedOpcode()
13592 (VT == MVT::f16 && Subtarget->hasMadF16() && in getFusedOpcode()
13599 (N0->getFlags().hasAllowContract() && in getFusedOpcode()
13600 N1->getFlags().hasAllowContract())) && in getFusedOpcode()
13609 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13612 EVT VT = N->getValueType(0); in reassociateScalarOps()
13619 unsigned Opc = N->getOpcode(); in reassociateScalarOps()
13620 SDValue Op0 = N->getOperand(0); in reassociateScalarOps()
13621 SDValue Op1 = N->getOperand(1); in reassociateScalarOps()
13623 if (!(Op0->isDivergent() ^ Op1->isDivergent())) in reassociateScalarOps()
13626 if (Op0->isDivergent()) in reassociateScalarOps()
13634 if (!(Op1->isDivergent() ^ Op2->isDivergent())) in reassociateScalarOps()
13637 if (Op1->isDivergent()) in reassociateScalarOps()
13655 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13658 // Full 64-bit multiplies that feed into an addition are lowered here instead
13665 assert(N->getOpcode() == ISD::ADD); in tryFoldToMad64_32()
13668 EVT VT = N->getValueType(0); in tryFoldToMad64_32()
13670 SDValue LHS = N->getOperand(0); in tryFoldToMad64_32()
13671 SDValue RHS = N->getOperand(1); in tryFoldToMad64_32()
13678 if (!N->isDivergent() && Subtarget->hasSMulHi()) in tryFoldToMad64_32()
13691 // multiple uses, except on hardware with full-rate multiply-add (which is in tryFoldToMad64_32()
13692 // part of full-rate 64-bit ops). in tryFoldToMad64_32()
13693 if (!Subtarget->hasFullRate64Ops()) { in tryFoldToMad64_32()
13695 for (SDNode *Use : LHS->uses()) { in tryFoldToMad64_32()
13698 if (Use->getOpcode() != ISD::ADD) in tryFoldToMad64_32()
13742 // are {sign,zero}-extended or not. in tryFoldToMad64_32()
13786 if (!Byte0 || Byte0->isConstantZero()) { in handleMulOperand()
13790 if (Byte1 && !Byte1->isConstantZero()) { in handleMulOperand()
13837 unsigned FMask = 0xFF << (8 * (3 - Step)); in placeSources()
13840 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); in placeSources()
13842 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask); in placeSources()
13846 int FirstGroup = -1; in placeSources()
13856 Match->PermMask = addPermMasks(FirstMask, Match->PermMask); in placeSources()
13861 if (FirstGroup != -1) { in placeSources()
13869 Match->PermMask = addPermMasks(SecondMask, Match->PermMask); in placeSources()
13880 unsigned FMask = 0xFF << (8 * (3 - Step)); in placeSources()
13884 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), in placeSources()
13888 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)), in placeSources()
13901 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset); in resolveSources()
13904 if (Elt->PermMask == 0x3020100) in resolveSources()
13908 DAG.getConstant(Elt->PermMask, SL, MVT::i32)); in resolveSources()
13919 auto FirstMask = FirstElt->PermMask; in resolveSources()
13920 auto SecondMask = SecondElt->PermMask; in resolveSources()
13930 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); in resolveSources()
13932 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset); in resolveSources()
13947 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); in resolveSources()
13951 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32))); in resolveSources()
13964 EntryMask = EntryMask >> ((4 - ChainLength) * 8); in fixMasks()
13981 // If we both ops are i8s (pre legalize-dag), then the signedness semantics in checkDot4MulSignedness()
14043 EVT VT = N->getValueType(0); in performAddCombine()
14045 SDValue LHS = N->getOperand(0); in performAddCombine()
14046 SDValue RHS = N->getOperand(1); in performAddCombine()
14049 if (Subtarget->hasMad64_32()) { in performAddCombine()
14059 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && in performAddCombine()
14060 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { in performAddCombine()
14070 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1; in performAddCombine()
14071 if (MulIdx == -1) in performAddCombine()
14073 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0)); in performAddCombine()
14076 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1)); in performAddCombine()
14081 TempNode->getOperand(MulIdx), *Src0, *Src1, in performAddCombine()
14082 TempNode->getOperand(MulIdx)->getOperand(0), in performAddCombine()
14083 TempNode->getOperand(MulIdx)->getOperand(1), DAG); in performAddCombine()
14091 auto AddIdx = 1 - MulIdx; in performAddCombine()
14092 // Allow the special case where add (add (mul24, 0), mul24) became -> in performAddCombine()
14094 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) { in performAddCombine()
14095 Src2s.push_back(TempNode->getOperand(AddIdx)); in performAddCombine()
14097 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0)); in performAddCombine()
14101 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1)); in performAddCombine()
14105 TempNode->getOperand(AddIdx), *Src0, *Src1, in performAddCombine()
14106 TempNode->getOperand(AddIdx)->getOperand(0), in performAddCombine()
14107 TempNode->getOperand(AddIdx)->getOperand(1), DAG); in performAddCombine()
14119 TempNode = TempNode->getOperand(AddIdx); in performAddCombine()
14122 if (TempNode->getNumOperands() < 2) in performAddCombine()
14124 LHS = TempNode->getOperand(0); in performAddCombine()
14125 RHS = TempNode->getOperand(1); in performAddCombine()
14146 Src0s.begin()->PermMask == Src1s.begin()->PermMask && in performAddCombine()
14147 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 && in performAddCombine()
14148 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) { in performAddCombine()
14150 auto Src0Mask = Src0s.begin()->PermMask; in performAddCombine()
14154 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8)); in performAddCombine()
14168 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset); in performAddCombine()
14171 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp, in performAddCombine()
14172 SecondElt->DWordOffset); in performAddCombine()
14188 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32); in performAddCombine()
14232 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args); in performAddCombine()
14241 EVT VT = N->getValueType(0); in performSubCombine()
14247 SDValue LHS = N->getOperand(0); in performSubCombine()
14248 SDValue RHS = N->getOperand(1); in performSubCombine()
14275 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args); in performSubCombine()
14283 if (N->getValueType(0) != MVT::i32) in performAddCarrySubCarryCombine()
14286 if (!isNullConstant(N->getOperand(1))) in performAddCarrySubCarryCombine()
14290 SDValue LHS = N->getOperand(0); in performAddCarrySubCarryCombine()
14295 unsigned Opc = N->getOpcode(); in performAddCarrySubCarryCombine()
14298 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) }; in performAddCarrySubCarryCombine()
14299 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args); in performAddCarrySubCarryCombine()
14310 EVT VT = N->getValueType(0); in performFAddCombine()
14313 SDValue LHS = N->getOperand(0); in performFAddCombine()
14314 SDValue RHS = N->getOperand(1); in performFAddCombine()
14319 // fadd (fadd (a, a), b) -> mad 2.0, a, b in performFAddCombine()
14331 // fadd (b, fadd (a, a)) -> mad 2.0, a, b in performFAddCombine()
14353 EVT VT = N->getValueType(0); in performFSubCombine()
14361 SDValue LHS = N->getOperand(0); in performFSubCombine()
14362 SDValue RHS = N->getOperand(1); in performFSubCombine()
14364 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c) in performFSubCombine()
14378 // (fsub c, (fadd a, a)) -> mad -2.0, a, c in performFSubCombine()
14384 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT); in performFSubCombine()
14397 EVT VT = N->getValueType(0); in performFDivCombine()
14398 if (VT != MVT::f16 || !Subtarget->has16BitInsts()) in performFDivCombine()
14401 SDValue LHS = N->getOperand(0); in performFDivCombine()
14402 SDValue RHS = N->getOperand(1); in performFDivCombine()
14404 SDNodeFlags Flags = N->getFlags(); in performFDivCombine()
14405 SDNodeFlags RHSFlags = RHS->getFlags(); in performFDivCombine()
14407 !RHS->hasOneUse()) in performFDivCombine()
14412 if (CLHS->isExactlyValue(1.0) || in performFDivCombine()
14413 (IsNegative = CLHS->isExactlyValue(-1.0))) { in performFDivCombine()
14414 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 in performFDivCombine()
14415 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 in performFDivCombine()
14431 EVT VT = N->getValueType(0); in performFMACombine()
14434 if (!Subtarget->hasDot7Insts() || VT != MVT::f32) in performFMACombine()
14437 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> in performFMACombine()
14439 SDValue Op1 = N->getOperand(0); in performFMACombine()
14440 SDValue Op2 = N->getOperand(1); in performFMACombine()
14441 SDValue FMA = N->getOperand(2); in performFMACombine()
14450 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2. in performFMACombine()
14453 (N->getFlags().hasAllowContract() && in performFMACombine()
14454 FMA->getFlags().hasAllowContract())) { in performFMACombine()
14508 SDValue LHS = N->getOperand(0); in performSetCCCombine()
14509 SDValue RHS = N->getOperand(1); in performSetCCCombine()
14511 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); in performSetCCCombine()
14525 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1 in performSetCCCombine()
14526 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc in performSetCCCombine()
14527 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 in performSetCCCombine()
14529 if ((CRHS->isAllOnes() && in performSetCCCombine()
14531 (CRHS->isZero() && in performSetCCCombine()
14534 DAG.getConstant(-1, SL, MVT::i1)); in performSetCCCombine()
14535 if ((CRHS->isAllOnes() && in performSetCCCombine()
14537 (CRHS->isZero() && in performSetCCCombine()
14542 const APInt &CRHSVal = CRHS->getAPIntValue(); in performSetCCCombine()
14550 // setcc (select cc, CT, CF), CF, eq => xor cc, -1 in performSetCCCombine()
14552 // setcc (select cc, CT, CF), CT, ne => xor cc, -1 in performSetCCCombine()
14560 DAG.getConstant(-1, SL, MVT::i1)); in performSetCCCombine()
14568 (!Subtarget->has16BitInsts() || VT != MVT::f16)) in performSetCCCombine()
14572 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity)) in performSetCCCombine()
14573 // (fcmp one (fabs x), inf) -> (fp_class x, in performSetCCCombine()
14580 const APFloat &APF = CRHS->getValueAPF(); in performSetCCCombine()
14603 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; in performCvtF32UByteNCombine()
14605 SDValue Src = N->getOperand(0); in performCvtF32UByteNCombine()
14606 SDValue Shift = N->getOperand(0); in performCvtF32UByteNCombine()
14613 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x in performCvtF32UByteNCombine()
14614 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x in performCvtF32UByteNCombine()
14615 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x in performCvtF32UByteNCombine()
14616 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x in performCvtF32UByteNCombine()
14617 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x in performCvtF32UByteNCombine()
14624 ShiftOffset -= C->getZExtValue(); in performCvtF32UByteNCombine()
14626 ShiftOffset += C->getZExtValue(); in performCvtF32UByteNCombine()
14640 if (N->getOpcode() != ISD::DELETED_NODE) in performCvtF32UByteNCombine()
14648 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc); in performCvtF32UByteNCombine()
14655 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0)); in performClampCombine()
14660 const APFloat &F = CSrc->getValueAPF(); in performClampCombine()
14663 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { in performClampCombine()
14664 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); in performClampCombine()
14669 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); in performClampCombine()
14679 switch (N->getOpcode()) { in PerformDAGCombine()
14715 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in PerformDAGCombine()
14716 if (N->getValueType(0) == MVT::i32 && N->isDivergent() && in PerformDAGCombine()
14717 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { in PerformDAGCombine()
14741 SDValue Src = N->getOperand(0); in PerformDAGCombine()
14764 EVT VT = N->getValueType(0); in PerformDAGCombine()
14766 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x)) in PerformDAGCombine()
14769 SDValue Src = N->getOperand(0); in PerformDAGCombine()
14819 unsigned Opcode = Node->getMachineOpcode(); in adjustWritemask()
14822 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1; in adjustWritemask()
14823 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx)) in adjustWritemask()
14828 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1; in adjustWritemask()
14829 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx); in adjustWritemask()
14831 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1; in adjustWritemask()
14832 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1; in adjustWritemask()
14833 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) || in adjustWritemask()
14834 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx))) in adjustWritemask()
14838 bool HasChain = Node->getNumValues() > 1; in adjustWritemask()
14852 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); in adjustWritemask()
14860 if (!I->isMachineOpcode() || in adjustWritemask()
14861 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG) in adjustWritemask()
14868 Lane = SubIdx2Lane(I->getConstantOperandVal(1)); in adjustWritemask()
14899 // If the original dmask has one channel - then nothing to do in adjustWritemask()
14902 // Use an arbitrary dmask - required for the instruction to work in adjustWritemask()
14911 // Check for TFE or LWE - increase the number of channels by one to account in adjustWritemask()
14918 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels); in adjustWritemask()
14919 assert(NewOpcode != -1 && in adjustWritemask()
14920 NewOpcode != static_cast<int>(Node->getMachineOpcode()) && in adjustWritemask()
14925 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx); in adjustWritemask()
14927 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end()); in adjustWritemask()
14929 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT(); in adjustWritemask()
14943 DAG.setNodeMemRefs(NewNode, Node->memoperands()); in adjustWritemask()
14948 assert(Node->hasNUsesOfValue(1, 0)); in adjustWritemask()
14950 SDLoc(Node), Users[Lane]->getValueType(0), in adjustWritemask()
14998 if (Node->getOpcode() == ISD::CopyToReg) { in legalizeTargetIndependentNode()
14999 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1)); in legalizeTargetIndependentNode()
15000 SDValue SrcVal = Node->getOperand(2); in legalizeTargetIndependentNode()
15004 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) { in legalizeTargetIndependentNode()
15010 SDNode *Glued = Node->getGluedNode(); in legalizeTargetIndependentNode()
15012 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal, in legalizeTargetIndependentNode()
15013 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0)); in legalizeTargetIndependentNode()
15024 for (unsigned i = 0; i < Node->getNumOperands(); ++i) { in legalizeTargetIndependentNode()
15025 if (!isFrameIndexOp(Node->getOperand(i))) { in legalizeTargetIndependentNode()
15026 Ops.push_back(Node->getOperand(i)); in legalizeTargetIndependentNode()
15032 Node->getOperand(i).getValueType(), in legalizeTargetIndependentNode()
15033 Node->getOperand(i)), 0)); in legalizeTargetIndependentNode()
15043 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in PostISelFolding()
15044 unsigned Opcode = Node->getMachineOpcode(); in PostISelFolding()
15046 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() && in PostISelFolding()
15047 !TII->isGather4(Opcode) && in PostISelFolding()
15064 SDValue Src0 = Node->getOperand(1); in PostISelFolding()
15065 SDValue Src1 = Node->getOperand(3); in PostISelFolding()
15066 SDValue Src2 = Node->getOperand(5); in PostISelFolding()
15075 getRegClassFor(VT, Src0.getNode()->isDivergent()); in PostISelFolding()
15101 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end()); in PostISelFolding()
15106 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); in PostISelFolding()
15120 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in AddMemOpInit()
15121 const SIRegisterInfo &TRI = TII->getRegisterInfo(); in AddMemOpInit()
15122 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); in AddMemOpInit()
15129 if (TII->isImage(MI)) { in AddMemOpInit()
15130 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe); in AddMemOpInit()
15131 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe); in AddMemOpInit()
15132 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16); in AddMemOpInit()
15137 unsigned TFEVal = TFE ? TFE->getImm() : 0; in AddMemOpInit()
15138 unsigned LWEVal = LWE ? LWE->getImm() : 0; in AddMemOpInit()
15139 unsigned D16Val = D16 ? D16->getImm() : 0; in AddMemOpInit()
15144 // At least one of TFE or LWE are non-zero in AddMemOpInit()
15149 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask); in AddMemOpInit()
15154 unsigned dmask = MO_Dmask->getImm(); in AddMemOpInit()
15157 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask); in AddMemOpInit()
15159 bool Packed = !Subtarget->hasUnpackedD16VMem(); in AddMemOpInit()
15164 // - this is in fact an error but this is picked up elsewhere and in AddMemOpInit()
15167 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; in AddMemOpInit()
15170 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) { in AddMemOpInit()
15171 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32; in AddMemOpInit()
15185 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1; in AddMemOpInit()
15186 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1); in AddMemOpInit()
15188 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst); in AddMemOpInit()
15189 for (; SizeLeft; SizeLeft--, CurrIdx++) { in AddMemOpInit()
15190 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx)); in AddMemOpInit()
15193 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg) in AddMemOpInit()
15195 // Insert into the super-reg in AddMemOpInit()
15196 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) in AddMemOpInit()
15208 MI.tieOperands(DstIdx, MI.getNumOperands() - 1); in AddMemOpInit()
15215 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in AdjustInstrPostInstrSelection()
15217 MachineFunction *MF = MI.getParent()->getParent(); in AdjustInstrPostInstrSelection()
15218 MachineRegisterInfo &MRI = MF->getRegInfo(); in AdjustInstrPostInstrSelection()
15219 SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); in AdjustInstrPostInstrSelection()
15221 if (TII->isVOP3(MI.getOpcode())) { in AdjustInstrPostInstrSelection()
15223 TII->legalizeOperandsVOP3(MRI, MI); in AdjustInstrPostInstrSelection()
15226 // This saves a chain-copy of registers and better balance register in AdjustInstrPostInstrSelection()
15230 bool HasAGPRs = Info->mayNeedAGPRs(); in AdjustInstrPostInstrSelection()
15231 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); in AdjustInstrPostInstrSelection()
15236 if (I == -1) in AdjustInstrPostInstrSelection()
15243 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); in AdjustInstrPostInstrSelection()
15244 if (!TRI->hasAGPRs(RC)) in AdjustInstrPostInstrSelection()
15247 if (!Src || !Src->isCopy() || in AdjustInstrPostInstrSelection()
15248 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) in AdjustInstrPostInstrSelection()
15250 auto *NewRC = TRI->getEquivalentVGPRClass(RC); in AdjustInstrPostInstrSelection()
15261 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { in AdjustInstrPostInstrSelection()
15262 if (Src2->isReg() && Src2->getReg().isVirtual()) { in AdjustInstrPostInstrSelection()
15263 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg()); in AdjustInstrPostInstrSelection()
15264 if (TRI->isVectorSuperClass(RC)) { in AdjustInstrPostInstrSelection()
15265 auto *NewRC = TRI->getEquivalentAGPRClass(RC); in AdjustInstrPostInstrSelection()
15266 MRI.setRegClass(Src2->getReg(), NewRC); in AdjustInstrPostInstrSelection()
15267 if (Src2->isTied()) in AdjustInstrPostInstrSelection()
15277 if (TII->isImage(MI)) in AdjustInstrPostInstrSelection()
15278 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr); in AdjustInstrPostInstrSelection()
15290 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in wrapAddr64Rsrc()
15293 // full 128-bit register. If we are building multiple resource descriptors, in wrapAddr64Rsrc()
15294 // this will allow CSEing of the 2-component register. in wrapAddr64Rsrc()
15299 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32), in wrapAddr64Rsrc()
15352 //===----------------------------------------------------------------------===//
15354 //===----------------------------------------------------------------------===//
15390 RC = TRI->getVGPRClassForBitWidth(BitWidth); in getRegForInlineAsmConstraint()
15397 if (!Subtarget->hasMAIInsts()) in getRegForInlineAsmConstraint()
15404 RC = TRI->getAGPRClassForBitWidth(BitWidth); in getRegForInlineAsmConstraint()
15419 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2); in getRegForInlineAsmConstraint()
15437 uint32_t Width = (End - Idx + 1) * 32; in getRegForInlineAsmConstraint()
15438 MCRegister Reg = RC->getRegister(Idx); in getRegForInlineAsmConstraint()
15440 RC = TRI->getVGPRClassForBitWidth(Width); in getRegForInlineAsmConstraint()
15442 RC = TRI->getSGPRClassForBitWidth(Width); in getRegForInlineAsmConstraint()
15444 RC = TRI->getAGPRClassForBitWidth(Width); in getRegForInlineAsmConstraint()
15446 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC); in getRegForInlineAsmConstraint()
15452 if (!Failed && Idx < RC->getNumRegs()) in getRegForInlineAsmConstraint()
15453 return std::pair(RC->getRegister(Idx), RC); in getRegForInlineAsmConstraint()
15460 Ret.second = TRI->getPhysRegBaseClass(Ret.first); in getRegForInlineAsmConstraint()
15528 if (Size == 16 && !Subtarget->has16BitInsts()) in getAsmOperandConstVal()
15532 Val = C->getSExtValue(); in getAsmOperandConstVal()
15536 Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); in getAsmOperandConstVal()
15544 if (ConstantSDNode *C = V->getConstantSplatNode()) { in getAsmOperandConstVal()
15545 Val = C->getSExtValue(); in getAsmOperandConstVal()
15548 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) { in getAsmOperandConstVal()
15549 Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); in getAsmOperandConstVal()
15592 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); in checkAsmConstraintValA()
15663 return -1; in getAlignedAGPRClassID()
15668 // the function is legalized do we know all of the non-spill stack objects or if
15674 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); in finalizeLowering()
15677 if (Info->isEntryFunction()) { in finalizeLowering()
15686 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1) in finalizeLowering()
15687 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2, in finalizeLowering()
15689 Info->setSGPRForEXECCopy(SReg); in finalizeLowering()
15691 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), in finalizeLowering()
15692 Info->getStackPtrOffsetReg())); in finalizeLowering()
15693 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) in finalizeLowering()
15694 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); in finalizeLowering()
15698 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG) in finalizeLowering()
15699 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); in finalizeLowering()
15701 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) in finalizeLowering()
15702 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); in finalizeLowering()
15704 Info->limitOccupancy(MF); in finalizeLowering()
15709 TII->fixImplicitOperands(MI); in finalizeLowering()
15716 // per-subtarget, but there's no easy way to achieve that right now. This is in finalizeLowering()
15725 int NewClassID = getAlignedAGPRClassID(RC->getID()); in finalizeLowering()
15726 if (NewClassID != -1) in finalizeLowering()
15727 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID)); in finalizeLowering()
15749 // These return at most the (wavefront size - 1) + src1 in computeKnownBitsForTargetNode()
15758 Known.Zero.setHighBits(Size - MaxActiveBits); in computeKnownBitsForTargetNode()
15776 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); in computeKnownBitsForFrameIndex()
15790 switch (MI->getOpcode()) { in computeKnownBitsForTargetInstr()
15793 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { in computeKnownBitsForTargetInstr()
15805 // These return at most the wavefront size - 1. in computeKnownBitsForTargetInstr()
15807 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2()); in computeKnownBitsForTargetInstr()
15815 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize())); in computeKnownBitsForTargetInstr()
15829 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs(); in computeKnownBitsForTargetInstr()
15861 Intrinsic::ID IID = GI->getIntrinsicID(); in computeKnownAlignForTargetInstr()
15874 // Pre-GFX10 target did not benefit from loop alignment in getPrefLoopAlignment()
15875 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() || in getPrefLoopAlignment()
15876 getSubtarget()->hasInstFwdPrefetchBug()) in getPrefLoopAlignment()
15889 const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); in getPrefLoopAlignment()
15890 const MachineBasicBlock *Header = ML->getHeader(); in getPrefLoopAlignment()
15891 if (Header->getAlignment() != PrefAlign) in getPrefLoopAlignment()
15892 return Header->getAlignment(); // Already processed. in getPrefLoopAlignment()
15895 for (const MachineBasicBlock *MBB : ML->blocks()) { in getPrefLoopAlignment()
15899 LoopSize += MBB->getAlignment().value() / 2; in getPrefLoopAlignment()
15902 LoopSize += TII->getInstSizeInBytes(MI); in getPrefLoopAlignment()
15916 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) { in getPrefLoopAlignment()
15917 if (MachineBasicBlock *Exit = P->getExitBlock()) { in getPrefLoopAlignment()
15918 auto I = Exit->getFirstNonDebugInstr(); in getPrefLoopAlignment()
15919 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH) in getPrefLoopAlignment()
15924 MachineBasicBlock *Pre = ML->getLoopPreheader(); in getPrefLoopAlignment()
15925 MachineBasicBlock *Exit = ML->getExitBlock(); in getPrefLoopAlignment()
15928 auto PreTerm = Pre->getFirstTerminator(); in getPrefLoopAlignment()
15929 if (PreTerm == Pre->begin() || in getPrefLoopAlignment()
15930 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH) in getPrefLoopAlignment()
15931 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) in getPrefLoopAlignment()
15934 auto ExitHead = Exit->getFirstNonDebugInstr(); in getPrefLoopAlignment()
15935 if (ExitHead == Exit->end() || in getPrefLoopAlignment()
15936 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH) in getPrefLoopAlignment()
15937 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH)) in getPrefLoopAlignment()
15946 assert(N->getOpcode() == ISD::CopyFromReg); in isCopyFromRegOfInlineAsm()
15949 N = N->getOperand(0).getNode(); in isCopyFromRegOfInlineAsm()
15950 if (N->getOpcode() == ISD::INLINEASM || in isCopyFromRegOfInlineAsm()
15951 N->getOpcode() == ISD::INLINEASM_BR) in isCopyFromRegOfInlineAsm()
15953 } while (N->getOpcode() == ISD::CopyFromReg); in isCopyFromRegOfInlineAsm()
15960 switch (N->getOpcode()) { in isSDNodeSourceOfDivergence()
15962 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); in isSDNodeSourceOfDivergence()
15963 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); in isSDNodeSourceOfDivergence()
15964 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); in isSDNodeSourceOfDivergence()
15965 Register Reg = R->getReg(); in isSDNodeSourceOfDivergence()
15969 return !TRI->isSGPRReg(MRI, Reg); in isSDNodeSourceOfDivergence()
15971 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) in isSDNodeSourceOfDivergence()
15972 return UA->isDivergent(V); in isSDNodeSourceOfDivergence()
15974 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); in isSDNodeSourceOfDivergence()
15975 return !TRI->isSGPRReg(MRI, Reg); in isSDNodeSourceOfDivergence()
15979 unsigned AS = L->getAddressSpace(); in isSDNodeSourceOfDivergence()
15980 // A flat load may access private memory. in isSDNodeSourceOfDivergence()
15986 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0)); in isSDNodeSourceOfDivergence()
15988 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1)); in isSDNodeSourceOfDivergence()
16007 // Target-specific read-modify-write atomics are sources of divergence. in isSDNodeSourceOfDivergence()
16011 // Generic read-modify-write atomics are sources of divergence. in isSDNodeSourceOfDivergence()
16012 return A->readMem() && A->writeMem(); in isSDNodeSourceOfDivergence()
16052 if (Info->getMode().DX10Clamp) in isKnownNeverNaNForTargetNode()
16066 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
16067 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16074 // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16079 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() != in unsafeFPAtomicsDisabled()
16084 LLVMContext &Ctx = RMW->getContext(); in emitAtomicRMWLegalRemark()
16087 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty() in emitAtomicRMWLegalRemark()
16089 : SSNs[RMW->getSyncScopeID()]; in emitAtomicRMWLegalRemark()
16093 << RMW->getOperationName(RMW->getOperation()) in emitAtomicRMWLegalRemark()
16099 Type *EltTy = VT->getElementType(); in isHalf2OrBFloat2()
16100 return VT->getNumElements() == 2 && in isHalf2OrBFloat2()
16101 (EltTy->isHalfTy() || EltTy->isBFloatTy()); in isHalf2OrBFloat2()
16109 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy(); in isHalf2()
16114 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); in isBFloat2()
16119 unsigned AS = RMW->getPointerAddressSpace(); in shouldExpandAtomicRMWInIR()
16124 OptimizationRemarkEmitter ORE(RMW->getFunction()); in shouldExpandAtomicRMWInIR()
16131 auto SSID = RMW->getSyncScopeID(); in shouldExpandAtomicRMWInIR()
16134 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"); in shouldExpandAtomicRMWInIR()
16136 switch (RMW->getOperation()) { in shouldExpandAtomicRMWInIR()
16143 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand()); in shouldExpandAtomicRMWInIR()
16144 ConstVal && ConstVal->isNullValue()) in shouldExpandAtomicRMWInIR()
16151 Type *Ty = RMW->getType(); in shouldExpandAtomicRMWInIR()
16156 // is fixed to round-to-nearest-even. in shouldExpandAtomicRMWInIR()
16159 // round-to-nearest-even. in shouldExpandAtomicRMWInIR()
16162 // suggests it is OK if the floating-point mode may not match the calling in shouldExpandAtomicRMWInIR()
16164 if (Ty->isFloatTy()) { in shouldExpandAtomicRMWInIR()
16165 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None in shouldExpandAtomicRMWInIR()
16169 if (Ty->isDoubleTy()) { in shouldExpandAtomicRMWInIR()
16171 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None in shouldExpandAtomicRMWInIR()
16175 if (Subtarget->hasAtomicDsPkAdd16Insts() && isHalf2OrBFloat2(Ty)) in shouldExpandAtomicRMWInIR()
16185 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy())) in shouldExpandAtomicRMWInIR()
16190 // FIXME: Needs to account for no fine-grained memory in shouldExpandAtomicRMWInIR()
16191 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty)) in shouldExpandAtomicRMWInIR()
16195 // FIXME: Needs to account for no fine-grained memory in shouldExpandAtomicRMWInIR()
16196 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) in shouldExpandAtomicRMWInIR()
16200 // FIXME: Needs to account for no fine-grained memory in shouldExpandAtomicRMWInIR()
16201 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty)) in shouldExpandAtomicRMWInIR()
16205 // FIXME: Needs to account for no fine-grained memory in shouldExpandAtomicRMWInIR()
16206 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) in shouldExpandAtomicRMWInIR()
16209 // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for in shouldExpandAtomicRMWInIR()
16211 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isBFloat2(Ty)) in shouldExpandAtomicRMWInIR()
16215 if (unsafeFPAtomicsDisabled(RMW->getFunction())) in shouldExpandAtomicRMWInIR()
16222 // global and flat atomic fadd f64: gfx90a, gfx940. in shouldExpandAtomicRMWInIR()
16223 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) in shouldExpandAtomicRMWInIR()
16227 if (Ty->isFloatTy()) { in shouldExpandAtomicRMWInIR()
16228 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+. in shouldExpandAtomicRMWInIR()
16229 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) in shouldExpandAtomicRMWInIR()
16232 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) in shouldExpandAtomicRMWInIR()
16236 if (RMW->use_empty() && in shouldExpandAtomicRMWInIR()
16237 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty)) in shouldExpandAtomicRMWInIR()
16242 // flat atomic fadd f32: gfx940, gfx11+. in shouldExpandAtomicRMWInIR()
16243 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { in shouldExpandAtomicRMWInIR()
16244 if (Subtarget->hasFlatAtomicFaddF32Inst()) in shouldExpandAtomicRMWInIR()
16247 // If it is in flat address space, and the type is float, we will try to in shouldExpandAtomicRMWInIR()
16252 if (Subtarget->hasLDSFPAtomicAddF32()) { in shouldExpandAtomicRMWInIR()
16253 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) in shouldExpandAtomicRMWInIR()
16255 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) in shouldExpandAtomicRMWInIR()
16264 Type *Ty = RMW->getType(); in shouldExpandAtomicRMWInIR()
16267 if (AS == AMDGPUAS::LOCAL_ADDRESS && (Ty->isFloatTy() || Ty->isDoubleTy())) in shouldExpandAtomicRMWInIR()
16270 if (unsafeFPAtomicsDisabled(RMW->getFunction())) in shouldExpandAtomicRMWInIR()
16277 // For flat and global cases: in shouldExpandAtomicRMWInIR()
16288 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) in shouldExpandAtomicRMWInIR()
16290 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy()) in shouldExpandAtomicRMWInIR()
16294 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy()) in shouldExpandAtomicRMWInIR()
16296 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy()) in shouldExpandAtomicRMWInIR()
16323 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS in shouldExpandAtomicLoadInIR()
16330 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS in shouldExpandAtomicStoreInIR()
16337 return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS in shouldExpandAtomicCmpXchgInIR()
16345 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); in getRegClassFor()
16347 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass in getRegClassFor()
16349 if (!TRI->isSGPRClass(RC) && !isDivergent) in getRegClassFor()
16350 return TRI->getEquivalentSGPRClass(RC); in getRegClassFor()
16351 if (TRI->isSGPRClass(RC) && isDivergent) in getRegClassFor()
16352 return TRI->getEquivalentVGPRClass(RC); in getRegClassFor()
16362 // FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16368 IntegerType *IT = dyn_cast<IntegerType>(V->getType()); in hasCFUser()
16369 if (!IT || IT->getBitWidth() != WaveSize) in hasCFUser()
16377 for (const auto *U : V->users()) { in hasCFUser()
16379 if (V == U->getOperand(1)) { in hasCFUser()
16380 switch (Intrinsic->getIntrinsicID()) { in hasCFUser()
16391 if (V == U->getOperand(0)) { in hasCFUser()
16392 switch (Intrinsic->getIntrinsicID()) { in hasCFUser()
16414 if (CI->isInlineAsm()) { in requiresUniformRegister()
16420 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); in requiresUniformRegister()
16422 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI); in requiresUniformRegister()
16428 if (RC && SIRI->isSGPRClass(RC)) in requiresUniformRegister()
16435 return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); in requiresUniformRegister()
16439 SDNode::use_iterator I = N->use_begin(), E = N->use_end(); in hasMemSDNodeUser()
16454 if (N0->isDivergent() || !N1->isDivergent()) in isReassocProfitable()
16459 hasMemSDNodeUser(*N0->use_begin())); in isReassocProfitable()
16481 if (User->getOpcode() != ISD::CopyToReg) in checkForPhysRegDependency()
16483 if (!Def->isMachineOpcode()) in checkForPhysRegDependency()
16489 unsigned ResNo = User->getOperand(Op).getResNo(); in checkForPhysRegDependency()
16490 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1) in checkForPhysRegDependency()
16492 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode()); in checkForPhysRegDependency()
16496 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo)); in checkForPhysRegDependency()
16497 Cost = RC->getCopyCost(); in checkForPhysRegDependency()
16504 AtomicRMWInst::BinOp Op = AI->getOperation(); in emitExpandAtomicRMW()
16508 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0 in emitExpandAtomicRMW()
16509 assert(cast<Constant>(AI->getValOperand())->isNullValue() && in emitExpandAtomicRMW()
16511 AI->setOperation(AtomicRMWInst::Add); in emitExpandAtomicRMW()
16515 assert(Subtarget->hasAtomicFaddInsts() && in emitExpandAtomicRMW()
16517 assert(AI->getType()->isFloatTy() && in emitExpandAtomicRMW()
16518 AI->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS && in emitExpandAtomicRMW()
16519 "generic atomicrmw expansion only supports FP32 operand in flat " in emitExpandAtomicRMW()
16569 Function *F = BB->getParent(); in emitExpandAtomicRMW()
16571 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end"); in emitExpandAtomicRMW()
16582 Value *Val = AI->getValOperand(); in emitExpandAtomicRMW()
16583 Type *ValTy = Val->getType(); in emitExpandAtomicRMW()
16584 Value *Addr = AI->getPointerOperand(); in emitExpandAtomicRMW()
16587 Value *Val) -> Value * { in emitExpandAtomicRMW()
16589 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(), in emitExpandAtomicRMW()
16590 AI->getOrdering(), AI->getSyncScopeID()); in emitExpandAtomicRMW()
16592 AI->getAllMetadata(MDs); in emitExpandAtomicRMW()
16594 OldVal->setMetadata(P.first, P.second); in emitExpandAtomicRMW()
16598 std::prev(BB->end())->eraseFromParent(); in emitExpandAtomicRMW()
16635 Loaded->addIncoming(LoadedShared, SharedBB); in emitExpandAtomicRMW()
16636 Loaded->addIncoming(LoadedPrivate, PrivateBB); in emitExpandAtomicRMW()
16637 Loaded->addIncoming(LoadedGlobal, GlobalBB); in emitExpandAtomicRMW()
16640 AI->replaceAllUsesWith(Loaded); in emitExpandAtomicRMW()
16641 AI->eraseFromParent(); in emitExpandAtomicRMW()
16647 auto Order = AI->getOrdering(); in lowerIdempotentRMWIntoFencedLoad()
16657 AI->getType(), AI->getPointerOperand(), AI->getAlign()); in lowerIdempotentRMWIntoFencedLoad()
16658 LI->setAtomic(Order, AI->getSyncScopeID()); in lowerIdempotentRMWIntoFencedLoad()
16659 LI->copyMetadata(*AI); in lowerIdempotentRMWIntoFencedLoad()
16660 LI->takeName(AI); in lowerIdempotentRMWIntoFencedLoad()
16661 AI->replaceAllUsesWith(LI); in lowerIdempotentRMWIntoFencedLoad()
16662 AI->eraseFromParent(); in lowerIdempotentRMWIntoFencedLoad()