Lines Matching +full:vcc +full:- +full:p

1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
67 /// picking the optimal operand combination from a post-isel optimization pass.
69 //===----------------------------------------------------------------------===//
127 // widening etc. We don't handle selection with vcc in artifact sources, in applyBank()
142 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1); in applyBank()
240 // and VCC banks are for the natural scalar and vector conditions produced by in copyCost()
277 // 32-bit extract of a 64-bit value is just access of a subregister, so free. in getBreakDownCost()
281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR in getBreakDownCost()
293 // VCC-like use. in getRegBankFromRegClass()
294 if (TRI->isSGPRClass(&RC)) { in getRegBankFromRegClass()
296 // should be inferable from the copied to-type. We don't have many boolean in getRegBankFromRegClass()
304 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank; in getRegBankFromRegClass()
449 const unsigned AS = MMO->getAddrSpace(); in isScalarLoadLegal()
452 const unsigned MemSize = 8 * MMO->getSize().getValue(); in isScalarLoadLegal()
454 // Require 4-byte alignment. in isScalarLoadLegal()
455 return (MMO->getAlign() >= Align(4) || in isScalarLoadLegal()
457 ((MemSize == 16 && MMO->getAlign() >= Align(2)) || in isScalarLoadLegal()
458 (MemSize == 8 && MMO->getAlign() >= Align(1))))) && in isScalarLoadLegal()
460 !MMO->isAtomic() && in isScalarLoadLegal()
461 // Don't use scalar loads for volatile accesses to non-constant address in isScalarLoadLegal()
463 (IsConst || !MMO->isVolatile()) && in isScalarLoadLegal()
465 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) && in isScalarLoadLegal()
473 const MachineFunction &MF = *MI.getParent()->getParent(); in getInstrAlternativeMappings()
510 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0. in getInstrAlternativeMappings()
666 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy); in split64BitValueForMapping()
667 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy); in split64BitValueForMapping()
669 MRI->setRegBank(LoLHS, *Bank); in split64BitValueForMapping()
670 MRI->setRegBank(HiLHS, *Bank); in split64BitValueForMapping()
681 /// Replace the current type each register in \p Regs has with \p NewTy
757 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
785 const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass(); in executeInWaterfallLoop()
815 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); in executeInWaterfallLoop()
816 MachineBasicBlock *BodyBB = MF->CreateMachineBasicBlock(); in executeInWaterfallLoop()
817 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); in executeInWaterfallLoop()
818 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock(); in executeInWaterfallLoop()
821 MF->insert(MBBI, LoopBB); in executeInWaterfallLoop()
822 MF->insert(MBBI, BodyBB); in executeInWaterfallLoop()
823 MF->insert(MBBI, RestoreExecBB); in executeInWaterfallLoop()
824 MF->insert(MBBI, RemainderBB); in executeInWaterfallLoop()
826 LoopBB->addSuccessor(BodyBB); in executeInWaterfallLoop()
827 BodyBB->addSuccessor(RestoreExecBB); in executeInWaterfallLoop()
828 BodyBB->addSuccessor(LoopBB); in executeInWaterfallLoop()
831 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); in executeInWaterfallLoop()
832 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); in executeInWaterfallLoop()
835 RestoreExecBB->addSuccessor(RemainderBB); in executeInWaterfallLoop()
837 B.setInsertPt(*LoopBB, LoopBB->end()); in executeInWaterfallLoop()
852 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end()); in executeInWaterfallLoop()
856 auto NewEnd = BodyBB->end(); in executeInWaterfallLoop()
875 Op.setReg(OldVal->second); in executeInWaterfallLoop()
931 // Make sure we don't re-process this register again. in executeInWaterfallLoop()
936 // The ballot becomes a no-op during instruction selection. in executeInWaterfallLoop()
943 // Update EXEC, save the original EXEC value to VCC. in executeInWaterfallLoop()
950 B.setInsertPt(*BodyBB, BodyBB->end()); in executeInWaterfallLoop()
958 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use in executeInWaterfallLoop()
965 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg) in executeInWaterfallLoop()
976 B.setInsertPt(*RemainderBB, RemainderBB->begin()); in executeInWaterfallLoop()
981 // Return any unique registers used by \p MI at \p OpIndices that need to be
982 // handled in a waterfall loop. Returns these registers in \p
992 if (OpBank->getID() != AMDGPU::SGPRRegBankID) in collectWaterfallOperands()
1027 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1032 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)}; in splitUnequalType()
1039 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize; in splitUnequalType()
1073 const unsigned MemSize = 8 * MMO->getSize().getValue(); in applyMappingLoad()
1083 ((MemSize == 8 && MMO->getAlign() >= Align(1)) || in applyMappingLoad()
1084 (MemSize == 16 && MMO->getAlign() >= Align(2))) && in applyMappingLoad()
1094 // This is an extending load from a sub-dword size. Widen the memory in applyMappingLoad()
1109 // 96-bit loads are only available for vector loads. We need to split this in applyMappingLoad()
1110 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load). in applyMappingLoad()
1111 if (MMO->getAlign() < Align(16)) { in applyMappingLoad()
1134 // 128-bit loads are supported for all instruction types. in applyMappingLoad()
1197 Register SPReg = Info->getStackPtrOffsetReg(); in applyMappingDynStackAlloc()
1255 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) { in setBufferOffsets()
1260 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); in setBufferOffsets()
1261 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); in setBufferOffsets()
1274 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) { in setBufferOffsets()
1278 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); in setBufferOffsets()
1280 return 0; // XXX - Why is this 0? in setBufferOffsets()
1286 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); in setBufferOffsets()
1289 return 0; // XXX - Why is this 0? in setBufferOffsets()
1296 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI); in setBufferOffsets()
1297 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI); in setBufferOffsets()
1321 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank); in setBufferOffsets()
1325 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank); in setBufferOffsets()
1346 // FIXME: 96-bit case was widened during legalize. We need to narrow it back in applyMappingSBufferLoad()
1369 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we in applyMappingSBufferLoad()
1386 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank); in applyMappingSBufferLoad()
1472 // There is no 64-bit vgpr bitfield extract instructions so the operation in applyMappingBFE()
1482 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions in applyMappingBFE()
1485 // Use the 32-bit bitfield extract instruction if the width is a constant. in applyMappingBFE()
1486 // Depending on the width size, use either the low or high 32-bits. in applyMappingBFE()
1488 auto WidthImm = ConstWidth->Value.getZExtValue(); in applyMappingBFE()
1490 // Use bitfield extract on the lower 32-bit source, and then sign-extend in applyMappingBFE()
1491 // or clear the upper 32-bits. in applyMappingBFE()
1499 // Use bitfield extract on upper 32-bit source, and combine with lower in applyMappingBFE()
1500 // 32-bit source. in applyMappingBFE()
1501 auto UpperWidth = B.buildConstant(S32, WidthImm - 32); in applyMappingBFE()
1512 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit in applyMappingBFE()
1611 // Accumulate and produce the "carry-out" bit. in applyMappingMAD_64_32()
1613 // The "carry-out" is defined as bit 64 of the result when computed as a in applyMappingMAD_64_32()
1614 // big integer. For unsigned multiply-add, this matches the usual definition in applyMappingMAD_64_32()
1615 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the in applyMappingMAD_64_32()
1617 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add in applyMappingMAD_64_32()
1718 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1771 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) in handleD16VData()
1818 ImmOffset -= Overflow; in splitBufferOffsets()
1893 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1894 /// original 32-bit source value (to be inserted in the low part of the combined
1895 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1910 // Replicate sign bit from 32-bit extended part. in extendLow32IntoHigh32()
1912 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank); in extendLow32IntoHigh32()
1957 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); in foldExtractEltToCmpSelect()
1976 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); in foldExtractEltToCmpSelect()
1978 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); in foldExtractEltToCmpSelect()
1985 MRI.setRegBank(S->getOperand(N).getReg(), DstBank); in foldExtractEltToCmpSelect()
1987 Res[L] = S->getOperand(0).getReg(); in foldExtractEltToCmpSelect()
2058 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg(); in foldInsertEltToCmpSelect()
2077 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank); in foldInsertEltToCmpSelect()
2079 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank); in foldInsertEltToCmpSelect()
2098 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank); in foldInsertEltToCmpSelect()
2108 // Break s_mul_u64 into 32-bit vector operations.
2151 // -------------------- in applyMappingSMULU64()
2154 // ----------------------------------------- in applyMappingSMULU64()
2157 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit in applyMappingSMULU64()
2159 // The low 32-bit value is Op1L*Op0L. in applyMappingSMULU64()
2160 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from in applyMappingSMULU64()
2205 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue(); in applyMappingImpl()
2226 // phis. For VCC, blindly inserting a copy when the phi is lowered will in applyMappingImpl()
2236 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator()); in applyMappingImpl()
2283 // will end up using a copy to a 32-bit vreg. in applyMappingImpl()
2299 // If we had a constrained VCC result register, a copy was inserted to VCC in applyMappingImpl()
2384 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if in applyMappingImpl()
2395 MachineFunction *MF = MI.getParent()->getParent(); in applyMappingImpl()
2453 MachineFunction *MF = MI.getParent()->getParent(); in applyMappingImpl()
2477 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector in applyMappingImpl()
2484 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU. in applyMappingImpl()
2485 // Packed 16-bit operations need to be scalarized and promoted. in applyMappingImpl()
2496 MachineFunction *MF = MBB->getParent(); in applyMappingImpl()
2545 // where the 33 higher bits are sign-extended and in applyMappingImpl()
2547 // where the 32 higher bits are zero-extended. In case scalar registers are in applyMappingImpl()
2561 "that handles only 64-bit operands."); in applyMappingImpl()
2568 MI.setDesc(TII->get(AMDGPU::S_MUL_U64)); in applyMappingImpl()
2638 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32); in applyMappingImpl()
2684 // which return -1 when the input is zero: in applyMappingImpl()
2685 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32)) in applyMappingImpl()
2686 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo)) in applyMappingImpl()
2687 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32)) in applyMappingImpl()
2688 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo)) in applyMappingImpl()
2732 // Extend to 32-bit, and then extend the low half. in applyMappingImpl()
2751 // It is not legal to have a legalization artifact with a VCC source. Rather in applyMappingImpl()
2760 // 64-bit select is SGPR only in applyMappingImpl()
2762 SrcBank->getID() == AMDGPU::SGPRRegBankID; in applyMappingImpl()
2766 auto True = B.buildConstant(SelType, Signed ? -1 : 1); in applyMappingImpl()
2826 // Move the base register. We'll re-insert the add later. in applyMappingImpl()
2852 // Re-insert the constant offset add inside the waterfall loop. in applyMappingImpl()
2868 // Split the vector index into 32-bit pieces. Prepare to move all of the in applyMappingImpl()
2874 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). in applyMappingImpl()
2901 MachineBasicBlock *LoopBB = Extract1->getParent(); in applyMappingImpl()
2907 Extract0->getOperand(0).setReg(TmpReg0); in applyMappingImpl()
2908 Extract1->getOperand(0).setReg(TmpReg1); in applyMappingImpl()
2910 B.setInsertPt(*LoopBB, ++Extract1->getIterator()); in applyMappingImpl()
2957 // Move the base register. We'll re-insert the add later. in applyMappingImpl()
2965 // Re-insert the constant offset add inside the waterfall loop. in applyMappingImpl()
2981 // Split the vector index into 32-bit pieces. Prepare to move all of the in applyMappingImpl()
2987 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). in applyMappingImpl()
3032 // Re-insert the constant offset add inside the waterfall loop. in applyMappingImpl()
3134 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index in applyMappingImpl()
3168 // Use default handling and insert copy to vcc source. in applyMappingImpl()
3180 assert(RSrcIntrin && RSrcIntrin->IsImage); in applyMappingImpl()
3181 // Non-images can have complications from operands that allow both SGPR in applyMappingImpl()
3184 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); in applyMappingImpl()
3188 unsigned N = MI.getNumExplicitOperands() - 2; in applyMappingImpl()
3262 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index in applyMappingImpl()
3292 // Non-images can have complications from operands that allow both SGPR in applyMappingImpl()
3295 if (RSrcIntrin->IsImage) { in applyMappingImpl()
3296 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg); in applyMappingImpl()
3320 // Move all non-copies before the copies, so that a complete range can be in applyMappingImpl()
3329 MBB->getParent()->getInfo<SIMachineFunctionInfo>(); in applyMappingImpl()
3330 while (Start->getOpcode() != FrameSetupOpcode) { in applyMappingImpl()
3331 --Start; in applyMappingImpl()
3333 if (Start->getOpcode() == AMDGPU::COPY) { in applyMappingImpl()
3334 auto &Dst = Start->getOperand(0); in applyMappingImpl()
3342 auto &Src = Start->getOperand(1); in applyMappingImpl()
3345 IsCopy = Info->getScratchRSrcReg() == Reg; in applyMappingImpl()
3361 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); in applyMappingImpl()
3370 while (End->getOpcode() != FrameDestroyOpcode) { in applyMappingImpl()
3373 if (End->getOpcode() == AMDGPU::COPY) { in applyMappingImpl()
3374 auto &Src = End->getOperand(1); in applyMappingImpl()
3393 MBB->splice(LastCopy, MBB, NonCopy->getIterator()); in applyMappingImpl()
3453 // vgpr, sgpr -> vgpr
3454 // vgpr, agpr -> vgpr
3455 // agpr, agpr -> agpr
3456 // agpr, sgpr -> vgpr
3478 // vcc, vcc -> vcc in regBankBoolUnion()
3479 // vcc, sgpr -> vcc in regBankBoolUnion()
3480 // vcc, vgpr -> vcc in regBankBoolUnion()
3484 // vcc, vgpr -> vgpr in regBankBoolUnion()
3497 RegBank = regBankUnion(RegBank, Bank->getID()); in getMappingType()
3507 const MachineFunction &MF = *MI.getParent()->getParent(); in isSALUMapping()
3514 if (Bank->getID() != AMDGPU::SGPRRegBankID) in isSALUMapping()
3523 const MachineFunction &MF = *MI.getParent()->getParent(); in getDefaultMappingSOP()
3541 const MachineFunction &MF = *MI.getParent()->getParent(); in getDefaultMappingVOP()
3546 // the constant bus restriction. Force all sources to VGPR (except for VCC). in getDefaultMappingVOP()
3565 const MachineFunction &MF = *MI.getParent()->getParent(); in getDefaultMappingAllVGPR()
3638 return AMDGPU::getValueMapping(PtrBank->getID(), Size); in getValueMappingForPtr()
3644 const MachineFunction &MF = *MI.getParent()->getParent(); in getInstrMappingForLoad()
3693 return Bank ? Bank->getID() : Default; in getRegBankID()
3734 const MachineFunction &MF = *MI.getParent()->getParent(); in getInstrMapping()
3772 // It doesn't make sense to use vcc or scc banks here, so just ignore in getInstrMapping()
3787 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies in getInstrMapping()
3793 Register DstReg = PHI->getReg(0); in getInstrMapping()
3797 ResultBank = DstBank->getID(); in getInstrMapping()
3799 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) { in getInstrMapping()
3800 Register Reg = PHI->getIncomingValue(I); in getInstrMapping()
3804 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) { in getInstrMapping()
3810 unsigned OpBank = Bank->getID(); in getInstrMapping()
3848 TargetBankID = DstBank->getID(); in getInstrMapping()
4013 // - Default SOP in getInstrMapping()
4014 // - Default VOP in getInstrMapping()
4015 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP. in getInstrMapping()
4024 if (Bank->getID() != AMDGPU::SGPRRegBankID) { in getInstrMapping()
4037 // If the multiply-add is full-rate in VALU, use that even if the in getInstrMapping()
4193 switch (SrcBank->getID()) { in getInstrMapping()
4202 // Scalar extend can use 64-bit BFE, but VGPRs require extending to in getInstrMapping()
4203 // 32-bits, and then to 64. in getInstrMapping()
4205 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(), in getInstrMapping()
4233 // See if the result register has already been constrained to vcc, which may in getInstrMapping()
4261 // TODO: Use 32-bit for scalar output size. in getInstrMapping()
4262 // SCC results will need to be copied to a 32-bit SGPR virtual register. in getInstrMapping()
4440 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID(); in getInstrMapping()
4441 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID(); in getInstrMapping()
4735 Info->mayNeedAGPRs() in getInstrMapping()
4741 Info->mayNeedAGPRs() in getInstrMapping()
4774 const int M0Idx = MI.getNumOperands() - 1; in getInstrMapping()
4852 // Non-images can have complications from operands that allow both SGPR in getInstrMapping()
4855 assert(RSrcIntrin->IsImage); in getInstrMapping()
4856 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg); in getInstrMapping()
4859 unsigned N = MI.getNumExplicitOperands() - 2; in getInstrMapping()
5082 const int M0Idx = MI.getNumOperands() - 1; in getInstrMapping()
5175 // TODO: Should report 32-bit for scalar condition type. in getInstrMapping()