Lines Matching +full:auto +full:- +full:load
1 //===--- ExpandMemCmp.cpp - Expand memcmp() to load/stores ----------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass tries to expand memcmp() calls into optimally-sized loads and
12 //===----------------------------------------------------------------------===//
42 #define DEBUG_TYPE "expand-memcmp"
51 "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
56 "max-loads-per-memcmp", cl::Hidden,
60 "max-loads-per-memcmp-opt-size", cl::Hidden,
61 cl::desc("Set maximum number of loads used in expanded memcmp for -Os/Oz"));
91 // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
92 // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}.
98 // The size of the load for this block, in bytes.
100 // The offset of this load from the base pointer, in bytes.
190 // We try to do as many non-overlapping loads as possible starting from the in computeOverlappingLoadSequence()
193 assert(NumNonOverlappingLoads && "there must be at least one load"); in computeOverlappingLoadSequence()
194 // There remain 0 to (MaxLoadSize - 1) bytes to load, this will be done with in computeOverlappingLoadSequence()
195 // an overlapping load. in computeOverlappingLoadSequence()
196 Size = Size - NumNonOverlappingLoads * MaxLoadSize; in computeOverlappingLoadSequence()
201 // Bail if the number of loads (non-overlapping + potential overlapping one) in computeOverlappingLoadSequence()
206 // Add non-overlapping loads. in computeOverlappingLoadSequence()
214 // Add the last overlapping load. in computeOverlappingLoadSequence()
216 LoadSequence.push_back({MaxLoadSize, Offset - (MaxLoadSize - Size)}); in computeOverlappingLoadSequence()
233 auto Last = LoadSequence[LoadSequence.size() - 1]; in optimiseLoadSequence()
234 auto PreLast = LoadSequence[LoadSequence.size() - 2]; in optimiseLoadSequence()
240 auto LoadSize = Last.LoadSize + PreLast.LoadSize; in optimiseLoadSequence()
253 // with given maximum load size and memcmp size parameter.
255 // 1. A list of load compare blocks - LoadCmpBlocks.
269 // Scale the max size down if the target can load more bytes than we need. in MemCmpExpansion()
274 assert(!LoadSizes.empty() && "cannot load Size bytes"); in MemCmpExpansion()
282 // If we allow overlapping loads and the load sequence is not already optimal, in MemCmpExpansion()
287 auto OverlappingLoads = computeOverlappingLoadSequence( in MemCmpExpansion()
309 BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb", in createLoadCmpBlocks()
310 EndBlock->getParent(), EndBlock); in createLoadCmpBlocks()
316 ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block", in createResultBlock()
317 EndBlock->getParent(), EndBlock); in createResultBlock()
325 Value *LhsSource = CI->getArgOperand(0); in getLoadPair()
326 Value *RhsSource = CI->getArgOperand(1); in getLoadPair()
327 Align LhsAlign = LhsSource->getPointerAlignment(DL); in getLoadPair()
328 Align RhsAlign = RhsSource->getPointerAlignment(DL); in getLoadPair()
330 auto *ByteType = Type::getInt8Ty(CI->getContext()); in getLoadPair()
337 // Create a constant or a load from the source. in getLoadPair()
339 if (auto *C = dyn_cast<Constant>(LhsSource)) in getLoadPair()
345 if (auto *C = dyn_cast<Constant>(RhsSource)) in getLoadPair()
359 CI->getModule(), Intrinsic::bswap, BSwapSizeType); in getLoadPair()
365 if (CmpSizeType != nullptr && CmpSizeType != Lhs->getType()) { in getLoadPair()
381 getLoadPair(Type::getInt8Ty(CI->getContext()), nullptr, in emitLoadCompareByteBlock()
382 Type::getInt32Ty(CI->getContext()), OffsetBytes); in emitLoadCompareByteBlock()
385 PhiRes->addIncoming(Diff, BB); in emitLoadCompareByteBlock()
387 if (BlockIndex < (LoadCmpBlocks.size() - 1)) { in emitLoadCompareByteBlock()
391 ConstantInt::get(Diff->getType(), 0)); in emitLoadCompareByteBlock()
396 DTU->applyUpdates( in emitLoadCompareByteBlock()
404 DTU->applyUpdates({{DominatorTree::Insert, BB, EndBlock}}); in emitLoadCompareByteBlock()
419 std::min(getNumLoads() - LoadIndex, NumLoadsPerBlockForZeroCmp); in getCompareLoadPairs()
421 // For a single-block expansion, start inserting before the memcmp call. in getCompareLoadPairs()
429 // comparison using xor+or. The type for the combinations is the largest load in getCompareLoadPairs()
433 : IntegerType::get(CI->getContext(), MaxLoadSize * 8); in getCompareLoadPairs()
438 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8), nullptr, in getCompareLoadPairs()
448 // If there's only one load per block, we just compare the loaded values. in getCompareLoadPairs()
453 auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> { in getCompareLoadPairs()
455 for (unsigned i = 0; i < InList.size() - 1; i = i + 2) { in getCompareLoadPairs()
474 Cmp = Builder.CreateICmpNE(OrList[0], ConstantInt::get(Diff->getType(), 0)); in getCompareLoadPairs()
484 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) in emitLoadCompareBlockMultipleLoads()
493 DTU->applyUpdates({{DominatorTree::Insert, BB, ResBlock.BB}, in emitLoadCompareBlockMultipleLoads()
499 if (BlockIndex == LoadCmpBlocks.size() - 1) { in emitLoadCompareBlockMultipleLoads()
500 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); in emitLoadCompareBlockMultipleLoads()
501 PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); in emitLoadCompareBlockMultipleLoads()
515 // There is one load per block in this case, BlockIndex == LoadIndex. in emitLoadCompareBlock()
524 IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8); in emitLoadCompareBlock()
527 ? IntegerType::get(CI->getContext(), in emitLoadCompareBlock()
531 CI->getContext(), in emitLoadCompareBlock()
533 assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type"); in emitLoadCompareBlock()
543 ResBlock.PhiSrc1->addIncoming(Loads.Lhs, LoadCmpBlocks[BlockIndex]); in emitLoadCompareBlock()
544 ResBlock.PhiSrc2->addIncoming(Loads.Rhs, LoadCmpBlocks[BlockIndex]); in emitLoadCompareBlock()
548 BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1)) in emitLoadCompareBlock()
557 DTU->applyUpdates({{DominatorTree::Insert, BB, NextBB}, in emitLoadCompareBlock()
563 if (BlockIndex == LoadCmpBlocks.size() - 1) { in emitLoadCompareBlock()
564 Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0); in emitLoadCompareBlock()
565 PhiRes->addIncoming(Zero, LoadCmpBlocks[BlockIndex]); in emitLoadCompareBlock()
570 // memcmp result. It compares the two loaded source values and returns -1 if
576 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); in emitMemCmpResultBlock()
578 Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1); in emitMemCmpResultBlock()
579 PhiRes->addIncoming(Res, ResBlock.BB); in emitMemCmpResultBlock()
583 DTU->applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}}); in emitMemCmpResultBlock()
586 BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt(); in emitMemCmpResultBlock()
593 Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1), in emitMemCmpResultBlock()
596 PhiRes->addIncoming(Res, ResBlock.BB); in emitMemCmpResultBlock()
600 DTU->applyUpdates({{DominatorTree::Insert, ResBlock.BB, EndBlock}}); in emitMemCmpResultBlock()
604 Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8); in setupResultBlockPHINodes()
606 // Note: this assumes one load per block. in setupResultBlockPHINodes()
614 Builder.SetInsertPoint(EndBlock, EndBlock->begin()); in setupEndBlockPHINodes()
615 PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res"); in setupEndBlockPHINodes()
631 /// load and compare can bypass the compare, branch, and phi IR that is required
637 return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); in getMemCmpEqZeroOneBlock()
640 /// A memcmp expansion that only has one block of load and compare can bypass
647 Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8); in getMemCmpOneBlock()
649 NeedsBSwap ? IntegerType::get(CI->getContext(), PowerOf2Ceil(Size * 8)) in getMemCmpOneBlock()
652 IntegerType::get(CI->getContext(), in getMemCmpOneBlock()
669 if (CI->hasOneUser()) { in getMemCmpOneBlock()
670 auto *UI = cast<Instruction>(*CI->user_begin()); in getMemCmpOneBlock()
680 Shift == (CI->getType()->getIntegerBitWidth() - 1)) { in getMemCmpOneBlock()
691 auto *Result = NeedsZExt ? Builder.CreateZExt(Cmp, UI->getType()) : Cmp; in getMemCmpOneBlock()
692 UI->replaceAllUsesWith(Result); in getMemCmpOneBlock()
693 UI->eraseFromParent(); in getMemCmpOneBlock()
694 CI->eraseFromParent(); in getMemCmpOneBlock()
701 // If a target prefers to use selects to get -1/0/1, they should be able in getMemCmpOneBlock()
715 // Create the basic block framework for a multi-block expansion. in getMemCmpExpansion()
717 BasicBlock *StartBlock = CI->getParent(); in getMemCmpExpansion()
725 // two loaded source values of each load compare block. in getMemCmpExpansion()
729 // Create the number of required load compare basic blocks. in getMemCmpExpansion()
734 StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); in getMemCmpExpansion()
736 DTU->applyUpdates({{DominatorTree::Insert, StartBlock, LoadCmpBlocks[0]}, in getMemCmpExpansion()
740 Builder.SetCurrentDebugLocation(CI->getDebugLoc()); in getMemCmpExpansion()
769 /// %4 = load i64, i64* %2
770 /// %5 = load i64, i64* %3
781 /// %11 = select i1 %10, i32 -1, i32 1
790 /// %18 = load i32, i32* %16
791 /// %19 = load i32, i32* %17
806 /// %32 = load i16, i16* %30
807 /// %33 = load i16, i16* %31
820 /// %44 = load i8, i8* %42
821 /// %45 = load i8, i8* %43
836 // Early exit from expansion if -Oz. in expandMemCmp()
837 if (CI->getFunction()->hasMinSize()) in expandMemCmp()
841 ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2)); in expandMemCmp()
846 const uint64_t SizeVal = SizeCast->getZExtValue(); in expandMemCmp()
852 // available load sizes. in expandMemCmp()
855 bool OptForSize = CI->getFunction()->hasOptSize() || in expandMemCmp()
856 llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI); in expandMemCmp()
857 auto Options = TTI->enableMemCmpExpansion(OptForSize, in expandMemCmp()
883 CI->replaceAllUsesWith(Res); in expandMemCmp()
884 CI->eraseFromParent(); in expandMemCmp()
913 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); in runOnFunction()
918 TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering(); in runOnFunction()
924 auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI(); in runOnFunction()
925 auto *BFI = (PSI && PSI->hasProfileSummary()) ? in runOnFunction()
929 if (auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>()) in runOnFunction()
930 DT = &DTWP->getDomTree(); in runOnFunction()
931 auto PA = runImpl(F, TLI, TTI, TL, PSI, BFI, DT); in runOnFunction()
956 if (TLI->getLibFunc(*CI, Func) && in runOnBlock()
975 for (auto BBIt = F.begin(); BBIt != F.end();) { in runImpl()
999 const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); in run()
1000 const auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); in run()
1001 const auto &TTI = FAM.getResult<TargetIRAnalysis>(F); in run()
1002 auto *PSI = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F) in run()
1004 BlockFrequencyInfo *BFI = (PSI && PSI->hasProfileSummary()) in run()
1007 auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F); in run()
1014 "Expand memcmp() to load/stores", false, false)
1021 "Expand memcmp() to load/stores", false, false) in INITIALIZE_PASS_DEPENDENCY()