1 //===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This pass implements an idiom recognizer that transforms simple loops into a 10 // non-loop form. In cases that this kicks in, it can be a significant 11 // performance win. 12 // 13 // If compiling for code size we avoid idiom recognition if the resulting 14 // code could be larger than the code for the original loop. One way this could 15 // happen is if the loop is not removable after idiom recognition due to the 16 // presence of non-idiom instructions. The initial implementation of the 17 // heuristics applies to idioms in multi-block loops. 18 // 19 //===----------------------------------------------------------------------===// 20 // 21 // TODO List: 22 // 23 // Future loop memory idioms to recognize: 24 // memcmp, memmove, strlen, etc. 25 // Future floating point idioms to recognize in -ffast-math mode: 26 // fpowi 27 // Future integer operation idioms to recognize: 28 // ctpop 29 // 30 // Beware that isel's default lowering for ctpop is highly inefficient for 31 // i64 and larger types when i64 is legal and the value has few bits set. It 32 // would be good to enhance isel to emit a loop for ctpop in this case. 33 // 34 // This could recognize common matrix multiplies and dot product idioms and 35 // replace them with calls to BLAS (if linked in??). 36 // 37 //===----------------------------------------------------------------------===// 38 39 #include "llvm/Transforms/Scalar/LoopIdiomRecognize.h" 40 #include "llvm/ADT/APInt.h" 41 #include "llvm/ADT/ArrayRef.h" 42 #include "llvm/ADT/DenseMap.h" 43 #include "llvm/ADT/MapVector.h" 44 #include "llvm/ADT/STLExtras.h" 45 #include "llvm/ADT/SetVector.h" 46 #include "llvm/ADT/SmallPtrSet.h" 47 #include "llvm/ADT/SmallVector.h" 48 #include "llvm/ADT/Statistic.h" 49 #include "llvm/ADT/StringRef.h" 50 #include "llvm/Analysis/AliasAnalysis.h" 51 #include "llvm/Analysis/LoopAccessAnalysis.h" 52 #include "llvm/Analysis/LoopInfo.h" 53 #include "llvm/Analysis/LoopPass.h" 54 #include "llvm/Analysis/MemoryLocation.h" 55 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 56 #include "llvm/Analysis/ScalarEvolution.h" 57 #include "llvm/Analysis/ScalarEvolutionExpander.h" 58 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 59 #include "llvm/Analysis/TargetLibraryInfo.h" 60 #include "llvm/Analysis/TargetTransformInfo.h" 61 #include "llvm/Analysis/ValueTracking.h" 62 #include "llvm/IR/Attributes.h" 63 #include "llvm/IR/BasicBlock.h" 64 #include "llvm/IR/Constant.h" 65 #include "llvm/IR/Constants.h" 66 #include "llvm/IR/DataLayout.h" 67 #include "llvm/IR/DebugLoc.h" 68 #include "llvm/IR/DerivedTypes.h" 69 #include "llvm/IR/Dominators.h" 70 #include "llvm/IR/GlobalValue.h" 71 #include "llvm/IR/GlobalVariable.h" 72 #include "llvm/IR/IRBuilder.h" 73 #include "llvm/IR/InstrTypes.h" 74 #include "llvm/IR/Instruction.h" 75 #include "llvm/IR/Instructions.h" 76 #include "llvm/IR/IntrinsicInst.h" 77 #include "llvm/IR/Intrinsics.h" 78 #include "llvm/IR/LLVMContext.h" 79 #include "llvm/IR/Module.h" 80 #include "llvm/IR/PassManager.h" 81 #include "llvm/IR/PatternMatch.h" 82 #include "llvm/IR/Type.h" 83 #include "llvm/IR/User.h" 84 #include "llvm/IR/Value.h" 85 #include "llvm/IR/ValueHandle.h" 86 #include "llvm/IR/Verifier.h" 87 #include "llvm/Pass.h" 88 #include "llvm/Support/Casting.h" 89 #include "llvm/Support/CommandLine.h" 90 #include "llvm/Support/Debug.h" 91 #include "llvm/Support/raw_ostream.h" 92 #include "llvm/Transforms/Scalar.h" 93 #include "llvm/Transforms/Scalar/LoopPassManager.h" 94 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 95 #include "llvm/Transforms/Utils/BuildLibCalls.h" 96 #include "llvm/Transforms/Utils/Local.h" 97 #include "llvm/Transforms/Utils/LoopUtils.h" 98 #include <algorithm> 99 #include <cassert> 100 #include <cstdint> 101 #include <utility> 102 #include <vector> 103 104 using namespace llvm; 105 106 #define DEBUG_TYPE "loop-idiom" 107 108 STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); 109 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); 110 STATISTIC(NumBCmp, "Number of memcmp's formed from loop 2xload+eq-compare"); 111 112 static cl::opt<bool> UseLIRCodeSizeHeurs( 113 "use-lir-code-size-heurs", 114 cl::desc("Use loop idiom recognition code size heuristics when compiling" 115 "with -Os/-Oz"), 116 cl::init(true), cl::Hidden); 117 118 namespace { 119 120 // FIXME: reinventing the wheel much? Is there a cleaner solution? 121 struct PMAbstraction { 122 virtual void markLoopAsDeleted(Loop *L) = 0; 123 virtual ~PMAbstraction() = default; 124 }; 125 struct LegacyPMAbstraction : PMAbstraction { 126 LPPassManager &LPM; 127 LegacyPMAbstraction(LPPassManager &LPM) : LPM(LPM) {} 128 virtual ~LegacyPMAbstraction() = default; 129 void markLoopAsDeleted(Loop *L) override { LPM.markLoopAsDeleted(*L); } 130 }; 131 struct NewPMAbstraction : PMAbstraction { 132 LPMUpdater &Updater; 133 NewPMAbstraction(LPMUpdater &Updater) : Updater(Updater) {} 134 virtual ~NewPMAbstraction() = default; 135 void markLoopAsDeleted(Loop *L) override { 136 Updater.markLoopAsDeleted(*L, L->getName()); 137 } 138 }; 139 140 class LoopIdiomRecognize { 141 Loop *CurLoop = nullptr; 142 AliasAnalysis *AA; 143 DominatorTree *DT; 144 LoopInfo *LI; 145 ScalarEvolution *SE; 146 TargetLibraryInfo *TLI; 147 const TargetTransformInfo *TTI; 148 const DataLayout *DL; 149 PMAbstraction &LoopDeleter; 150 OptimizationRemarkEmitter &ORE; 151 bool ApplyCodeSizeHeuristics; 152 153 public: 154 explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT, 155 LoopInfo *LI, ScalarEvolution *SE, 156 TargetLibraryInfo *TLI, 157 const TargetTransformInfo *TTI, 158 const DataLayout *DL, PMAbstraction &LoopDeleter, 159 OptimizationRemarkEmitter &ORE) 160 : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), 161 LoopDeleter(LoopDeleter), ORE(ORE) {} 162 163 bool runOnLoop(Loop *L); 164 165 private: 166 using StoreList = SmallVector<StoreInst *, 8>; 167 using StoreListMap = MapVector<Value *, StoreList>; 168 169 StoreListMap StoreRefsForMemset; 170 StoreListMap StoreRefsForMemsetPattern; 171 StoreList StoreRefsForMemcpy; 172 bool HasMemset; 173 bool HasMemsetPattern; 174 bool HasMemcpy; 175 bool HasMemCmp; 176 bool HasBCmp; 177 178 /// Return code for isLegalStore() 179 enum LegalStoreKind { 180 None = 0, 181 Memset, 182 MemsetPattern, 183 Memcpy, 184 UnorderedAtomicMemcpy, 185 DontUse // Dummy retval never to be used. Allows catching errors in retval 186 // handling. 187 }; 188 189 /// \name Countable Loop Idiom Handling 190 /// @{ 191 192 bool runOnCountableLoop(); 193 bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, 194 SmallVectorImpl<BasicBlock *> &ExitBlocks); 195 196 void collectStores(BasicBlock *BB); 197 LegalStoreKind isLegalStore(StoreInst *SI); 198 enum class ForMemset { No, Yes }; 199 bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, 200 ForMemset For); 201 bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); 202 203 bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, 204 unsigned StoreAlignment, Value *StoredVal, 205 Instruction *TheStore, 206 SmallPtrSetImpl<Instruction *> &Stores, 207 const SCEVAddRecExpr *Ev, const SCEV *BECount, 208 bool NegStride, bool IsLoopMemset = false); 209 bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount); 210 bool avoidLIRForMultiBlockLoop(bool IsMemset = false, 211 bool IsLoopMemset = false); 212 213 /// @} 214 /// \name Noncountable Loop Idiom Handling 215 /// @{ 216 217 bool runOnNoncountableLoop(); 218 219 struct CmpLoopStructure { 220 Value *BCmpValue, *LatchCmpValue; 221 BasicBlock *HeaderBrEqualBB, *HeaderBrUnequalBB; 222 BasicBlock *LatchBrFinishBB, *LatchBrContinueBB; 223 }; 224 bool matchBCmpLoopStructure(CmpLoopStructure &CmpLoop) const; 225 struct CmpOfLoads { 226 ICmpInst::Predicate BCmpPred; 227 Value *LoadSrcA, *LoadSrcB; 228 Value *LoadA, *LoadB; 229 }; 230 bool matchBCmpOfLoads(Value *BCmpValue, CmpOfLoads &CmpOfLoads) const; 231 bool recognizeBCmpLoopControlFlow(const CmpOfLoads &CmpOfLoads, 232 CmpLoopStructure &CmpLoop) const; 233 bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, CmpOfLoads &CmpOfLoads, 234 const SCEV *&SrcA, const SCEV *&SrcB, 235 const SCEV *&Iterations) const; 236 bool detectBCmpIdiom(ICmpInst *&BCmpInst, CmpInst *&LatchCmpInst, 237 LoadInst *&LoadA, LoadInst *&LoadB, const SCEV *&SrcA, 238 const SCEV *&SrcB, const SCEV *&NBytes) const; 239 BasicBlock *transformBCmpControlFlow(ICmpInst *ComparedEqual); 240 void transformLoopToBCmp(ICmpInst *BCmpInst, CmpInst *LatchCmpInst, 241 LoadInst *LoadA, LoadInst *LoadB, const SCEV *SrcA, 242 const SCEV *SrcB, const SCEV *NBytes); 243 bool recognizeBCmp(); 244 245 bool recognizePopcount(); 246 void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, 247 PHINode *CntPhi, Value *Var); 248 bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz 249 void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB, 250 Instruction *CntInst, PHINode *CntPhi, 251 Value *Var, Instruction *DefX, 252 const DebugLoc &DL, bool ZeroCheck, 253 bool IsCntPhiUsedOutsideLoop); 254 255 /// @} 256 }; 257 258 class LoopIdiomRecognizeLegacyPass : public LoopPass { 259 public: 260 static char ID; 261 262 explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) { 263 initializeLoopIdiomRecognizeLegacyPassPass( 264 *PassRegistry::getPassRegistry()); 265 } 266 267 bool runOnLoop(Loop *L, LPPassManager &LPM) override { 268 if (skipLoop(L)) 269 return false; 270 271 AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); 272 DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 273 LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); 274 ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); 275 TargetLibraryInfo *TLI = 276 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI( 277 *L->getHeader()->getParent()); 278 const TargetTransformInfo *TTI = 279 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI( 280 *L->getHeader()->getParent()); 281 const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout(); 282 LegacyPMAbstraction LoopDeleter(LPM); 283 284 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis 285 // pass. Function analyses need to be preserved across loop transformations 286 // but ORE cannot be preserved (see comment before the pass definition). 287 OptimizationRemarkEmitter ORE(L->getHeader()->getParent()); 288 289 LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, LoopDeleter, ORE); 290 return LIR.runOnLoop(L); 291 } 292 293 /// This transformation requires natural loop information & requires that 294 /// loop preheaders be inserted into the CFG. 295 void getAnalysisUsage(AnalysisUsage &AU) const override { 296 AU.addRequired<TargetLibraryInfoWrapperPass>(); 297 AU.addRequired<TargetTransformInfoWrapperPass>(); 298 getLoopAnalysisUsage(AU); 299 } 300 }; 301 302 } // end anonymous namespace 303 304 char LoopIdiomRecognizeLegacyPass::ID = 0; 305 306 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM, 307 LoopStandardAnalysisResults &AR, 308 LPMUpdater &Updater) { 309 const auto *DL = &L.getHeader()->getModule()->getDataLayout(); 310 311 const auto &FAM = 312 AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); 313 Function *F = L.getHeader()->getParent(); 314 315 auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F); 316 // FIXME: This should probably be optional rather than required. 317 if (!ORE) 318 report_fatal_error( 319 "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached " 320 "at a higher level"); 321 322 NewPMAbstraction LoopDeleter(Updater); 323 LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL, 324 LoopDeleter, *ORE); 325 if (!LIR.runOnLoop(&L)) 326 return PreservedAnalyses::all(); 327 328 return getLoopPassPreservedAnalyses(); 329 } 330 331 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom", 332 "Recognize loop idioms", false, false) 333 INITIALIZE_PASS_DEPENDENCY(LoopPass) 334 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) 335 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) 336 INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom", 337 "Recognize loop idioms", false, false) 338 339 Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); } 340 341 static void deleteDeadInstruction(Instruction *I) { 342 I->replaceAllUsesWith(UndefValue::get(I->getType())); 343 I->eraseFromParent(); 344 } 345 346 //===----------------------------------------------------------------------===// 347 // 348 // Implementation of LoopIdiomRecognize 349 // 350 //===----------------------------------------------------------------------===// 351 352 bool LoopIdiomRecognize::runOnLoop(Loop *L) { 353 CurLoop = L; 354 // If the loop could not be converted to canonical form, it must have an 355 // indirectbr in it, just give up. 356 if (!L->getLoopPreheader()) 357 return false; 358 359 // Disable loop idiom recognition if the function's name is a common idiom. 360 StringRef Name = L->getHeader()->getParent()->getName(); 361 if (Name == "memset" || Name == "memcpy" || Name == "memcmp" || 362 Name == "bcmp") 363 return false; 364 365 // Determine if code size heuristics need to be applied. 366 ApplyCodeSizeHeuristics = 367 L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs; 368 369 HasMemset = TLI->has(LibFunc_memset); 370 HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); 371 HasMemcpy = TLI->has(LibFunc_memcpy); 372 HasMemCmp = TLI->has(LibFunc_memcmp); 373 HasBCmp = TLI->has(LibFunc_bcmp); 374 375 if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemCmp || HasBCmp) 376 if (SE->hasLoopInvariantBackedgeTakenCount(L)) 377 return runOnCountableLoop(); 378 379 return runOnNoncountableLoop(); 380 } 381 382 bool LoopIdiomRecognize::runOnCountableLoop() { 383 const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); 384 assert(!isa<SCEVCouldNotCompute>(BECount) && 385 "runOnCountableLoop() called on a loop without a predictable" 386 "backedge-taken count"); 387 388 // If this loop executes exactly one time, then it should be peeled, not 389 // optimized by this pass. 390 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 391 if (BECst->getAPInt() == 0) 392 return false; 393 394 SmallVector<BasicBlock *, 8> ExitBlocks; 395 CurLoop->getUniqueExitBlocks(ExitBlocks); 396 397 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" 398 << CurLoop->getHeader()->getParent()->getName() 399 << "] Countable Loop %" << CurLoop->getHeader()->getName() 400 << "\n"); 401 402 bool MadeChange = false; 403 404 // The following transforms hoist stores/memsets into the loop pre-header. 405 // Give up if the loop has instructions may throw. 406 SimpleLoopSafetyInfo SafetyInfo; 407 SafetyInfo.computeLoopSafetyInfo(CurLoop); 408 if (SafetyInfo.anyBlockMayThrow()) 409 return MadeChange; 410 411 // Scan all the blocks in the loop that are not in subloops. 412 for (auto *BB : CurLoop->getBlocks()) { 413 // Ignore blocks in subloops. 414 if (LI->getLoopFor(BB) != CurLoop) 415 continue; 416 417 MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks); 418 } 419 return MadeChange; 420 } 421 422 static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) { 423 const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1)); 424 return ConstStride->getAPInt(); 425 } 426 427 /// getMemSetPatternValue - If a strided store of the specified value is safe to 428 /// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should 429 /// be passed in. Otherwise, return null. 430 /// 431 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these 432 /// just replicate their input array and then pass on to memset_pattern16. 433 static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { 434 // FIXME: This could check for UndefValue because it can be merged into any 435 // other valid pattern. 436 437 // If the value isn't a constant, we can't promote it to being in a constant 438 // array. We could theoretically do a store to an alloca or something, but 439 // that doesn't seem worthwhile. 440 Constant *C = dyn_cast<Constant>(V); 441 if (!C) 442 return nullptr; 443 444 // Only handle simple values that are a power of two bytes in size. 445 uint64_t Size = DL->getTypeSizeInBits(V->getType()); 446 if (Size == 0 || (Size & 7) || (Size & (Size - 1))) 447 return nullptr; 448 449 // Don't care enough about darwin/ppc to implement this. 450 if (DL->isBigEndian()) 451 return nullptr; 452 453 // Convert to size in bytes. 454 Size /= 8; 455 456 // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see 457 // if the top and bottom are the same (e.g. for vectors and large integers). 458 if (Size > 16) 459 return nullptr; 460 461 // If the constant is exactly 16 bytes, just use it. 462 if (Size == 16) 463 return C; 464 465 // Otherwise, we'll use an array of the constants. 466 unsigned ArraySize = 16 / Size; 467 ArrayType *AT = ArrayType::get(V->getType(), ArraySize); 468 return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); 469 } 470 471 LoopIdiomRecognize::LegalStoreKind 472 LoopIdiomRecognize::isLegalStore(StoreInst *SI) { 473 // Don't touch volatile stores. 474 if (SI->isVolatile()) 475 return LegalStoreKind::None; 476 // We only want simple or unordered-atomic stores. 477 if (!SI->isUnordered()) 478 return LegalStoreKind::None; 479 480 // Don't convert stores of non-integral pointer types to memsets (which stores 481 // integers). 482 if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType())) 483 return LegalStoreKind::None; 484 485 // Avoid merging nontemporal stores. 486 if (SI->getMetadata(LLVMContext::MD_nontemporal)) 487 return LegalStoreKind::None; 488 489 Value *StoredVal = SI->getValueOperand(); 490 Value *StorePtr = SI->getPointerOperand(); 491 492 // Reject stores that are so large that they overflow an unsigned. 493 uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); 494 if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) 495 return LegalStoreKind::None; 496 497 // See if the pointer expression is an AddRec like {base,+,1} on the current 498 // loop, which indicates a strided store. If we have something else, it's a 499 // random store we can't handle. 500 const SCEVAddRecExpr *StoreEv = 501 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 502 if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) 503 return LegalStoreKind::None; 504 505 // Check to see if we have a constant stride. 506 if (!isa<SCEVConstant>(StoreEv->getOperand(1))) 507 return LegalStoreKind::None; 508 509 // See if the store can be turned into a memset. 510 511 // If the stored value is a byte-wise value (like i32 -1), then it may be 512 // turned into a memset of i8 -1, assuming that all the consecutive bytes 513 // are stored. A store of i32 0x01020304 can never be turned into a memset, 514 // but it can be turned into memset_pattern if the target supports it. 515 Value *SplatValue = isBytewiseValue(StoredVal, *DL); 516 Constant *PatternValue = nullptr; 517 518 // Note: memset and memset_pattern on unordered-atomic is yet not supported 519 bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple(); 520 521 // If we're allowed to form a memset, and the stored value would be 522 // acceptable for memset, use it. 523 if (!UnorderedAtomic && HasMemset && SplatValue && 524 // Verify that the stored value is loop invariant. If not, we can't 525 // promote the memset. 526 CurLoop->isLoopInvariant(SplatValue)) { 527 // It looks like we can use SplatValue. 528 return LegalStoreKind::Memset; 529 } else if (!UnorderedAtomic && HasMemsetPattern && 530 // Don't create memset_pattern16s with address spaces. 531 StorePtr->getType()->getPointerAddressSpace() == 0 && 532 (PatternValue = getMemSetPatternValue(StoredVal, DL))) { 533 // It looks like we can use PatternValue! 534 return LegalStoreKind::MemsetPattern; 535 } 536 537 // Otherwise, see if the store can be turned into a memcpy. 538 if (HasMemcpy) { 539 // Check to see if the stride matches the size of the store. If so, then we 540 // know that every byte is touched in the loop. 541 APInt Stride = getStoreStride(StoreEv); 542 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); 543 if (StoreSize != Stride && StoreSize != -Stride) 544 return LegalStoreKind::None; 545 546 // The store must be feeding a non-volatile load. 547 LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); 548 549 // Only allow non-volatile loads 550 if (!LI || LI->isVolatile()) 551 return LegalStoreKind::None; 552 // Only allow simple or unordered-atomic loads 553 if (!LI->isUnordered()) 554 return LegalStoreKind::None; 555 556 // See if the pointer expression is an AddRec like {base,+,1} on the current 557 // loop, which indicates a strided load. If we have something else, it's a 558 // random load we can't handle. 559 const SCEVAddRecExpr *LoadEv = 560 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); 561 if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) 562 return LegalStoreKind::None; 563 564 // The store and load must share the same stride. 565 if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) 566 return LegalStoreKind::None; 567 568 // Success. This store can be converted into a memcpy. 569 UnorderedAtomic = UnorderedAtomic || LI->isAtomic(); 570 return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy 571 : LegalStoreKind::Memcpy; 572 } 573 // This store can't be transformed into a memset/memcpy. 574 return LegalStoreKind::None; 575 } 576 577 void LoopIdiomRecognize::collectStores(BasicBlock *BB) { 578 StoreRefsForMemset.clear(); 579 StoreRefsForMemsetPattern.clear(); 580 StoreRefsForMemcpy.clear(); 581 for (Instruction &I : *BB) { 582 StoreInst *SI = dyn_cast<StoreInst>(&I); 583 if (!SI) 584 continue; 585 586 // Make sure this is a strided store with a constant stride. 587 switch (isLegalStore(SI)) { 588 case LegalStoreKind::None: 589 // Nothing to do 590 break; 591 case LegalStoreKind::Memset: { 592 // Find the base pointer. 593 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); 594 StoreRefsForMemset[Ptr].push_back(SI); 595 } break; 596 case LegalStoreKind::MemsetPattern: { 597 // Find the base pointer. 598 Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); 599 StoreRefsForMemsetPattern[Ptr].push_back(SI); 600 } break; 601 case LegalStoreKind::Memcpy: 602 case LegalStoreKind::UnorderedAtomicMemcpy: 603 StoreRefsForMemcpy.push_back(SI); 604 break; 605 default: 606 assert(false && "unhandled return value"); 607 break; 608 } 609 } 610 } 611 612 /// runOnLoopBlock - Process the specified block, which lives in a counted loop 613 /// with the specified backedge count. This block is known to be in the current 614 /// loop and not in any subloops. 615 bool LoopIdiomRecognize::runOnLoopBlock( 616 BasicBlock *BB, const SCEV *BECount, 617 SmallVectorImpl<BasicBlock *> &ExitBlocks) { 618 // We can only promote stores in this block if they are unconditionally 619 // executed in the loop. For a block to be unconditionally executed, it has 620 // to dominate all the exit blocks of the loop. Verify this now. 621 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) 622 if (!DT->dominates(BB, ExitBlocks[i])) 623 return false; 624 625 bool MadeChange = false; 626 // Look for store instructions, which may be optimized to memset/memcpy. 627 collectStores(BB); 628 629 // Look for a single store or sets of stores with a common base, which can be 630 // optimized into a memset (memset_pattern). The latter most commonly happens 631 // with structs and handunrolled loops. 632 for (auto &SL : StoreRefsForMemset) 633 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes); 634 635 for (auto &SL : StoreRefsForMemsetPattern) 636 MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No); 637 638 // Optimize the store into a memcpy, if it feeds an similarly strided load. 639 for (auto &SI : StoreRefsForMemcpy) 640 MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); 641 642 for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { 643 Instruction *Inst = &*I++; 644 // Look for memset instructions, which may be optimized to a larger memset. 645 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { 646 WeakTrackingVH InstPtr(&*I); 647 if (!processLoopMemSet(MSI, BECount)) 648 continue; 649 MadeChange = true; 650 651 // If processing the memset invalidated our iterator, start over from the 652 // top of the block. 653 if (!InstPtr) 654 I = BB->begin(); 655 continue; 656 } 657 } 658 659 return MadeChange; 660 } 661 662 /// See if this store(s) can be promoted to a memset. 663 bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL, 664 const SCEV *BECount, ForMemset For) { 665 // Try to find consecutive stores that can be transformed into memsets. 666 SetVector<StoreInst *> Heads, Tails; 667 SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain; 668 669 // Do a quadratic search on all of the given stores and find 670 // all of the pairs of stores that follow each other. 671 SmallVector<unsigned, 16> IndexQueue; 672 for (unsigned i = 0, e = SL.size(); i < e; ++i) { 673 assert(SL[i]->isSimple() && "Expected only non-volatile stores."); 674 675 Value *FirstStoredVal = SL[i]->getValueOperand(); 676 Value *FirstStorePtr = SL[i]->getPointerOperand(); 677 const SCEVAddRecExpr *FirstStoreEv = 678 cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr)); 679 APInt FirstStride = getStoreStride(FirstStoreEv); 680 unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType()); 681 682 // See if we can optimize just this store in isolation. 683 if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) { 684 Heads.insert(SL[i]); 685 continue; 686 } 687 688 Value *FirstSplatValue = nullptr; 689 Constant *FirstPatternValue = nullptr; 690 691 if (For == ForMemset::Yes) 692 FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL); 693 else 694 FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL); 695 696 assert((FirstSplatValue || FirstPatternValue) && 697 "Expected either splat value or pattern value."); 698 699 IndexQueue.clear(); 700 // If a store has multiple consecutive store candidates, search Stores 701 // array according to the sequence: from i+1 to e, then from i-1 to 0. 702 // This is because usually pairing with immediate succeeding or preceding 703 // candidate create the best chance to find memset opportunity. 704 unsigned j = 0; 705 for (j = i + 1; j < e; ++j) 706 IndexQueue.push_back(j); 707 for (j = i; j > 0; --j) 708 IndexQueue.push_back(j - 1); 709 710 for (auto &k : IndexQueue) { 711 assert(SL[k]->isSimple() && "Expected only non-volatile stores."); 712 Value *SecondStorePtr = SL[k]->getPointerOperand(); 713 const SCEVAddRecExpr *SecondStoreEv = 714 cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr)); 715 APInt SecondStride = getStoreStride(SecondStoreEv); 716 717 if (FirstStride != SecondStride) 718 continue; 719 720 Value *SecondStoredVal = SL[k]->getValueOperand(); 721 Value *SecondSplatValue = nullptr; 722 Constant *SecondPatternValue = nullptr; 723 724 if (For == ForMemset::Yes) 725 SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL); 726 else 727 SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL); 728 729 assert((SecondSplatValue || SecondPatternValue) && 730 "Expected either splat value or pattern value."); 731 732 if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) { 733 if (For == ForMemset::Yes) { 734 if (isa<UndefValue>(FirstSplatValue)) 735 FirstSplatValue = SecondSplatValue; 736 if (FirstSplatValue != SecondSplatValue) 737 continue; 738 } else { 739 if (isa<UndefValue>(FirstPatternValue)) 740 FirstPatternValue = SecondPatternValue; 741 if (FirstPatternValue != SecondPatternValue) 742 continue; 743 } 744 Tails.insert(SL[k]); 745 Heads.insert(SL[i]); 746 ConsecutiveChain[SL[i]] = SL[k]; 747 break; 748 } 749 } 750 } 751 752 // We may run into multiple chains that merge into a single chain. We mark the 753 // stores that we transformed so that we don't visit the same store twice. 754 SmallPtrSet<Value *, 16> TransformedStores; 755 bool Changed = false; 756 757 // For stores that start but don't end a link in the chain: 758 for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end(); 759 it != e; ++it) { 760 if (Tails.count(*it)) 761 continue; 762 763 // We found a store instr that starts a chain. Now follow the chain and try 764 // to transform it. 765 SmallPtrSet<Instruction *, 8> AdjacentStores; 766 StoreInst *I = *it; 767 768 StoreInst *HeadStore = I; 769 unsigned StoreSize = 0; 770 771 // Collect the chain into a list. 772 while (Tails.count(I) || Heads.count(I)) { 773 if (TransformedStores.count(I)) 774 break; 775 AdjacentStores.insert(I); 776 777 StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType()); 778 // Move to the next value in the chain. 779 I = ConsecutiveChain[I]; 780 } 781 782 Value *StoredVal = HeadStore->getValueOperand(); 783 Value *StorePtr = HeadStore->getPointerOperand(); 784 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 785 APInt Stride = getStoreStride(StoreEv); 786 787 // Check to see if the stride matches the size of the stores. If so, then 788 // we know that every byte is touched in the loop. 789 if (StoreSize != Stride && StoreSize != -Stride) 790 continue; 791 792 bool NegStride = StoreSize == -Stride; 793 794 if (processLoopStridedStore(StorePtr, StoreSize, HeadStore->getAlignment(), 795 StoredVal, HeadStore, AdjacentStores, StoreEv, 796 BECount, NegStride)) { 797 TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end()); 798 Changed = true; 799 } 800 } 801 802 return Changed; 803 } 804 805 /// processLoopMemSet - See if this memset can be promoted to a large memset. 806 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI, 807 const SCEV *BECount) { 808 // We can only handle non-volatile memsets with a constant size. 809 if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) 810 return false; 811 812 // If we're not allowed to hack on memset, we fail. 813 if (!HasMemset) 814 return false; 815 816 Value *Pointer = MSI->getDest(); 817 818 // See if the pointer expression is an AddRec like {base,+,1} on the current 819 // loop, which indicates a strided store. If we have something else, it's a 820 // random store we can't handle. 821 const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); 822 if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) 823 return false; 824 825 // Reject memsets that are so large that they overflow an unsigned. 826 uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); 827 if ((SizeInBytes >> 32) != 0) 828 return false; 829 830 // Check to see if the stride matches the size of the memset. If so, then we 831 // know that every byte is touched in the loop. 832 const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); 833 if (!ConstStride) 834 return false; 835 836 APInt Stride = ConstStride->getAPInt(); 837 if (SizeInBytes != Stride && SizeInBytes != -Stride) 838 return false; 839 840 // Verify that the memset value is loop invariant. If not, we can't promote 841 // the memset. 842 Value *SplatValue = MSI->getValue(); 843 if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue)) 844 return false; 845 846 SmallPtrSet<Instruction *, 1> MSIs; 847 MSIs.insert(MSI); 848 bool NegStride = SizeInBytes == -Stride; 849 return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, 850 MSI->getDestAlignment(), SplatValue, MSI, MSIs, 851 Ev, BECount, NegStride, /*IsLoopMemset=*/true); 852 } 853 854 /// mayLoopAccessLocation - Return true if the specified loop might access the 855 /// specified pointer location, which is a loop-strided access. The 'Access' 856 /// argument specifies what the verboten forms of access are (read or write). 857 static bool 858 mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L, 859 const SCEV *BECount, unsigned StoreSize, 860 AliasAnalysis &AA, 861 SmallPtrSetImpl<Instruction *> &IgnoredStores) { 862 // Get the location that may be stored across the loop. Since the access is 863 // strided positively through memory, we say that the modified location starts 864 // at the pointer and has infinite size. 865 LocationSize AccessSize = LocationSize::unknown(); 866 867 // If the loop iterates a fixed number of times, we can refine the access size 868 // to be exactly the size of the memset, which is (BECount+1)*StoreSize 869 if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) 870 AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) * 871 StoreSize); 872 873 // TODO: For this to be really effective, we have to dive into the pointer 874 // operand in the store. Store to &A[i] of 100 will always return may alias 875 // with store of &A[100], we need to StoreLoc to be "A" with size of 100, 876 // which will then no-alias a store to &A[100]. 877 MemoryLocation StoreLoc(Ptr, AccessSize); 878 879 for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; 880 ++BI) 881 for (Instruction &I : **BI) 882 if (IgnoredStores.count(&I) == 0 && 883 isModOrRefSet( 884 intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access))) 885 return true; 886 887 return false; 888 } 889 890 // If we have a negative stride, Start refers to the end of the memory location 891 // we're trying to memset. Therefore, we need to recompute the base pointer, 892 // which is just Start - BECount*Size. 893 static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount, 894 Type *IntPtr, unsigned StoreSize, 895 ScalarEvolution *SE) { 896 const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr); 897 if (StoreSize != 1) 898 Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize), 899 SCEV::FlagNUW); 900 return SE->getMinusSCEV(Start, Index); 901 } 902 903 /// Compute the number of bytes as a SCEV from the backedge taken count. 904 /// 905 /// This also maps the SCEV into the provided type and tries to handle the 906 /// computation in a way that will fold cleanly. 907 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr, 908 unsigned StoreSize, Loop *CurLoop, 909 const DataLayout *DL, ScalarEvolution *SE) { 910 const SCEV *NumBytesS; 911 // The # stored bytes is (BECount+1)*Size. Expand the trip count out to 912 // pointer size if it isn't already. 913 // 914 // If we're going to need to zero extend the BE count, check if we can add 915 // one to it prior to zero extending without overflow. Provided this is safe, 916 // it allows better simplification of the +1. 917 if (DL->getTypeSizeInBits(BECount->getType()) < 918 DL->getTypeSizeInBits(IntPtr) && 919 SE->isLoopEntryGuardedByCond( 920 CurLoop, ICmpInst::ICMP_NE, BECount, 921 SE->getNegativeSCEV(SE->getOne(BECount->getType())))) { 922 NumBytesS = SE->getZeroExtendExpr( 923 SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW), 924 IntPtr); 925 } else { 926 NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr), 927 SE->getOne(IntPtr), SCEV::FlagNUW); 928 } 929 930 // And scale it based on the store size. 931 if (StoreSize != 1) { 932 NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), 933 SCEV::FlagNUW); 934 } 935 return NumBytesS; 936 } 937 938 /// processLoopStridedStore - We see a strided store of some value. If we can 939 /// transform this into a memset or memset_pattern in the loop preheader, do so. 940 bool LoopIdiomRecognize::processLoopStridedStore( 941 Value *DestPtr, unsigned StoreSize, unsigned StoreAlignment, 942 Value *StoredVal, Instruction *TheStore, 943 SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev, 944 const SCEV *BECount, bool NegStride, bool IsLoopMemset) { 945 Value *SplatValue = isBytewiseValue(StoredVal, *DL); 946 Constant *PatternValue = nullptr; 947 948 if (!SplatValue) 949 PatternValue = getMemSetPatternValue(StoredVal, DL); 950 951 assert((SplatValue || PatternValue) && 952 "Expected either splat value or pattern value."); 953 954 // The trip count of the loop and the base pointer of the addrec SCEV is 955 // guaranteed to be loop invariant, which means that it should dominate the 956 // header. This allows us to insert code for it in the preheader. 957 unsigned DestAS = DestPtr->getType()->getPointerAddressSpace(); 958 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 959 IRBuilder<> Builder(Preheader->getTerminator()); 960 SCEVExpander Expander(*SE, *DL, "loop-idiom"); 961 962 Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS); 963 Type *IntPtr = Builder.getIntPtrTy(*DL, DestAS); 964 965 const SCEV *Start = Ev->getStart(); 966 // Handle negative strided loops. 967 if (NegStride) 968 Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE); 969 970 // TODO: ideally we should still be able to generate memset if SCEV expander 971 // is taught to generate the dependencies at the latest point. 972 if (!isSafeToExpand(Start, *SE)) 973 return false; 974 975 // Okay, we have a strided store "p[i]" of a splattable value. We can turn 976 // this into a memset in the loop preheader now if we want. However, this 977 // would be unsafe to do if there is anything else in the loop that may read 978 // or write to the aliased location. Check for any overlap by generating the 979 // base pointer and checking the region. 980 Value *BasePtr = 981 Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator()); 982 if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount, 983 StoreSize, *AA, Stores)) { 984 Expander.clear(); 985 // If we generated new code for the base pointer, clean up. 986 RecursivelyDeleteTriviallyDeadInstructions(BasePtr, TLI); 987 return false; 988 } 989 990 if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset)) 991 return false; 992 993 // Okay, everything looks good, insert the memset. 994 995 const SCEV *NumBytesS = 996 getNumBytes(BECount, IntPtr, StoreSize, CurLoop, DL, SE); 997 998 // TODO: ideally we should still be able to generate memset if SCEV expander 999 // is taught to generate the dependencies at the latest point. 1000 if (!isSafeToExpand(NumBytesS, *SE)) 1001 return false; 1002 1003 Value *NumBytes = 1004 Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); 1005 1006 CallInst *NewCall; 1007 if (SplatValue) { 1008 NewCall = 1009 Builder.CreateMemSet(BasePtr, SplatValue, NumBytes, StoreAlignment); 1010 } else { 1011 // Everything is emitted in default address space 1012 Type *Int8PtrTy = DestInt8PtrTy; 1013 1014 Module *M = TheStore->getModule(); 1015 StringRef FuncName = "memset_pattern16"; 1016 FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(), 1017 Int8PtrTy, Int8PtrTy, IntPtr); 1018 inferLibFuncAttributes(M, FuncName, *TLI); 1019 1020 // Otherwise we should form a memset_pattern16. PatternValue is known to be 1021 // an constant array of 16-bytes. Plop the value into a mergable global. 1022 GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, 1023 GlobalValue::PrivateLinkage, 1024 PatternValue, ".memset_pattern"); 1025 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these. 1026 GV->setAlignment(Align(16)); 1027 Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy); 1028 NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes}); 1029 } 1030 1031 LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" 1032 << " from store to: " << *Ev << " at: " << *TheStore 1033 << "\n"); 1034 NewCall->setDebugLoc(TheStore->getDebugLoc()); 1035 1036 ORE.emit([&]() { 1037 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore", 1038 NewCall->getDebugLoc(), Preheader) 1039 << "Transformed loop-strided store into a call to " 1040 << ore::NV("NewFunction", NewCall->getCalledFunction()) 1041 << "() function"; 1042 }); 1043 1044 // Okay, the memset has been formed. Zap the original store and anything that 1045 // feeds into it. 1046 for (auto *I : Stores) 1047 deleteDeadInstruction(I); 1048 ++NumMemSet; 1049 return true; 1050 } 1051 1052 /// If the stored value is a strided load in the same loop with the same stride 1053 /// this may be transformable into a memcpy. This kicks in for stuff like 1054 /// for (i) A[i] = B[i]; 1055 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, 1056 const SCEV *BECount) { 1057 assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores."); 1058 1059 Value *StorePtr = SI->getPointerOperand(); 1060 const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); 1061 APInt Stride = getStoreStride(StoreEv); 1062 unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType()); 1063 bool NegStride = StoreSize == -Stride; 1064 1065 // The store must be feeding a non-volatile load. 1066 LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); 1067 assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads."); 1068 1069 // See if the pointer expression is an AddRec like {base,+,1} on the current 1070 // loop, which indicates a strided load. If we have something else, it's a 1071 // random load we can't handle. 1072 const SCEVAddRecExpr *LoadEv = 1073 cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); 1074 1075 // The trip count of the loop and the base pointer of the addrec SCEV is 1076 // guaranteed to be loop invariant, which means that it should dominate the 1077 // header. This allows us to insert code for it in the preheader. 1078 BasicBlock *Preheader = CurLoop->getLoopPreheader(); 1079 IRBuilder<> Builder(Preheader->getTerminator()); 1080 SCEVExpander Expander(*SE, *DL, "loop-idiom"); 1081 1082 const SCEV *StrStart = StoreEv->getStart(); 1083 unsigned StrAS = SI->getPointerAddressSpace(); 1084 Type *IntPtrTy = Builder.getIntPtrTy(*DL, StrAS); 1085 1086 // Handle negative strided loops. 1087 if (NegStride) 1088 StrStart = getStartForNegStride(StrStart, BECount, IntPtrTy, StoreSize, SE); 1089 1090 // Okay, we have a strided store "p[i]" of a loaded value. We can turn 1091 // this into a memcpy in the loop preheader now if we want. However, this 1092 // would be unsafe to do if there is anything else in the loop that may read 1093 // or write the memory region we're storing to. This includes the load that 1094 // feeds the stores. Check for an alias by generating the base address and 1095 // checking everything. 1096 Value *StoreBasePtr = Expander.expandCodeFor( 1097 StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator()); 1098 1099 SmallPtrSet<Instruction *, 1> Stores; 1100 Stores.insert(SI); 1101 if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, 1102 StoreSize, *AA, Stores)) { 1103 Expander.clear(); 1104 // If we generated new code for the base pointer, clean up. 1105 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); 1106 return false; 1107 } 1108 1109 const SCEV *LdStart = LoadEv->getStart(); 1110 unsigned LdAS = LI->getPointerAddressSpace(); 1111 1112 // Handle negative strided loops. 1113 if (NegStride) 1114 LdStart = getStartForNegStride(LdStart, BECount, IntPtrTy, StoreSize, SE); 1115 1116 // For a memcpy, we have to make sure that the input array is not being 1117 // mutated by the loop. 1118 Value *LoadBasePtr = Expander.expandCodeFor( 1119 LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator()); 1120 1121 if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, 1122 StoreSize, *AA, Stores)) { 1123 Expander.clear(); 1124 // If we generated new code for the base pointer, clean up. 1125 RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI); 1126 RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI); 1127 return false; 1128 } 1129 1130 if (avoidLIRForMultiBlockLoop()) 1131 return false; 1132 1133 // Okay, everything is safe, we can transform this! 1134 1135 const SCEV *NumBytesS = 1136 getNumBytes(BECount, IntPtrTy, StoreSize, CurLoop, DL, SE); 1137 1138 Value *NumBytes = 1139 Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); 1140 1141 CallInst *NewCall = nullptr; 1142 // Check whether to generate an unordered atomic memcpy: 1143 // If the load or store are atomic, then they must necessarily be unordered 1144 // by previous checks. 1145 if (!SI->isAtomic() && !LI->isAtomic()) 1146 NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlignment(), 1147 LoadBasePtr, LI->getAlignment(), NumBytes); 1148 else { 1149 // We cannot allow unaligned ops for unordered load/store, so reject 1150 // anything where the alignment isn't at least the element size. 1151 unsigned Align = std::min(SI->getAlignment(), LI->getAlignment()); 1152 if (Align < StoreSize) 1153 return false; 1154 1155 // If the element.atomic memcpy is not lowered into explicit 1156 // loads/stores later, then it will be lowered into an element-size 1157 // specific lib call. If the lib call doesn't exist for our store size, then 1158 // we shouldn't generate the memcpy. 1159 if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) 1160 return false; 1161 1162 // Create the call. 1163 // Note that unordered atomic loads/stores are *required* by the spec to 1164 // have an alignment but non-atomic loads/stores may not. 1165 NewCall = Builder.CreateElementUnorderedAtomicMemCpy( 1166 StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(), 1167 NumBytes, StoreSize); 1168 } 1169 NewCall->setDebugLoc(SI->getDebugLoc()); 1170 1171 LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" 1172 << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" 1173 << " from store ptr=" << *StoreEv << " at: " << *SI 1174 << "\n"); 1175 1176 ORE.emit([&]() { 1177 return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad", 1178 NewCall->getDebugLoc(), Preheader) 1179 << "Formed a call to " 1180 << ore::NV("NewFunction", NewCall->getCalledFunction()) 1181 << "() function"; 1182 }); 1183 1184 // Okay, the memcpy has been formed. Zap the original store and anything that 1185 // feeds into it. 1186 deleteDeadInstruction(SI); 1187 ++NumMemCpy; 1188 return true; 1189 } 1190 1191 // When compiling for codesize we avoid idiom recognition for a multi-block loop 1192 // unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop. 1193 // 1194 bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, 1195 bool IsLoopMemset) { 1196 if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) { 1197 if (!CurLoop->getParentLoop() && (!IsMemset || !IsLoopMemset)) { 1198 LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName() 1199 << " : LIR " << (IsMemset ? "Memset" : "Memcpy") 1200 << " avoided: multi-block top-level loop\n"); 1201 return true; 1202 } 1203 } 1204 1205 return false; 1206 } 1207 1208 bool LoopIdiomRecognize::runOnNoncountableLoop() { 1209 LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" 1210 << CurLoop->getHeader()->getParent()->getName() 1211 << "] Noncountable Loop %" 1212 << CurLoop->getHeader()->getName() << "\n"); 1213 1214 return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS(); 1215 } 1216 1217 /// Check if the given conditional branch is based on the comparison between 1218 /// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is 1219 /// true), the control yields to the loop entry. If the branch matches the 1220 /// behavior, the variable involved in the comparison is returned. This function 1221 /// will be called to see if the precondition and postcondition of the loop are 1222 /// in desirable form. 1223 static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry, 1224 bool JmpOnZero = false) { 1225 if (!BI || !BI->isConditional()) 1226 return nullptr; 1227 1228 ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); 1229 if (!Cond) 1230 return nullptr; 1231 1232 ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1)); 1233 if (!CmpZero || !CmpZero->isZero()) 1234 return nullptr; 1235 1236 BasicBlock *TrueSucc = BI->getSuccessor(0); 1237 BasicBlock *FalseSucc = BI->getSuccessor(1); 1238 if (JmpOnZero) 1239 std::swap(TrueSucc, FalseSucc); 1240 1241 ICmpInst::Predicate Pred = Cond->getPredicate(); 1242 if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) || 1243 (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry)) 1244 return Cond->getOperand(0); 1245 1246 return nullptr; 1247 } 1248 1249 // Check if the recurrence variable `VarX` is in the right form to create 1250 // the idiom. Returns the value coerced to a PHINode if so. 1251 static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, 1252 BasicBlock *LoopEntry) { 1253 auto *PhiX = dyn_cast<PHINode>(VarX); 1254 if (PhiX && PhiX->getParent() == LoopEntry && 1255 (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX)) 1256 return PhiX; 1257 return nullptr; 1258 } 1259 1260 /// Return true iff the idiom is detected in the loop. 1261 /// 1262 /// Additionally: 1263 /// 1) \p CntInst is set to the instruction counting the population bit. 1264 /// 2) \p CntPhi is set to the corresponding phi node. 1265 /// 3) \p Var is set to the value whose population bits are being counted. 1266 /// 1267 /// The core idiom we are trying to detect is: 1268 /// \code 1269 /// if (x0 != 0) 1270 /// goto loop-exit // the precondition of the loop 1271 /// cnt0 = init-val; 1272 /// do { 1273 /// x1 = phi (x0, x2); 1274 /// cnt1 = phi(cnt0, cnt2); 1275 /// 1276 /// cnt2 = cnt1 + 1; 1277 /// ... 1278 /// x2 = x1 & (x1 - 1); 1279 /// ... 1280 /// } while(x != 0); 1281 /// 1282 /// loop-exit: 1283 /// \endcode 1284 static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, 1285 Instruction *&CntInst, PHINode *&CntPhi, 1286 Value *&Var) { 1287 // step 1: Check to see if the look-back branch match this pattern: 1288 // "if (a!=0) goto loop-entry". 1289 BasicBlock *LoopEntry; 1290 Instruction *DefX2, *CountInst; 1291 Value *VarX1, *VarX0; 1292 PHINode *PhiX, *CountPhi; 1293 1294 DefX2 = CountInst = nullptr; 1295 VarX1 = VarX0 = nullptr; 1296 PhiX = CountPhi = nullptr; 1297 LoopEntry = *(CurLoop->block_begin()); 1298 1299 // step 1: Check if the loop-back branch is in desirable form. 1300 { 1301 if (Value *T = matchCondition( 1302 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) 1303 DefX2 = dyn_cast<Instruction>(T); 1304 else 1305 return false; 1306 } 1307 1308 // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" 1309 { 1310 if (!DefX2 || DefX2->getOpcode() != Instruction::And) 1311 return false; 1312 1313 BinaryOperator *SubOneOp; 1314 1315 if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0)))) 1316 VarX1 = DefX2->getOperand(1); 1317 else { 1318 VarX1 = DefX2->getOperand(0); 1319 SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1)); 1320 } 1321 if (!SubOneOp || SubOneOp->getOperand(0) != VarX1) 1322 return false; 1323 1324 ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1)); 1325 if (!Dec || 1326 !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) || 1327 (SubOneOp->getOpcode() == Instruction::Add && 1328 Dec->isMinusOne()))) { 1329 return false; 1330 } 1331 } 1332 1333 // step 3: Check the recurrence of variable X 1334 PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry); 1335 if (!PhiX) 1336 return false; 1337 1338 // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 1339 { 1340 CountInst = nullptr; 1341 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), 1342 IterE = LoopEntry->end(); 1343 Iter != IterE; Iter++) { 1344 Instruction *Inst = &*Iter; 1345 if (Inst->getOpcode() != Instruction::Add) 1346 continue; 1347 1348 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); 1349 if (!Inc || !Inc->isOne()) 1350 continue; 1351 1352 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); 1353 if (!Phi) 1354 continue; 1355 1356 // Check if the result of the instruction is live of the loop. 1357 bool LiveOutLoop = false; 1358 for (User *U : Inst->users()) { 1359 if ((cast<Instruction>(U))->getParent() != LoopEntry) { 1360 LiveOutLoop = true; 1361 break; 1362 } 1363 } 1364 1365 if (LiveOutLoop) { 1366 CountInst = Inst; 1367 CountPhi = Phi; 1368 break; 1369 } 1370 } 1371 1372 if (!CountInst) 1373 return false; 1374 } 1375 1376 // step 5: check if the precondition is in this form: 1377 // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" 1378 { 1379 auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 1380 Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader()); 1381 if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) 1382 return false; 1383 1384 CntInst = CountInst; 1385 CntPhi = CountPhi; 1386 Var = T; 1387 } 1388 1389 return true; 1390 } 1391 1392 /// Return true if the idiom is detected in the loop. 1393 /// 1394 /// Additionally: 1395 /// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) 1396 /// or nullptr if there is no such. 1397 /// 2) \p CntPhi is set to the corresponding phi node 1398 /// or nullptr if there is no such. 1399 /// 3) \p Var is set to the value whose CTLZ could be used. 1400 /// 4) \p DefX is set to the instruction calculating Loop exit condition. 1401 /// 1402 /// The core idiom we are trying to detect is: 1403 /// \code 1404 /// if (x0 == 0) 1405 /// goto loop-exit // the precondition of the loop 1406 /// cnt0 = init-val; 1407 /// do { 1408 /// x = phi (x0, x.next); //PhiX 1409 /// cnt = phi(cnt0, cnt.next); 1410 /// 1411 /// cnt.next = cnt + 1; 1412 /// ... 1413 /// x.next = x >> 1; // DefX 1414 /// ... 1415 /// } while(x.next != 0); 1416 /// 1417 /// loop-exit: 1418 /// \endcode 1419 static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL, 1420 Intrinsic::ID &IntrinID, Value *&InitX, 1421 Instruction *&CntInst, PHINode *&CntPhi, 1422 Instruction *&DefX) { 1423 BasicBlock *LoopEntry; 1424 Value *VarX = nullptr; 1425 1426 DefX = nullptr; 1427 CntInst = nullptr; 1428 CntPhi = nullptr; 1429 LoopEntry = *(CurLoop->block_begin()); 1430 1431 // step 1: Check if the loop-back branch is in desirable form. 1432 if (Value *T = matchCondition( 1433 dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) 1434 DefX = dyn_cast<Instruction>(T); 1435 else 1436 return false; 1437 1438 // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1" 1439 if (!DefX || !DefX->isShift()) 1440 return false; 1441 IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz : 1442 Intrinsic::ctlz; 1443 ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1)); 1444 if (!Shft || !Shft->isOne()) 1445 return false; 1446 VarX = DefX->getOperand(0); 1447 1448 // step 3: Check the recurrence of variable X 1449 PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); 1450 if (!PhiX) 1451 return false; 1452 1453 InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader()); 1454 1455 // Make sure the initial value can't be negative otherwise the ashr in the 1456 // loop might never reach zero which would make the loop infinite. 1457 if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL)) 1458 return false; 1459 1460 // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 1461 // TODO: We can skip the step. If loop trip count is known (CTLZ), 1462 // then all uses of "cnt.next" could be optimized to the trip count 1463 // plus "cnt0". Currently it is not optimized. 1464 // This step could be used to detect POPCNT instruction: 1465 // cnt.next = cnt + (x.next & 1) 1466 for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), 1467 IterE = LoopEntry->end(); 1468 Iter != IterE; Iter++) { 1469 Instruction *Inst = &*Iter; 1470 if (Inst->getOpcode() != Instruction::Add) 1471 continue; 1472 1473 ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); 1474 if (!Inc || !Inc->isOne()) 1475 continue; 1476 1477 PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); 1478 if (!Phi) 1479 continue; 1480 1481 CntInst = Inst; 1482 CntPhi = Phi; 1483 break; 1484 } 1485 if (!CntInst) 1486 return false; 1487 1488 return true; 1489 } 1490 1491 /// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop 1492 /// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new 1493 /// trip count returns true; otherwise, returns false. 1494 bool LoopIdiomRecognize::recognizeAndInsertFFS() { 1495 // Give up if the loop has multiple blocks or multiple backedges. 1496 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 1497 return false; 1498 1499 Intrinsic::ID IntrinID; 1500 Value *InitX; 1501 Instruction *DefX = nullptr; 1502 PHINode *CntPhi = nullptr; 1503 Instruction *CntInst = nullptr; 1504 // Help decide if transformation is profitable. For ShiftUntilZero idiom, 1505 // this is always 6. 1506 size_t IdiomCanonicalSize = 6; 1507 1508 if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, 1509 CntInst, CntPhi, DefX)) 1510 return false; 1511 1512 bool IsCntPhiUsedOutsideLoop = false; 1513 for (User *U : CntPhi->users()) 1514 if (!CurLoop->contains(cast<Instruction>(U))) { 1515 IsCntPhiUsedOutsideLoop = true; 1516 break; 1517 } 1518 bool IsCntInstUsedOutsideLoop = false; 1519 for (User *U : CntInst->users()) 1520 if (!CurLoop->contains(cast<Instruction>(U))) { 1521 IsCntInstUsedOutsideLoop = true; 1522 break; 1523 } 1524 // If both CntInst and CntPhi are used outside the loop the profitability 1525 // is questionable. 1526 if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop) 1527 return false; 1528 1529 // For some CPUs result of CTLZ(X) intrinsic is undefined 1530 // when X is 0. If we can not guarantee X != 0, we need to check this 1531 // when expand. 1532 bool ZeroCheck = false; 1533 // It is safe to assume Preheader exist as it was checked in 1534 // parent function RunOnLoop. 1535 BasicBlock *PH = CurLoop->getLoopPreheader(); 1536 1537 // If we are using the count instruction outside the loop, make sure we 1538 // have a zero check as a precondition. Without the check the loop would run 1539 // one iteration for before any check of the input value. This means 0 and 1 1540 // would have identical behavior in the original loop and thus 1541 if (!IsCntPhiUsedOutsideLoop) { 1542 auto *PreCondBB = PH->getSinglePredecessor(); 1543 if (!PreCondBB) 1544 return false; 1545 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 1546 if (!PreCondBI) 1547 return false; 1548 if (matchCondition(PreCondBI, PH) != InitX) 1549 return false; 1550 ZeroCheck = true; 1551 } 1552 1553 // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always 1554 // profitable if we delete the loop. 1555 1556 // the loop has only 6 instructions: 1557 // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] 1558 // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] 1559 // %shr = ashr %n.addr.0, 1 1560 // %tobool = icmp eq %shr, 0 1561 // %inc = add nsw %i.0, 1 1562 // br i1 %tobool 1563 1564 const Value *Args[] = 1565 {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext()) 1566 : ConstantInt::getFalse(InitX->getContext())}; 1567 1568 // @llvm.dbg doesn't count as they have no semantic effect. 1569 auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug(); 1570 uint32_t HeaderSize = 1571 std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end()); 1572 1573 if (HeaderSize != IdiomCanonicalSize && 1574 TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) > 1575 TargetTransformInfo::TCC_Basic) 1576 return false; 1577 1578 transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX, 1579 DefX->getDebugLoc(), ZeroCheck, 1580 IsCntPhiUsedOutsideLoop); 1581 return true; 1582 } 1583 1584 /// Recognizes a population count idiom in a non-countable loop. 1585 /// 1586 /// If detected, transforms the relevant code to issue the popcount intrinsic 1587 /// function call, and returns true; otherwise, returns false. 1588 bool LoopIdiomRecognize::recognizePopcount() { 1589 if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware) 1590 return false; 1591 1592 // Counting population are usually conducted by few arithmetic instructions. 1593 // Such instructions can be easily "absorbed" by vacant slots in a 1594 // non-compact loop. Therefore, recognizing popcount idiom only makes sense 1595 // in a compact loop. 1596 1597 // Give up if the loop has multiple blocks or multiple backedges. 1598 if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) 1599 return false; 1600 1601 BasicBlock *LoopBody = *(CurLoop->block_begin()); 1602 if (LoopBody->size() >= 20) { 1603 // The loop is too big, bail out. 1604 return false; 1605 } 1606 1607 // It should have a preheader containing nothing but an unconditional branch. 1608 BasicBlock *PH = CurLoop->getLoopPreheader(); 1609 if (!PH || &PH->front() != PH->getTerminator()) 1610 return false; 1611 auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator()); 1612 if (!EntryBI || EntryBI->isConditional()) 1613 return false; 1614 1615 // It should have a precondition block where the generated popcount intrinsic 1616 // function can be inserted. 1617 auto *PreCondBB = PH->getSinglePredecessor(); 1618 if (!PreCondBB) 1619 return false; 1620 auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator()); 1621 if (!PreCondBI || PreCondBI->isUnconditional()) 1622 return false; 1623 1624 Instruction *CntInst; 1625 PHINode *CntPhi; 1626 Value *Val; 1627 if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val)) 1628 return false; 1629 1630 transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val); 1631 return true; 1632 } 1633 1634 static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, 1635 const DebugLoc &DL) { 1636 Value *Ops[] = {Val}; 1637 Type *Tys[] = {Val->getType()}; 1638 1639 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); 1640 Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); 1641 CallInst *CI = IRBuilder.CreateCall(Func, Ops); 1642 CI->setDebugLoc(DL); 1643 1644 return CI; 1645 } 1646 1647 static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, 1648 const DebugLoc &DL, bool ZeroCheck, 1649 Intrinsic::ID IID) { 1650 Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; 1651 Type *Tys[] = {Val->getType()}; 1652 1653 Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); 1654 Function *Func = Intrinsic::getDeclaration(M, IID, Tys); 1655 CallInst *CI = IRBuilder.CreateCall(Func, Ops); 1656 CI->setDebugLoc(DL); 1657 1658 return CI; 1659 } 1660 1661 /// Transform the following loop (Using CTLZ, CTTZ is similar): 1662 /// loop: 1663 /// CntPhi = PHI [Cnt0, CntInst] 1664 /// PhiX = PHI [InitX, DefX] 1665 /// CntInst = CntPhi + 1 1666 /// DefX = PhiX >> 1 1667 /// LOOP_BODY 1668 /// Br: loop if (DefX != 0) 1669 /// Use(CntPhi) or Use(CntInst) 1670 /// 1671 /// Into: 1672 /// If CntPhi used outside the loop: 1673 /// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) 1674 /// Count = CountPrev + 1 1675 /// else 1676 /// Count = BitWidth(InitX) - CTLZ(InitX) 1677 /// loop: 1678 /// CntPhi = PHI [Cnt0, CntInst] 1679 /// PhiX = PHI [InitX, DefX] 1680 /// PhiCount = PHI [Count, Dec] 1681 /// CntInst = CntPhi + 1 1682 /// DefX = PhiX >> 1 1683 /// Dec = PhiCount - 1 1684 /// LOOP_BODY 1685 /// Br: loop if (Dec != 0) 1686 /// Use(CountPrev + Cnt0) // Use(CntPhi) 1687 /// or 1688 /// Use(Count + Cnt0) // Use(CntInst) 1689 /// 1690 /// If LOOP_BODY is empty the loop will be deleted. 1691 /// If CntInst and DefX are not used in LOOP_BODY they will be removed. 1692 void LoopIdiomRecognize::transformLoopToCountable( 1693 Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst, 1694 PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL, 1695 bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { 1696 BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator()); 1697 1698 // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block 1699 IRBuilder<> Builder(PreheaderBr); 1700 Builder.SetCurrentDebugLocation(DL); 1701 Value *FFS, *Count, *CountPrev, *NewCount, *InitXNext; 1702 1703 // Count = BitWidth - CTLZ(InitX); 1704 // If there are uses of CntPhi create: 1705 // CountPrev = BitWidth - CTLZ(InitX >> 1); 1706 if (IsCntPhiUsedOutsideLoop) { 1707 if (DefX->getOpcode() == Instruction::AShr) 1708 InitXNext = 1709 Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1)); 1710 else if (DefX->getOpcode() == Instruction::LShr) 1711 InitXNext = 1712 Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1)); 1713 else if (DefX->getOpcode() == Instruction::Shl) // cttz 1714 InitXNext = 1715 Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1)); 1716 else 1717 llvm_unreachable("Unexpected opcode!"); 1718 } else 1719 InitXNext = InitX; 1720 FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID); 1721 Count = Builder.CreateSub( 1722 ConstantInt::get(FFS->getType(), 1723 FFS->getType()->getIntegerBitWidth()), 1724 FFS); 1725 if (IsCntPhiUsedOutsideLoop) { 1726 CountPrev = Count; 1727 Count = Builder.CreateAdd( 1728 CountPrev, 1729 ConstantInt::get(CountPrev->getType(), 1)); 1730 } 1731 1732 NewCount = Builder.CreateZExtOrTrunc( 1733 IsCntPhiUsedOutsideLoop ? CountPrev : Count, 1734 cast<IntegerType>(CntInst->getType())); 1735 1736 // If the counter's initial value is not zero, insert Add Inst. 1737 Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); 1738 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 1739 if (!InitConst || !InitConst->isZero()) 1740 NewCount = Builder.CreateAdd(NewCount, CntInitVal); 1741 1742 // Step 2: Insert new IV and loop condition: 1743 // loop: 1744 // ... 1745 // PhiCount = PHI [Count, Dec] 1746 // ... 1747 // Dec = PhiCount - 1 1748 // ... 1749 // Br: loop if (Dec != 0) 1750 BasicBlock *Body = *(CurLoop->block_begin()); 1751 auto *LbBr = cast<BranchInst>(Body->getTerminator()); 1752 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 1753 Type *Ty = Count->getType(); 1754 1755 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); 1756 1757 Builder.SetInsertPoint(LbCond); 1758 Instruction *TcDec = cast<Instruction>( 1759 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), 1760 "tcdec", false, true)); 1761 1762 TcPhi->addIncoming(Count, Preheader); 1763 TcPhi->addIncoming(TcDec, Body); 1764 1765 CmpInst::Predicate Pred = 1766 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; 1767 LbCond->setPredicate(Pred); 1768 LbCond->setOperand(0, TcDec); 1769 LbCond->setOperand(1, ConstantInt::get(Ty, 0)); 1770 1771 // Step 3: All the references to the original counter outside 1772 // the loop are replaced with the NewCount 1773 if (IsCntPhiUsedOutsideLoop) 1774 CntPhi->replaceUsesOutsideBlock(NewCount, Body); 1775 else 1776 CntInst->replaceUsesOutsideBlock(NewCount, Body); 1777 1778 // step 4: Forget the "non-computable" trip-count SCEV associated with the 1779 // loop. The loop would otherwise not be deleted even if it becomes empty. 1780 SE->forgetLoop(CurLoop); 1781 } 1782 1783 void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, 1784 Instruction *CntInst, 1785 PHINode *CntPhi, Value *Var) { 1786 BasicBlock *PreHead = CurLoop->getLoopPreheader(); 1787 auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator()); 1788 const DebugLoc &DL = CntInst->getDebugLoc(); 1789 1790 // Assuming before transformation, the loop is following: 1791 // if (x) // the precondition 1792 // do { cnt++; x &= x - 1; } while(x); 1793 1794 // Step 1: Insert the ctpop instruction at the end of the precondition block 1795 IRBuilder<> Builder(PreCondBr); 1796 Value *PopCnt, *PopCntZext, *NewCount, *TripCnt; 1797 { 1798 PopCnt = createPopcntIntrinsic(Builder, Var, DL); 1799 NewCount = PopCntZext = 1800 Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType())); 1801 1802 if (NewCount != PopCnt) 1803 (cast<Instruction>(NewCount))->setDebugLoc(DL); 1804 1805 // TripCnt is exactly the number of iterations the loop has 1806 TripCnt = NewCount; 1807 1808 // If the population counter's initial value is not zero, insert Add Inst. 1809 Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); 1810 ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); 1811 if (!InitConst || !InitConst->isZero()) { 1812 NewCount = Builder.CreateAdd(NewCount, CntInitVal); 1813 (cast<Instruction>(NewCount))->setDebugLoc(DL); 1814 } 1815 } 1816 1817 // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to 1818 // "if (NewCount == 0) loop-exit". Without this change, the intrinsic 1819 // function would be partial dead code, and downstream passes will drag 1820 // it back from the precondition block to the preheader. 1821 { 1822 ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition()); 1823 1824 Value *Opnd0 = PopCntZext; 1825 Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); 1826 if (PreCond->getOperand(0) != Var) 1827 std::swap(Opnd0, Opnd1); 1828 1829 ICmpInst *NewPreCond = cast<ICmpInst>( 1830 Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); 1831 PreCondBr->setCondition(NewPreCond); 1832 1833 RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI); 1834 } 1835 1836 // Step 3: Note that the population count is exactly the trip count of the 1837 // loop in question, which enable us to convert the loop from noncountable 1838 // loop into a countable one. The benefit is twofold: 1839 // 1840 // - If the loop only counts population, the entire loop becomes dead after 1841 // the transformation. It is a lot easier to prove a countable loop dead 1842 // than to prove a noncountable one. (In some C dialects, an infinite loop 1843 // isn't dead even if it computes nothing useful. In general, DCE needs 1844 // to prove a noncountable loop finite before safely delete it.) 1845 // 1846 // - If the loop also performs something else, it remains alive. 1847 // Since it is transformed to countable form, it can be aggressively 1848 // optimized by some optimizations which are in general not applicable 1849 // to a noncountable loop. 1850 // 1851 // After this step, this loop (conceptually) would look like following: 1852 // newcnt = __builtin_ctpop(x); 1853 // t = newcnt; 1854 // if (x) 1855 // do { cnt++; x &= x-1; t--) } while (t > 0); 1856 BasicBlock *Body = *(CurLoop->block_begin()); 1857 { 1858 auto *LbBr = cast<BranchInst>(Body->getTerminator()); 1859 ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); 1860 Type *Ty = TripCnt->getType(); 1861 1862 PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); 1863 1864 Builder.SetInsertPoint(LbCond); 1865 Instruction *TcDec = cast<Instruction>( 1866 Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), 1867 "tcdec", false, true)); 1868 1869 TcPhi->addIncoming(TripCnt, PreHead); 1870 TcPhi->addIncoming(TcDec, Body); 1871 1872 CmpInst::Predicate Pred = 1873 (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; 1874 LbCond->setPredicate(Pred); 1875 LbCond->setOperand(0, TcDec); 1876 LbCond->setOperand(1, ConstantInt::get(Ty, 0)); 1877 } 1878 1879 // Step 4: All the references to the original population counter outside 1880 // the loop are replaced with the NewCount -- the value returned from 1881 // __builtin_ctpop(). 1882 CntInst->replaceUsesOutsideBlock(NewCount, Body); 1883 1884 // step 5: Forget the "non-computable" trip-count SCEV associated with the 1885 // loop. The loop would otherwise not be deleted even if it becomes empty. 1886 SE->forgetLoop(CurLoop); 1887 } 1888 1889 bool LoopIdiomRecognize::matchBCmpLoopStructure( 1890 CmpLoopStructure &CmpLoop) const { 1891 ICmpInst::Predicate BCmpPred; 1892 1893 // We are looking for the following basic layout: 1894 // PreheaderBB: <preheader> ; preds = ??? 1895 // <...> 1896 // br label %LoopHeaderBB 1897 // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB 1898 // <...> 1899 // %BCmpValue = icmp <...> 1900 // br i1 %BCmpValue, label %LoopLatchBB, label %Successor0 1901 // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB 1902 // <...> 1903 // %LatchCmpValue = <are we done, or do next iteration?> 1904 // br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB 1905 // Successor0: <exit> ; preds = %LoopHeaderBB 1906 // <...> 1907 // Successor1: <exit> ; preds = %LoopLatchBB 1908 // <...> 1909 // 1910 // Successor0 and Successor1 may or may not be the same basic block. 1911 1912 // Match basic frame-work of this supposedly-comparison loop. 1913 using namespace PatternMatch; 1914 if (!match(CurLoop->getHeader()->getTerminator(), 1915 m_Br(m_CombineAnd(m_ICmp(BCmpPred, m_Value(), m_Value()), 1916 m_Value(CmpLoop.BCmpValue)), 1917 CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB)) || 1918 !match(CurLoop->getLoopLatch()->getTerminator(), 1919 m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop.LatchCmpValue)), 1920 CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB))) { 1921 LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n"); 1922 return false; 1923 } 1924 LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n"); 1925 return true; 1926 } 1927 1928 bool LoopIdiomRecognize::matchBCmpOfLoads(Value *BCmpValue, 1929 CmpOfLoads &CmpOfLoads) const { 1930 using namespace PatternMatch; 1931 LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue 1932 << " as bcmp pattern.\n"); 1933 1934 // Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example: 1935 // %v0 = load <...>, <...>* %LoadSrcA 1936 // %v1 = load <...>, <...>* %LoadSrcB 1937 // %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1 1938 // There won't be any no-op bitcasts between load and icmp, 1939 // they would have been transformed into a load of bitcast. 1940 // FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too. 1941 if (!match(BCmpValue, 1942 m_ICmp(CmpOfLoads.BCmpPred, 1943 m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcA)), 1944 m_Value(CmpOfLoads.LoadA)), 1945 m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcB)), 1946 m_Value(CmpOfLoads.LoadB)))) || 1947 !ICmpInst::isEquality(CmpOfLoads.BCmpPred)) { 1948 LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n"); 1949 return false; 1950 } 1951 LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t" 1952 << *CmpOfLoads.LoadA << "\n\t" << *CmpOfLoads.LoadB 1953 << "\n"); 1954 // FIXME: handle memcmp pattern? 1955 return true; 1956 } 1957 1958 bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow( 1959 const CmpOfLoads &CmpOfLoads, CmpLoopStructure &CmpLoop) const { 1960 BasicBlock *LoopHeaderBB = CurLoop->getHeader(); 1961 BasicBlock *LoopLatchBB = CurLoop->getLoopLatch(); 1962 1963 // Be wary, comparisons can be inverted, canonicalize order. 1964 // If this 'element' comparison passed, we expect to proceed to the next elt. 1965 if (CmpOfLoads.BCmpPred != ICmpInst::Predicate::ICMP_EQ) 1966 std::swap(CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB); 1967 // The predicate on loop latch does not matter, just canonicalize some order. 1968 if (CmpLoop.LatchBrContinueBB != LoopHeaderBB) 1969 std::swap(CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB); 1970 1971 SmallVector<BasicBlock *, 2> ExitBlocks; 1972 1973 CurLoop->getUniqueExitBlocks(ExitBlocks); 1974 assert(ExitBlocks.size() <= 2U && "Can't have more than two exit blocks."); 1975 1976 // Check that control-flow between blocks is as expected. 1977 if (CmpLoop.HeaderBrEqualBB != LoopLatchBB || 1978 CmpLoop.LatchBrContinueBB != LoopHeaderBB || 1979 !is_contained(ExitBlocks, CmpLoop.HeaderBrUnequalBB) || 1980 !is_contained(ExitBlocks, CmpLoop.LatchBrFinishBB)) { 1981 LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n"); 1982 return false; 1983 } 1984 1985 assert(!is_contained(ExitBlocks, CmpLoop.HeaderBrEqualBB) && 1986 !is_contained(ExitBlocks, CmpLoop.LatchBrContinueBB) && 1987 "Unexpected exit edges."); 1988 1989 LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n"); 1990 1991 LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n"); 1992 assert(CurLoop->isLCSSAForm(*DT) && "Should only get LCSSA-form loops here."); 1993 // No loop instructions must be used outside of the loop. Since we are in 1994 // LCSSA form, we only need to check successor block's PHI nodes's incoming 1995 // values for incoming blocks that are the loop basic blocks. 1996 for (const BasicBlock *ExitBB : ExitBlocks) { 1997 for (const PHINode &PHI : ExitBB->phis()) { 1998 for (const BasicBlock *LoopBB : 1999 make_filter_range(PHI.blocks(), [this](BasicBlock *PredecessorBB) { 2000 return CurLoop->contains(PredecessorBB); 2001 })) { 2002 const auto *I = 2003 dyn_cast<Instruction>(PHI.getIncomingValueForBlock(LoopBB)); 2004 if (I && CurLoop->contains(I)) { 2005 LLVM_DEBUG(dbgs() 2006 << "Loop contains instruction " << *I 2007 << " which is used outside of the loop in basic block " 2008 << ExitBB->getName() << " in phi node " << PHI << "\n"); 2009 return false; 2010 } 2011 } 2012 } 2013 } 2014 // Similarly, the loop should not have any other observable side-effects 2015 // other than the final comparison result. 2016 for (BasicBlock *LoopBB : CurLoop->blocks()) { 2017 for (Instruction &I : *LoopBB) { 2018 if (isa<DbgInfoIntrinsic>(I)) // Ignore dbginfo. 2019 continue; // FIXME: anything else? lifetime info? 2020 if ((I.mayHaveSideEffects() || I.isAtomic() || I.isFenceLike()) && 2021 &I != CmpOfLoads.LoadA && &I != CmpOfLoads.LoadB) { 2022 LLVM_DEBUG( 2023 dbgs() << "Loop contains instruction with potential side-effects: " 2024 << I << "\n"); 2025 return false; 2026 } 2027 } 2028 } 2029 LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n"); 2030 return true; 2031 } 2032 2033 bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, 2034 CmpOfLoads &CmpOfLoads, 2035 const SCEV *&SrcA, 2036 const SCEV *&SrcB, 2037 const SCEV *&Iterations) const { 2038 // Try to compute SCEV of the loads, for this loop's scope. 2039 const auto *ScevForSrcA = dyn_cast<SCEVAddRecExpr>( 2040 SE->getSCEVAtScope(CmpOfLoads.LoadSrcA, CurLoop)); 2041 const auto *ScevForSrcB = dyn_cast<SCEVAddRecExpr>( 2042 SE->getSCEVAtScope(CmpOfLoads.LoadSrcB, CurLoop)); 2043 if (!ScevForSrcA || !ScevForSrcB) { 2044 LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n"); 2045 return false; 2046 } 2047 2048 LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t" 2049 << *ScevForSrcA << "\n\t" << *ScevForSrcB << "\n"); 2050 2051 // Loads must have folloving SCEV exprs: {%ptr,+,BCmpTyBytes}<%LoopHeaderBB> 2052 const SCEV *RecStepForA = ScevForSrcA->getStepRecurrence(*SE); 2053 const SCEV *RecStepForB = ScevForSrcB->getStepRecurrence(*SE); 2054 if (!ScevForSrcA->isAffine() || !ScevForSrcB->isAffine() || 2055 ScevForSrcA->getLoop() != CurLoop || ScevForSrcB->getLoop() != CurLoop || 2056 RecStepForA != RecStepForB || !isa<SCEVConstant>(RecStepForA) || 2057 cast<SCEVConstant>(RecStepForA)->getAPInt() != BCmpTyBytes) { 2058 LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support " 2059 "affine SCEV expressions originating in the loop we " 2060 "are analysing with identical constant positive step, " 2061 "equal to the count of bytes compared. Got:\n\t" 2062 << *RecStepForA << "\n\t" << *RecStepForB << "\n"); 2063 return false; 2064 // FIXME: can support BCmpTyBytes > Step. 2065 // But will need to account for the extra bytes compared at the end. 2066 } 2067 2068 SrcA = ScevForSrcA->getStart(); 2069 SrcB = ScevForSrcB->getStart(); 2070 LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA 2071 << "\n\t" << *SrcB << "\n"); 2072 2073 // The load sources must be loop-invants that dominate the loop header. 2074 if (SrcA == SE->getCouldNotCompute() || SrcB == SE->getCouldNotCompute() || 2075 !SE->isAvailableAtLoopEntry(SrcA, CurLoop) || 2076 !SE->isAvailableAtLoopEntry(SrcB, CurLoop)) { 2077 LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable " 2078 "prior to loop header.\n"); 2079 return false; 2080 } 2081 2082 LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n"); 2083 2084 // bcmp / memcmp take length argument as size_t, so let's conservatively 2085 // assume that the iteration count should be not wider than that. 2086 Type *CmpFuncSizeTy = DL->getIntPtrType(SE->getContext()); 2087 2088 // For how many iterations is loop guaranteed not to exit via LoopLatch? 2089 // This is one less than the maximal number of comparisons,and is: n + -1 2090 const SCEV *LoopExitCount = 2091 SE->getExitCount(CurLoop, CurLoop->getLoopLatch()); 2092 LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: " 2093 << *LoopExitCount << "\n"); 2094 // Exit count, similarly, must be loop-invant that dominates the loop header. 2095 if (LoopExitCount == SE->getCouldNotCompute() || 2096 !LoopExitCount->getType()->isIntOrPtrTy() || 2097 LoopExitCount->getType()->getScalarSizeInBits() > 2098 CmpFuncSizeTy->getScalarSizeInBits() || 2099 !SE->isAvailableAtLoopEntry(LoopExitCount, CurLoop)) { 2100 LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n"); 2101 return false; 2102 } 2103 2104 // LoopExitCount is always one less than the actual count of iterations. 2105 // Do this before cast, else we will be stuck with 1 + zext(-1 + n) 2106 Iterations = SE->getAddExpr( 2107 LoopExitCount, SE->getOne(LoopExitCount->getType()), SCEV::FlagNUW); 2108 assert(Iterations != SE->getCouldNotCompute() && 2109 "Shouldn't fail to increment by one."); 2110 2111 LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations << "\n"); 2112 return true; 2113 } 2114 2115 /// Return true iff the bcmp idiom is detected in the loop. 2116 /// 2117 /// Additionally: 2118 /// 1) \p BCmpInst is set to the root byte-comparison instruction. 2119 /// 2) \p LatchCmpInst is set to the comparison that controls the latch. 2120 /// 3) \p LoadA is set to the first LoadInst. 2121 /// 4) \p LoadB is set to the second LoadInst. 2122 /// 5) \p SrcA is set to the first source location that is being compared. 2123 /// 6) \p SrcB is set to the second source location that is being compared. 2124 /// 7) \p NBytes is set to the number of bytes to compare. 2125 bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst *&BCmpInst, 2126 CmpInst *&LatchCmpInst, 2127 LoadInst *&LoadA, LoadInst *&LoadB, 2128 const SCEV *&SrcA, const SCEV *&SrcB, 2129 const SCEV *&NBytes) const { 2130 LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n"); 2131 2132 // Give up if the loop is not in normal form, or has more than 2 blocks. 2133 if (!CurLoop->isLoopSimplifyForm() || CurLoop->getNumBlocks() > 2) { 2134 LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n"); 2135 return false; 2136 } 2137 LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n"); 2138 2139 CmpLoopStructure CmpLoop; 2140 if (!matchBCmpLoopStructure(CmpLoop)) 2141 return false; 2142 2143 CmpOfLoads CmpOfLoads; 2144 if (!matchBCmpOfLoads(CmpLoop.BCmpValue, CmpOfLoads)) 2145 return false; 2146 2147 if (!recognizeBCmpLoopControlFlow(CmpOfLoads, CmpLoop)) 2148 return false; 2149 2150 BCmpInst = cast<ICmpInst>(CmpLoop.BCmpValue); // FIXME: is there no 2151 LatchCmpInst = cast<CmpInst>(CmpLoop.LatchCmpValue); // way to combine 2152 LoadA = cast<LoadInst>(CmpOfLoads.LoadA); // these cast with 2153 LoadB = cast<LoadInst>(CmpOfLoads.LoadB); // m_Value() matcher? 2154 2155 Type *BCmpValTy = BCmpInst->getOperand(0)->getType(); 2156 LLVMContext &Context = BCmpValTy->getContext(); 2157 uint64_t BCmpTyBits = DL->getTypeSizeInBits(BCmpValTy); 2158 static constexpr uint64_t ByteTyBits = 8; 2159 2160 LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy 2161 << " of size " << BCmpTyBits 2162 << " bits (while byte = " << ByteTyBits << " bits).\n"); 2163 // bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check 2164 // that we are dealing with a multiple of a byte here. 2165 if (BCmpTyBits % ByteTyBits != 0) { 2166 LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n"); 2167 return false; 2168 // FIXME: could still be done under a run-time check that the total bit 2169 // count is a multiple of a byte i guess? Or handle remainder separately? 2170 } 2171 2172 // Each comparison is done on this many bytes. 2173 uint64_t BCmpTyBytes = BCmpTyBits / ByteTyBits; 2174 LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes 2175 << " bytes, eligible for bcmp conversion.\n"); 2176 2177 const SCEV *Iterations; 2178 if (!recognizeBCmpLoopSCEV(BCmpTyBytes, CmpOfLoads, SrcA, SrcB, Iterations)) 2179 return false; 2180 2181 // bcmp / memcmp take length argument as size_t, do promotion now. 2182 Type *CmpFuncSizeTy = DL->getIntPtrType(Context); 2183 Iterations = SE->getNoopOrZeroExtend(Iterations, CmpFuncSizeTy); 2184 assert(Iterations != SE->getCouldNotCompute() && "Promotion failed."); 2185 // Note that it didn't do ptrtoint cast, we will need to do it manually. 2186 2187 // We will be comparing *bytes*, not BCmpTy, we need to recalculate size. 2188 // It's a multiplication, and it *could* overflow. But for it to overflow 2189 // we'd want to compare more bytes than could be represented by size_t, But 2190 // allocation functions also take size_t. So how'd you produce such buffer? 2191 // FIXME: we likely need to actually check that we know this won't overflow, 2192 // via llvm::computeOverflowForUnsignedMul(). 2193 NBytes = SE->getMulExpr( 2194 Iterations, SE->getConstant(CmpFuncSizeTy, BCmpTyBytes), SCEV::FlagNUW); 2195 assert(NBytes != SE->getCouldNotCompute() && 2196 "Shouldn't fail to increment by one."); 2197 2198 LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes << "\n"); 2199 2200 if (LoadA->getPointerAddressSpace() != LoadB->getPointerAddressSpace() || 2201 LoadA->getPointerAddressSpace() != 0 || !LoadA->isSimple() || 2202 !LoadB->isSimple()) { 2203 StringLiteral L("Unsupported loads in idiom - only support identical, " 2204 "simple loads from address space 0.\n"); 2205 LLVM_DEBUG(dbgs() << L); 2206 ORE.emit([&]() { 2207 return OptimizationRemarkMissed(DEBUG_TYPE, "BCmpIdiomUnsupportedLoads", 2208 BCmpInst->getDebugLoc(), 2209 CurLoop->getHeader()) 2210 << L; 2211 }); 2212 return false; // FIXME: support non-simple loads. 2213 } 2214 2215 LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n"); 2216 ORE.emit([&]() { 2217 return OptimizationRemarkAnalysis(DEBUG_TYPE, "RecognizedBCmpIdiom", 2218 CurLoop->getStartLoc(), 2219 CurLoop->getHeader()) 2220 << "Loop recognized as a bcmp idiom"; 2221 }); 2222 2223 return true; 2224 } 2225 2226 BasicBlock * 2227 LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst *ComparedEqual) { 2228 LLVM_DEBUG(dbgs() << "Transforming control-flow.\n"); 2229 SmallVector<DominatorTree::UpdateType, 8> DTUpdates; 2230 2231 BasicBlock *PreheaderBB = CurLoop->getLoopPreheader(); 2232 BasicBlock *HeaderBB = CurLoop->getHeader(); 2233 BasicBlock *LoopLatchBB = CurLoop->getLoopLatch(); 2234 SmallString<32> LoopName = CurLoop->getName(); 2235 Function *Func = PreheaderBB->getParent(); 2236 LLVMContext &Context = Func->getContext(); 2237 2238 // Before doing anything, drop SCEV info. 2239 SE->forgetLoop(CurLoop); 2240 2241 // Here we start with: (0/6) 2242 // PreheaderBB: <preheader> ; preds = ??? 2243 // <...> 2244 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) 2245 // %ComparedEqual = icmp eq <...> %memcmp, 0 2246 // br label %LoopHeaderBB 2247 // LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB 2248 // <...> 2249 // br i1 %<...>, label %LoopLatchBB, label %Successor0BB 2250 // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB 2251 // <...> 2252 // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB 2253 // Successor0BB: <exit> ; preds = %LoopHeaderBB 2254 // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ] 2255 // <...> 2256 // Successor1BB: <exit> ; preds = %LoopLatchBB 2257 // %S1PHI = phi <...> [ <...>, %LoopLatchBB ] 2258 // <...> 2259 // 2260 // Successor0 and Successor1 may or may not be the same basic block. 2261 2262 // Decouple the edge between loop preheader basic block and loop header basic 2263 // block. Thus the loop has become unreachable. 2264 assert(cast<BranchInst>(PreheaderBB->getTerminator())->isUnconditional() && 2265 PreheaderBB->getTerminator()->getSuccessor(0) == HeaderBB && 2266 "Preheader bb must end with an unconditional branch to header bb."); 2267 PreheaderBB->getTerminator()->eraseFromParent(); 2268 DTUpdates.push_back({DominatorTree::Delete, PreheaderBB, HeaderBB}); 2269 2270 // Create a new preheader basic block before loop header basic block. 2271 auto *PhonyPreheaderBB = BasicBlock::Create( 2272 Context, LoopName + ".phonypreheaderbb", Func, HeaderBB); 2273 // And insert an unconditional branch from phony preheader basic block to 2274 // loop header basic block. 2275 IRBuilder<>(PhonyPreheaderBB).CreateBr(HeaderBB); 2276 DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB}); 2277 2278 // Create a *single* new empty block that we will substitute as a 2279 // successor basic block for the loop's exits. This one is temporary. 2280 // Much like phony preheader basic block, it is not connected. 2281 auto *PhonySuccessorBB = 2282 BasicBlock::Create(Context, LoopName + ".phonysuccessorbb", Func, 2283 LoopLatchBB->getNextNode()); 2284 // That block must have *some* non-PHI instruction, or else deleteDeadLoop() 2285 // will mess up cleanup of dbginfo, and verifier will complain. 2286 IRBuilder<>(PhonySuccessorBB).CreateUnreachable(); 2287 2288 // Create two new empty blocks that we will use to preserve the original 2289 // loop exit control-flow, and preserve the incoming values in the PHI nodes 2290 // in loop's successor exit blocks. These will live one. 2291 auto *ComparedUnequalBB = 2292 BasicBlock::Create(Context, ComparedEqual->getName() + ".unequalbb", Func, 2293 PhonySuccessorBB->getNextNode()); 2294 auto *ComparedEqualBB = 2295 BasicBlock::Create(Context, ComparedEqual->getName() + ".equalbb", Func, 2296 PhonySuccessorBB->getNextNode()); 2297 2298 // By now we have: (1/6) 2299 // PreheaderBB: ; preds = ??? 2300 // <...> 2301 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) 2302 // %ComparedEqual = icmp eq <...> %memcmp, 0 2303 // [no terminator instruction!] 2304 // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE! 2305 // br label %LoopHeaderBB 2306 // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB 2307 // <...> 2308 // br i1 %<...>, label %LoopLatchBB, label %Successor0BB 2309 // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB 2310 // <...> 2311 // br i1 %<...>, label %Successor1BB, label %LoopHeaderBB 2312 // PhonySuccessorBB: ; No preds, UNREACHABLE! 2313 // unreachable 2314 // EqualBB: ; No preds, UNREACHABLE! 2315 // [no terminator instruction!] 2316 // UnequalBB: ; No preds, UNREACHABLE! 2317 // [no terminator instruction!] 2318 // Successor0BB: <exit> ; preds = %LoopHeaderBB 2319 // %S0PHI = phi <...> [ <...>, %LoopHeaderBB ] 2320 // <...> 2321 // Successor1BB: <exit> ; preds = %LoopLatchBB 2322 // %S1PHI = phi <...> [ <...>, %LoopLatchBB ] 2323 // <...> 2324 2325 // What is the mapping/replacement basic block for exiting out of the loop 2326 // from either of old's loop basic blocks? 2327 auto GetReplacementBB = [this, ComparedEqualBB, 2328 ComparedUnequalBB](const BasicBlock *OldBB) { 2329 assert(CurLoop->contains(OldBB) && "Only for loop's basic blocks."); 2330 if (OldBB == CurLoop->getLoopLatch()) // "all elements compared equal". 2331 return ComparedEqualBB; 2332 if (OldBB == CurLoop->getHeader()) // "element compared unequal". 2333 return ComparedUnequalBB; 2334 llvm_unreachable("Only had two basic blocks in loop."); 2335 }; 2336 2337 // What are the exits out of this loop? 2338 SmallVector<Loop::Edge, 2> LoopExitEdges; 2339 CurLoop->getExitEdges(LoopExitEdges); 2340 assert(LoopExitEdges.size() == 2 && "Should have only to two exit edges."); 2341 2342 // Populate new basic blocks, update the exiting control-flow, PHI nodes. 2343 for (const Loop::Edge &Edge : LoopExitEdges) { 2344 auto *OldLoopBB = const_cast<BasicBlock *>(Edge.first); 2345 auto *SuccessorBB = const_cast<BasicBlock *>(Edge.second); 2346 assert(CurLoop->contains(OldLoopBB) && !CurLoop->contains(SuccessorBB) && 2347 "Unexpected edge."); 2348 2349 // If we would exit the loop from this loop's basic block, 2350 // what semantically would that mean? Did comparison succeed or fail? 2351 BasicBlock *NewBB = GetReplacementBB(OldLoopBB); 2352 assert(NewBB->empty() && "Should not get same new basic block here twice."); 2353 IRBuilder<> Builder(NewBB); 2354 Builder.SetCurrentDebugLocation(OldLoopBB->getTerminator()->getDebugLoc()); 2355 Builder.CreateBr(SuccessorBB); 2356 DTUpdates.push_back({DominatorTree::Insert, NewBB, SuccessorBB}); 2357 // Also, be *REALLY* careful with PHI nodes in successor basic block, 2358 // update them to recieve the same input value, but not from current loop's 2359 // basic block, but from new basic block instead. 2360 SuccessorBB->replacePhiUsesWith(OldLoopBB, NewBB); 2361 // Also, change loop control-flow. This loop's basic block shall no longer 2362 // exit from the loop to it's original successor basic block, but to our new 2363 // phony successor basic block. Note that new successor will be unique exit. 2364 OldLoopBB->getTerminator()->replaceSuccessorWith(SuccessorBB, 2365 PhonySuccessorBB); 2366 DTUpdates.push_back({DominatorTree::Delete, OldLoopBB, SuccessorBB}); 2367 DTUpdates.push_back({DominatorTree::Insert, OldLoopBB, PhonySuccessorBB}); 2368 } 2369 2370 // Inform DomTree about edge changes. Note that LoopInfo is still out-of-date. 2371 assert(DTUpdates.size() == 8 && "Update count prediction failed."); 2372 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager); 2373 DTU.applyUpdates(DTUpdates); 2374 DTUpdates.clear(); 2375 2376 // By now we have: (2/6) 2377 // PreheaderBB: ; preds = ??? 2378 // <...> 2379 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) 2380 // %ComparedEqual = icmp eq <...> %memcmp, 0 2381 // [no terminator instruction!] 2382 // PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE! 2383 // br label %LoopHeaderBB 2384 // LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB 2385 // <...> 2386 // br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB 2387 // LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB 2388 // <...> 2389 // br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB 2390 // PhonySuccessorBB: <uniq. exit> ; preds = %LoopHeaderBB, %LoopLatchBB 2391 // unreachable 2392 // EqualBB: ; No preds, UNREACHABLE! 2393 // br label %Successor1BB 2394 // UnequalBB: ; No preds, UNREACHABLE! 2395 // br label %Successor0BB 2396 // Successor0BB: ; preds = %UnequalBB 2397 // %S0PHI = phi <...> [ <...>, %UnequalBB ] 2398 // <...> 2399 // Successor1BB: ; preds = %EqualBB 2400 // %S0PHI = phi <...> [ <...>, %EqualBB ] 2401 // <...> 2402 2403 // *Finally*, zap the original loop. Record it's parent loop though. 2404 Loop *ParentLoop = CurLoop->getParentLoop(); 2405 LLVM_DEBUG(dbgs() << "Deleting old loop.\n"); 2406 LoopDeleter.markLoopAsDeleted(CurLoop); // Mark as deleted *BEFORE* deleting! 2407 deleteDeadLoop(CurLoop, DT, SE, LI); // And actually delete the loop. 2408 CurLoop = nullptr; 2409 2410 // By now we have: (3/6) 2411 // PreheaderBB: ; preds = ??? 2412 // <...> 2413 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) 2414 // %ComparedEqual = icmp eq <...> %memcmp, 0 2415 // [no terminator instruction!] 2416 // PhonyPreheaderBB: ; No preds, UNREACHABLE! 2417 // br label %PhonySuccessorBB 2418 // PhonySuccessorBB: ; preds = %PhonyPreheaderBB 2419 // unreachable 2420 // EqualBB: ; No preds, UNREACHABLE! 2421 // br label %Successor1BB 2422 // UnequalBB: ; No preds, UNREACHABLE! 2423 // br label %Successor0BB 2424 // Successor0BB: ; preds = %UnequalBB 2425 // %S0PHI = phi <...> [ <...>, %UnequalBB ] 2426 // <...> 2427 // Successor1BB: ; preds = %EqualBB 2428 // %S0PHI = phi <...> [ <...>, %EqualBB ] 2429 // <...> 2430 2431 // Now, actually restore the CFG. 2432 2433 // Insert an unconditional branch from an actual preheader basic block to 2434 // phony preheader basic block. 2435 IRBuilder<>(PreheaderBB).CreateBr(PhonyPreheaderBB); 2436 DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB}); 2437 // Insert proper conditional branch from phony successor basic block to the 2438 // "dispatch" basic blocks, which were used to preserve incoming values in 2439 // original loop's successor basic blocks. 2440 assert(isa<UnreachableInst>(PhonySuccessorBB->getTerminator()) && 2441 "Yep, that's the one we created to keep deleteDeadLoop() happy."); 2442 PhonySuccessorBB->getTerminator()->eraseFromParent(); 2443 { 2444 IRBuilder<> Builder(PhonySuccessorBB); 2445 Builder.SetCurrentDebugLocation(ComparedEqual->getDebugLoc()); 2446 Builder.CreateCondBr(ComparedEqual, ComparedEqualBB, ComparedUnequalBB); 2447 } 2448 DTUpdates.push_back( 2449 {DominatorTree::Insert, PhonySuccessorBB, ComparedEqualBB}); 2450 DTUpdates.push_back( 2451 {DominatorTree::Insert, PhonySuccessorBB, ComparedUnequalBB}); 2452 2453 BasicBlock *DispatchBB = PhonySuccessorBB; 2454 DispatchBB->setName(LoopName + ".bcmpdispatchbb"); 2455 2456 assert(DTUpdates.size() == 3 && "Update count prediction failed."); 2457 DTU.applyUpdates(DTUpdates); 2458 DTUpdates.clear(); 2459 2460 // By now we have: (4/6) 2461 // PreheaderBB: ; preds = ??? 2462 // <...> 2463 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) 2464 // %ComparedEqual = icmp eq <...> %memcmp, 0 2465 // br label %PhonyPreheaderBB 2466 // PhonyPreheaderBB: ; preds = %PreheaderBB 2467 // br label %DispatchBB 2468 // DispatchBB: ; preds = %PhonyPreheaderBB 2469 // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB 2470 // EqualBB: ; preds = %DispatchBB 2471 // br label %Successor1BB 2472 // UnequalBB: ; preds = %DispatchBB 2473 // br label %Successor0BB 2474 // Successor0BB: ; preds = %UnequalBB 2475 // %S0PHI = phi <...> [ <...>, %UnequalBB ] 2476 // <...> 2477 // Successor1BB: ; preds = %EqualBB 2478 // %S0PHI = phi <...> [ <...>, %EqualBB ] 2479 // <...> 2480 2481 // The basic CFG has been restored! Now let's merge redundant basic blocks. 2482 2483 // Merge phony successor basic block into it's only predecessor, 2484 // phony preheader basic block. It is fully pointlessly redundant. 2485 MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU); 2486 2487 // By now we have: (5/6) 2488 // PreheaderBB: ; preds = ??? 2489 // <...> 2490 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) 2491 // %ComparedEqual = icmp eq <...> %memcmp, 0 2492 // br label %DispatchBB 2493 // DispatchBB: ; preds = %PreheaderBB 2494 // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB 2495 // EqualBB: ; preds = %DispatchBB 2496 // br label %Successor1BB 2497 // UnequalBB: ; preds = %DispatchBB 2498 // br label %Successor0BB 2499 // Successor0BB: ; preds = %UnequalBB 2500 // %S0PHI = phi <...> [ <...>, %UnequalBB ] 2501 // <...> 2502 // Successor1BB: ; preds = %EqualBB 2503 // %S0PHI = phi <...> [ <...>, %EqualBB ] 2504 // <...> 2505 2506 // Was this loop nested? 2507 if (!ParentLoop) { 2508 // If the loop was *NOT* nested, then let's also merge phony successor 2509 // basic block into it's only predecessor, preheader basic block. 2510 // Also, here we need to update LoopInfo. 2511 LI->removeBlock(PreheaderBB); 2512 MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU); 2513 2514 // By now we have: (6/6) 2515 // DispatchBB: ; preds = ??? 2516 // <...> 2517 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) 2518 // %ComparedEqual = icmp eq <...> %memcmp, 0 2519 // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB 2520 // EqualBB: ; preds = %DispatchBB 2521 // br label %Successor1BB 2522 // UnequalBB: ; preds = %DispatchBB 2523 // br label %Successor0BB 2524 // Successor0BB: ; preds = %UnequalBB 2525 // %S0PHI = phi <...> [ <...>, %UnequalBB ] 2526 // <...> 2527 // Successor1BB: ; preds = %EqualBB 2528 // %S0PHI = phi <...> [ <...>, %EqualBB ] 2529 // <...> 2530 2531 return DispatchBB; 2532 } 2533 2534 // Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop. 2535 // To achieve that, we shall keep the preheader basic block (mainly so that 2536 // the loop header block will be guaranteed to have a predecessor outside of 2537 // the loop), and create a phony loop with all these new three basic blocks. 2538 Loop *PhonyLoop = LI->AllocateLoop(); 2539 ParentLoop->addChildLoop(PhonyLoop); 2540 PhonyLoop->addBasicBlockToLoop(DispatchBB, *LI); 2541 PhonyLoop->addBasicBlockToLoop(ComparedEqualBB, *LI); 2542 PhonyLoop->addBasicBlockToLoop(ComparedUnequalBB, *LI); 2543 2544 // But we only have a preheader basic block, a header basic block block and 2545 // two exiting basic blocks. For a proper loop we also need a backedge from 2546 // non-header basic block to header bb. 2547 // Let's just add a never-taken branch from both of the exiting basic blocks. 2548 for (BasicBlock *BB : {ComparedEqualBB, ComparedUnequalBB}) { 2549 BranchInst *OldTerminator = cast<BranchInst>(BB->getTerminator()); 2550 assert(OldTerminator->isUnconditional() && "That's the one we created."); 2551 BasicBlock *SuccessorBB = OldTerminator->getSuccessor(0); 2552 2553 IRBuilder<> Builder(OldTerminator); 2554 Builder.SetCurrentDebugLocation(OldTerminator->getDebugLoc()); 2555 Builder.CreateCondBr(ConstantInt::getTrue(Context), SuccessorBB, 2556 DispatchBB); 2557 OldTerminator->eraseFromParent(); 2558 // Yes, the backedge will never be taken. The control-flow is redundant. 2559 // If it can be simplified further, other passes will take care. 2560 DTUpdates.push_back({DominatorTree::Delete, BB, SuccessorBB}); 2561 DTUpdates.push_back({DominatorTree::Insert, BB, SuccessorBB}); 2562 DTUpdates.push_back({DominatorTree::Insert, BB, DispatchBB}); 2563 } 2564 assert(DTUpdates.size() == 6 && "Update count prediction failed."); 2565 DTU.applyUpdates(DTUpdates); 2566 DTUpdates.clear(); 2567 2568 // By now we have: (6/6) 2569 // PreheaderBB: <preheader> ; preds = ??? 2570 // <...> 2571 // %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes) 2572 // %ComparedEqual = icmp eq <...> %memcmp, 0 2573 // br label %BCmpDispatchBB 2574 // BCmpDispatchBB: <header> ; preds = %PreheaderBB 2575 // br i1 %ComparedEqual, label %EqualBB, label %UnequalBB 2576 // EqualBB: <latch,exiting> ; preds = %BCmpDispatchBB 2577 // br i1 %true, label %Successor1BB, label %BCmpDispatchBB 2578 // UnequalBB: <latch,exiting> ; preds = %BCmpDispatchBB 2579 // br i1 %true, label %Successor0BB, label %BCmpDispatchBB 2580 // Successor0BB: ; preds = %UnequalBB 2581 // %S0PHI = phi <...> [ <...>, %UnequalBB ] 2582 // <...> 2583 // Successor1BB: ; preds = %EqualBB 2584 // %S0PHI = phi <...> [ <...>, %EqualBB ] 2585 // <...> 2586 2587 // Finally fully DONE! 2588 return DispatchBB; 2589 } 2590 2591 void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst *BCmpInst, 2592 CmpInst *LatchCmpInst, 2593 LoadInst *LoadA, LoadInst *LoadB, 2594 const SCEV *SrcA, const SCEV *SrcB, 2595 const SCEV *NBytes) { 2596 // We will be inserting before the terminator instruction of preheader block. 2597 IRBuilder<> Builder(CurLoop->getLoopPreheader()->getTerminator()); 2598 2599 LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n"); 2600 LLVM_DEBUG(dbgs() << "Emitting new instructions.\n"); 2601 2602 // Expand the SCEV expressions for both sources to compare, and produce value 2603 // for the byte len (beware of Iterations potentially being a pointer, and 2604 // account for element size being BCmpTyBytes bytes, which may be not 1 byte) 2605 Value *PtrA, *PtrB, *Len; 2606 { 2607 SCEVExpander SExp(*SE, *DL, "LoopToBCmp"); 2608 SExp.setInsertPoint(&*Builder.GetInsertPoint()); 2609 2610 auto HandlePtr = [&SExp](LoadInst *Load, const SCEV *Src) { 2611 SExp.SetCurrentDebugLocation(DebugLoc()); 2612 // If the pointer operand of original load had dbgloc - use it. 2613 if (const auto *I = dyn_cast<Instruction>(Load->getPointerOperand())) 2614 SExp.SetCurrentDebugLocation(I->getDebugLoc()); 2615 return SExp.expandCodeFor(Src); 2616 }; 2617 PtrA = HandlePtr(LoadA, SrcA); 2618 PtrB = HandlePtr(LoadB, SrcB); 2619 2620 // For len calculation let's use dbgloc for the loop's latch condition. 2621 Builder.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc()); 2622 SExp.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc()); 2623 Len = SExp.expandCodeFor(NBytes); 2624 2625 Type *CmpFuncSizeTy = DL->getIntPtrType(Builder.getContext()); 2626 assert(SE->getTypeSizeInBits(Len->getType()) == 2627 DL->getTypeSizeInBits(CmpFuncSizeTy) && 2628 "Len should already have the correct size."); 2629 2630 // Make sure that iteration count is a number, insert ptrtoint cast if not. 2631 if (Len->getType()->isPointerTy()) 2632 Len = Builder.CreatePtrToInt(Len, CmpFuncSizeTy); 2633 assert(Len->getType() == CmpFuncSizeTy && "Should have correct type now."); 2634 2635 Len->setName(Len->getName() + ".bytecount"); 2636 2637 // There is no legality check needed. We want to compare that the memory 2638 // regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal. 2639 // For them to be fully equal, they must match bit-by-bit. And likewise, 2640 // for them to *NOT* be fully equal, they have to differ just by one bit. 2641 // The step of comparison (bits compared at once) simply does not matter. 2642 } 2643 2644 // For the rest of new instructions, dbgloc should point at the value cmp. 2645 Builder.SetCurrentDebugLocation(BCmpInst->getDebugLoc()); 2646 2647 // Emit the comparison itself. 2648 auto *CmpCall = 2649 cast<CallInst>(HasBCmp ? emitBCmp(PtrA, PtrB, Len, Builder, *DL, TLI) 2650 : emitMemCmp(PtrA, PtrB, Len, Builder, *DL, TLI)); 2651 // FIXME: add {B,Mem}CmpInst with MemoryCompareInst 2652 // (based on MemIntrinsicBase) as base? 2653 // FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...) 2654 2655 // {b,mem}cmp returned 0 if they were equal, or non-zero if not equal. 2656 auto *ComparedEqual = cast<ICmpInst>(Builder.CreateICmpEQ( 2657 CmpCall, ConstantInt::get(CmpCall->getType(), 0), 2658 PtrA->getName() + ".vs." + PtrB->getName() + ".eqcmp")); 2659 2660 BasicBlock *BB = transformBCmpControlFlow(ComparedEqual); 2661 Builder.ClearInsertionPoint(); 2662 2663 // We're done. 2664 LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n"); 2665 ORE.emit([&]() { 2666 return OptimizationRemark(DEBUG_TYPE, "TransformedBCmpIdiomToCall", 2667 CmpCall->getDebugLoc(), BB) 2668 << "Transformed bcmp idiom into a call to " 2669 << ore::NV("NewFunction", CmpCall->getCalledFunction()) 2670 << "() function"; 2671 }); 2672 ++NumBCmp; 2673 } 2674 2675 /// Recognizes a bcmp idiom in a non-countable loop. 2676 /// 2677 /// If detected, transforms the relevant code to issue the bcmp (or memcmp) 2678 /// intrinsic function call, and returns true; otherwise, returns false. 2679 bool LoopIdiomRecognize::recognizeBCmp() { 2680 if (!HasMemCmp && !HasBCmp) 2681 return false; 2682 2683 ICmpInst *BCmpInst; 2684 CmpInst *LatchCmpInst; 2685 LoadInst *LoadA, *LoadB; 2686 const SCEV *SrcA, *SrcB, *NBytes; 2687 if (!detectBCmpIdiom(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, 2688 NBytes)) { 2689 LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n"); 2690 return false; 2691 } 2692 2693 transformLoopToBCmp(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, NBytes); 2694 return true; 2695 } 2696