1 //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // The implementation for the loop memory dependence that was originally 10 // developed for the loop vectorizer. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Analysis/LoopAccessAnalysis.h" 15 #include "llvm/ADT/APInt.h" 16 #include "llvm/ADT/DenseMap.h" 17 #include "llvm/ADT/EquivalenceClasses.h" 18 #include "llvm/ADT/PointerIntPair.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SetVector.h" 21 #include "llvm/ADT/SmallPtrSet.h" 22 #include "llvm/ADT/SmallSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/AliasAnalysis.h" 25 #include "llvm/Analysis/AliasSetTracker.h" 26 #include "llvm/Analysis/LoopAnalysisManager.h" 27 #include "llvm/Analysis/LoopInfo.h" 28 #include "llvm/Analysis/LoopIterator.h" 29 #include "llvm/Analysis/MemoryLocation.h" 30 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 31 #include "llvm/Analysis/ScalarEvolution.h" 32 #include "llvm/Analysis/ScalarEvolutionExpressions.h" 33 #include "llvm/Analysis/ScalarEvolutionPatternMatch.h" 34 #include "llvm/Analysis/TargetLibraryInfo.h" 35 #include "llvm/Analysis/TargetTransformInfo.h" 36 #include "llvm/Analysis/ValueTracking.h" 37 #include "llvm/Analysis/VectorUtils.h" 38 #include "llvm/IR/BasicBlock.h" 39 #include "llvm/IR/Constants.h" 40 #include "llvm/IR/DataLayout.h" 41 #include "llvm/IR/DebugLoc.h" 42 #include "llvm/IR/DerivedTypes.h" 43 #include "llvm/IR/DiagnosticInfo.h" 44 #include "llvm/IR/Dominators.h" 45 #include "llvm/IR/Function.h" 46 #include "llvm/IR/InstrTypes.h" 47 #include "llvm/IR/Instruction.h" 48 #include "llvm/IR/Instructions.h" 49 #include "llvm/IR/IntrinsicInst.h" 50 #include "llvm/IR/PassManager.h" 51 #include "llvm/IR/Type.h" 52 #include "llvm/IR/Value.h" 53 #include "llvm/IR/ValueHandle.h" 54 #include "llvm/Support/Casting.h" 55 #include "llvm/Support/CommandLine.h" 56 #include "llvm/Support/Debug.h" 57 #include "llvm/Support/ErrorHandling.h" 58 #include "llvm/Support/raw_ostream.h" 59 #include <algorithm> 60 #include <cassert> 61 #include <cstdint> 62 #include <iterator> 63 #include <utility> 64 #include <variant> 65 #include <vector> 66 67 using namespace llvm; 68 using namespace llvm::SCEVPatternMatch; 69 70 #define DEBUG_TYPE "loop-accesses" 71 72 static cl::opt<unsigned, true> 73 VectorizationFactor("force-vector-width", cl::Hidden, 74 cl::desc("Sets the SIMD width. Zero is autoselect."), 75 cl::location(VectorizerParams::VectorizationFactor)); 76 unsigned VectorizerParams::VectorizationFactor; 77 78 static cl::opt<unsigned, true> 79 VectorizationInterleave("force-vector-interleave", cl::Hidden, 80 cl::desc("Sets the vectorization interleave count. " 81 "Zero is autoselect."), 82 cl::location( 83 VectorizerParams::VectorizationInterleave)); 84 unsigned VectorizerParams::VectorizationInterleave; 85 86 static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold( 87 "runtime-memory-check-threshold", cl::Hidden, 88 cl::desc("When performing memory disambiguation checks at runtime do not " 89 "generate more than this number of comparisons (default = 8)."), 90 cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8)); 91 unsigned VectorizerParams::RuntimeMemoryCheckThreshold; 92 93 /// The maximum iterations used to merge memory checks 94 static cl::opt<unsigned> MemoryCheckMergeThreshold( 95 "memory-check-merge-threshold", cl::Hidden, 96 cl::desc("Maximum number of comparisons done when trying to merge " 97 "runtime memory checks. (default = 100)"), 98 cl::init(100)); 99 100 /// Maximum SIMD width. 101 const unsigned VectorizerParams::MaxVectorWidth = 64; 102 103 /// We collect dependences up to this threshold. 104 static cl::opt<unsigned> 105 MaxDependences("max-dependences", cl::Hidden, 106 cl::desc("Maximum number of dependences collected by " 107 "loop-access analysis (default = 100)"), 108 cl::init(100)); 109 110 /// This enables versioning on the strides of symbolically striding memory 111 /// accesses in code like the following. 112 /// for (i = 0; i < N; ++i) 113 /// A[i * Stride1] += B[i * Stride2] ... 114 /// 115 /// Will be roughly translated to 116 /// if (Stride1 == 1 && Stride2 == 1) { 117 /// for (i = 0; i < N; i+=4) 118 /// A[i:i+3] += ... 119 /// } else 120 /// ... 121 static cl::opt<bool> EnableMemAccessVersioning( 122 "enable-mem-access-versioning", cl::init(true), cl::Hidden, 123 cl::desc("Enable symbolic stride memory access versioning")); 124 125 /// Enable store-to-load forwarding conflict detection. This option can 126 /// be disabled for correctness testing. 127 static cl::opt<bool> EnableForwardingConflictDetection( 128 "store-to-load-forwarding-conflict-detection", cl::Hidden, 129 cl::desc("Enable conflict detection in loop-access analysis"), 130 cl::init(true)); 131 132 static cl::opt<unsigned> MaxForkedSCEVDepth( 133 "max-forked-scev-depth", cl::Hidden, 134 cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"), 135 cl::init(5)); 136 137 static cl::opt<bool> SpeculateUnitStride( 138 "laa-speculate-unit-stride", cl::Hidden, 139 cl::desc("Speculate that non-constant strides are unit in LAA"), 140 cl::init(true)); 141 142 static cl::opt<bool, true> HoistRuntimeChecks( 143 "hoist-runtime-checks", cl::Hidden, 144 cl::desc( 145 "Hoist inner loop runtime memory checks to outer loop if possible"), 146 cl::location(VectorizerParams::HoistRuntimeChecks), cl::init(true)); 147 bool VectorizerParams::HoistRuntimeChecks; 148 149 bool VectorizerParams::isInterleaveForced() { 150 return ::VectorizationInterleave.getNumOccurrences() > 0; 151 } 152 153 const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, 154 const DenseMap<Value *, const SCEV *> &PtrToStride, 155 Value *Ptr) { 156 const SCEV *OrigSCEV = PSE.getSCEV(Ptr); 157 158 // If there is an entry in the map return the SCEV of the pointer with the 159 // symbolic stride replaced by one. 160 const SCEV *StrideSCEV = PtrToStride.lookup(Ptr); 161 if (!StrideSCEV) 162 // For a non-symbolic stride, just return the original expression. 163 return OrigSCEV; 164 165 // Note: This assert is both overly strong and overly weak. The actual 166 // invariant here is that StrideSCEV should be loop invariant. The only 167 // such invariant strides we happen to speculate right now are unknowns 168 // and thus this is a reasonable proxy of the actual invariant. 169 assert(isa<SCEVUnknown>(StrideSCEV) && "shouldn't be in map"); 170 171 ScalarEvolution *SE = PSE.getSE(); 172 const SCEV *CT = SE->getOne(StrideSCEV->getType()); 173 PSE.addPredicate(*SE->getEqualPredicate(StrideSCEV, CT)); 174 const SCEV *Expr = PSE.getSCEV(Ptr); 175 176 LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV 177 << " by: " << *Expr << "\n"); 178 return Expr; 179 } 180 181 RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup( 182 unsigned Index, const RuntimePointerChecking &RtCheck) 183 : High(RtCheck.Pointers[Index].End), Low(RtCheck.Pointers[Index].Start), 184 AddressSpace(RtCheck.Pointers[Index] 185 .PointerValue->getType() 186 ->getPointerAddressSpace()), 187 NeedsFreeze(RtCheck.Pointers[Index].NeedsFreeze) { 188 Members.push_back(Index); 189 } 190 191 /// Returns \p A + \p B, if it is guaranteed not to unsigned wrap. Otherwise 192 /// return nullptr. \p A and \p B must have the same type. 193 static const SCEV *addSCEVNoOverflow(const SCEV *A, const SCEV *B, 194 ScalarEvolution &SE) { 195 if (!SE.willNotOverflow(Instruction::Add, /*IsSigned=*/false, A, B)) 196 return nullptr; 197 return SE.getAddExpr(A, B); 198 } 199 200 /// Returns \p A * \p B, if it is guaranteed not to unsigned wrap. Otherwise 201 /// return nullptr. \p A and \p B must have the same type. 202 static const SCEV *mulSCEVOverflow(const SCEV *A, const SCEV *B, 203 ScalarEvolution &SE) { 204 if (!SE.willNotOverflow(Instruction::Mul, /*IsSigned=*/false, A, B)) 205 return nullptr; 206 return SE.getMulExpr(A, B); 207 } 208 209 /// Return true, if evaluating \p AR at \p MaxBTC cannot wrap, because \p AR at 210 /// \p MaxBTC is guaranteed inbounds of the accessed object. 211 static bool evaluatePtrAddRecAtMaxBTCWillNotWrap(const SCEVAddRecExpr *AR, 212 const SCEV *MaxBTC, 213 const SCEV *EltSize, 214 ScalarEvolution &SE, 215 const DataLayout &DL) { 216 auto *PointerBase = SE.getPointerBase(AR->getStart()); 217 auto *StartPtr = dyn_cast<SCEVUnknown>(PointerBase); 218 if (!StartPtr) 219 return false; 220 bool CheckForNonNull, CheckForFreed; 221 uint64_t DerefBytes = StartPtr->getValue()->getPointerDereferenceableBytes( 222 DL, CheckForNonNull, CheckForFreed); 223 224 if (CheckForNonNull || CheckForFreed) 225 return false; 226 227 const SCEV *Step = AR->getStepRecurrence(SE); 228 bool IsKnownNonNegative = SE.isKnownNonNegative(Step); 229 if (!IsKnownNonNegative && !SE.isKnownNegative(Step)) 230 return false; 231 232 Type *WiderTy = SE.getWiderType(MaxBTC->getType(), Step->getType()); 233 Step = SE.getNoopOrSignExtend(Step, WiderTy); 234 MaxBTC = SE.getNoopOrZeroExtend(MaxBTC, WiderTy); 235 236 // For the computations below, make sure they don't unsigned wrap. 237 if (!SE.isKnownPredicate(CmpInst::ICMP_UGE, AR->getStart(), StartPtr)) 238 return false; 239 const SCEV *StartOffset = SE.getNoopOrZeroExtend( 240 SE.getMinusSCEV(AR->getStart(), StartPtr), WiderTy); 241 242 const SCEV *OffsetAtLastIter = 243 mulSCEVOverflow(MaxBTC, SE.getAbsExpr(Step, /*IsNSW=*/false), SE); 244 if (!OffsetAtLastIter) 245 return false; 246 247 const SCEV *OffsetEndBytes = addSCEVNoOverflow( 248 OffsetAtLastIter, SE.getNoopOrZeroExtend(EltSize, WiderTy), SE); 249 if (!OffsetEndBytes) 250 return false; 251 252 if (IsKnownNonNegative) { 253 // For positive steps, check if 254 // (AR->getStart() - StartPtr) + (MaxBTC * Step) + EltSize <= DerefBytes, 255 // while making sure none of the computations unsigned wrap themselves. 256 const SCEV *EndBytes = addSCEVNoOverflow(StartOffset, OffsetEndBytes, SE); 257 if (!EndBytes) 258 return false; 259 return SE.isKnownPredicate(CmpInst::ICMP_ULE, EndBytes, 260 SE.getConstant(WiderTy, DerefBytes)); 261 } 262 263 // For negative steps check if 264 // * StartOffset >= (MaxBTC * Step + EltSize) 265 // * StartOffset <= DerefBytes. 266 assert(SE.isKnownNegative(Step) && "must be known negative"); 267 return SE.isKnownPredicate(CmpInst::ICMP_SGE, StartOffset, OffsetEndBytes) && 268 SE.isKnownPredicate(CmpInst::ICMP_ULE, StartOffset, 269 SE.getConstant(WiderTy, DerefBytes)); 270 } 271 272 std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess( 273 const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *BTC, 274 const SCEV *MaxBTC, ScalarEvolution *SE, 275 DenseMap<std::pair<const SCEV *, Type *>, 276 std::pair<const SCEV *, const SCEV *>> *PointerBounds) { 277 std::pair<const SCEV *, const SCEV *> *PtrBoundsPair; 278 if (PointerBounds) { 279 auto [Iter, Ins] = PointerBounds->insert( 280 {{PtrExpr, AccessTy}, 281 {SE->getCouldNotCompute(), SE->getCouldNotCompute()}}); 282 if (!Ins) 283 return Iter->second; 284 PtrBoundsPair = &Iter->second; 285 } 286 287 const SCEV *ScStart; 288 const SCEV *ScEnd; 289 290 auto &DL = Lp->getHeader()->getDataLayout(); 291 Type *IdxTy = DL.getIndexType(PtrExpr->getType()); 292 const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy); 293 if (SE->isLoopInvariant(PtrExpr, Lp)) { 294 ScStart = ScEnd = PtrExpr; 295 } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) { 296 ScStart = AR->getStart(); 297 if (!isa<SCEVCouldNotCompute>(BTC)) 298 // Evaluating AR at an exact BTC is safe: LAA separately checks that 299 // accesses cannot wrap in the loop. If evaluating AR at BTC wraps, then 300 // the loop either triggers UB when executing a memory access with a 301 // poison pointer or the wrapping/poisoned pointer is not used. 302 ScEnd = AR->evaluateAtIteration(BTC, *SE); 303 else { 304 // Evaluating AR at MaxBTC may wrap and create an expression that is less 305 // than the start of the AddRec due to wrapping (for example consider 306 // MaxBTC = -2). If that's the case, set ScEnd to -(EltSize + 1). ScEnd 307 // will get incremented by EltSize before returning, so this effectively 308 // sets ScEnd to the maximum unsigned value for the type. Note that LAA 309 // separately checks that accesses cannot not wrap, so unsigned max 310 // represents an upper bound. 311 if (evaluatePtrAddRecAtMaxBTCWillNotWrap(AR, MaxBTC, EltSizeSCEV, *SE, 312 DL)) { 313 ScEnd = AR->evaluateAtIteration(MaxBTC, *SE); 314 } else { 315 ScEnd = SE->getAddExpr( 316 SE->getNegativeSCEV(EltSizeSCEV), 317 SE->getSCEV(ConstantExpr::getIntToPtr( 318 ConstantInt::get(EltSizeSCEV->getType(), -1), AR->getType()))); 319 } 320 } 321 const SCEV *Step = AR->getStepRecurrence(*SE); 322 323 // For expressions with negative step, the upper bound is ScStart and the 324 // lower bound is ScEnd. 325 if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) { 326 if (CStep->getValue()->isNegative()) 327 std::swap(ScStart, ScEnd); 328 } else { 329 // Fallback case: the step is not constant, but we can still 330 // get the upper and lower bounds of the interval by using min/max 331 // expressions. 332 ScStart = SE->getUMinExpr(ScStart, ScEnd); 333 ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd); 334 } 335 } else 336 return {SE->getCouldNotCompute(), SE->getCouldNotCompute()}; 337 338 assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant"); 339 assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant"); 340 341 // Add the size of the pointed element to ScEnd. 342 ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV); 343 344 std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd}; 345 if (PointerBounds) 346 *PtrBoundsPair = Res; 347 return Res; 348 } 349 350 /// Calculate Start and End points of memory access using 351 /// getStartAndEndForAccess. 352 void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr, 353 Type *AccessTy, bool WritePtr, 354 unsigned DepSetId, unsigned ASId, 355 PredicatedScalarEvolution &PSE, 356 bool NeedsFreeze) { 357 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); 358 const SCEV *BTC = PSE.getBackedgeTakenCount(); 359 const auto &[ScStart, ScEnd] = 360 getStartAndEndForAccess(Lp, PtrExpr, AccessTy, BTC, SymbolicMaxBTC, 361 PSE.getSE(), &DC.getPointerBounds()); 362 assert(!isa<SCEVCouldNotCompute>(ScStart) && 363 !isa<SCEVCouldNotCompute>(ScEnd) && 364 "must be able to compute both start and end expressions"); 365 Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, PtrExpr, 366 NeedsFreeze); 367 } 368 369 bool RuntimePointerChecking::tryToCreateDiffCheck( 370 const RuntimeCheckingPtrGroup &CGI, const RuntimeCheckingPtrGroup &CGJ) { 371 // If either group contains multiple different pointers, bail out. 372 // TODO: Support multiple pointers by using the minimum or maximum pointer, 373 // depending on src & sink. 374 if (CGI.Members.size() != 1 || CGJ.Members.size() != 1) 375 return false; 376 377 const PointerInfo *Src = &Pointers[CGI.Members[0]]; 378 const PointerInfo *Sink = &Pointers[CGJ.Members[0]]; 379 380 // If either pointer is read and written, multiple checks may be needed. Bail 381 // out. 382 if (!DC.getOrderForAccess(Src->PointerValue, !Src->IsWritePtr).empty() || 383 !DC.getOrderForAccess(Sink->PointerValue, !Sink->IsWritePtr).empty()) 384 return false; 385 386 ArrayRef<unsigned> AccSrc = 387 DC.getOrderForAccess(Src->PointerValue, Src->IsWritePtr); 388 ArrayRef<unsigned> AccSink = 389 DC.getOrderForAccess(Sink->PointerValue, Sink->IsWritePtr); 390 // If either pointer is accessed multiple times, there may not be a clear 391 // src/sink relation. Bail out for now. 392 if (AccSrc.size() != 1 || AccSink.size() != 1) 393 return false; 394 395 // If the sink is accessed before src, swap src/sink. 396 if (AccSink[0] < AccSrc[0]) 397 std::swap(Src, Sink); 398 399 const SCEVConstant *Step; 400 const SCEV *SrcStart; 401 const SCEV *SinkStart; 402 const Loop *InnerLoop = DC.getInnermostLoop(); 403 if (!match(Src->Expr, 404 m_scev_AffineAddRec(m_SCEV(SrcStart), m_SCEVConstant(Step), 405 m_SpecificLoop(InnerLoop))) || 406 !match(Sink->Expr, 407 m_scev_AffineAddRec(m_SCEV(SinkStart), m_scev_Specific(Step), 408 m_SpecificLoop(InnerLoop)))) 409 return false; 410 411 SmallVector<Instruction *, 4> SrcInsts = 412 DC.getInstructionsForAccess(Src->PointerValue, Src->IsWritePtr); 413 SmallVector<Instruction *, 4> SinkInsts = 414 DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr); 415 Type *SrcTy = getLoadStoreType(SrcInsts[0]); 416 Type *DstTy = getLoadStoreType(SinkInsts[0]); 417 if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy)) 418 return false; 419 420 const DataLayout &DL = InnerLoop->getHeader()->getDataLayout(); 421 unsigned AllocSize = 422 std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy)); 423 424 // Only matching constant steps matching the AllocSize are supported at the 425 // moment. This simplifies the difference computation. Can be extended in the 426 // future. 427 if (Step->getAPInt().abs() != AllocSize) 428 return false; 429 430 IntegerType *IntTy = 431 IntegerType::get(Src->PointerValue->getContext(), 432 DL.getPointerSizeInBits(CGI.AddressSpace)); 433 434 // When counting down, the dependence distance needs to be swapped. 435 if (Step->getValue()->isNegative()) 436 std::swap(SinkStart, SrcStart); 437 438 const SCEV *SinkStartInt = SE->getPtrToIntExpr(SinkStart, IntTy); 439 const SCEV *SrcStartInt = SE->getPtrToIntExpr(SrcStart, IntTy); 440 if (isa<SCEVCouldNotCompute>(SinkStartInt) || 441 isa<SCEVCouldNotCompute>(SrcStartInt)) 442 return false; 443 444 // If the start values for both Src and Sink also vary according to an outer 445 // loop, then it's probably better to avoid creating diff checks because 446 // they may not be hoisted. We should instead let llvm::addRuntimeChecks 447 // do the expanded full range overlap checks, which can be hoisted. 448 if (HoistRuntimeChecks && InnerLoop->getParentLoop() && 449 isa<SCEVAddRecExpr>(SinkStartInt) && isa<SCEVAddRecExpr>(SrcStartInt)) { 450 auto *SrcStartAR = cast<SCEVAddRecExpr>(SrcStartInt); 451 auto *SinkStartAR = cast<SCEVAddRecExpr>(SinkStartInt); 452 const Loop *StartARLoop = SrcStartAR->getLoop(); 453 if (StartARLoop == SinkStartAR->getLoop() && 454 StartARLoop == InnerLoop->getParentLoop() && 455 // If the diff check would already be loop invariant (due to the 456 // recurrences being the same), then we prefer to keep the diff checks 457 // because they are cheaper. 458 SrcStartAR->getStepRecurrence(*SE) != 459 SinkStartAR->getStepRecurrence(*SE)) { 460 LLVM_DEBUG(dbgs() << "LAA: Not creating diff runtime check, since these " 461 "cannot be hoisted out of the outer loop\n"); 462 return false; 463 } 464 } 465 466 LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n" 467 << "SrcStart: " << *SrcStartInt << '\n' 468 << "SinkStartInt: " << *SinkStartInt << '\n'); 469 DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize, 470 Src->NeedsFreeze || Sink->NeedsFreeze); 471 return true; 472 } 473 474 SmallVector<RuntimePointerCheck, 4> RuntimePointerChecking::generateChecks() { 475 SmallVector<RuntimePointerCheck, 4> Checks; 476 477 for (unsigned I = 0; I < CheckingGroups.size(); ++I) { 478 for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) { 479 const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I]; 480 const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J]; 481 482 if (needsChecking(CGI, CGJ)) { 483 CanUseDiffCheck = CanUseDiffCheck && tryToCreateDiffCheck(CGI, CGJ); 484 Checks.emplace_back(&CGI, &CGJ); 485 } 486 } 487 } 488 return Checks; 489 } 490 491 void RuntimePointerChecking::generateChecks( 492 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { 493 assert(Checks.empty() && "Checks is not empty"); 494 groupChecks(DepCands, UseDependencies); 495 Checks = generateChecks(); 496 } 497 498 bool RuntimePointerChecking::needsChecking( 499 const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const { 500 for (const auto &I : M.Members) 501 for (const auto &J : N.Members) 502 if (needsChecking(I, J)) 503 return true; 504 return false; 505 } 506 507 /// Compare \p I and \p J and return the minimum. 508 /// Return nullptr in case we couldn't find an answer. 509 static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J, 510 ScalarEvolution *SE) { 511 std::optional<APInt> Diff = SE->computeConstantDifference(J, I); 512 if (!Diff) 513 return nullptr; 514 return Diff->isNegative() ? J : I; 515 } 516 517 bool RuntimeCheckingPtrGroup::addPointer( 518 unsigned Index, const RuntimePointerChecking &RtCheck) { 519 return addPointer( 520 Index, RtCheck.Pointers[Index].Start, RtCheck.Pointers[Index].End, 521 RtCheck.Pointers[Index].PointerValue->getType()->getPointerAddressSpace(), 522 RtCheck.Pointers[Index].NeedsFreeze, *RtCheck.SE); 523 } 524 525 bool RuntimeCheckingPtrGroup::addPointer(unsigned Index, const SCEV *Start, 526 const SCEV *End, unsigned AS, 527 bool NeedsFreeze, 528 ScalarEvolution &SE) { 529 assert(AddressSpace == AS && 530 "all pointers in a checking group must be in the same address space"); 531 532 // Compare the starts and ends with the known minimum and maximum 533 // of this set. We need to know how we compare against the min/max 534 // of the set in order to be able to emit memchecks. 535 const SCEV *Min0 = getMinFromExprs(Start, Low, &SE); 536 if (!Min0) 537 return false; 538 539 const SCEV *Min1 = getMinFromExprs(End, High, &SE); 540 if (!Min1) 541 return false; 542 543 // Update the low bound expression if we've found a new min value. 544 if (Min0 == Start) 545 Low = Start; 546 547 // Update the high bound expression if we've found a new max value. 548 if (Min1 != End) 549 High = End; 550 551 Members.push_back(Index); 552 this->NeedsFreeze |= NeedsFreeze; 553 return true; 554 } 555 556 void RuntimePointerChecking::groupChecks( 557 MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) { 558 // We build the groups from dependency candidates equivalence classes 559 // because: 560 // - We know that pointers in the same equivalence class share 561 // the same underlying object and therefore there is a chance 562 // that we can compare pointers 563 // - We wouldn't be able to merge two pointers for which we need 564 // to emit a memcheck. The classes in DepCands are already 565 // conveniently built such that no two pointers in the same 566 // class need checking against each other. 567 568 // We use the following (greedy) algorithm to construct the groups 569 // For every pointer in the equivalence class: 570 // For each existing group: 571 // - if the difference between this pointer and the min/max bounds 572 // of the group is a constant, then make the pointer part of the 573 // group and update the min/max bounds of that group as required. 574 575 CheckingGroups.clear(); 576 577 // If we need to check two pointers to the same underlying object 578 // with a non-constant difference, we shouldn't perform any pointer 579 // grouping with those pointers. This is because we can easily get 580 // into cases where the resulting check would return false, even when 581 // the accesses are safe. 582 // 583 // The following example shows this: 584 // for (i = 0; i < 1000; ++i) 585 // a[5000 + i * m] = a[i] + a[i + 9000] 586 // 587 // Here grouping gives a check of (5000, 5000 + 1000 * m) against 588 // (0, 10000) which is always false. However, if m is 1, there is no 589 // dependence. Not grouping the checks for a[i] and a[i + 9000] allows 590 // us to perform an accurate check in this case. 591 // 592 // The above case requires that we have an UnknownDependence between 593 // accesses to the same underlying object. This cannot happen unless 594 // FoundNonConstantDistanceDependence is set, and therefore UseDependencies 595 // is also false. In this case we will use the fallback path and create 596 // separate checking groups for all pointers. 597 598 // If we don't have the dependency partitions, construct a new 599 // checking pointer group for each pointer. This is also required 600 // for correctness, because in this case we can have checking between 601 // pointers to the same underlying object. 602 if (!UseDependencies) { 603 for (unsigned I = 0; I < Pointers.size(); ++I) 604 CheckingGroups.emplace_back(I, *this); 605 return; 606 } 607 608 unsigned TotalComparisons = 0; 609 610 DenseMap<Value *, SmallVector<unsigned>> PositionMap; 611 for (unsigned Index = 0; Index < Pointers.size(); ++Index) 612 PositionMap[Pointers[Index].PointerValue].push_back(Index); 613 614 // We need to keep track of what pointers we've already seen so we 615 // don't process them twice. 616 SmallSet<unsigned, 2> Seen; 617 618 // Go through all equivalence classes, get the "pointer check groups" 619 // and add them to the overall solution. We use the order in which accesses 620 // appear in 'Pointers' to enforce determinism. 621 for (unsigned I = 0; I < Pointers.size(); ++I) { 622 // We've seen this pointer before, and therefore already processed 623 // its equivalence class. 624 if (Seen.contains(I)) 625 continue; 626 627 MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue, 628 Pointers[I].IsWritePtr); 629 630 SmallVector<RuntimeCheckingPtrGroup, 2> Groups; 631 632 // Because DepCands is constructed by visiting accesses in the order in 633 // which they appear in alias sets (which is deterministic) and the 634 // iteration order within an equivalence class member is only dependent on 635 // the order in which unions and insertions are performed on the 636 // equivalence class, the iteration order is deterministic. 637 for (auto M : DepCands.members(Access)) { 638 auto PointerI = PositionMap.find(M.getPointer()); 639 // If we can't find the pointer in PositionMap that means we can't 640 // generate a memcheck for it. 641 if (PointerI == PositionMap.end()) 642 continue; 643 for (unsigned Pointer : PointerI->second) { 644 bool Merged = false; 645 // Mark this pointer as seen. 646 Seen.insert(Pointer); 647 648 // Go through all the existing sets and see if we can find one 649 // which can include this pointer. 650 for (RuntimeCheckingPtrGroup &Group : Groups) { 651 // Don't perform more than a certain amount of comparisons. 652 // This should limit the cost of grouping the pointers to something 653 // reasonable. If we do end up hitting this threshold, the algorithm 654 // will create separate groups for all remaining pointers. 655 if (TotalComparisons > MemoryCheckMergeThreshold) 656 break; 657 658 TotalComparisons++; 659 660 if (Group.addPointer(Pointer, *this)) { 661 Merged = true; 662 break; 663 } 664 } 665 666 if (!Merged) 667 // We couldn't add this pointer to any existing set or the threshold 668 // for the number of comparisons has been reached. Create a new group 669 // to hold the current pointer. 670 Groups.emplace_back(Pointer, *this); 671 } 672 } 673 674 // We've computed the grouped checks for this partition. 675 // Save the results and continue with the next one. 676 llvm::append_range(CheckingGroups, Groups); 677 } 678 } 679 680 bool RuntimePointerChecking::arePointersInSamePartition( 681 const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1, 682 unsigned PtrIdx2) { 683 return (PtrToPartition[PtrIdx1] != -1 && 684 PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]); 685 } 686 687 bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const { 688 const PointerInfo &PointerI = Pointers[I]; 689 const PointerInfo &PointerJ = Pointers[J]; 690 691 // No need to check if two readonly pointers intersect. 692 if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr) 693 return false; 694 695 // Only need to check pointers between two different dependency sets. 696 if (PointerI.DependencySetId == PointerJ.DependencySetId) 697 return false; 698 699 // Only need to check pointers in the same alias set. 700 return PointerI.AliasSetId == PointerJ.AliasSetId; 701 } 702 703 /// Assign each RuntimeCheckingPtrGroup pointer an index for stable UTC output. 704 static DenseMap<const RuntimeCheckingPtrGroup *, unsigned> 705 getPtrToIdxMap(ArrayRef<RuntimeCheckingPtrGroup> CheckingGroups) { 706 DenseMap<const RuntimeCheckingPtrGroup *, unsigned> PtrIndices; 707 for (const auto &[Idx, CG] : enumerate(CheckingGroups)) 708 PtrIndices[&CG] = Idx; 709 return PtrIndices; 710 } 711 712 void RuntimePointerChecking::printChecks( 713 raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks, 714 unsigned Depth) const { 715 unsigned N = 0; 716 auto PtrIndices = getPtrToIdxMap(CheckingGroups); 717 for (const auto &[Check1, Check2] : Checks) { 718 const auto &First = Check1->Members, &Second = Check2->Members; 719 OS.indent(Depth) << "Check " << N++ << ":\n"; 720 OS.indent(Depth + 2) << "Comparing group GRP" << PtrIndices.at(Check1) 721 << ":\n"; 722 for (unsigned K : First) 723 OS.indent(Depth + 2) << *Pointers[K].PointerValue << "\n"; 724 OS.indent(Depth + 2) << "Against group GRP" << PtrIndices.at(Check2) 725 << ":\n"; 726 for (unsigned K : Second) 727 OS.indent(Depth + 2) << *Pointers[K].PointerValue << "\n"; 728 } 729 } 730 731 void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const { 732 733 OS.indent(Depth) << "Run-time memory checks:\n"; 734 printChecks(OS, Checks, Depth); 735 736 OS.indent(Depth) << "Grouped accesses:\n"; 737 auto PtrIndices = getPtrToIdxMap(CheckingGroups); 738 for (const auto &CG : CheckingGroups) { 739 OS.indent(Depth + 2) << "Group GRP" << PtrIndices.at(&CG) << ":\n"; 740 OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High 741 << ")\n"; 742 for (unsigned Member : CG.Members) { 743 OS.indent(Depth + 6) << "Member: " << *Pointers[Member].Expr << "\n"; 744 } 745 } 746 } 747 748 namespace { 749 750 /// Analyses memory accesses in a loop. 751 /// 752 /// Checks whether run time pointer checks are needed and builds sets for data 753 /// dependence checking. 754 class AccessAnalysis { 755 public: 756 /// Read or write access location. 757 typedef PointerIntPair<Value *, 1, bool> MemAccessInfo; 758 typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList; 759 760 AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI, 761 MemoryDepChecker::DepCandidates &DA, 762 PredicatedScalarEvolution &PSE, 763 SmallPtrSetImpl<MDNode *> &LoopAliasScopes) 764 : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE), 765 LoopAliasScopes(LoopAliasScopes) { 766 // We're analyzing dependences across loop iterations. 767 BAA.enableCrossIterationMode(); 768 } 769 770 /// Register a load and whether it is only read from. 771 void addLoad(const MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) { 772 Value *Ptr = const_cast<Value *>(Loc.Ptr); 773 AST.add(adjustLoc(Loc)); 774 Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy); 775 if (IsReadOnly) 776 ReadOnlyPtr.insert(Ptr); 777 } 778 779 /// Register a store. 780 void addStore(const MemoryLocation &Loc, Type *AccessTy) { 781 Value *Ptr = const_cast<Value *>(Loc.Ptr); 782 AST.add(adjustLoc(Loc)); 783 Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy); 784 } 785 786 /// Check if we can emit a run-time no-alias check for \p Access. 787 /// 788 /// Returns true if we can emit a run-time no alias check for \p Access. 789 /// If we can check this access, this also adds it to a dependence set and 790 /// adds a run-time to check for it to \p RtCheck. If \p Assume is true, 791 /// we will attempt to use additional run-time checks in order to get 792 /// the bounds of the pointer. 793 bool createCheckForAccess(RuntimePointerChecking &RtCheck, 794 MemAccessInfo Access, Type *AccessTy, 795 const DenseMap<Value *, const SCEV *> &Strides, 796 DenseMap<Value *, unsigned> &DepSetId, 797 Loop *TheLoop, unsigned &RunningDepId, 798 unsigned ASId, bool Assume); 799 800 /// Check whether we can check the pointers at runtime for 801 /// non-intersection. 802 /// 803 /// Returns true if we need no check or if we do and we can generate them 804 /// (i.e. the pointers have computable bounds). A return value of false means 805 /// we couldn't analyze and generate runtime checks for all pointers in the 806 /// loop, but if \p AllowPartial is set then we will have checks for those 807 /// pointers we could analyze. 808 bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, Loop *TheLoop, 809 const DenseMap<Value *, const SCEV *> &Strides, 810 Value *&UncomputablePtr, bool AllowPartial); 811 812 /// Goes over all memory accesses, checks whether a RT check is needed 813 /// and builds sets of dependent accesses. 814 void buildDependenceSets() { 815 processMemAccesses(); 816 } 817 818 /// Initial processing of memory accesses determined that we need to 819 /// perform dependency checking. 820 /// 821 /// Note that this can later be cleared if we retry memcheck analysis without 822 /// dependency checking (i.e. FoundNonConstantDistanceDependence). 823 bool isDependencyCheckNeeded() const { return !CheckDeps.empty(); } 824 825 /// We decided that no dependence analysis would be used. Reset the state. 826 void resetDepChecks(MemoryDepChecker &DepChecker) { 827 CheckDeps.clear(); 828 DepChecker.clearDependences(); 829 } 830 831 const MemAccessInfoList &getDependenciesToCheck() const { return CheckDeps; } 832 833 private: 834 typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap; 835 836 /// Adjust the MemoryLocation so that it represents accesses to this 837 /// location across all iterations, rather than a single one. 838 MemoryLocation adjustLoc(MemoryLocation Loc) const { 839 // The accessed location varies within the loop, but remains within the 840 // underlying object. 841 Loc.Size = LocationSize::beforeOrAfterPointer(); 842 Loc.AATags.Scope = adjustAliasScopeList(Loc.AATags.Scope); 843 Loc.AATags.NoAlias = adjustAliasScopeList(Loc.AATags.NoAlias); 844 return Loc; 845 } 846 847 /// Drop alias scopes that are only valid within a single loop iteration. 848 MDNode *adjustAliasScopeList(MDNode *ScopeList) const { 849 if (!ScopeList) 850 return nullptr; 851 852 // For the sake of simplicity, drop the whole scope list if any scope is 853 // iteration-local. 854 if (any_of(ScopeList->operands(), [&](Metadata *Scope) { 855 return LoopAliasScopes.contains(cast<MDNode>(Scope)); 856 })) 857 return nullptr; 858 859 return ScopeList; 860 } 861 862 /// Go over all memory access and check whether runtime pointer checks 863 /// are needed and build sets of dependency check candidates. 864 void processMemAccesses(); 865 866 /// Map of all accesses. Values are the types used to access memory pointed to 867 /// by the pointer. 868 PtrAccessMap Accesses; 869 870 /// The loop being checked. 871 const Loop *TheLoop; 872 873 /// List of accesses that need a further dependence check. 874 MemAccessInfoList CheckDeps; 875 876 /// Set of pointers that are read only. 877 SmallPtrSet<Value*, 16> ReadOnlyPtr; 878 879 /// Batched alias analysis results. 880 BatchAAResults BAA; 881 882 /// An alias set tracker to partition the access set by underlying object and 883 //intrinsic property (such as TBAA metadata). 884 AliasSetTracker AST; 885 886 /// The LoopInfo of the loop being checked. 887 const LoopInfo *LI; 888 889 /// Sets of potentially dependent accesses - members of one set share an 890 /// underlying pointer. The set "CheckDeps" identfies which sets really need a 891 /// dependence check. 892 MemoryDepChecker::DepCandidates &DepCands; 893 894 /// Initial processing of memory accesses determined that we may need 895 /// to add memchecks. Perform the analysis to determine the necessary checks. 896 /// 897 /// Note that, this is different from isDependencyCheckNeeded. When we retry 898 /// memcheck analysis without dependency checking 899 /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is 900 /// cleared while this remains set if we have potentially dependent accesses. 901 bool IsRTCheckAnalysisNeeded = false; 902 903 /// The SCEV predicate containing all the SCEV-related assumptions. 904 PredicatedScalarEvolution &PSE; 905 906 DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects; 907 908 /// Alias scopes that are declared inside the loop, and as such not valid 909 /// across iterations. 910 SmallPtrSetImpl<MDNode *> &LoopAliasScopes; 911 }; 912 913 } // end anonymous namespace 914 915 /// Try to compute a constant stride for \p AR. Used by getPtrStride and 916 /// isNoWrap. 917 static std::optional<int64_t> 918 getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, 919 Value *Ptr, PredicatedScalarEvolution &PSE) { 920 // The access function must stride over the innermost loop. 921 if (Lp != AR->getLoop()) { 922 LLVM_DEBUG({ 923 dbgs() << "LAA: Bad stride - Not striding over innermost loop "; 924 if (Ptr) 925 dbgs() << *Ptr << " "; 926 927 dbgs() << "SCEV: " << *AR << "\n"; 928 }); 929 return std::nullopt; 930 } 931 932 // Check the step is constant. 933 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE()); 934 935 // Calculate the pointer stride and check if it is constant. 936 const APInt *APStepVal; 937 if (!match(Step, m_scev_APInt(APStepVal))) { 938 LLVM_DEBUG({ 939 dbgs() << "LAA: Bad stride - Not a constant strided "; 940 if (Ptr) 941 dbgs() << *Ptr << " "; 942 dbgs() << "SCEV: " << *AR << "\n"; 943 }); 944 return std::nullopt; 945 } 946 947 const auto &DL = Lp->getHeader()->getDataLayout(); 948 TypeSize AllocSize = DL.getTypeAllocSize(AccessTy); 949 int64_t Size = AllocSize.getFixedValue(); 950 951 // Huge step value - give up. 952 std::optional<int64_t> StepVal = APStepVal->trySExtValue(); 953 if (!StepVal) 954 return std::nullopt; 955 956 // Strided access. 957 return *StepVal % Size ? std::nullopt : std::make_optional(*StepVal / Size); 958 } 959 960 /// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use 961 /// informating from the IR pointer value to determine no-wrap. 962 static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, 963 Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, 964 std::optional<int64_t> Stride = std::nullopt) { 965 // FIXME: This should probably only return true for NUW. 966 if (AR->getNoWrapFlags(SCEV::NoWrapMask)) 967 return true; 968 969 if (Ptr && PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) 970 return true; 971 972 // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap, 973 // the distance between the previously accessed location and the wrapped 974 // location will be larger than half the pointer index type space. In that 975 // case, the GEP would be poison and any memory access dependent on it would 976 // be immediate UB when executed. 977 if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr); 978 GEP && GEP->hasNoUnsignedSignedWrap()) 979 return true; 980 981 if (!Stride) 982 Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE); 983 if (Stride) { 984 // If the null pointer is undefined, then a access sequence which would 985 // otherwise access it can be assumed not to unsigned wrap. Note that this 986 // assumes the object in memory is aligned to the natural alignment. 987 unsigned AddrSpace = AR->getType()->getPointerAddressSpace(); 988 if (!NullPointerIsDefined(L->getHeader()->getParent(), AddrSpace) && 989 (Stride == 1 || Stride == -1)) 990 return true; 991 } 992 993 if (Ptr && Assume) { 994 PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); 995 LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n" 996 << "LAA: Pointer: " << *Ptr << "\n" 997 << "LAA: SCEV: " << *AR << "\n" 998 << "LAA: Added an overflow assumption\n"); 999 return true; 1000 } 1001 1002 return false; 1003 } 1004 1005 static void visitPointers(Value *StartPtr, const Loop &InnermostLoop, 1006 function_ref<void(Value *)> AddPointer) { 1007 SmallPtrSet<Value *, 8> Visited; 1008 SmallVector<Value *> WorkList; 1009 WorkList.push_back(StartPtr); 1010 1011 while (!WorkList.empty()) { 1012 Value *Ptr = WorkList.pop_back_val(); 1013 if (!Visited.insert(Ptr).second) 1014 continue; 1015 auto *PN = dyn_cast<PHINode>(Ptr); 1016 // SCEV does not look through non-header PHIs inside the loop. Such phis 1017 // can be analyzed by adding separate accesses for each incoming pointer 1018 // value. 1019 if (PN && InnermostLoop.contains(PN->getParent()) && 1020 PN->getParent() != InnermostLoop.getHeader()) { 1021 llvm::append_range(WorkList, PN->incoming_values()); 1022 } else 1023 AddPointer(Ptr); 1024 } 1025 } 1026 1027 // Walk back through the IR for a pointer, looking for a select like the 1028 // following: 1029 // 1030 // %offset = select i1 %cmp, i64 %a, i64 %b 1031 // %addr = getelementptr double, double* %base, i64 %offset 1032 // %ld = load double, double* %addr, align 8 1033 // 1034 // We won't be able to form a single SCEVAddRecExpr from this since the 1035 // address for each loop iteration depends on %cmp. We could potentially 1036 // produce multiple valid SCEVAddRecExprs, though, and check all of them for 1037 // memory safety/aliasing if needed. 1038 // 1039 // If we encounter some IR we don't yet handle, or something obviously fine 1040 // like a constant, then we just add the SCEV for that term to the list passed 1041 // in by the caller. If we have a node that may potentially yield a valid 1042 // SCEVAddRecExpr then we decompose it into parts and build the SCEV terms 1043 // ourselves before adding to the list. 1044 static void findForkedSCEVs( 1045 ScalarEvolution *SE, const Loop *L, Value *Ptr, 1046 SmallVectorImpl<PointerIntPair<const SCEV *, 1, bool>> &ScevList, 1047 unsigned Depth) { 1048 // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or 1049 // we've exceeded our limit on recursion, just return whatever we have 1050 // regardless of whether it can be used for a forked pointer or not, along 1051 // with an indication of whether it might be a poison or undef value. 1052 const SCEV *Scev = SE->getSCEV(Ptr); 1053 if (isa<SCEVAddRecExpr>(Scev) || L->isLoopInvariant(Ptr) || 1054 !isa<Instruction>(Ptr) || Depth == 0) { 1055 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr)); 1056 return; 1057 } 1058 1059 Depth--; 1060 1061 auto UndefPoisonCheck = [](PointerIntPair<const SCEV *, 1, bool> S) { 1062 return get<1>(S); 1063 }; 1064 1065 auto GetBinOpExpr = [&SE](unsigned Opcode, const SCEV *L, const SCEV *R) { 1066 switch (Opcode) { 1067 case Instruction::Add: 1068 return SE->getAddExpr(L, R); 1069 case Instruction::Sub: 1070 return SE->getMinusSCEV(L, R); 1071 default: 1072 llvm_unreachable("Unexpected binary operator when walking ForkedPtrs"); 1073 } 1074 }; 1075 1076 Instruction *I = cast<Instruction>(Ptr); 1077 unsigned Opcode = I->getOpcode(); 1078 switch (Opcode) { 1079 case Instruction::GetElementPtr: { 1080 auto *GEP = cast<GetElementPtrInst>(I); 1081 Type *SourceTy = GEP->getSourceElementType(); 1082 // We only handle base + single offset GEPs here for now. 1083 // Not dealing with preexisting gathers yet, so no vectors. 1084 if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) { 1085 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP)); 1086 break; 1087 } 1088 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> BaseScevs; 1089 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> OffsetScevs; 1090 findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth); 1091 findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth); 1092 1093 // See if we need to freeze our fork... 1094 bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) || 1095 any_of(OffsetScevs, UndefPoisonCheck); 1096 1097 // Check that we only have a single fork, on either the base or the offset. 1098 // Copy the SCEV across for the one without a fork in order to generate 1099 // the full SCEV for both sides of the GEP. 1100 if (OffsetScevs.size() == 2 && BaseScevs.size() == 1) 1101 BaseScevs.push_back(BaseScevs[0]); 1102 else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1) 1103 OffsetScevs.push_back(OffsetScevs[0]); 1104 else { 1105 ScevList.emplace_back(Scev, NeedsFreeze); 1106 break; 1107 } 1108 1109 Type *IntPtrTy = SE->getEffectiveSCEVType(GEP->getPointerOperandType()); 1110 1111 // Find the size of the type being pointed to. We only have a single 1112 // index term (guarded above) so we don't need to index into arrays or 1113 // structures, just get the size of the scalar value. 1114 const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy); 1115 1116 for (auto [B, O] : zip(BaseScevs, OffsetScevs)) { 1117 const SCEV *Base = get<0>(B); 1118 const SCEV *Offset = get<0>(O); 1119 1120 // Scale up the offsets by the size of the type, then add to the bases. 1121 const SCEV *Scaled = 1122 SE->getMulExpr(Size, SE->getTruncateOrSignExtend(Offset, IntPtrTy)); 1123 ScevList.emplace_back(SE->getAddExpr(Base, Scaled), NeedsFreeze); 1124 } 1125 break; 1126 } 1127 case Instruction::Select: { 1128 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; 1129 // A select means we've found a forked pointer, but we currently only 1130 // support a single select per pointer so if there's another behind this 1131 // then we just bail out and return the generic SCEV. 1132 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth); 1133 findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth); 1134 if (ChildScevs.size() == 2) 1135 append_range(ScevList, ChildScevs); 1136 else 1137 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr)); 1138 break; 1139 } 1140 case Instruction::PHI: { 1141 SmallVector<PointerIntPair<const SCEV *, 1, bool>, 2> ChildScevs; 1142 // A phi means we've found a forked pointer, but we currently only 1143 // support a single phi per pointer so if there's another behind this 1144 // then we just bail out and return the generic SCEV. 1145 if (I->getNumOperands() == 2) { 1146 findForkedSCEVs(SE, L, I->getOperand(0), ChildScevs, Depth); 1147 findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth); 1148 } 1149 if (ChildScevs.size() == 2) 1150 append_range(ScevList, ChildScevs); 1151 else 1152 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr)); 1153 break; 1154 } 1155 case Instruction::Add: 1156 case Instruction::Sub: { 1157 SmallVector<PointerIntPair<const SCEV *, 1, bool>> LScevs; 1158 SmallVector<PointerIntPair<const SCEV *, 1, bool>> RScevs; 1159 findForkedSCEVs(SE, L, I->getOperand(0), LScevs, Depth); 1160 findForkedSCEVs(SE, L, I->getOperand(1), RScevs, Depth); 1161 1162 // See if we need to freeze our fork... 1163 bool NeedsFreeze = 1164 any_of(LScevs, UndefPoisonCheck) || any_of(RScevs, UndefPoisonCheck); 1165 1166 // Check that we only have a single fork, on either the left or right side. 1167 // Copy the SCEV across for the one without a fork in order to generate 1168 // the full SCEV for both sides of the BinOp. 1169 if (LScevs.size() == 2 && RScevs.size() == 1) 1170 RScevs.push_back(RScevs[0]); 1171 else if (RScevs.size() == 2 && LScevs.size() == 1) 1172 LScevs.push_back(LScevs[0]); 1173 else { 1174 ScevList.emplace_back(Scev, NeedsFreeze); 1175 break; 1176 } 1177 1178 for (auto [L, R] : zip(LScevs, RScevs)) 1179 ScevList.emplace_back(GetBinOpExpr(Opcode, get<0>(L), get<0>(R)), 1180 NeedsFreeze); 1181 break; 1182 } 1183 default: 1184 // Just return the current SCEV if we haven't handled the instruction yet. 1185 LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n"); 1186 ScevList.emplace_back(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr)); 1187 break; 1188 } 1189 } 1190 1191 static SmallVector<PointerIntPair<const SCEV *, 1, bool>> 1192 findForkedPointer(PredicatedScalarEvolution &PSE, 1193 const DenseMap<Value *, const SCEV *> &StridesMap, Value *Ptr, 1194 const Loop *L) { 1195 ScalarEvolution *SE = PSE.getSE(); 1196 assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!"); 1197 SmallVector<PointerIntPair<const SCEV *, 1, bool>> Scevs; 1198 findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth); 1199 1200 // For now, we will only accept a forked pointer with two possible SCEVs 1201 // that are either SCEVAddRecExprs or loop invariant. 1202 if (Scevs.size() == 2 && 1203 (isa<SCEVAddRecExpr>(get<0>(Scevs[0])) || 1204 SE->isLoopInvariant(get<0>(Scevs[0]), L)) && 1205 (isa<SCEVAddRecExpr>(get<0>(Scevs[1])) || 1206 SE->isLoopInvariant(get<0>(Scevs[1]), L))) { 1207 LLVM_DEBUG(dbgs() << "LAA: Found forked pointer: " << *Ptr << "\n"); 1208 LLVM_DEBUG(dbgs() << "\t(1) " << *get<0>(Scevs[0]) << "\n"); 1209 LLVM_DEBUG(dbgs() << "\t(2) " << *get<0>(Scevs[1]) << "\n"); 1210 return Scevs; 1211 } 1212 1213 return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}}; 1214 } 1215 1216 bool AccessAnalysis::createCheckForAccess( 1217 RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy, 1218 const DenseMap<Value *, const SCEV *> &StridesMap, 1219 DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop, 1220 unsigned &RunningDepId, unsigned ASId, bool Assume) { 1221 Value *Ptr = Access.getPointer(); 1222 1223 SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs = 1224 findForkedPointer(PSE, StridesMap, Ptr, TheLoop); 1225 assert(!TranslatedPtrs.empty() && "must have some translated pointers"); 1226 1227 /// Check whether all pointers can participate in a runtime bounds check. They 1228 /// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also 1229 /// must not wrap. 1230 for (auto &P : TranslatedPtrs) { 1231 // The bounds for loop-invariant pointer is trivial. 1232 if (PSE.getSE()->isLoopInvariant(P.getPointer(), TheLoop)) 1233 continue; 1234 1235 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(P.getPointer()); 1236 if (!AR && Assume) 1237 AR = PSE.getAsAddRec(Ptr); 1238 if (!AR || !AR->isAffine()) 1239 return false; 1240 1241 // If there's only one option for Ptr, look it up after bounds and wrap 1242 // checking, because assumptions might have been added to PSE. 1243 if (TranslatedPtrs.size() == 1) { 1244 AR = 1245 cast<SCEVAddRecExpr>(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr)); 1246 P.setPointer(AR); 1247 } 1248 1249 // When we run after a failing dependency check we have to make sure 1250 // we don't have wrapping pointers. 1251 if (!isNoWrap(PSE, AR, TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy, 1252 TheLoop, Assume)) { 1253 return false; 1254 } 1255 } 1256 1257 for (auto [PtrExpr, NeedsFreeze] : TranslatedPtrs) { 1258 // The id of the dependence set. 1259 unsigned DepId; 1260 1261 if (isDependencyCheckNeeded()) { 1262 Value *Leader = DepCands.getLeaderValue(Access).getPointer(); 1263 unsigned &LeaderId = DepSetId[Leader]; 1264 if (!LeaderId) 1265 LeaderId = RunningDepId++; 1266 DepId = LeaderId; 1267 } else 1268 // Each access has its own dependence set. 1269 DepId = RunningDepId++; 1270 1271 bool IsWrite = Access.getInt(); 1272 RtCheck.insert(TheLoop, Ptr, PtrExpr, AccessTy, IsWrite, DepId, ASId, PSE, 1273 NeedsFreeze); 1274 LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n'); 1275 } 1276 1277 return true; 1278 } 1279 1280 bool AccessAnalysis::canCheckPtrAtRT( 1281 RuntimePointerChecking &RtCheck, Loop *TheLoop, 1282 const DenseMap<Value *, const SCEV *> &StridesMap, Value *&UncomputablePtr, 1283 bool AllowPartial) { 1284 // Find pointers with computable bounds. We are going to use this information 1285 // to place a runtime bound check. 1286 bool CanDoRT = true; 1287 1288 bool MayNeedRTCheck = false; 1289 if (!IsRTCheckAnalysisNeeded) return true; 1290 1291 bool IsDepCheckNeeded = isDependencyCheckNeeded(); 1292 1293 // We assign a consecutive id to access from different alias sets. 1294 // Accesses between different groups doesn't need to be checked. 1295 unsigned ASId = 0; 1296 for (const auto &AS : AST) { 1297 int NumReadPtrChecks = 0; 1298 int NumWritePtrChecks = 0; 1299 bool CanDoAliasSetRT = true; 1300 ++ASId; 1301 auto ASPointers = AS.getPointers(); 1302 1303 // We assign consecutive id to access from different dependence sets. 1304 // Accesses within the same set don't need a runtime check. 1305 unsigned RunningDepId = 1; 1306 DenseMap<Value *, unsigned> DepSetId; 1307 1308 SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries; 1309 1310 // First, count how many write and read accesses are in the alias set. Also 1311 // collect MemAccessInfos for later. 1312 SmallVector<MemAccessInfo, 4> AccessInfos; 1313 for (const Value *ConstPtr : ASPointers) { 1314 Value *Ptr = const_cast<Value *>(ConstPtr); 1315 bool IsWrite = Accesses.contains(MemAccessInfo(Ptr, true)); 1316 if (IsWrite) 1317 ++NumWritePtrChecks; 1318 else 1319 ++NumReadPtrChecks; 1320 AccessInfos.emplace_back(Ptr, IsWrite); 1321 } 1322 1323 // We do not need runtime checks for this alias set, if there are no writes 1324 // or a single write and no reads. 1325 if (NumWritePtrChecks == 0 || 1326 (NumWritePtrChecks == 1 && NumReadPtrChecks == 0)) { 1327 assert((ASPointers.size() <= 1 || 1328 all_of(ASPointers, 1329 [this](const Value *Ptr) { 1330 MemAccessInfo AccessWrite(const_cast<Value *>(Ptr), 1331 true); 1332 return !DepCands.contains(AccessWrite); 1333 })) && 1334 "Can only skip updating CanDoRT below, if all entries in AS " 1335 "are reads or there is at most 1 entry"); 1336 continue; 1337 } 1338 1339 for (auto &Access : AccessInfos) { 1340 for (const auto &AccessTy : Accesses[Access]) { 1341 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, 1342 DepSetId, TheLoop, RunningDepId, ASId, 1343 false)) { 1344 LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" 1345 << *Access.getPointer() << '\n'); 1346 Retries.emplace_back(Access, AccessTy); 1347 CanDoAliasSetRT = false; 1348 } 1349 } 1350 } 1351 1352 // Note that this function computes CanDoRT and MayNeedRTCheck 1353 // independently. For example CanDoRT=false, MayNeedRTCheck=false means that 1354 // we have a pointer for which we couldn't find the bounds but we don't 1355 // actually need to emit any checks so it does not matter. 1356 // 1357 // We need runtime checks for this alias set, if there are at least 2 1358 // dependence sets (in which case RunningDepId > 2) or if we need to re-try 1359 // any bound checks (because in that case the number of dependence sets is 1360 // incomplete). 1361 bool NeedsAliasSetRTCheck = RunningDepId > 2 || !Retries.empty(); 1362 1363 // We need to perform run-time alias checks, but some pointers had bounds 1364 // that couldn't be checked. 1365 if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) { 1366 // Reset the CanDoSetRt flag and retry all accesses that have failed. 1367 // We know that we need these checks, so we can now be more aggressive 1368 // and add further checks if required (overflow checks). 1369 CanDoAliasSetRT = true; 1370 for (const auto &[Access, AccessTy] : Retries) { 1371 if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap, 1372 DepSetId, TheLoop, RunningDepId, ASId, 1373 /*Assume=*/true)) { 1374 CanDoAliasSetRT = false; 1375 UncomputablePtr = Access.getPointer(); 1376 if (!AllowPartial) 1377 break; 1378 } 1379 } 1380 } 1381 1382 CanDoRT &= CanDoAliasSetRT; 1383 MayNeedRTCheck |= NeedsAliasSetRTCheck; 1384 ++ASId; 1385 } 1386 1387 // If the pointers that we would use for the bounds comparison have different 1388 // address spaces, assume the values aren't directly comparable, so we can't 1389 // use them for the runtime check. We also have to assume they could 1390 // overlap. In the future there should be metadata for whether address spaces 1391 // are disjoint. 1392 unsigned NumPointers = RtCheck.Pointers.size(); 1393 for (unsigned i = 0; i < NumPointers; ++i) { 1394 for (unsigned j = i + 1; j < NumPointers; ++j) { 1395 // Only need to check pointers between two different dependency sets. 1396 if (RtCheck.Pointers[i].DependencySetId == 1397 RtCheck.Pointers[j].DependencySetId) 1398 continue; 1399 // Only need to check pointers in the same alias set. 1400 if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId) 1401 continue; 1402 1403 Value *PtrI = RtCheck.Pointers[i].PointerValue; 1404 Value *PtrJ = RtCheck.Pointers[j].PointerValue; 1405 1406 unsigned ASi = PtrI->getType()->getPointerAddressSpace(); 1407 unsigned ASj = PtrJ->getType()->getPointerAddressSpace(); 1408 if (ASi != ASj) { 1409 LLVM_DEBUG( 1410 dbgs() << "LAA: Runtime check would require comparison between" 1411 " different address spaces\n"); 1412 return false; 1413 } 1414 } 1415 } 1416 1417 if (MayNeedRTCheck && (CanDoRT || AllowPartial)) 1418 RtCheck.generateChecks(DepCands, IsDepCheckNeeded); 1419 1420 LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks() 1421 << " pointer comparisons.\n"); 1422 1423 // If we can do run-time checks, but there are no checks, no runtime checks 1424 // are needed. This can happen when all pointers point to the same underlying 1425 // object for example. 1426 RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck; 1427 1428 bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT; 1429 assert(CanDoRTIfNeeded == (CanDoRT || !MayNeedRTCheck) && 1430 "CanDoRTIfNeeded depends on RtCheck.Need"); 1431 if (!CanDoRTIfNeeded && !AllowPartial) 1432 RtCheck.reset(); 1433 return CanDoRTIfNeeded; 1434 } 1435 1436 void AccessAnalysis::processMemAccesses() { 1437 // We process the set twice: first we process read-write pointers, last we 1438 // process read-only pointers. This allows us to skip dependence tests for 1439 // read-only pointers. 1440 1441 LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n"); 1442 LLVM_DEBUG(dbgs() << " AST: "; AST.dump()); 1443 LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n"); 1444 LLVM_DEBUG({ 1445 for (const auto &[A, _] : Accesses) 1446 dbgs() << "\t" << *A.getPointer() << " (" 1447 << (A.getInt() 1448 ? "write" 1449 : (ReadOnlyPtr.contains(A.getPointer()) ? "read-only" 1450 : "read")) 1451 << ")\n"; 1452 }); 1453 1454 // The AliasSetTracker has nicely partitioned our pointers by metadata 1455 // compatibility and potential for underlying-object overlap. As a result, we 1456 // only need to check for potential pointer dependencies within each alias 1457 // set. 1458 for (const auto &AS : AST) { 1459 // Note that both the alias-set tracker and the alias sets themselves used 1460 // ordered collections internally and so the iteration order here is 1461 // deterministic. 1462 auto ASPointers = AS.getPointers(); 1463 1464 bool SetHasWrite = false; 1465 1466 // Map of (pointer to underlying objects, accessed address space) to last 1467 // access encountered. 1468 typedef DenseMap<std::pair<const Value *, unsigned>, MemAccessInfo> 1469 UnderlyingObjToAccessMap; 1470 UnderlyingObjToAccessMap ObjToLastAccess; 1471 1472 // Set of access to check after all writes have been processed. 1473 PtrAccessMap DeferredAccesses; 1474 1475 // Iterate over each alias set twice, once to process read/write pointers, 1476 // and then to process read-only pointers. 1477 for (int SetIteration = 0; SetIteration < 2; ++SetIteration) { 1478 bool UseDeferred = SetIteration > 0; 1479 PtrAccessMap &S = UseDeferred ? DeferredAccesses : Accesses; 1480 1481 for (const Value *ConstPtr : ASPointers) { 1482 Value *Ptr = const_cast<Value *>(ConstPtr); 1483 1484 // For a single memory access in AliasSetTracker, Accesses may contain 1485 // both read and write, and they both need to be handled for CheckDeps. 1486 for (const auto &[AC, _] : S) { 1487 if (AC.getPointer() != Ptr) 1488 continue; 1489 1490 bool IsWrite = AC.getInt(); 1491 1492 // If we're using the deferred access set, then it contains only 1493 // reads. 1494 bool IsReadOnlyPtr = ReadOnlyPtr.contains(Ptr) && !IsWrite; 1495 if (UseDeferred && !IsReadOnlyPtr) 1496 continue; 1497 // Otherwise, the pointer must be in the PtrAccessSet, either as a 1498 // read or a write. 1499 assert(((IsReadOnlyPtr && UseDeferred) || IsWrite || 1500 S.contains(MemAccessInfo(Ptr, false))) && 1501 "Alias-set pointer not in the access set?"); 1502 1503 MemAccessInfo Access(Ptr, IsWrite); 1504 DepCands.insert(Access); 1505 1506 // Memorize read-only pointers for later processing and skip them in 1507 // the first round (they need to be checked after we have seen all 1508 // write pointers). Note: we also mark pointer that are not 1509 // consecutive as "read-only" pointers (so that we check 1510 // "a[b[i]] +="). Hence, we need the second check for "!IsWrite". 1511 if (!UseDeferred && IsReadOnlyPtr) { 1512 // We only use the pointer keys, the types vector values don't 1513 // matter. 1514 DeferredAccesses.insert({Access, {}}); 1515 continue; 1516 } 1517 1518 // If this is a write - check other reads and writes for conflicts. If 1519 // this is a read only check other writes for conflicts (but only if 1520 // there is no other write to the ptr - this is an optimization to 1521 // catch "a[i] = a[i] + " without having to do a dependence check). 1522 if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) { 1523 CheckDeps.push_back(Access); 1524 IsRTCheckAnalysisNeeded = true; 1525 } 1526 1527 if (IsWrite) 1528 SetHasWrite = true; 1529 1530 // Create sets of pointers connected by a shared alias set and 1531 // underlying object. 1532 SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr]; 1533 UOs = {}; 1534 ::getUnderlyingObjects(Ptr, UOs, LI); 1535 LLVM_DEBUG(dbgs() 1536 << "Underlying objects for pointer " << *Ptr << "\n"); 1537 for (const Value *UnderlyingObj : UOs) { 1538 // nullptr never alias, don't join sets for pointer that have "null" 1539 // in their UnderlyingObjects list. 1540 if (isa<ConstantPointerNull>(UnderlyingObj) && 1541 !NullPointerIsDefined( 1542 TheLoop->getHeader()->getParent(), 1543 UnderlyingObj->getType()->getPointerAddressSpace())) 1544 continue; 1545 1546 auto [It, Inserted] = ObjToLastAccess.try_emplace( 1547 {UnderlyingObj, 1548 cast<PointerType>(Ptr->getType())->getAddressSpace()}, 1549 Access); 1550 if (!Inserted) { 1551 DepCands.unionSets(Access, It->second); 1552 It->second = Access; 1553 } 1554 1555 LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n"); 1556 } 1557 } 1558 } 1559 } 1560 } 1561 } 1562 1563 /// Check whether the access through \p Ptr has a constant stride. 1564 std::optional<int64_t> 1565 llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, 1566 const Loop *Lp, 1567 const DenseMap<Value *, const SCEV *> &StridesMap, 1568 bool Assume, bool ShouldCheckWrap) { 1569 const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr); 1570 if (PSE.getSE()->isLoopInvariant(PtrScev, Lp)) 1571 return 0; 1572 1573 assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr"); 1574 if (isa<ScalableVectorType>(AccessTy)) { 1575 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy 1576 << "\n"); 1577 return std::nullopt; 1578 } 1579 1580 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev); 1581 if (Assume && !AR) 1582 AR = PSE.getAsAddRec(Ptr); 1583 1584 if (!AR) { 1585 LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr 1586 << " SCEV: " << *PtrScev << "\n"); 1587 return std::nullopt; 1588 } 1589 1590 std::optional<int64_t> Stride = 1591 getStrideFromAddRec(AR, Lp, AccessTy, Ptr, PSE); 1592 if (!ShouldCheckWrap || !Stride) 1593 return Stride; 1594 1595 if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride)) 1596 return Stride; 1597 1598 LLVM_DEBUG( 1599 dbgs() << "LAA: Bad stride - Pointer may wrap in the address space " 1600 << *Ptr << " SCEV: " << *AR << "\n"); 1601 return std::nullopt; 1602 } 1603 1604 std::optional<int64_t> llvm::getPointersDiff(Type *ElemTyA, Value *PtrA, 1605 Type *ElemTyB, Value *PtrB, 1606 const DataLayout &DL, 1607 ScalarEvolution &SE, 1608 bool StrictCheck, bool CheckType) { 1609 assert(PtrA && PtrB && "Expected non-nullptr pointers."); 1610 1611 // Make sure that A and B are different pointers. 1612 if (PtrA == PtrB) 1613 return 0; 1614 1615 // Make sure that the element types are the same if required. 1616 if (CheckType && ElemTyA != ElemTyB) 1617 return std::nullopt; 1618 1619 unsigned ASA = PtrA->getType()->getPointerAddressSpace(); 1620 unsigned ASB = PtrB->getType()->getPointerAddressSpace(); 1621 1622 // Check that the address spaces match. 1623 if (ASA != ASB) 1624 return std::nullopt; 1625 unsigned IdxWidth = DL.getIndexSizeInBits(ASA); 1626 1627 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0); 1628 const Value *PtrA1 = PtrA->stripAndAccumulateConstantOffsets( 1629 DL, OffsetA, /*AllowNonInbounds=*/true); 1630 const Value *PtrB1 = PtrB->stripAndAccumulateConstantOffsets( 1631 DL, OffsetB, /*AllowNonInbounds=*/true); 1632 1633 std::optional<int64_t> Val; 1634 if (PtrA1 == PtrB1) { 1635 // Retrieve the address space again as pointer stripping now tracks through 1636 // `addrspacecast`. 1637 ASA = cast<PointerType>(PtrA1->getType())->getAddressSpace(); 1638 ASB = cast<PointerType>(PtrB1->getType())->getAddressSpace(); 1639 // Check that the address spaces match and that the pointers are valid. 1640 if (ASA != ASB) 1641 return std::nullopt; 1642 1643 IdxWidth = DL.getIndexSizeInBits(ASA); 1644 OffsetA = OffsetA.sextOrTrunc(IdxWidth); 1645 OffsetB = OffsetB.sextOrTrunc(IdxWidth); 1646 1647 OffsetB -= OffsetA; 1648 Val = OffsetB.trySExtValue(); 1649 } else { 1650 // Otherwise compute the distance with SCEV between the base pointers. 1651 const SCEV *PtrSCEVA = SE.getSCEV(PtrA); 1652 const SCEV *PtrSCEVB = SE.getSCEV(PtrB); 1653 std::optional<APInt> Diff = 1654 SE.computeConstantDifference(PtrSCEVB, PtrSCEVA); 1655 if (!Diff) 1656 return std::nullopt; 1657 Val = Diff->trySExtValue(); 1658 } 1659 1660 if (!Val) 1661 return std::nullopt; 1662 1663 int64_t Size = DL.getTypeStoreSize(ElemTyA); 1664 int64_t Dist = *Val / Size; 1665 1666 // Ensure that the calculated distance matches the type-based one after all 1667 // the bitcasts removal in the provided pointers. 1668 if (!StrictCheck || Dist * Size == Val) 1669 return Dist; 1670 return std::nullopt; 1671 } 1672 1673 bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, 1674 const DataLayout &DL, ScalarEvolution &SE, 1675 SmallVectorImpl<unsigned> &SortedIndices) { 1676 assert(llvm::all_of( 1677 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) && 1678 "Expected list of pointer operands."); 1679 // Walk over the pointers, and map each of them to an offset relative to 1680 // first pointer in the array. 1681 Value *Ptr0 = VL[0]; 1682 1683 using DistOrdPair = std::pair<int64_t, unsigned>; 1684 auto Compare = llvm::less_first(); 1685 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare); 1686 Offsets.emplace(0, 0); 1687 bool IsConsecutive = true; 1688 for (auto [Idx, Ptr] : drop_begin(enumerate(VL))) { 1689 std::optional<int64_t> Diff = 1690 getPointersDiff(ElemTy, Ptr0, ElemTy, Ptr, DL, SE, 1691 /*StrictCheck=*/true); 1692 if (!Diff) 1693 return false; 1694 1695 // Check if the pointer with the same offset is found. 1696 int64_t Offset = *Diff; 1697 auto [It, IsInserted] = Offsets.emplace(Offset, Idx); 1698 if (!IsInserted) 1699 return false; 1700 // Consecutive order if the inserted element is the last one. 1701 IsConsecutive &= std::next(It) == Offsets.end(); 1702 } 1703 SortedIndices.clear(); 1704 if (!IsConsecutive) { 1705 // Fill SortedIndices array only if it is non-consecutive. 1706 SortedIndices.resize(VL.size()); 1707 for (auto [Idx, Off] : enumerate(Offsets)) 1708 SortedIndices[Idx] = Off.second; 1709 } 1710 return true; 1711 } 1712 1713 /// Returns true if the memory operations \p A and \p B are consecutive. 1714 bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL, 1715 ScalarEvolution &SE, bool CheckType) { 1716 Value *PtrA = getLoadStorePointerOperand(A); 1717 Value *PtrB = getLoadStorePointerOperand(B); 1718 if (!PtrA || !PtrB) 1719 return false; 1720 Type *ElemTyA = getLoadStoreType(A); 1721 Type *ElemTyB = getLoadStoreType(B); 1722 std::optional<int64_t> Diff = 1723 getPointersDiff(ElemTyA, PtrA, ElemTyB, PtrB, DL, SE, 1724 /*StrictCheck=*/true, CheckType); 1725 return Diff == 1; 1726 } 1727 1728 void MemoryDepChecker::addAccess(StoreInst *SI) { 1729 visitPointers(SI->getPointerOperand(), *InnermostLoop, 1730 [this, SI](Value *Ptr) { 1731 Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx); 1732 InstMap.push_back(SI); 1733 ++AccessIdx; 1734 }); 1735 } 1736 1737 void MemoryDepChecker::addAccess(LoadInst *LI) { 1738 visitPointers(LI->getPointerOperand(), *InnermostLoop, 1739 [this, LI](Value *Ptr) { 1740 Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx); 1741 InstMap.push_back(LI); 1742 ++AccessIdx; 1743 }); 1744 } 1745 1746 MemoryDepChecker::VectorizationSafetyStatus 1747 MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) { 1748 switch (Type) { 1749 case NoDep: 1750 case Forward: 1751 case BackwardVectorizable: 1752 return VectorizationSafetyStatus::Safe; 1753 1754 case Unknown: 1755 return VectorizationSafetyStatus::PossiblySafeWithRtChecks; 1756 case ForwardButPreventsForwarding: 1757 case Backward: 1758 case BackwardVectorizableButPreventsForwarding: 1759 case IndirectUnsafe: 1760 return VectorizationSafetyStatus::Unsafe; 1761 } 1762 llvm_unreachable("unexpected DepType!"); 1763 } 1764 1765 bool MemoryDepChecker::Dependence::isBackward() const { 1766 switch (Type) { 1767 case NoDep: 1768 case Forward: 1769 case ForwardButPreventsForwarding: 1770 case Unknown: 1771 case IndirectUnsafe: 1772 return false; 1773 1774 case BackwardVectorizable: 1775 case Backward: 1776 case BackwardVectorizableButPreventsForwarding: 1777 return true; 1778 } 1779 llvm_unreachable("unexpected DepType!"); 1780 } 1781 1782 bool MemoryDepChecker::Dependence::isPossiblyBackward() const { 1783 return isBackward() || Type == Unknown || Type == IndirectUnsafe; 1784 } 1785 1786 bool MemoryDepChecker::Dependence::isForward() const { 1787 switch (Type) { 1788 case Forward: 1789 case ForwardButPreventsForwarding: 1790 return true; 1791 1792 case NoDep: 1793 case Unknown: 1794 case BackwardVectorizable: 1795 case Backward: 1796 case BackwardVectorizableButPreventsForwarding: 1797 case IndirectUnsafe: 1798 return false; 1799 } 1800 llvm_unreachable("unexpected DepType!"); 1801 } 1802 1803 bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance, 1804 uint64_t TypeByteSize, 1805 unsigned CommonStride) { 1806 // If loads occur at a distance that is not a multiple of a feasible vector 1807 // factor store-load forwarding does not take place. 1808 // Positive dependences might cause troubles because vectorizing them might 1809 // prevent store-load forwarding making vectorized code run a lot slower. 1810 // a[i] = a[i-3] ^ a[i-8]; 1811 // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and 1812 // hence on your typical architecture store-load forwarding does not take 1813 // place. Vectorizing in such cases does not make sense. 1814 // Store-load forwarding distance. 1815 1816 // After this many iterations store-to-load forwarding conflicts should not 1817 // cause any slowdowns. 1818 const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize; 1819 // Maximum vector factor. 1820 uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 = 1821 std::min(VectorizerParams::MaxVectorWidth * TypeByteSize, 1822 MaxStoreLoadForwardSafeDistanceInBits); 1823 1824 // Compute the smallest VF at which the store and load would be misaligned. 1825 for (uint64_t VF = 2 * TypeByteSize; 1826 VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) { 1827 // If the number of vector iteration between the store and the load are 1828 // small we could incur conflicts. 1829 if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) { 1830 MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1); 1831 break; 1832 } 1833 } 1834 1835 if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) { 1836 LLVM_DEBUG( 1837 dbgs() << "LAA: Distance " << Distance 1838 << " that could cause a store-load forwarding conflict\n"); 1839 return true; 1840 } 1841 1842 if (CommonStride && 1843 MaxVFWithoutSLForwardIssuesPowerOf2 < 1844 MaxStoreLoadForwardSafeDistanceInBits && 1845 MaxVFWithoutSLForwardIssuesPowerOf2 != 1846 VectorizerParams::MaxVectorWidth * TypeByteSize) { 1847 uint64_t MaxVF = 1848 bit_floor(MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride); 1849 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; 1850 MaxStoreLoadForwardSafeDistanceInBits = 1851 std::min(MaxStoreLoadForwardSafeDistanceInBits, MaxVFInBits); 1852 } 1853 return false; 1854 } 1855 1856 void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) { 1857 if (Status < S) 1858 Status = S; 1859 } 1860 1861 /// Given a dependence-distance \p Dist between two memory accesses, that have 1862 /// strides in the same direction whose absolute value of the maximum stride is 1863 /// given in \p MaxStride, in a loop whose maximum backedge taken count is \p 1864 /// MaxBTC, check if it is possible to prove statically that the dependence 1865 /// distance is larger than the range that the accesses will travel through the 1866 /// execution of the loop. If so, return true; false otherwise. This is useful 1867 /// for example in loops such as the following (PR31098): 1868 /// 1869 /// for (i = 0; i < D; ++i) { 1870 /// = out[i]; 1871 /// out[i+D] = 1872 /// } 1873 static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE, 1874 const SCEV &MaxBTC, const SCEV &Dist, 1875 uint64_t MaxStride) { 1876 1877 // If we can prove that 1878 // (**) |Dist| > MaxBTC * Step 1879 // where Step is the absolute stride of the memory accesses in bytes, 1880 // then there is no dependence. 1881 // 1882 // Rationale: 1883 // We basically want to check if the absolute distance (|Dist/Step|) 1884 // is >= the loop iteration count (or > MaxBTC). 1885 // This is equivalent to the Strong SIV Test (Practical Dependence Testing, 1886 // Section 4.2.1); Note, that for vectorization it is sufficient to prove 1887 // that the dependence distance is >= VF; This is checked elsewhere. 1888 // But in some cases we can prune dependence distances early, and 1889 // even before selecting the VF, and without a runtime test, by comparing 1890 // the distance against the loop iteration count. Since the vectorized code 1891 // will be executed only if LoopCount >= VF, proving distance >= LoopCount 1892 // also guarantees that distance >= VF. 1893 // 1894 const SCEV *Step = SE.getConstant(MaxBTC.getType(), MaxStride); 1895 const SCEV *Product = SE.getMulExpr(&MaxBTC, Step); 1896 1897 const SCEV *CastedDist = &Dist; 1898 const SCEV *CastedProduct = Product; 1899 uint64_t DistTypeSizeBits = DL.getTypeSizeInBits(Dist.getType()); 1900 uint64_t ProductTypeSizeBits = DL.getTypeSizeInBits(Product->getType()); 1901 1902 // The dependence distance can be positive/negative, so we sign extend Dist; 1903 // The multiplication of the absolute stride in bytes and the 1904 // backedgeTakenCount is non-negative, so we zero extend Product. 1905 if (DistTypeSizeBits > ProductTypeSizeBits) 1906 CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType()); 1907 else 1908 CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType()); 1909 1910 // Is Dist - (MaxBTC * Step) > 0 ? 1911 // (If so, then we have proven (**) because |Dist| >= Dist) 1912 const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct); 1913 if (SE.isKnownPositive(Minus)) 1914 return true; 1915 1916 // Second try: Is -Dist - (MaxBTC * Step) > 0 ? 1917 // (If so, then we have proven (**) because |Dist| >= -1*Dist) 1918 const SCEV *NegDist = SE.getNegativeSCEV(CastedDist); 1919 Minus = SE.getMinusSCEV(NegDist, CastedProduct); 1920 return SE.isKnownPositive(Minus); 1921 } 1922 1923 /// Check the dependence for two accesses with the same stride \p Stride. 1924 /// \p Distance is the positive distance in bytes, and \p TypeByteSize is type 1925 /// size in bytes. 1926 /// 1927 /// \returns true if they are independent. 1928 static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride, 1929 uint64_t TypeByteSize) { 1930 assert(Stride > 1 && "The stride must be greater than 1"); 1931 assert(TypeByteSize > 0 && "The type size in byte must be non-zero"); 1932 assert(Distance > 0 && "The distance must be non-zero"); 1933 1934 // Skip if the distance is not multiple of type byte size. 1935 if (Distance % TypeByteSize) 1936 return false; 1937 1938 // No dependence if the distance is not multiple of the stride. 1939 // E.g. 1940 // for (i = 0; i < 1024 ; i += 4) 1941 // A[i+2] = A[i] + 1; 1942 // 1943 // Two accesses in memory (distance is 2, stride is 4): 1944 // | A[0] | | | | A[4] | | | | 1945 // | | | A[2] | | | | A[6] | | 1946 // 1947 // E.g. 1948 // for (i = 0; i < 1024 ; i += 3) 1949 // A[i+4] = A[i] + 1; 1950 // 1951 // Two accesses in memory (distance is 4, stride is 3): 1952 // | A[0] | | | A[3] | | | A[6] | | | 1953 // | | | | | A[4] | | | A[7] | | 1954 return Distance % Stride; 1955 } 1956 1957 bool MemoryDepChecker::areAccessesCompletelyBeforeOrAfter(const SCEV *Src, 1958 Type *SrcTy, 1959 const SCEV *Sink, 1960 Type *SinkTy) { 1961 const SCEV *BTC = PSE.getBackedgeTakenCount(); 1962 const SCEV *SymbolicMaxBTC = PSE.getSymbolicMaxBackedgeTakenCount(); 1963 ScalarEvolution &SE = *PSE.getSE(); 1964 const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess( 1965 InnermostLoop, Src, SrcTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds); 1966 if (isa<SCEVCouldNotCompute>(SrcStart_) || isa<SCEVCouldNotCompute>(SrcEnd_)) 1967 return false; 1968 1969 const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess( 1970 InnermostLoop, Sink, SinkTy, BTC, SymbolicMaxBTC, &SE, &PointerBounds); 1971 if (isa<SCEVCouldNotCompute>(SinkStart_) || 1972 isa<SCEVCouldNotCompute>(SinkEnd_)) 1973 return false; 1974 1975 if (!LoopGuards) 1976 LoopGuards.emplace(ScalarEvolution::LoopGuards::collect(InnermostLoop, SE)); 1977 1978 auto SrcEnd = SE.applyLoopGuards(SrcEnd_, *LoopGuards); 1979 auto SinkStart = SE.applyLoopGuards(SinkStart_, *LoopGuards); 1980 if (SE.isKnownPredicate(CmpInst::ICMP_ULE, SrcEnd, SinkStart)) 1981 return true; 1982 1983 auto SinkEnd = SE.applyLoopGuards(SinkEnd_, *LoopGuards); 1984 auto SrcStart = SE.applyLoopGuards(SrcStart_, *LoopGuards); 1985 return SE.isKnownPredicate(CmpInst::ICMP_ULE, SinkEnd, SrcStart); 1986 } 1987 1988 std::variant<MemoryDepChecker::Dependence::DepType, 1989 MemoryDepChecker::DepDistanceStrideAndSizeInfo> 1990 MemoryDepChecker::getDependenceDistanceStrideAndSize( 1991 const AccessAnalysis::MemAccessInfo &A, Instruction *AInst, 1992 const AccessAnalysis::MemAccessInfo &B, Instruction *BInst) { 1993 const auto &DL = InnermostLoop->getHeader()->getDataLayout(); 1994 auto &SE = *PSE.getSE(); 1995 const auto &[APtr, AIsWrite] = A; 1996 const auto &[BPtr, BIsWrite] = B; 1997 1998 // Two reads are independent. 1999 if (!AIsWrite && !BIsWrite) 2000 return MemoryDepChecker::Dependence::NoDep; 2001 2002 Type *ATy = getLoadStoreType(AInst); 2003 Type *BTy = getLoadStoreType(BInst); 2004 2005 // We cannot check pointers in different address spaces. 2006 if (APtr->getType()->getPointerAddressSpace() != 2007 BPtr->getType()->getPointerAddressSpace()) 2008 return MemoryDepChecker::Dependence::Unknown; 2009 2010 std::optional<int64_t> StrideAPtr = 2011 getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true); 2012 std::optional<int64_t> StrideBPtr = 2013 getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true); 2014 2015 const SCEV *Src = PSE.getSCEV(APtr); 2016 const SCEV *Sink = PSE.getSCEV(BPtr); 2017 2018 // If the induction step is negative we have to invert source and sink of the 2019 // dependence when measuring the distance between them. We should not swap 2020 // AIsWrite with BIsWrite, as their uses expect them in program order. 2021 if (StrideAPtr && *StrideAPtr < 0) { 2022 std::swap(Src, Sink); 2023 std::swap(AInst, BInst); 2024 std::swap(ATy, BTy); 2025 std::swap(StrideAPtr, StrideBPtr); 2026 } 2027 2028 const SCEV *Dist = SE.getMinusSCEV(Sink, Src); 2029 2030 LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink 2031 << "\n"); 2032 LLVM_DEBUG(dbgs() << "LAA: Distance for " << *AInst << " to " << *BInst 2033 << ": " << *Dist << "\n"); 2034 2035 // Need accesses with constant strides and the same direction for further 2036 // dependence analysis. We don't want to vectorize "A[B[i]] += ..." and 2037 // similar code or pointer arithmetic that could wrap in the address space. 2038 2039 // If either Src or Sink are not strided (i.e. not a non-wrapping AddRec) and 2040 // not loop-invariant (stride will be 0 in that case), we cannot analyze the 2041 // dependence further and also cannot generate runtime checks. 2042 if (!StrideAPtr || !StrideBPtr) { 2043 LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n"); 2044 return MemoryDepChecker::Dependence::IndirectUnsafe; 2045 } 2046 2047 int64_t StrideAPtrInt = *StrideAPtr; 2048 int64_t StrideBPtrInt = *StrideBPtr; 2049 LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt 2050 << " Sink induction step: " << StrideBPtrInt << "\n"); 2051 // At least Src or Sink are loop invariant and the other is strided or 2052 // invariant. We can generate a runtime check to disambiguate the accesses. 2053 if (!StrideAPtrInt || !StrideBPtrInt) 2054 return MemoryDepChecker::Dependence::Unknown; 2055 2056 // Both Src and Sink have a constant stride, check if they are in the same 2057 // direction. 2058 if ((StrideAPtrInt > 0) != (StrideBPtrInt > 0)) { 2059 LLVM_DEBUG( 2060 dbgs() << "Pointer access with strides in different directions\n"); 2061 return MemoryDepChecker::Dependence::Unknown; 2062 } 2063 2064 TypeSize AStoreSz = DL.getTypeStoreSize(ATy); 2065 TypeSize BStoreSz = DL.getTypeStoreSize(BTy); 2066 2067 // If store sizes are not the same, set TypeByteSize to zero, so we can check 2068 // it in the caller isDependent. 2069 uint64_t ASz = DL.getTypeAllocSize(ATy); 2070 uint64_t BSz = DL.getTypeAllocSize(BTy); 2071 uint64_t TypeByteSize = (AStoreSz == BStoreSz) ? BSz : 0; 2072 2073 uint64_t StrideAScaled = std::abs(StrideAPtrInt) * ASz; 2074 uint64_t StrideBScaled = std::abs(StrideBPtrInt) * BSz; 2075 2076 uint64_t MaxStride = std::max(StrideAScaled, StrideBScaled); 2077 2078 std::optional<uint64_t> CommonStride; 2079 if (StrideAScaled == StrideBScaled) 2080 CommonStride = StrideAScaled; 2081 2082 // TODO: FoundNonConstantDistanceDependence is used as a necessary condition 2083 // to consider retrying with runtime checks. Historically, we did not set it 2084 // when (unscaled) strides were different but there is no inherent reason to. 2085 if (!isa<SCEVConstant>(Dist)) 2086 FoundNonConstantDistanceDependence |= StrideAPtrInt == StrideBPtrInt; 2087 2088 return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride, 2089 TypeByteSize, AIsWrite, BIsWrite); 2090 } 2091 2092 MemoryDepChecker::Dependence::DepType 2093 MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, 2094 const MemAccessInfo &B, unsigned BIdx) { 2095 assert(AIdx < BIdx && "Must pass arguments in program order"); 2096 2097 // Check if we can prove that Sink only accesses memory after Src's end or 2098 // vice versa. The helper is used to perform the checks only on the exit paths 2099 // where it helps to improve the analysis result. 2100 auto CheckCompletelyBeforeOrAfter = [&]() { 2101 auto *APtr = A.getPointer(); 2102 auto *BPtr = B.getPointer(); 2103 Type *ATy = getLoadStoreType(InstMap[AIdx]); 2104 Type *BTy = getLoadStoreType(InstMap[BIdx]); 2105 const SCEV *Src = PSE.getSCEV(APtr); 2106 const SCEV *Sink = PSE.getSCEV(BPtr); 2107 return areAccessesCompletelyBeforeOrAfter(Src, ATy, Sink, BTy); 2108 }; 2109 2110 // Get the dependence distance, stride, type size and what access writes for 2111 // the dependence between A and B. 2112 auto Res = 2113 getDependenceDistanceStrideAndSize(A, InstMap[AIdx], B, InstMap[BIdx]); 2114 if (std::holds_alternative<Dependence::DepType>(Res)) { 2115 if (std::get<Dependence::DepType>(Res) == Dependence::Unknown && 2116 CheckCompletelyBeforeOrAfter()) 2117 return Dependence::NoDep; 2118 return std::get<Dependence::DepType>(Res); 2119 } 2120 2121 auto &[Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite] = 2122 std::get<DepDistanceStrideAndSizeInfo>(Res); 2123 bool HasSameSize = TypeByteSize > 0; 2124 2125 if (isa<SCEVCouldNotCompute>(Dist)) { 2126 if (CheckCompletelyBeforeOrAfter()) 2127 return Dependence::NoDep; 2128 LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n"); 2129 return Dependence::Unknown; 2130 } 2131 2132 ScalarEvolution &SE = *PSE.getSE(); 2133 auto &DL = InnermostLoop->getHeader()->getDataLayout(); 2134 2135 // If the distance between the acecsses is larger than their maximum absolute 2136 // stride multiplied by the symbolic maximum backedge taken count (which is an 2137 // upper bound of the number of iterations), the accesses are independet, i.e. 2138 // they are far enough appart that accesses won't access the same location 2139 // across all loop ierations. 2140 if (HasSameSize && 2141 isSafeDependenceDistance( 2142 DL, SE, *(PSE.getSymbolicMaxBackedgeTakenCount()), *Dist, MaxStride)) 2143 return Dependence::NoDep; 2144 2145 // The rest of this function relies on ConstDist being at most 64-bits, which 2146 // is checked earlier. Will assert if the calling code changes. 2147 const APInt *APDist = nullptr; 2148 uint64_t ConstDist = 2149 match(Dist, m_scev_APInt(APDist)) ? APDist->abs().getZExtValue() : 0; 2150 2151 // Attempt to prove strided accesses independent. 2152 if (APDist) { 2153 // If the distance between accesses and their strides are known constants, 2154 // check whether the accesses interlace each other. 2155 if (ConstDist > 0 && CommonStride && CommonStride > 1 && HasSameSize && 2156 areStridedAccessesIndependent(ConstDist, *CommonStride, TypeByteSize)) { 2157 LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n"); 2158 return Dependence::NoDep; 2159 } 2160 } else { 2161 if (!LoopGuards) 2162 LoopGuards.emplace( 2163 ScalarEvolution::LoopGuards::collect(InnermostLoop, SE)); 2164 Dist = SE.applyLoopGuards(Dist, *LoopGuards); 2165 } 2166 2167 // Negative distances are not plausible dependencies. 2168 if (SE.isKnownNonPositive(Dist)) { 2169 if (SE.isKnownNonNegative(Dist)) { 2170 if (HasSameSize) { 2171 // Write to the same location with the same size. 2172 return Dependence::Forward; 2173 } 2174 LLVM_DEBUG(dbgs() << "LAA: possibly zero dependence difference but " 2175 "different type sizes\n"); 2176 return Dependence::Unknown; 2177 } 2178 2179 bool IsTrueDataDependence = (AIsWrite && !BIsWrite); 2180 // Check if the first access writes to a location that is read in a later 2181 // iteration, where the distance between them is not a multiple of a vector 2182 // factor and relatively small. 2183 // 2184 // NOTE: There is no need to update MaxSafeVectorWidthInBits after call to 2185 // couldPreventStoreLoadForward, even if it changed MinDepDistBytes, since a 2186 // forward dependency will allow vectorization using any width. 2187 2188 if (IsTrueDataDependence && EnableForwardingConflictDetection) { 2189 if (!ConstDist) { 2190 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep 2191 : Dependence::Unknown; 2192 } 2193 if (!HasSameSize || 2194 couldPreventStoreLoadForward(ConstDist, TypeByteSize)) { 2195 LLVM_DEBUG( 2196 dbgs() << "LAA: Forward but may prevent st->ld forwarding\n"); 2197 return Dependence::ForwardButPreventsForwarding; 2198 } 2199 } 2200 2201 LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n"); 2202 return Dependence::Forward; 2203 } 2204 2205 int64_t MinDistance = SE.getSignedRangeMin(Dist).getSExtValue(); 2206 // Below we only handle strictly positive distances. 2207 if (MinDistance <= 0) { 2208 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep 2209 : Dependence::Unknown; 2210 } 2211 2212 if (!HasSameSize) { 2213 if (CheckCompletelyBeforeOrAfter()) 2214 return Dependence::NoDep; 2215 LLVM_DEBUG(dbgs() << "LAA: ReadWrite-Write positive dependency with " 2216 "different type sizes\n"); 2217 return Dependence::Unknown; 2218 } 2219 // Bail out early if passed-in parameters make vectorization not feasible. 2220 unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? 2221 VectorizerParams::VectorizationFactor : 1); 2222 unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ? 2223 VectorizerParams::VectorizationInterleave : 1); 2224 // The minimum number of iterations for a vectorized/unrolled version. 2225 unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U); 2226 2227 // It's not vectorizable if the distance is smaller than the minimum distance 2228 // needed for a vectroized/unrolled version. Vectorizing one iteration in 2229 // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize. 2230 // (No need to plus the last gap distance). 2231 // 2232 // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. 2233 // foo(int *A) { 2234 // int *B = (int *)((char *)A + 14); 2235 // for (i = 0 ; i < 1024 ; i += 2) 2236 // B[i] = A[i] + 1; 2237 // } 2238 // 2239 // Two accesses in memory (stride is 4 * 2): 2240 // | A[0] | | A[2] | | A[4] | | A[6] | | 2241 // | B[0] | | B[2] | | B[4] | 2242 // 2243 // MinDistance needs for vectorizing iterations except the last iteration: 2244 // 4 * 2 * (MinNumIter - 1). MinDistance needs for the last iteration: 4. 2245 // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4. 2246 // 2247 // If MinNumIter is 2, it is vectorizable as the minimum distance needed is 2248 // 12, which is less than distance. 2249 // 2250 // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4), 2251 // the minimum distance needed is 28, which is greater than distance. It is 2252 // not safe to do vectorization. 2253 // 2254 // We use MaxStride (maximum of src and sink strides) to get a conservative 2255 // lower bound on the MinDistanceNeeded in case of different strides. 2256 2257 // We know that Dist is positive, but it may not be constant. Use the signed 2258 // minimum for computations below, as this ensures we compute the closest 2259 // possible dependence distance. 2260 uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize; 2261 if (MinDistanceNeeded > static_cast<uint64_t>(MinDistance)) { 2262 if (!ConstDist) { 2263 // For non-constant distances, we checked the lower bound of the 2264 // dependence distance and the distance may be larger at runtime (and safe 2265 // for vectorization). Classify it as Unknown, so we re-try with runtime 2266 // checks, unless we can prove both accesses cannot overlap. 2267 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep 2268 : Dependence::Unknown; 2269 } 2270 LLVM_DEBUG(dbgs() << "LAA: Failure because of positive minimum distance " 2271 << MinDistance << '\n'); 2272 return Dependence::Backward; 2273 } 2274 2275 // Unsafe if the minimum distance needed is greater than smallest dependence 2276 // distance distance. 2277 if (MinDistanceNeeded > MinDepDistBytes) { 2278 LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least " 2279 << MinDistanceNeeded << " size in bytes\n"); 2280 return Dependence::Backward; 2281 } 2282 2283 MinDepDistBytes = 2284 std::min(static_cast<uint64_t>(MinDistance), MinDepDistBytes); 2285 2286 bool IsTrueDataDependence = (!AIsWrite && BIsWrite); 2287 if (IsTrueDataDependence && EnableForwardingConflictDetection && ConstDist && 2288 couldPreventStoreLoadForward(MinDistance, TypeByteSize, *CommonStride)) 2289 return Dependence::BackwardVectorizableButPreventsForwarding; 2290 2291 uint64_t MaxVF = MinDepDistBytes / MaxStride; 2292 LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance 2293 << " with max VF = " << MaxVF << '\n'); 2294 2295 uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8; 2296 if (!ConstDist && MaxVFInBits < MaxTargetVectorWidthInBits) { 2297 // For non-constant distances, we checked the lower bound of the dependence 2298 // distance and the distance may be larger at runtime (and safe for 2299 // vectorization). Classify it as Unknown, so we re-try with runtime checks, 2300 // unless we can prove both accesses cannot overlap. 2301 return CheckCompletelyBeforeOrAfter() ? Dependence::NoDep 2302 : Dependence::Unknown; 2303 } 2304 2305 if (CheckCompletelyBeforeOrAfter()) 2306 return Dependence::NoDep; 2307 2308 MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits); 2309 return Dependence::BackwardVectorizable; 2310 } 2311 2312 bool MemoryDepChecker::areDepsSafe(const DepCandidates &DepCands, 2313 const MemAccessInfoList &CheckDeps) { 2314 2315 MinDepDistBytes = -1; 2316 SmallPtrSet<MemAccessInfo, 8> Visited; 2317 for (MemAccessInfo CurAccess : CheckDeps) { 2318 if (Visited.contains(CurAccess)) 2319 continue; 2320 2321 // Check accesses within this set. 2322 EquivalenceClasses<MemAccessInfo>::member_iterator AI = 2323 DepCands.findLeader(CurAccess); 2324 EquivalenceClasses<MemAccessInfo>::member_iterator AE = 2325 DepCands.member_end(); 2326 2327 // Check every access pair. 2328 while (AI != AE) { 2329 Visited.insert(*AI); 2330 bool AIIsWrite = AI->getInt(); 2331 // Check loads only against next equivalent class, but stores also against 2332 // other stores in the same equivalence class - to the same address. 2333 EquivalenceClasses<MemAccessInfo>::member_iterator OI = 2334 (AIIsWrite ? AI : std::next(AI)); 2335 while (OI != AE) { 2336 // Check every accessing instruction pair in program order. 2337 auto &Acc = Accesses[*AI]; 2338 for (std::vector<unsigned>::iterator I1 = Acc.begin(), I1E = Acc.end(); 2339 I1 != I1E; ++I1) 2340 // Scan all accesses of another equivalence class, but only the next 2341 // accesses of the same equivalent class. 2342 for (std::vector<unsigned>::iterator 2343 I2 = (OI == AI ? std::next(I1) : Accesses[*OI].begin()), 2344 I2E = (OI == AI ? I1E : Accesses[*OI].end()); 2345 I2 != I2E; ++I2) { 2346 auto A = std::make_pair(&*AI, *I1); 2347 auto B = std::make_pair(&*OI, *I2); 2348 2349 assert(*I1 != *I2); 2350 if (*I1 > *I2) 2351 std::swap(A, B); 2352 2353 Dependence::DepType Type = 2354 isDependent(*A.first, A.second, *B.first, B.second); 2355 mergeInStatus(Dependence::isSafeForVectorization(Type)); 2356 2357 // Gather dependences unless we accumulated MaxDependences 2358 // dependences. In that case return as soon as we find the first 2359 // unsafe dependence. This puts a limit on this quadratic 2360 // algorithm. 2361 if (RecordDependences) { 2362 if (Type != Dependence::NoDep) 2363 Dependences.emplace_back(A.second, B.second, Type); 2364 2365 if (Dependences.size() >= MaxDependences) { 2366 RecordDependences = false; 2367 Dependences.clear(); 2368 LLVM_DEBUG(dbgs() 2369 << "Too many dependences, stopped recording\n"); 2370 } 2371 } 2372 if (!RecordDependences && !isSafeForVectorization()) 2373 return false; 2374 } 2375 ++OI; 2376 } 2377 ++AI; 2378 } 2379 } 2380 2381 LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n"); 2382 return isSafeForVectorization(); 2383 } 2384 2385 SmallVector<Instruction *, 4> 2386 MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool IsWrite) const { 2387 MemAccessInfo Access(Ptr, IsWrite); 2388 auto &IndexVector = Accesses.find(Access)->second; 2389 2390 SmallVector<Instruction *, 4> Insts; 2391 transform(IndexVector, 2392 std::back_inserter(Insts), 2393 [&](unsigned Idx) { return this->InstMap[Idx]; }); 2394 return Insts; 2395 } 2396 2397 const char *MemoryDepChecker::Dependence::DepName[] = { 2398 "NoDep", 2399 "Unknown", 2400 "IndirectUnsafe", 2401 "Forward", 2402 "ForwardButPreventsForwarding", 2403 "Backward", 2404 "BackwardVectorizable", 2405 "BackwardVectorizableButPreventsForwarding"}; 2406 2407 void MemoryDepChecker::Dependence::print( 2408 raw_ostream &OS, unsigned Depth, 2409 const SmallVectorImpl<Instruction *> &Instrs) const { 2410 OS.indent(Depth) << DepName[Type] << ":\n"; 2411 OS.indent(Depth + 2) << *Instrs[Source] << " -> \n"; 2412 OS.indent(Depth + 2) << *Instrs[Destination] << "\n"; 2413 } 2414 2415 bool LoopAccessInfo::canAnalyzeLoop() { 2416 // We need to have a loop header. 2417 LLVM_DEBUG(dbgs() << "\nLAA: Checking a loop in '" 2418 << TheLoop->getHeader()->getParent()->getName() << "' from " 2419 << TheLoop->getLocStr() << "\n"); 2420 2421 // We can only analyze innermost loops. 2422 if (!TheLoop->isInnermost()) { 2423 LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n"); 2424 recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop"; 2425 return false; 2426 } 2427 2428 // We must have a single backedge. 2429 if (TheLoop->getNumBackEdges() != 1) { 2430 LLVM_DEBUG( 2431 dbgs() << "LAA: loop control flow is not understood by analyzer\n"); 2432 recordAnalysis("CFGNotUnderstood") 2433 << "loop control flow is not understood by analyzer"; 2434 return false; 2435 } 2436 2437 // ScalarEvolution needs to be able to find the symbolic max backedge taken 2438 // count, which is an upper bound on the number of loop iterations. The loop 2439 // may execute fewer iterations, if it exits via an uncountable exit. 2440 const SCEV *ExitCount = PSE->getSymbolicMaxBackedgeTakenCount(); 2441 if (isa<SCEVCouldNotCompute>(ExitCount)) { 2442 recordAnalysis("CantComputeNumberOfIterations") 2443 << "could not determine number of loop iterations"; 2444 LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n"); 2445 return false; 2446 } 2447 2448 LLVM_DEBUG(dbgs() << "LAA: Found an analyzable loop: " 2449 << TheLoop->getHeader()->getName() << "\n"); 2450 return true; 2451 } 2452 2453 bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI, 2454 const TargetLibraryInfo *TLI, 2455 DominatorTree *DT) { 2456 // Holds the Load and Store instructions. 2457 SmallVector<LoadInst *, 16> Loads; 2458 SmallVector<StoreInst *, 16> Stores; 2459 SmallPtrSet<MDNode *, 8> LoopAliasScopes; 2460 2461 // Holds all the different accesses in the loop. 2462 unsigned NumReads = 0; 2463 unsigned NumReadWrites = 0; 2464 2465 bool HasComplexMemInst = false; 2466 2467 // A runtime check is only legal to insert if there are no convergent calls. 2468 HasConvergentOp = false; 2469 2470 PtrRtChecking->Pointers.clear(); 2471 PtrRtChecking->Need = false; 2472 2473 const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); 2474 2475 const bool EnableMemAccessVersioningOfLoop = 2476 EnableMemAccessVersioning && 2477 !TheLoop->getHeader()->getParent()->hasOptSize(); 2478 2479 // Traverse blocks in fixed RPOT order, regardless of their storage in the 2480 // loop info, as it may be arbitrary. 2481 LoopBlocksRPO RPOT(TheLoop); 2482 RPOT.perform(LI); 2483 for (BasicBlock *BB : RPOT) { 2484 // Scan the BB and collect legal loads and stores. Also detect any 2485 // convergent instructions. 2486 for (Instruction &I : *BB) { 2487 if (auto *Call = dyn_cast<CallBase>(&I)) { 2488 if (Call->isConvergent()) 2489 HasConvergentOp = true; 2490 } 2491 2492 // With both a non-vectorizable memory instruction and a convergent 2493 // operation, found in this loop, no reason to continue the search. 2494 if (HasComplexMemInst && HasConvergentOp) 2495 return false; 2496 2497 // Avoid hitting recordAnalysis multiple times. 2498 if (HasComplexMemInst) 2499 continue; 2500 2501 // Record alias scopes defined inside the loop. 2502 if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I)) 2503 for (Metadata *Op : Decl->getScopeList()->operands()) 2504 LoopAliasScopes.insert(cast<MDNode>(Op)); 2505 2506 // Many math library functions read the rounding mode. We will only 2507 // vectorize a loop if it contains known function calls that don't set 2508 // the flag. Therefore, it is safe to ignore this read from memory. 2509 auto *Call = dyn_cast<CallInst>(&I); 2510 if (Call && getVectorIntrinsicIDForCall(Call, TLI)) 2511 continue; 2512 2513 // If this is a load, save it. If this instruction can read from memory 2514 // but is not a load, we only allow it if it's a call to a function with a 2515 // vector mapping and no pointer arguments. 2516 if (I.mayReadFromMemory()) { 2517 auto hasPointerArgs = [](CallBase *CB) { 2518 return any_of(CB->args(), [](Value const *Arg) { 2519 return Arg->getType()->isPointerTy(); 2520 }); 2521 }; 2522 2523 // If the function has an explicit vectorized counterpart, and does not 2524 // take output/input pointers, we can safely assume that it can be 2525 // vectorized. 2526 if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() && 2527 !hasPointerArgs(Call) && !VFDatabase::getMappings(*Call).empty()) 2528 continue; 2529 2530 auto *Ld = dyn_cast<LoadInst>(&I); 2531 if (!Ld) { 2532 recordAnalysis("CantVectorizeInstruction", Ld) 2533 << "instruction cannot be vectorized"; 2534 HasComplexMemInst = true; 2535 continue; 2536 } 2537 if (!Ld->isSimple() && !IsAnnotatedParallel) { 2538 recordAnalysis("NonSimpleLoad", Ld) 2539 << "read with atomic ordering or volatile read"; 2540 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n"); 2541 HasComplexMemInst = true; 2542 continue; 2543 } 2544 NumLoads++; 2545 Loads.push_back(Ld); 2546 DepChecker->addAccess(Ld); 2547 if (EnableMemAccessVersioningOfLoop) 2548 collectStridedAccess(Ld); 2549 continue; 2550 } 2551 2552 // Save 'store' instructions. Abort if other instructions write to memory. 2553 if (I.mayWriteToMemory()) { 2554 auto *St = dyn_cast<StoreInst>(&I); 2555 if (!St) { 2556 recordAnalysis("CantVectorizeInstruction", St) 2557 << "instruction cannot be vectorized"; 2558 HasComplexMemInst = true; 2559 continue; 2560 } 2561 if (!St->isSimple() && !IsAnnotatedParallel) { 2562 recordAnalysis("NonSimpleStore", St) 2563 << "write with atomic ordering or volatile write"; 2564 LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n"); 2565 HasComplexMemInst = true; 2566 continue; 2567 } 2568 NumStores++; 2569 Stores.push_back(St); 2570 DepChecker->addAccess(St); 2571 if (EnableMemAccessVersioningOfLoop) 2572 collectStridedAccess(St); 2573 } 2574 } // Next instr. 2575 } // Next block. 2576 2577 if (HasComplexMemInst) 2578 return false; 2579 2580 // Now we have two lists that hold the loads and the stores. 2581 // Next, we find the pointers that they use. 2582 2583 // Check if we see any stores. If there are no stores, then we don't 2584 // care if the pointers are *restrict*. 2585 if (!Stores.size()) { 2586 LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n"); 2587 return true; 2588 } 2589 2590 MemoryDepChecker::DepCandidates DepCands; 2591 AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes); 2592 2593 // Holds the analyzed pointers. We don't want to call getUnderlyingObjects 2594 // multiple times on the same object. If the ptr is accessed twice, once 2595 // for read and once for write, it will only appear once (on the write 2596 // list). This is okay, since we are going to check for conflicts between 2597 // writes and between reads and writes, but not between reads and reads. 2598 SmallSet<std::pair<Value *, Type *>, 16> Seen; 2599 2600 // Record uniform store addresses to identify if we have multiple stores 2601 // to the same address. 2602 SmallPtrSet<Value *, 16> UniformStores; 2603 2604 for (StoreInst *ST : Stores) { 2605 Value *Ptr = ST->getPointerOperand(); 2606 2607 if (isInvariant(Ptr)) { 2608 // Record store instructions to loop invariant addresses 2609 StoresToInvariantAddresses.push_back(ST); 2610 HasStoreStoreDependenceInvolvingLoopInvariantAddress |= 2611 !UniformStores.insert(Ptr).second; 2612 } 2613 2614 // If we did *not* see this pointer before, insert it to the read-write 2615 // list. At this phase it is only a 'write' list. 2616 Type *AccessTy = getLoadStoreType(ST); 2617 if (Seen.insert({Ptr, AccessTy}).second) { 2618 ++NumReadWrites; 2619 2620 MemoryLocation Loc = MemoryLocation::get(ST); 2621 // The TBAA metadata could have a control dependency on the predication 2622 // condition, so we cannot rely on it when determining whether or not we 2623 // need runtime pointer checks. 2624 if (blockNeedsPredication(ST->getParent(), TheLoop, DT)) 2625 Loc.AATags.TBAA = nullptr; 2626 2627 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop, 2628 [&Accesses, AccessTy, Loc](Value *Ptr) { 2629 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr); 2630 Accesses.addStore(NewLoc, AccessTy); 2631 }); 2632 } 2633 } 2634 2635 if (IsAnnotatedParallel) { 2636 LLVM_DEBUG( 2637 dbgs() << "LAA: A loop annotated parallel, ignore memory dependency " 2638 << "checks.\n"); 2639 return true; 2640 } 2641 2642 for (LoadInst *LD : Loads) { 2643 Value *Ptr = LD->getPointerOperand(); 2644 // If we did *not* see this pointer before, insert it to the 2645 // read list. If we *did* see it before, then it is already in 2646 // the read-write list. This allows us to vectorize expressions 2647 // such as A[i] += x; Because the address of A[i] is a read-write 2648 // pointer. This only works if the index of A[i] is consecutive. 2649 // If the address of i is unknown (for example A[B[i]]) then we may 2650 // read a few words, modify, and write a few words, and some of the 2651 // words may be written to the same address. 2652 bool IsReadOnlyPtr = false; 2653 Type *AccessTy = getLoadStoreType(LD); 2654 if (Seen.insert({Ptr, AccessTy}).second || 2655 !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) { 2656 ++NumReads; 2657 IsReadOnlyPtr = true; 2658 } 2659 2660 // See if there is an unsafe dependency between a load to a uniform address and 2661 // store to the same uniform address. 2662 if (UniformStores.contains(Ptr)) { 2663 LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform " 2664 "load and uniform store to the same address!\n"); 2665 HasLoadStoreDependenceInvolvingLoopInvariantAddress = true; 2666 } 2667 2668 MemoryLocation Loc = MemoryLocation::get(LD); 2669 // The TBAA metadata could have a control dependency on the predication 2670 // condition, so we cannot rely on it when determining whether or not we 2671 // need runtime pointer checks. 2672 if (blockNeedsPredication(LD->getParent(), TheLoop, DT)) 2673 Loc.AATags.TBAA = nullptr; 2674 2675 visitPointers(const_cast<Value *>(Loc.Ptr), *TheLoop, 2676 [&Accesses, AccessTy, Loc, IsReadOnlyPtr](Value *Ptr) { 2677 MemoryLocation NewLoc = Loc.getWithNewPtr(Ptr); 2678 Accesses.addLoad(NewLoc, AccessTy, IsReadOnlyPtr); 2679 }); 2680 } 2681 2682 // If we write (or read-write) to a single destination and there are no 2683 // other reads in this loop then is it safe to vectorize. 2684 if (NumReadWrites == 1 && NumReads == 0) { 2685 LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n"); 2686 return true; 2687 } 2688 2689 // Build dependence sets and check whether we need a runtime pointer bounds 2690 // check. 2691 Accesses.buildDependenceSets(); 2692 2693 // Find pointers with computable bounds. We are going to use this information 2694 // to place a runtime bound check. 2695 Value *UncomputablePtr = nullptr; 2696 HasCompletePtrRtChecking = Accesses.canCheckPtrAtRT( 2697 *PtrRtChecking, TheLoop, SymbolicStrides, UncomputablePtr, AllowPartial); 2698 if (!HasCompletePtrRtChecking) { 2699 const auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr); 2700 recordAnalysis("CantIdentifyArrayBounds", I) 2701 << "cannot identify array bounds"; 2702 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find " 2703 << "the array bounds.\n"); 2704 return false; 2705 } 2706 2707 LLVM_DEBUG( 2708 dbgs() << "LAA: May be able to perform a memory runtime check if needed.\n"); 2709 2710 bool DepsAreSafe = true; 2711 if (Accesses.isDependencyCheckNeeded()) { 2712 LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n"); 2713 DepsAreSafe = 2714 DepChecker->areDepsSafe(DepCands, Accesses.getDependenciesToCheck()); 2715 2716 if (!DepsAreSafe && DepChecker->shouldRetryWithRuntimeCheck()) { 2717 LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n"); 2718 2719 // Clear the dependency checks. We assume they are not needed. 2720 Accesses.resetDepChecks(*DepChecker); 2721 2722 PtrRtChecking->reset(); 2723 PtrRtChecking->Need = true; 2724 2725 UncomputablePtr = nullptr; 2726 HasCompletePtrRtChecking = 2727 Accesses.canCheckPtrAtRT(*PtrRtChecking, TheLoop, SymbolicStrides, 2728 UncomputablePtr, AllowPartial); 2729 2730 // Check that we found the bounds for the pointer. 2731 if (!HasCompletePtrRtChecking) { 2732 auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr); 2733 recordAnalysis("CantCheckMemDepsAtRunTime", I) 2734 << "cannot check memory dependencies at runtime"; 2735 LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n"); 2736 return false; 2737 } 2738 DepsAreSafe = true; 2739 } 2740 } 2741 2742 if (HasConvergentOp) { 2743 recordAnalysis("CantInsertRuntimeCheckWithConvergent") 2744 << "cannot add control dependency to convergent operation"; 2745 LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because a runtime check " 2746 "would be needed with a convergent operation\n"); 2747 return false; 2748 } 2749 2750 if (DepsAreSafe) { 2751 LLVM_DEBUG( 2752 dbgs() << "LAA: No unsafe dependent memory operations in loop. We" 2753 << (PtrRtChecking->Need ? "" : " don't") 2754 << " need runtime memory checks.\n"); 2755 return true; 2756 } 2757 2758 emitUnsafeDependenceRemark(); 2759 return false; 2760 } 2761 2762 void LoopAccessInfo::emitUnsafeDependenceRemark() { 2763 const auto *Deps = getDepChecker().getDependences(); 2764 if (!Deps) 2765 return; 2766 const auto *Found = 2767 llvm::find_if(*Deps, [](const MemoryDepChecker::Dependence &D) { 2768 return MemoryDepChecker::Dependence::isSafeForVectorization(D.Type) != 2769 MemoryDepChecker::VectorizationSafetyStatus::Safe; 2770 }); 2771 if (Found == Deps->end()) 2772 return; 2773 MemoryDepChecker::Dependence Dep = *Found; 2774 2775 LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n"); 2776 2777 // Emit remark for first unsafe dependence 2778 bool HasForcedDistribution = false; 2779 std::optional<const MDOperand *> Value = 2780 findStringMetadataForLoop(TheLoop, "llvm.loop.distribute.enable"); 2781 if (Value) { 2782 const MDOperand *Op = *Value; 2783 assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 2784 HasForcedDistribution = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 2785 } 2786 2787 const std::string Info = 2788 HasForcedDistribution 2789 ? "unsafe dependent memory operations in loop." 2790 : "unsafe dependent memory operations in loop. Use " 2791 "#pragma clang loop distribute(enable) to allow loop distribution " 2792 "to attempt to isolate the offending operations into a separate " 2793 "loop"; 2794 OptimizationRemarkAnalysis &R = 2795 recordAnalysis("UnsafeDep", Dep.getDestination(getDepChecker())) << Info; 2796 2797 switch (Dep.Type) { 2798 case MemoryDepChecker::Dependence::NoDep: 2799 case MemoryDepChecker::Dependence::Forward: 2800 case MemoryDepChecker::Dependence::BackwardVectorizable: 2801 llvm_unreachable("Unexpected dependence"); 2802 case MemoryDepChecker::Dependence::Backward: 2803 R << "\nBackward loop carried data dependence."; 2804 break; 2805 case MemoryDepChecker::Dependence::ForwardButPreventsForwarding: 2806 R << "\nForward loop carried data dependence that prevents " 2807 "store-to-load forwarding."; 2808 break; 2809 case MemoryDepChecker::Dependence::BackwardVectorizableButPreventsForwarding: 2810 R << "\nBackward loop carried data dependence that prevents " 2811 "store-to-load forwarding."; 2812 break; 2813 case MemoryDepChecker::Dependence::IndirectUnsafe: 2814 R << "\nUnsafe indirect dependence."; 2815 break; 2816 case MemoryDepChecker::Dependence::Unknown: 2817 R << "\nUnknown data dependence."; 2818 break; 2819 } 2820 2821 if (Instruction *I = Dep.getSource(getDepChecker())) { 2822 DebugLoc SourceLoc = I->getDebugLoc(); 2823 if (auto *DD = dyn_cast_or_null<Instruction>(getPointerOperand(I))) 2824 SourceLoc = DD->getDebugLoc(); 2825 if (SourceLoc) 2826 R << " Memory location is the same as accessed at " 2827 << ore::NV("Location", SourceLoc); 2828 } 2829 } 2830 2831 bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, 2832 DominatorTree *DT) { 2833 assert(TheLoop->contains(BB) && "Unknown block used"); 2834 2835 // Blocks that do not dominate the latch need predication. 2836 const BasicBlock *Latch = TheLoop->getLoopLatch(); 2837 return !DT->dominates(BB, Latch); 2838 } 2839 2840 OptimizationRemarkAnalysis & 2841 LoopAccessInfo::recordAnalysis(StringRef RemarkName, const Instruction *I) { 2842 assert(!Report && "Multiple reports generated"); 2843 2844 const BasicBlock *CodeRegion = TheLoop->getHeader(); 2845 DebugLoc DL = TheLoop->getStartLoc(); 2846 2847 if (I) { 2848 CodeRegion = I->getParent(); 2849 // If there is no debug location attached to the instruction, revert back to 2850 // using the loop's. 2851 if (I->getDebugLoc()) 2852 DL = I->getDebugLoc(); 2853 } 2854 2855 Report = std::make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, 2856 DL, CodeRegion); 2857 return *Report; 2858 } 2859 2860 bool LoopAccessInfo::isInvariant(Value *V) const { 2861 auto *SE = PSE->getSE(); 2862 if (TheLoop->isLoopInvariant(V)) 2863 return true; 2864 if (!SE->isSCEVable(V->getType())) 2865 return false; 2866 const SCEV *S = SE->getSCEV(V); 2867 return SE->isLoopInvariant(S, TheLoop); 2868 } 2869 2870 /// If \p Ptr is a GEP, which has a loop-variant operand, return that operand. 2871 /// Otherwise, return \p Ptr. 2872 static Value *getLoopVariantGEPOperand(Value *Ptr, ScalarEvolution *SE, 2873 Loop *Lp) { 2874 auto *GEP = dyn_cast<GetElementPtrInst>(Ptr); 2875 if (!GEP) 2876 return Ptr; 2877 2878 Value *V = Ptr; 2879 for (const Use &U : GEP->operands()) { 2880 if (!SE->isLoopInvariant(SE->getSCEV(U), Lp)) { 2881 if (V == Ptr) 2882 V = U; 2883 else 2884 // There must be exactly one loop-variant operand. 2885 return Ptr; 2886 } 2887 } 2888 return V; 2889 } 2890 2891 /// Get the stride of a pointer access in a loop. Looks for symbolic 2892 /// strides "a[i*stride]". Returns the symbolic stride, or null otherwise. 2893 static const SCEV *getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) { 2894 auto *PtrTy = dyn_cast<PointerType>(Ptr->getType()); 2895 if (!PtrTy) 2896 return nullptr; 2897 2898 // Try to remove a gep instruction to make the pointer (actually index at this 2899 // point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the 2900 // pointer, otherwise, we are analyzing the index. 2901 Value *OrigPtr = Ptr; 2902 2903 Ptr = getLoopVariantGEPOperand(Ptr, SE, Lp); 2904 const SCEV *V = SE->getSCEV(Ptr); 2905 2906 if (Ptr != OrigPtr) 2907 // Strip off casts. 2908 while (auto *C = dyn_cast<SCEVIntegralCastExpr>(V)) 2909 V = C->getOperand(); 2910 2911 if (!match(V, m_scev_AffineAddRec(m_SCEV(), m_SCEV(V), m_SpecificLoop(Lp)))) 2912 return nullptr; 2913 2914 // Note that the restriction after this loop invariant check are only 2915 // profitability restrictions. 2916 if (!SE->isLoopInvariant(V, Lp)) 2917 return nullptr; 2918 2919 // Look for the loop invariant symbolic value. 2920 if (isa<SCEVUnknown>(V)) 2921 return V; 2922 2923 if (auto *C = dyn_cast<SCEVIntegralCastExpr>(V)) 2924 if (isa<SCEVUnknown>(C->getOperand())) 2925 return V; 2926 2927 return nullptr; 2928 } 2929 2930 void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { 2931 Value *Ptr = getLoadStorePointerOperand(MemAccess); 2932 if (!Ptr) 2933 return; 2934 2935 // Note: getStrideFromPointer is a *profitability* heuristic. We 2936 // could broaden the scope of values returned here - to anything 2937 // which happens to be loop invariant and contributes to the 2938 // computation of an interesting IV - but we chose not to as we 2939 // don't have a cost model here, and broadening the scope exposes 2940 // far too many unprofitable cases. 2941 const SCEV *StrideExpr = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop); 2942 if (!StrideExpr) 2943 return; 2944 2945 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for " 2946 "versioning:"); 2947 LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *StrideExpr << "\n"); 2948 2949 if (!SpeculateUnitStride) { 2950 LLVM_DEBUG(dbgs() << " Chose not to due to -laa-speculate-unit-stride\n"); 2951 return; 2952 } 2953 2954 // Avoid adding the "Stride == 1" predicate when we know that 2955 // Stride >= Trip-Count. Such a predicate will effectively optimize a single 2956 // or zero iteration loop, as Trip-Count <= Stride == 1. 2957 // 2958 // TODO: We are currently not making a very informed decision on when it is 2959 // beneficial to apply stride versioning. It might make more sense that the 2960 // users of this analysis (such as the vectorizer) will trigger it, based on 2961 // their specific cost considerations; For example, in cases where stride 2962 // versioning does not help resolving memory accesses/dependences, the 2963 // vectorizer should evaluate the cost of the runtime test, and the benefit 2964 // of various possible stride specializations, considering the alternatives 2965 // of using gather/scatters (if available). 2966 2967 const SCEV *MaxBTC = PSE->getSymbolicMaxBackedgeTakenCount(); 2968 2969 // Match the types so we can compare the stride and the MaxBTC. 2970 // The Stride can be positive/negative, so we sign extend Stride; 2971 // The backedgeTakenCount is non-negative, so we zero extend MaxBTC. 2972 const DataLayout &DL = TheLoop->getHeader()->getDataLayout(); 2973 uint64_t StrideTypeSizeBits = DL.getTypeSizeInBits(StrideExpr->getType()); 2974 uint64_t BETypeSizeBits = DL.getTypeSizeInBits(MaxBTC->getType()); 2975 const SCEV *CastedStride = StrideExpr; 2976 const SCEV *CastedBECount = MaxBTC; 2977 ScalarEvolution *SE = PSE->getSE(); 2978 if (BETypeSizeBits >= StrideTypeSizeBits) 2979 CastedStride = SE->getNoopOrSignExtend(StrideExpr, MaxBTC->getType()); 2980 else 2981 CastedBECount = SE->getZeroExtendExpr(MaxBTC, StrideExpr->getType()); 2982 const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount); 2983 // Since TripCount == BackEdgeTakenCount + 1, checking: 2984 // "Stride >= TripCount" is equivalent to checking: 2985 // Stride - MaxBTC> 0 2986 if (SE->isKnownPositive(StrideMinusBETaken)) { 2987 LLVM_DEBUG( 2988 dbgs() << "LAA: Stride>=TripCount; No point in versioning as the " 2989 "Stride==1 predicate will imply that the loop executes " 2990 "at most once.\n"); 2991 return; 2992 } 2993 LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.\n"); 2994 2995 // Strip back off the integer cast, and check that our result is a 2996 // SCEVUnknown as we expect. 2997 const SCEV *StrideBase = StrideExpr; 2998 if (const auto *C = dyn_cast<SCEVIntegralCastExpr>(StrideBase)) 2999 StrideBase = C->getOperand(); 3000 SymbolicStrides[Ptr] = cast<SCEVUnknown>(StrideBase); 3001 } 3002 3003 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, 3004 const TargetTransformInfo *TTI, 3005 const TargetLibraryInfo *TLI, AAResults *AA, 3006 DominatorTree *DT, LoopInfo *LI, 3007 bool AllowPartial) 3008 : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)), 3009 PtrRtChecking(nullptr), TheLoop(L), AllowPartial(AllowPartial) { 3010 unsigned MaxTargetVectorWidthInBits = std::numeric_limits<unsigned>::max(); 3011 if (TTI && !TTI->enableScalableVectorization()) 3012 // Scale the vector width by 2 as rough estimate to also consider 3013 // interleaving. 3014 MaxTargetVectorWidthInBits = 3015 TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2; 3016 3017 DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides, 3018 MaxTargetVectorWidthInBits); 3019 PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE); 3020 if (canAnalyzeLoop()) 3021 CanVecMem = analyzeLoop(AA, LI, TLI, DT); 3022 } 3023 3024 void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { 3025 if (CanVecMem) { 3026 OS.indent(Depth) << "Memory dependences are safe"; 3027 const MemoryDepChecker &DC = getDepChecker(); 3028 if (!DC.isSafeForAnyVectorWidth()) 3029 OS << " with a maximum safe vector width of " 3030 << DC.getMaxSafeVectorWidthInBits() << " bits"; 3031 if (!DC.isSafeForAnyStoreLoadForwardDistances()) { 3032 uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits(); 3033 OS << ", with a maximum safe store-load forward width of " << SLDist 3034 << " bits"; 3035 } 3036 if (PtrRtChecking->Need) 3037 OS << " with run-time checks"; 3038 OS << "\n"; 3039 } 3040 3041 if (HasConvergentOp) 3042 OS.indent(Depth) << "Has convergent operation in loop\n"; 3043 3044 if (Report) 3045 OS.indent(Depth) << "Report: " << Report->getMsg() << "\n"; 3046 3047 if (auto *Dependences = DepChecker->getDependences()) { 3048 OS.indent(Depth) << "Dependences:\n"; 3049 for (const auto &Dep : *Dependences) { 3050 Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions()); 3051 OS << "\n"; 3052 } 3053 } else 3054 OS.indent(Depth) << "Too many dependences, not recorded\n"; 3055 3056 // List the pair of accesses need run-time checks to prove independence. 3057 PtrRtChecking->print(OS, Depth); 3058 if (PtrRtChecking->Need && !HasCompletePtrRtChecking) 3059 OS.indent(Depth) << "Generated run-time checks are incomplete\n"; 3060 OS << "\n"; 3061 3062 OS.indent(Depth) 3063 << "Non vectorizable stores to invariant address were " 3064 << (HasStoreStoreDependenceInvolvingLoopInvariantAddress || 3065 HasLoadStoreDependenceInvolvingLoopInvariantAddress 3066 ? "" 3067 : "not ") 3068 << "found in loop.\n"; 3069 3070 OS.indent(Depth) << "SCEV assumptions:\n"; 3071 PSE->getPredicate().print(OS, Depth); 3072 3073 OS << "\n"; 3074 3075 OS.indent(Depth) << "Expressions re-written:\n"; 3076 PSE->print(OS, Depth); 3077 } 3078 3079 const LoopAccessInfo &LoopAccessInfoManager::getInfo(Loop &L, 3080 bool AllowPartial) { 3081 const auto &[It, Inserted] = LoopAccessInfoMap.try_emplace(&L); 3082 3083 // We need to create the LoopAccessInfo if either we don't already have one, 3084 // or if it was created with a different value of AllowPartial. 3085 if (Inserted || It->second->hasAllowPartial() != AllowPartial) 3086 It->second = std::make_unique<LoopAccessInfo>(&L, &SE, TTI, TLI, &AA, &DT, 3087 &LI, AllowPartial); 3088 3089 return *It->second; 3090 } 3091 void LoopAccessInfoManager::clear() { 3092 // Collect LoopAccessInfo entries that may keep references to IR outside the 3093 // analyzed loop or SCEVs that may have been modified or invalidated. At the 3094 // moment, that is loops requiring memory or SCEV runtime checks, as those cache 3095 // SCEVs, e.g. for pointer expressions. 3096 for (const auto &[L, LAI] : LoopAccessInfoMap) { 3097 if (LAI->getRuntimePointerChecking()->getChecks().empty() && 3098 LAI->getPSE().getPredicate().isAlwaysTrue()) 3099 continue; 3100 LoopAccessInfoMap.erase(L); 3101 } 3102 } 3103 3104 bool LoopAccessInfoManager::invalidate( 3105 Function &F, const PreservedAnalyses &PA, 3106 FunctionAnalysisManager::Invalidator &Inv) { 3107 // Check whether our analysis is preserved. 3108 auto PAC = PA.getChecker<LoopAccessAnalysis>(); 3109 if (!PAC.preserved() && !PAC.preservedSet<AllAnalysesOn<Function>>()) 3110 // If not, give up now. 3111 return true; 3112 3113 // Check whether the analyses we depend on became invalid for any reason. 3114 // Skip checking TargetLibraryAnalysis as it is immutable and can't become 3115 // invalid. 3116 return Inv.invalidate<AAManager>(F, PA) || 3117 Inv.invalidate<ScalarEvolutionAnalysis>(F, PA) || 3118 Inv.invalidate<LoopAnalysis>(F, PA) || 3119 Inv.invalidate<DominatorTreeAnalysis>(F, PA); 3120 } 3121 3122 LoopAccessInfoManager LoopAccessAnalysis::run(Function &F, 3123 FunctionAnalysisManager &FAM) { 3124 auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(F); 3125 auto &AA = FAM.getResult<AAManager>(F); 3126 auto &DT = FAM.getResult<DominatorTreeAnalysis>(F); 3127 auto &LI = FAM.getResult<LoopAnalysis>(F); 3128 auto &TTI = FAM.getResult<TargetIRAnalysis>(F); 3129 auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 3130 return LoopAccessInfoManager(SE, AA, DT, LI, &TTI, &TLI); 3131 } 3132 3133 AnalysisKey LoopAccessAnalysis::Key; 3134