1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This transformation implements the well known scalar replacement of 10 /// aggregates transformation. It tries to identify promotable elements of an 11 /// aggregate alloca, and promote them to registers. It will also try to 12 /// convert uses of an element (or set of elements) of an alloca into a vector 13 /// or bitfield-style integer scalar if appropriate. 14 /// 15 /// It works to do this with minimal slicing of the alloca so that regions 16 /// which are merely transferred in and out of external memory remain unchanged 17 /// and are not decomposed to scalar code. 18 /// 19 /// Because this also performs alloca promotion, it can be thought of as also 20 /// serving the purpose of SSA formation. The algorithm iterates on the 21 /// function until all opportunities for promotion have been realized. 22 /// 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/Transforms/Scalar/SROA.h" 26 #include "llvm/ADT/APInt.h" 27 #include "llvm/ADT/ArrayRef.h" 28 #include "llvm/ADT/DenseMap.h" 29 #include "llvm/ADT/MapVector.h" 30 #include "llvm/ADT/PointerIntPair.h" 31 #include "llvm/ADT/STLExtras.h" 32 #include "llvm/ADT/SetVector.h" 33 #include "llvm/ADT/SmallBitVector.h" 34 #include "llvm/ADT/SmallPtrSet.h" 35 #include "llvm/ADT/SmallVector.h" 36 #include "llvm/ADT/Statistic.h" 37 #include "llvm/ADT/StringRef.h" 38 #include "llvm/ADT/Twine.h" 39 #include "llvm/ADT/iterator.h" 40 #include "llvm/ADT/iterator_range.h" 41 #include "llvm/Analysis/AssumptionCache.h" 42 #include "llvm/Analysis/DomTreeUpdater.h" 43 #include "llvm/Analysis/GlobalsModRef.h" 44 #include "llvm/Analysis/Loads.h" 45 #include "llvm/Analysis/PtrUseVisitor.h" 46 #include "llvm/Config/llvm-config.h" 47 #include "llvm/IR/BasicBlock.h" 48 #include "llvm/IR/Constant.h" 49 #include "llvm/IR/ConstantFolder.h" 50 #include "llvm/IR/Constants.h" 51 #include "llvm/IR/DIBuilder.h" 52 #include "llvm/IR/DataLayout.h" 53 #include "llvm/IR/DebugInfo.h" 54 #include "llvm/IR/DebugInfoMetadata.h" 55 #include "llvm/IR/DerivedTypes.h" 56 #include "llvm/IR/Dominators.h" 57 #include "llvm/IR/Function.h" 58 #include "llvm/IR/GetElementPtrTypeIterator.h" 59 #include "llvm/IR/GlobalAlias.h" 60 #include "llvm/IR/IRBuilder.h" 61 #include "llvm/IR/InstVisitor.h" 62 #include "llvm/IR/Instruction.h" 63 #include "llvm/IR/Instructions.h" 64 #include "llvm/IR/IntrinsicInst.h" 65 #include "llvm/IR/LLVMContext.h" 66 #include "llvm/IR/Metadata.h" 67 #include "llvm/IR/Module.h" 68 #include "llvm/IR/Operator.h" 69 #include "llvm/IR/PassManager.h" 70 #include "llvm/IR/Type.h" 71 #include "llvm/IR/Use.h" 72 #include "llvm/IR/User.h" 73 #include "llvm/IR/Value.h" 74 #include "llvm/IR/ValueHandle.h" 75 #include "llvm/InitializePasses.h" 76 #include "llvm/Pass.h" 77 #include "llvm/Support/Casting.h" 78 #include "llvm/Support/CommandLine.h" 79 #include "llvm/Support/Compiler.h" 80 #include "llvm/Support/Debug.h" 81 #include "llvm/Support/ErrorHandling.h" 82 #include "llvm/Support/raw_ostream.h" 83 #include "llvm/Transforms/Scalar.h" 84 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 85 #include "llvm/Transforms/Utils/Local.h" 86 #include "llvm/Transforms/Utils/PromoteMemToReg.h" 87 #include <algorithm> 88 #include <cassert> 89 #include <cstddef> 90 #include <cstdint> 91 #include <cstring> 92 #include <iterator> 93 #include <string> 94 #include <tuple> 95 #include <utility> 96 #include <variant> 97 #include <vector> 98 99 using namespace llvm; 100 101 #define DEBUG_TYPE "sroa" 102 103 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 104 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); 105 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); 106 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten"); 107 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition"); 108 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 109 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 110 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 111 STATISTIC(NumLoadsPredicated, 112 "Number of loads rewritten into predicated loads to allow promotion"); 113 STATISTIC( 114 NumStoresPredicated, 115 "Number of stores rewritten into predicated loads to allow promotion"); 116 STATISTIC(NumDeleted, "Number of instructions deleted"); 117 STATISTIC(NumVectorized, "Number of vectorized aggregates"); 118 119 /// Hidden option to experiment with completely strict handling of inbounds 120 /// GEPs. 121 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), 122 cl::Hidden); 123 /// Disable running mem2reg during SROA in order to test or debug SROA. 124 static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false), 125 cl::Hidden); 126 namespace { 127 128 class AllocaSliceRewriter; 129 class AllocaSlices; 130 class Partition; 131 132 class SelectHandSpeculativity { 133 unsigned char Storage = 0; // None are speculatable by default. 134 using TrueVal = Bitfield::Element<bool, 0, 1>; // Low 0'th bit. 135 using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit. 136 public: 137 SelectHandSpeculativity() = default; 138 SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal); 139 bool isSpeculatable(bool isTrueVal) const; 140 bool areAllSpeculatable() const; 141 bool areAnySpeculatable() const; 142 bool areNoneSpeculatable() const; 143 // For interop as int half of PointerIntPair. 144 explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); } 145 explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {} 146 }; 147 static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char)); 148 149 using PossiblySpeculatableLoad = 150 PointerIntPair<LoadInst *, 2, SelectHandSpeculativity>; 151 using UnspeculatableStore = StoreInst *; 152 using RewriteableMemOp = 153 std::variant<PossiblySpeculatableLoad, UnspeculatableStore>; 154 using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>; 155 156 /// An optimization pass providing Scalar Replacement of Aggregates. 157 /// 158 /// This pass takes allocations which can be completely analyzed (that is, they 159 /// don't escape) and tries to turn them into scalar SSA values. There are 160 /// a few steps to this process. 161 /// 162 /// 1) It takes allocations of aggregates and analyzes the ways in which they 163 /// are used to try to split them into smaller allocations, ideally of 164 /// a single scalar data type. It will split up memcpy and memset accesses 165 /// as necessary and try to isolate individual scalar accesses. 166 /// 2) It will transform accesses into forms which are suitable for SSA value 167 /// promotion. This can be replacing a memset with a scalar store of an 168 /// integer value, or it can involve speculating operations on a PHI or 169 /// select to be a PHI or select of the results. 170 /// 3) Finally, this will try to detect a pattern of accesses which map cleanly 171 /// onto insert and extract operations on a vector value, and convert them to 172 /// this form. By doing so, it will enable promotion of vector aggregates to 173 /// SSA vector values. 174 class SROA { 175 LLVMContext *const C; 176 DomTreeUpdater *const DTU; 177 AssumptionCache *const AC; 178 const bool PreserveCFG; 179 180 /// Worklist of alloca instructions to simplify. 181 /// 182 /// Each alloca in the function is added to this. Each new alloca formed gets 183 /// added to it as well to recursively simplify unless that alloca can be 184 /// directly promoted. Finally, each time we rewrite a use of an alloca other 185 /// the one being actively rewritten, we add it back onto the list if not 186 /// already present to ensure it is re-visited. 187 SmallSetVector<AllocaInst *, 16> Worklist; 188 189 /// A collection of instructions to delete. 190 /// We try to batch deletions to simplify code and make things a bit more 191 /// efficient. We also make sure there is no dangling pointers. 192 SmallVector<WeakVH, 8> DeadInsts; 193 194 /// Post-promotion worklist. 195 /// 196 /// Sometimes we discover an alloca which has a high probability of becoming 197 /// viable for SROA after a round of promotion takes place. In those cases, 198 /// the alloca is enqueued here for re-processing. 199 /// 200 /// Note that we have to be very careful to clear allocas out of this list in 201 /// the event they are deleted. 202 SmallSetVector<AllocaInst *, 16> PostPromotionWorklist; 203 204 /// A collection of alloca instructions we can directly promote. 205 std::vector<AllocaInst *> PromotableAllocas; 206 207 /// A worklist of PHIs to speculate prior to promoting allocas. 208 /// 209 /// All of these PHIs have been checked for the safety of speculation and by 210 /// being speculated will allow promoting allocas currently in the promotable 211 /// queue. 212 SmallSetVector<PHINode *, 8> SpeculatablePHIs; 213 214 /// A worklist of select instructions to rewrite prior to promoting 215 /// allocas. 216 SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite; 217 218 /// Select instructions that use an alloca and are subsequently loaded can be 219 /// rewritten to load both input pointers and then select between the result, 220 /// allowing the load of the alloca to be promoted. 221 /// From this: 222 /// %P2 = select i1 %cond, ptr %Alloca, ptr %Other 223 /// %V = load <type>, ptr %P2 224 /// to: 225 /// %V1 = load <type>, ptr %Alloca -> will be mem2reg'd 226 /// %V2 = load <type>, ptr %Other 227 /// %V = select i1 %cond, <type> %V1, <type> %V2 228 /// 229 /// We can do this to a select if its only uses are loads 230 /// and if either the operand to the select can be loaded unconditionally, 231 /// or if we are allowed to perform CFG modifications. 232 /// If found an intervening bitcast with a single use of the load, 233 /// allow the promotion. 234 static std::optional<RewriteableMemOps> 235 isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG); 236 237 public: 238 SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC, 239 SROAOptions PreserveCFG_) 240 : C(C), DTU(DTU), AC(AC), 241 PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {} 242 243 /// Main run method used by both the SROAPass and by the legacy pass. 244 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F); 245 246 private: 247 friend class AllocaSliceRewriter; 248 249 bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); 250 AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P); 251 bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); 252 std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI); 253 void clobberUse(Use &U); 254 bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas); 255 bool promoteAllocas(Function &F); 256 }; 257 258 } // end anonymous namespace 259 260 /// Calculate the fragment of a variable to use when slicing a store 261 /// based on the slice dimensions, existing fragment, and base storage 262 /// fragment. 263 /// Results: 264 /// UseFrag - Use Target as the new fragment. 265 /// UseNoFrag - The new slice already covers the whole variable. 266 /// Skip - The new alloca slice doesn't include this variable. 267 /// FIXME: Can we use calculateFragmentIntersect instead? 268 namespace { 269 enum FragCalcResult { UseFrag, UseNoFrag, Skip }; 270 } 271 static FragCalcResult 272 calculateFragment(DILocalVariable *Variable, 273 uint64_t NewStorageSliceOffsetInBits, 274 uint64_t NewStorageSliceSizeInBits, 275 std::optional<DIExpression::FragmentInfo> StorageFragment, 276 std::optional<DIExpression::FragmentInfo> CurrentFragment, 277 DIExpression::FragmentInfo &Target) { 278 // If the base storage describes part of the variable apply the offset and 279 // the size constraint. 280 if (StorageFragment) { 281 Target.SizeInBits = 282 std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits); 283 Target.OffsetInBits = 284 NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits; 285 } else { 286 Target.SizeInBits = NewStorageSliceSizeInBits; 287 Target.OffsetInBits = NewStorageSliceOffsetInBits; 288 } 289 290 // If this slice extracts the entirety of an independent variable from a 291 // larger alloca, do not produce a fragment expression, as the variable is 292 // not fragmented. 293 if (!CurrentFragment) { 294 if (auto Size = Variable->getSizeInBits()) { 295 // Treat the current fragment as covering the whole variable. 296 CurrentFragment = DIExpression::FragmentInfo(*Size, 0); 297 if (Target == CurrentFragment) 298 return UseNoFrag; 299 } 300 } 301 302 // No additional work to do if there isn't a fragment already, or there is 303 // but it already exactly describes the new assignment. 304 if (!CurrentFragment || *CurrentFragment == Target) 305 return UseFrag; 306 307 // Reject the target fragment if it doesn't fit wholly within the current 308 // fragment. TODO: We could instead chop up the target to fit in the case of 309 // a partial overlap. 310 if (Target.startInBits() < CurrentFragment->startInBits() || 311 Target.endInBits() > CurrentFragment->endInBits()) 312 return Skip; 313 314 // Target fits within the current fragment, return it. 315 return UseFrag; 316 } 317 318 static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) { 319 return DebugVariable(DVI->getVariable(), std::nullopt, 320 DVI->getDebugLoc().getInlinedAt()); 321 } 322 static DebugVariable getAggregateVariable(DPValue *DPV) { 323 return DebugVariable(DPV->getVariable(), std::nullopt, 324 DPV->getDebugLoc().getInlinedAt()); 325 } 326 327 static DPValue *createLinkedAssign(DPValue *, DIBuilder &DIB, 328 Instruction *LinkedInstr, Value *NewValue, 329 DILocalVariable *Variable, 330 DIExpression *Expression, Value *Address, 331 DIExpression *AddressExpression, 332 const DILocation *DI) { 333 (void)DIB; 334 return DPValue::createLinkedDPVAssign(LinkedInstr, NewValue, Variable, 335 Expression, Address, AddressExpression, 336 DI); 337 } 338 static DbgAssignIntrinsic *createLinkedAssign( 339 DbgAssignIntrinsic *, DIBuilder &DIB, Instruction *LinkedInstr, 340 Value *NewValue, DILocalVariable *Variable, DIExpression *Expression, 341 Value *Address, DIExpression *AddressExpression, const DILocation *DI) { 342 return DIB.insertDbgAssign(LinkedInstr, NewValue, Variable, Expression, 343 Address, AddressExpression, DI); 344 } 345 346 /// Find linked dbg.assign and generate a new one with the correct 347 /// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the 348 /// value component is copied from the old dbg.assign to the new. 349 /// \param OldAlloca Alloca for the variable before splitting. 350 /// \param IsSplit True if the store (not necessarily alloca) 351 /// is being split. 352 /// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca. 353 /// \param SliceSizeInBits New number of bits being written to. 354 /// \param OldInst Instruction that is being split. 355 /// \param Inst New instruction performing this part of the 356 /// split store. 357 /// \param Dest Store destination. 358 /// \param Value Stored value. 359 /// \param DL Datalayout. 360 static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, 361 uint64_t OldAllocaOffsetInBits, 362 uint64_t SliceSizeInBits, Instruction *OldInst, 363 Instruction *Inst, Value *Dest, Value *Value, 364 const DataLayout &DL) { 365 auto MarkerRange = at::getAssignmentMarkers(OldInst); 366 auto DPVAssignMarkerRange = at::getDPVAssignmentMarkers(OldInst); 367 // Nothing to do if OldInst has no linked dbg.assign intrinsics. 368 if (MarkerRange.empty() && DPVAssignMarkerRange.empty()) 369 return; 370 371 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n"); 372 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n"); 373 LLVM_DEBUG(dbgs() << " IsSplit: " << IsSplit << "\n"); 374 LLVM_DEBUG(dbgs() << " OldAllocaOffsetInBits: " << OldAllocaOffsetInBits 375 << "\n"); 376 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n"); 377 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n"); 378 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n"); 379 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n"); 380 if (Value) 381 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n"); 382 383 /// Map of aggregate variables to their fragment associated with OldAlloca. 384 DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>> 385 BaseFragments; 386 for (auto *DAI : at::getAssignmentMarkers(OldAlloca)) 387 BaseFragments[getAggregateVariable(DAI)] = 388 DAI->getExpression()->getFragmentInfo(); 389 for (auto *DPV : at::getDPVAssignmentMarkers(OldAlloca)) 390 BaseFragments[getAggregateVariable(DPV)] = 391 DPV->getExpression()->getFragmentInfo(); 392 393 // The new inst needs a DIAssignID unique metadata tag (if OldInst has 394 // one). It shouldn't already have one: assert this assumption. 395 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID)); 396 DIAssignID *NewID = nullptr; 397 auto &Ctx = Inst->getContext(); 398 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false); 399 assert(OldAlloca->isStaticAlloca()); 400 401 auto MigrateDbgAssign = [&](auto DbgAssign) { 402 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign 403 << "\n"); 404 auto *Expr = DbgAssign->getExpression(); 405 bool SetKillLocation = false; 406 407 if (IsSplit) { 408 std::optional<DIExpression::FragmentInfo> BaseFragment; 409 { 410 auto R = BaseFragments.find(getAggregateVariable(DbgAssign)); 411 if (R == BaseFragments.end()) 412 return; 413 BaseFragment = R->second; 414 } 415 std::optional<DIExpression::FragmentInfo> CurrentFragment = 416 Expr->getFragmentInfo(); 417 DIExpression::FragmentInfo NewFragment; 418 FragCalcResult Result = calculateFragment( 419 DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits, 420 BaseFragment, CurrentFragment, NewFragment); 421 422 if (Result == Skip) 423 return; 424 if (Result == UseFrag && !(NewFragment == CurrentFragment)) { 425 if (CurrentFragment) { 426 // Rewrite NewFragment to be relative to the existing one (this is 427 // what createFragmentExpression wants). CalculateFragment has 428 // already resolved the size for us. FIXME: Should it return the 429 // relative fragment too? 430 NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits; 431 } 432 // Add the new fragment info to the existing expression if possible. 433 if (auto E = DIExpression::createFragmentExpression( 434 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) { 435 Expr = *E; 436 } else { 437 // Otherwise, add the new fragment info to an empty expression and 438 // discard the value component of this dbg.assign as the value cannot 439 // be computed with the new fragment. 440 Expr = *DIExpression::createFragmentExpression( 441 DIExpression::get(Expr->getContext(), std::nullopt), 442 NewFragment.OffsetInBits, NewFragment.SizeInBits); 443 SetKillLocation = true; 444 } 445 } 446 } 447 448 // If we haven't created a DIAssignID ID do that now and attach it to Inst. 449 if (!NewID) { 450 NewID = DIAssignID::getDistinct(Ctx); 451 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID); 452 } 453 454 ::Value *NewValue = Value ? Value : DbgAssign->getValue(); 455 auto *NewAssign = createLinkedAssign( 456 DbgAssign, DIB, Inst, NewValue, DbgAssign->getVariable(), Expr, Dest, 457 DIExpression::get(Expr->getContext(), std::nullopt), 458 DbgAssign->getDebugLoc()); 459 460 // If we've updated the value but the original dbg.assign has an arglist 461 // then kill it now - we can't use the requested new value. 462 // We can't replace the DIArgList with the new value as it'd leave 463 // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without 464 // an arglist). And we can't keep the DIArgList in case the linked store 465 // is being split - in which case the DIArgList + expression may no longer 466 // be computing the correct value. 467 // This should be a very rare situation as it requires the value being 468 // stored to differ from the dbg.assign (i.e., the value has been 469 // represented differently in the debug intrinsic for some reason). 470 SetKillLocation |= 471 Value && (DbgAssign->hasArgList() || 472 !DbgAssign->getExpression()->isSingleLocationExpression()); 473 if (SetKillLocation) 474 NewAssign->setKillLocation(); 475 476 // We could use more precision here at the cost of some additional (code) 477 // complexity - if the original dbg.assign was adjacent to its store, we 478 // could position this new dbg.assign adjacent to its store rather than the 479 // old dbg.assgn. That would result in interleaved dbg.assigns rather than 480 // what we get now: 481 // split store !1 482 // split store !2 483 // dbg.assign !1 484 // dbg.assign !2 485 // This (current behaviour) results results in debug assignments being 486 // noted as slightly offset (in code) from the store. In practice this 487 // should have little effect on the debugging experience due to the fact 488 // that all the split stores should get the same line number. 489 NewAssign->moveBefore(DbgAssign); 490 491 NewAssign->setDebugLoc(DbgAssign->getDebugLoc()); 492 LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n"); 493 }; 494 495 for_each(MarkerRange, MigrateDbgAssign); 496 for_each(DPVAssignMarkerRange, MigrateDbgAssign); 497 } 498 499 namespace { 500 501 /// A custom IRBuilder inserter which prefixes all names, but only in 502 /// Assert builds. 503 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter { 504 std::string Prefix; 505 506 Twine getNameWithPrefix(const Twine &Name) const { 507 return Name.isTriviallyEmpty() ? Name : Prefix + Name; 508 } 509 510 public: 511 void SetNamePrefix(const Twine &P) { Prefix = P.str(); } 512 513 void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, 514 BasicBlock::iterator InsertPt) const override { 515 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB, 516 InsertPt); 517 } 518 }; 519 520 /// Provide a type for IRBuilder that drops names in release builds. 521 using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>; 522 523 /// A used slice of an alloca. 524 /// 525 /// This structure represents a slice of an alloca used by some instruction. It 526 /// stores both the begin and end offsets of this use, a pointer to the use 527 /// itself, and a flag indicating whether we can classify the use as splittable 528 /// or not when forming partitions of the alloca. 529 class Slice { 530 /// The beginning offset of the range. 531 uint64_t BeginOffset = 0; 532 533 /// The ending offset, not included in the range. 534 uint64_t EndOffset = 0; 535 536 /// Storage for both the use of this slice and whether it can be 537 /// split. 538 PointerIntPair<Use *, 1, bool> UseAndIsSplittable; 539 540 public: 541 Slice() = default; 542 543 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable) 544 : BeginOffset(BeginOffset), EndOffset(EndOffset), 545 UseAndIsSplittable(U, IsSplittable) {} 546 547 uint64_t beginOffset() const { return BeginOffset; } 548 uint64_t endOffset() const { return EndOffset; } 549 550 bool isSplittable() const { return UseAndIsSplittable.getInt(); } 551 void makeUnsplittable() { UseAndIsSplittable.setInt(false); } 552 553 Use *getUse() const { return UseAndIsSplittable.getPointer(); } 554 555 bool isDead() const { return getUse() == nullptr; } 556 void kill() { UseAndIsSplittable.setPointer(nullptr); } 557 558 /// Support for ordering ranges. 559 /// 560 /// This provides an ordering over ranges such that start offsets are 561 /// always increasing, and within equal start offsets, the end offsets are 562 /// decreasing. Thus the spanning range comes first in a cluster with the 563 /// same start position. 564 bool operator<(const Slice &RHS) const { 565 if (beginOffset() < RHS.beginOffset()) 566 return true; 567 if (beginOffset() > RHS.beginOffset()) 568 return false; 569 if (isSplittable() != RHS.isSplittable()) 570 return !isSplittable(); 571 if (endOffset() > RHS.endOffset()) 572 return true; 573 return false; 574 } 575 576 /// Support comparison with a single offset to allow binary searches. 577 friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, 578 uint64_t RHSOffset) { 579 return LHS.beginOffset() < RHSOffset; 580 } 581 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 582 const Slice &RHS) { 583 return LHSOffset < RHS.beginOffset(); 584 } 585 586 bool operator==(const Slice &RHS) const { 587 return isSplittable() == RHS.isSplittable() && 588 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset(); 589 } 590 bool operator!=(const Slice &RHS) const { return !operator==(RHS); } 591 }; 592 593 /// Representation of the alloca slices. 594 /// 595 /// This class represents the slices of an alloca which are formed by its 596 /// various uses. If a pointer escapes, we can't fully build a representation 597 /// for the slices used and we reflect that in this structure. The uses are 598 /// stored, sorted by increasing beginning offset and with unsplittable slices 599 /// starting at a particular offset before splittable slices. 600 class AllocaSlices { 601 public: 602 /// Construct the slices of a particular alloca. 603 AllocaSlices(const DataLayout &DL, AllocaInst &AI); 604 605 /// Test whether a pointer to the allocation escapes our analysis. 606 /// 607 /// If this is true, the slices are never fully built and should be 608 /// ignored. 609 bool isEscaped() const { return PointerEscapingInstr; } 610 611 /// Support for iterating over the slices. 612 /// @{ 613 using iterator = SmallVectorImpl<Slice>::iterator; 614 using range = iterator_range<iterator>; 615 616 iterator begin() { return Slices.begin(); } 617 iterator end() { return Slices.end(); } 618 619 using const_iterator = SmallVectorImpl<Slice>::const_iterator; 620 using const_range = iterator_range<const_iterator>; 621 622 const_iterator begin() const { return Slices.begin(); } 623 const_iterator end() const { return Slices.end(); } 624 /// @} 625 626 /// Erase a range of slices. 627 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); } 628 629 /// Insert new slices for this alloca. 630 /// 631 /// This moves the slices into the alloca's slices collection, and re-sorts 632 /// everything so that the usual ordering properties of the alloca's slices 633 /// hold. 634 void insert(ArrayRef<Slice> NewSlices) { 635 int OldSize = Slices.size(); 636 Slices.append(NewSlices.begin(), NewSlices.end()); 637 auto SliceI = Slices.begin() + OldSize; 638 llvm::sort(SliceI, Slices.end()); 639 std::inplace_merge(Slices.begin(), SliceI, Slices.end()); 640 } 641 642 // Forward declare the iterator and range accessor for walking the 643 // partitions. 644 class partition_iterator; 645 iterator_range<partition_iterator> partitions(); 646 647 /// Access the dead users for this alloca. 648 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } 649 650 /// Access Uses that should be dropped if the alloca is promotable. 651 ArrayRef<Use *> getDeadUsesIfPromotable() const { 652 return DeadUseIfPromotable; 653 } 654 655 /// Access the dead operands referring to this alloca. 656 /// 657 /// These are operands which have cannot actually be used to refer to the 658 /// alloca as they are outside its range and the user doesn't correct for 659 /// that. These mostly consist of PHI node inputs and the like which we just 660 /// need to replace with undef. 661 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; } 662 663 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 664 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 665 void printSlice(raw_ostream &OS, const_iterator I, 666 StringRef Indent = " ") const; 667 void printUse(raw_ostream &OS, const_iterator I, 668 StringRef Indent = " ") const; 669 void print(raw_ostream &OS) const; 670 void dump(const_iterator I) const; 671 void dump() const; 672 #endif 673 674 private: 675 template <typename DerivedT, typename RetT = void> class BuilderBase; 676 class SliceBuilder; 677 678 friend class AllocaSlices::SliceBuilder; 679 680 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 681 /// Handle to alloca instruction to simplify method interfaces. 682 AllocaInst &AI; 683 #endif 684 685 /// The instruction responsible for this alloca not having a known set 686 /// of slices. 687 /// 688 /// When an instruction (potentially) escapes the pointer to the alloca, we 689 /// store a pointer to that here and abort trying to form slices of the 690 /// alloca. This will be null if the alloca slices are analyzed successfully. 691 Instruction *PointerEscapingInstr; 692 693 /// The slices of the alloca. 694 /// 695 /// We store a vector of the slices formed by uses of the alloca here. This 696 /// vector is sorted by increasing begin offset, and then the unsplittable 697 /// slices before the splittable ones. See the Slice inner class for more 698 /// details. 699 SmallVector<Slice, 8> Slices; 700 701 /// Instructions which will become dead if we rewrite the alloca. 702 /// 703 /// Note that these are not separated by slice. This is because we expect an 704 /// alloca to be completely rewritten or not rewritten at all. If rewritten, 705 /// all these instructions can simply be removed and replaced with poison as 706 /// they come from outside of the allocated space. 707 SmallVector<Instruction *, 8> DeadUsers; 708 709 /// Uses which will become dead if can promote the alloca. 710 SmallVector<Use *, 8> DeadUseIfPromotable; 711 712 /// Operands which will become dead if we rewrite the alloca. 713 /// 714 /// These are operands that in their particular use can be replaced with 715 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs 716 /// to PHI nodes and the like. They aren't entirely dead (there might be 717 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 718 /// want to swap this particular input for poison to simplify the use lists of 719 /// the alloca. 720 SmallVector<Use *, 8> DeadOperands; 721 }; 722 723 /// A partition of the slices. 724 /// 725 /// An ephemeral representation for a range of slices which can be viewed as 726 /// a partition of the alloca. This range represents a span of the alloca's 727 /// memory which cannot be split, and provides access to all of the slices 728 /// overlapping some part of the partition. 729 /// 730 /// Objects of this type are produced by traversing the alloca's slices, but 731 /// are only ephemeral and not persistent. 732 class Partition { 733 private: 734 friend class AllocaSlices; 735 friend class AllocaSlices::partition_iterator; 736 737 using iterator = AllocaSlices::iterator; 738 739 /// The beginning and ending offsets of the alloca for this 740 /// partition. 741 uint64_t BeginOffset = 0, EndOffset = 0; 742 743 /// The start and end iterators of this partition. 744 iterator SI, SJ; 745 746 /// A collection of split slice tails overlapping the partition. 747 SmallVector<Slice *, 4> SplitTails; 748 749 /// Raw constructor builds an empty partition starting and ending at 750 /// the given iterator. 751 Partition(iterator SI) : SI(SI), SJ(SI) {} 752 753 public: 754 /// The start offset of this partition. 755 /// 756 /// All of the contained slices start at or after this offset. 757 uint64_t beginOffset() const { return BeginOffset; } 758 759 /// The end offset of this partition. 760 /// 761 /// All of the contained slices end at or before this offset. 762 uint64_t endOffset() const { return EndOffset; } 763 764 /// The size of the partition. 765 /// 766 /// Note that this can never be zero. 767 uint64_t size() const { 768 assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); 769 return EndOffset - BeginOffset; 770 } 771 772 /// Test whether this partition contains no slices, and merely spans 773 /// a region occupied by split slices. 774 bool empty() const { return SI == SJ; } 775 776 /// \name Iterate slices that start within the partition. 777 /// These may be splittable or unsplittable. They have a begin offset >= the 778 /// partition begin offset. 779 /// @{ 780 // FIXME: We should probably define a "concat_iterator" helper and use that 781 // to stitch together pointee_iterators over the split tails and the 782 // contiguous iterators of the partition. That would give a much nicer 783 // interface here. We could then additionally expose filtered iterators for 784 // split, unsplit, and unsplittable splices based on the usage patterns. 785 iterator begin() const { return SI; } 786 iterator end() const { return SJ; } 787 /// @} 788 789 /// Get the sequence of split slice tails. 790 /// 791 /// These tails are of slices which start before this partition but are 792 /// split and overlap into the partition. We accumulate these while forming 793 /// partitions. 794 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } 795 }; 796 797 } // end anonymous namespace 798 799 /// An iterator over partitions of the alloca's slices. 800 /// 801 /// This iterator implements the core algorithm for partitioning the alloca's 802 /// slices. It is a forward iterator as we don't support backtracking for 803 /// efficiency reasons, and re-use a single storage area to maintain the 804 /// current set of split slices. 805 /// 806 /// It is templated on the slice iterator type to use so that it can operate 807 /// with either const or non-const slice iterators. 808 class AllocaSlices::partition_iterator 809 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag, 810 Partition> { 811 friend class AllocaSlices; 812 813 /// Most of the state for walking the partitions is held in a class 814 /// with a nice interface for examining them. 815 Partition P; 816 817 /// We need to keep the end of the slices to know when to stop. 818 AllocaSlices::iterator SE; 819 820 /// We also need to keep track of the maximum split end offset seen. 821 /// FIXME: Do we really? 822 uint64_t MaxSplitSliceEndOffset = 0; 823 824 /// Sets the partition to be empty at given iterator, and sets the 825 /// end iterator. 826 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) 827 : P(SI), SE(SE) { 828 // If not already at the end, advance our state to form the initial 829 // partition. 830 if (SI != SE) 831 advance(); 832 } 833 834 /// Advance the iterator to the next partition. 835 /// 836 /// Requires that the iterator not be at the end of the slices. 837 void advance() { 838 assert((P.SI != SE || !P.SplitTails.empty()) && 839 "Cannot advance past the end of the slices!"); 840 841 // Clear out any split uses which have ended. 842 if (!P.SplitTails.empty()) { 843 if (P.EndOffset >= MaxSplitSliceEndOffset) { 844 // If we've finished all splits, this is easy. 845 P.SplitTails.clear(); 846 MaxSplitSliceEndOffset = 0; 847 } else { 848 // Remove the uses which have ended in the prior partition. This 849 // cannot change the max split slice end because we just checked that 850 // the prior partition ended prior to that max. 851 llvm::erase_if(P.SplitTails, 852 [&](Slice *S) { return S->endOffset() <= P.EndOffset; }); 853 assert(llvm::any_of(P.SplitTails, 854 [&](Slice *S) { 855 return S->endOffset() == MaxSplitSliceEndOffset; 856 }) && 857 "Could not find the current max split slice offset!"); 858 assert(llvm::all_of(P.SplitTails, 859 [&](Slice *S) { 860 return S->endOffset() <= MaxSplitSliceEndOffset; 861 }) && 862 "Max split slice end offset is not actually the max!"); 863 } 864 } 865 866 // If P.SI is already at the end, then we've cleared the split tail and 867 // now have an end iterator. 868 if (P.SI == SE) { 869 assert(P.SplitTails.empty() && "Failed to clear the split slices!"); 870 return; 871 } 872 873 // If we had a non-empty partition previously, set up the state for 874 // subsequent partitions. 875 if (P.SI != P.SJ) { 876 // Accumulate all the splittable slices which started in the old 877 // partition into the split list. 878 for (Slice &S : P) 879 if (S.isSplittable() && S.endOffset() > P.EndOffset) { 880 P.SplitTails.push_back(&S); 881 MaxSplitSliceEndOffset = 882 std::max(S.endOffset(), MaxSplitSliceEndOffset); 883 } 884 885 // Start from the end of the previous partition. 886 P.SI = P.SJ; 887 888 // If P.SI is now at the end, we at most have a tail of split slices. 889 if (P.SI == SE) { 890 P.BeginOffset = P.EndOffset; 891 P.EndOffset = MaxSplitSliceEndOffset; 892 return; 893 } 894 895 // If the we have split slices and the next slice is after a gap and is 896 // not splittable immediately form an empty partition for the split 897 // slices up until the next slice begins. 898 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && 899 !P.SI->isSplittable()) { 900 P.BeginOffset = P.EndOffset; 901 P.EndOffset = P.SI->beginOffset(); 902 return; 903 } 904 } 905 906 // OK, we need to consume new slices. Set the end offset based on the 907 // current slice, and step SJ past it. The beginning offset of the 908 // partition is the beginning offset of the next slice unless we have 909 // pre-existing split slices that are continuing, in which case we begin 910 // at the prior end offset. 911 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; 912 P.EndOffset = P.SI->endOffset(); 913 ++P.SJ; 914 915 // There are two strategies to form a partition based on whether the 916 // partition starts with an unsplittable slice or a splittable slice. 917 if (!P.SI->isSplittable()) { 918 // When we're forming an unsplittable region, it must always start at 919 // the first slice and will extend through its end. 920 assert(P.BeginOffset == P.SI->beginOffset()); 921 922 // Form a partition including all of the overlapping slices with this 923 // unsplittable slice. 924 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { 925 if (!P.SJ->isSplittable()) 926 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); 927 ++P.SJ; 928 } 929 930 // We have a partition across a set of overlapping unsplittable 931 // partitions. 932 return; 933 } 934 935 // If we're starting with a splittable slice, then we need to form 936 // a synthetic partition spanning it and any other overlapping splittable 937 // splices. 938 assert(P.SI->isSplittable() && "Forming a splittable partition!"); 939 940 // Collect all of the overlapping splittable slices. 941 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && 942 P.SJ->isSplittable()) { 943 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); 944 ++P.SJ; 945 } 946 947 // Back upiP.EndOffset if we ended the span early when encountering an 948 // unsplittable slice. This synthesizes the early end offset of 949 // a partition spanning only splittable slices. 950 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { 951 assert(!P.SJ->isSplittable()); 952 P.EndOffset = P.SJ->beginOffset(); 953 } 954 } 955 956 public: 957 bool operator==(const partition_iterator &RHS) const { 958 assert(SE == RHS.SE && 959 "End iterators don't match between compared partition iterators!"); 960 961 // The observed positions of partitions is marked by the P.SI iterator and 962 // the emptiness of the split slices. The latter is only relevant when 963 // P.SI == SE, as the end iterator will additionally have an empty split 964 // slices list, but the prior may have the same P.SI and a tail of split 965 // slices. 966 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) { 967 assert(P.SJ == RHS.P.SJ && 968 "Same set of slices formed two different sized partitions!"); 969 assert(P.SplitTails.size() == RHS.P.SplitTails.size() && 970 "Same slice position with differently sized non-empty split " 971 "slice tails!"); 972 return true; 973 } 974 return false; 975 } 976 977 partition_iterator &operator++() { 978 advance(); 979 return *this; 980 } 981 982 Partition &operator*() { return P; } 983 }; 984 985 /// A forward range over the partitions of the alloca's slices. 986 /// 987 /// This accesses an iterator range over the partitions of the alloca's 988 /// slices. It computes these partitions on the fly based on the overlapping 989 /// offsets of the slices and the ability to split them. It will visit "empty" 990 /// partitions to cover regions of the alloca only accessed via split 991 /// slices. 992 iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() { 993 return make_range(partition_iterator(begin(), end()), 994 partition_iterator(end(), end())); 995 } 996 997 static Value *foldSelectInst(SelectInst &SI) { 998 // If the condition being selected on is a constant or the same value is 999 // being selected between, fold the select. Yes this does (rarely) happen 1000 // early on. 1001 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 1002 return SI.getOperand(1 + CI->isZero()); 1003 if (SI.getOperand(1) == SI.getOperand(2)) 1004 return SI.getOperand(1); 1005 1006 return nullptr; 1007 } 1008 1009 /// A helper that folds a PHI node or a select. 1010 static Value *foldPHINodeOrSelectInst(Instruction &I) { 1011 if (PHINode *PN = dyn_cast<PHINode>(&I)) { 1012 // If PN merges together the same value, return that value. 1013 return PN->hasConstantValue(); 1014 } 1015 return foldSelectInst(cast<SelectInst>(I)); 1016 } 1017 1018 /// Builder for the alloca slices. 1019 /// 1020 /// This class builds a set of alloca slices by recursively visiting the uses 1021 /// of an alloca and making a slice for each load and store at each offset. 1022 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { 1023 friend class PtrUseVisitor<SliceBuilder>; 1024 friend class InstVisitor<SliceBuilder>; 1025 1026 using Base = PtrUseVisitor<SliceBuilder>; 1027 1028 const uint64_t AllocSize; 1029 AllocaSlices &AS; 1030 1031 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap; 1032 SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes; 1033 1034 /// Set to de-duplicate dead instructions found in the use walk. 1035 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 1036 1037 public: 1038 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS) 1039 : PtrUseVisitor<SliceBuilder>(DL), 1040 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()), 1041 AS(AS) {} 1042 1043 private: 1044 void markAsDead(Instruction &I) { 1045 if (VisitedDeadInsts.insert(&I).second) 1046 AS.DeadUsers.push_back(&I); 1047 } 1048 1049 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, 1050 bool IsSplittable = false) { 1051 // Completely skip uses which have a zero size or start either before or 1052 // past the end of the allocation. 1053 if (Size == 0 || Offset.uge(AllocSize)) { 1054 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" 1055 << Offset 1056 << " which has zero size or starts outside of the " 1057 << AllocSize << " byte alloca:\n" 1058 << " alloca: " << AS.AI << "\n" 1059 << " use: " << I << "\n"); 1060 return markAsDead(I); 1061 } 1062 1063 uint64_t BeginOffset = Offset.getZExtValue(); 1064 uint64_t EndOffset = BeginOffset + Size; 1065 1066 // Clamp the end offset to the end of the allocation. Note that this is 1067 // formulated to handle even the case where "BeginOffset + Size" overflows. 1068 // This may appear superficially to be something we could ignore entirely, 1069 // but that is not so! There may be widened loads or PHI-node uses where 1070 // some instructions are dead but not others. We can't completely ignore 1071 // them, and so have to record at least the information here. 1072 assert(AllocSize >= BeginOffset); // Established above. 1073 if (Size > AllocSize - BeginOffset) { 1074 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" 1075 << Offset << " to remain within the " << AllocSize 1076 << " byte alloca:\n" 1077 << " alloca: " << AS.AI << "\n" 1078 << " use: " << I << "\n"); 1079 EndOffset = AllocSize; 1080 } 1081 1082 AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); 1083 } 1084 1085 void visitBitCastInst(BitCastInst &BC) { 1086 if (BC.use_empty()) 1087 return markAsDead(BC); 1088 1089 return Base::visitBitCastInst(BC); 1090 } 1091 1092 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 1093 if (ASC.use_empty()) 1094 return markAsDead(ASC); 1095 1096 return Base::visitAddrSpaceCastInst(ASC); 1097 } 1098 1099 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 1100 if (GEPI.use_empty()) 1101 return markAsDead(GEPI); 1102 1103 if (SROAStrictInbounds && GEPI.isInBounds()) { 1104 // FIXME: This is a manually un-factored variant of the basic code inside 1105 // of GEPs with checking of the inbounds invariant specified in the 1106 // langref in a very strict sense. If we ever want to enable 1107 // SROAStrictInbounds, this code should be factored cleanly into 1108 // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds 1109 // by writing out the code here where we have the underlying allocation 1110 // size readily available. 1111 APInt GEPOffset = Offset; 1112 const DataLayout &DL = GEPI.getModule()->getDataLayout(); 1113 for (gep_type_iterator GTI = gep_type_begin(GEPI), 1114 GTE = gep_type_end(GEPI); 1115 GTI != GTE; ++GTI) { 1116 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 1117 if (!OpC) 1118 break; 1119 1120 // Handle a struct index, which adds its field offset to the pointer. 1121 if (StructType *STy = GTI.getStructTypeOrNull()) { 1122 unsigned ElementIdx = OpC->getZExtValue(); 1123 const StructLayout *SL = DL.getStructLayout(STy); 1124 GEPOffset += 1125 APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); 1126 } else { 1127 // For array or vector indices, scale the index by the size of the 1128 // type. 1129 APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); 1130 GEPOffset += Index * APInt(Offset.getBitWidth(), 1131 GTI.getSequentialElementStride(DL)); 1132 } 1133 1134 // If this index has computed an intermediate pointer which is not 1135 // inbounds, then the result of the GEP is a poison value and we can 1136 // delete it and all uses. 1137 if (GEPOffset.ugt(AllocSize)) 1138 return markAsDead(GEPI); 1139 } 1140 } 1141 1142 return Base::visitGetElementPtrInst(GEPI); 1143 } 1144 1145 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, 1146 uint64_t Size, bool IsVolatile) { 1147 // We allow splitting of non-volatile loads and stores where the type is an 1148 // integer type. These may be used to implement 'memcpy' or other "transfer 1149 // of bits" patterns. 1150 bool IsSplittable = 1151 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty); 1152 1153 insertUse(I, Offset, Size, IsSplittable); 1154 } 1155 1156 void visitLoadInst(LoadInst &LI) { 1157 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 1158 "All simple FCA loads should have been pre-split"); 1159 1160 if (!IsOffsetKnown) 1161 return PI.setAborted(&LI); 1162 1163 TypeSize Size = DL.getTypeStoreSize(LI.getType()); 1164 if (Size.isScalable()) 1165 return PI.setAborted(&LI); 1166 1167 return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(), 1168 LI.isVolatile()); 1169 } 1170 1171 void visitStoreInst(StoreInst &SI) { 1172 Value *ValOp = SI.getValueOperand(); 1173 if (ValOp == *U) 1174 return PI.setEscapedAndAborted(&SI); 1175 if (!IsOffsetKnown) 1176 return PI.setAborted(&SI); 1177 1178 TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType()); 1179 if (StoreSize.isScalable()) 1180 return PI.setAborted(&SI); 1181 1182 uint64_t Size = StoreSize.getFixedValue(); 1183 1184 // If this memory access can be shown to *statically* extend outside the 1185 // bounds of the allocation, it's behavior is undefined, so simply 1186 // ignore it. Note that this is more strict than the generic clamping 1187 // behavior of insertUse. We also try to handle cases which might run the 1188 // risk of overflow. 1189 // FIXME: We should instead consider the pointer to have escaped if this 1190 // function is being instrumented for addressing bugs or race conditions. 1191 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) { 1192 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" 1193 << Offset << " which extends past the end of the " 1194 << AllocSize << " byte alloca:\n" 1195 << " alloca: " << AS.AI << "\n" 1196 << " use: " << SI << "\n"); 1197 return markAsDead(SI); 1198 } 1199 1200 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 1201 "All simple FCA stores should have been pre-split"); 1202 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); 1203 } 1204 1205 void visitMemSetInst(MemSetInst &II) { 1206 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 1207 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 1208 if ((Length && Length->getValue() == 0) || 1209 (IsOffsetKnown && Offset.uge(AllocSize))) 1210 // Zero-length mem transfer intrinsics can be ignored entirely. 1211 return markAsDead(II); 1212 1213 if (!IsOffsetKnown) 1214 return PI.setAborted(&II); 1215 1216 insertUse(II, Offset, Length ? Length->getLimitedValue() 1217 : AllocSize - Offset.getLimitedValue(), 1218 (bool)Length); 1219 } 1220 1221 void visitMemTransferInst(MemTransferInst &II) { 1222 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 1223 if (Length && Length->getValue() == 0) 1224 // Zero-length mem transfer intrinsics can be ignored entirely. 1225 return markAsDead(II); 1226 1227 // Because we can visit these intrinsics twice, also check to see if the 1228 // first time marked this instruction as dead. If so, skip it. 1229 if (VisitedDeadInsts.count(&II)) 1230 return; 1231 1232 if (!IsOffsetKnown) 1233 return PI.setAborted(&II); 1234 1235 // This side of the transfer is completely out-of-bounds, and so we can 1236 // nuke the entire transfer. However, we also need to nuke the other side 1237 // if already added to our partitions. 1238 // FIXME: Yet another place we really should bypass this when 1239 // instrumenting for ASan. 1240 if (Offset.uge(AllocSize)) { 1241 SmallDenseMap<Instruction *, unsigned>::iterator MTPI = 1242 MemTransferSliceMap.find(&II); 1243 if (MTPI != MemTransferSliceMap.end()) 1244 AS.Slices[MTPI->second].kill(); 1245 return markAsDead(II); 1246 } 1247 1248 uint64_t RawOffset = Offset.getLimitedValue(); 1249 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; 1250 1251 // Check for the special case where the same exact value is used for both 1252 // source and dest. 1253 if (*U == II.getRawDest() && *U == II.getRawSource()) { 1254 // For non-volatile transfers this is a no-op. 1255 if (!II.isVolatile()) 1256 return markAsDead(II); 1257 1258 return insertUse(II, Offset, Size, /*IsSplittable=*/false); 1259 } 1260 1261 // If we have seen both source and destination for a mem transfer, then 1262 // they both point to the same alloca. 1263 bool Inserted; 1264 SmallDenseMap<Instruction *, unsigned>::iterator MTPI; 1265 std::tie(MTPI, Inserted) = 1266 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size())); 1267 unsigned PrevIdx = MTPI->second; 1268 if (!Inserted) { 1269 Slice &PrevP = AS.Slices[PrevIdx]; 1270 1271 // Check if the begin offsets match and this is a non-volatile transfer. 1272 // In that case, we can completely elide the transfer. 1273 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) { 1274 PrevP.kill(); 1275 return markAsDead(II); 1276 } 1277 1278 // Otherwise we have an offset transfer within the same alloca. We can't 1279 // split those. 1280 PrevP.makeUnsplittable(); 1281 } 1282 1283 // Insert the use now that we've fixed up the splittable nature. 1284 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); 1285 1286 // Check that we ended up with a valid index in the map. 1287 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II && 1288 "Map index doesn't point back to a slice with this user."); 1289 } 1290 1291 // Disable SRoA for any intrinsics except for lifetime invariants and 1292 // invariant group. 1293 // FIXME: What about debug intrinsics? This matches old behavior, but 1294 // doesn't make sense. 1295 void visitIntrinsicInst(IntrinsicInst &II) { 1296 if (II.isDroppable()) { 1297 AS.DeadUseIfPromotable.push_back(U); 1298 return; 1299 } 1300 1301 if (!IsOffsetKnown) 1302 return PI.setAborted(&II); 1303 1304 if (II.isLifetimeStartOrEnd()) { 1305 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 1306 uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), 1307 Length->getLimitedValue()); 1308 insertUse(II, Offset, Size, true); 1309 return; 1310 } 1311 1312 if (II.isLaunderOrStripInvariantGroup()) { 1313 insertUse(II, Offset, AllocSize, true); 1314 enqueueUsers(II); 1315 return; 1316 } 1317 1318 Base::visitIntrinsicInst(II); 1319 } 1320 1321 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 1322 // We consider any PHI or select that results in a direct load or store of 1323 // the same offset to be a viable use for slicing purposes. These uses 1324 // are considered unsplittable and the size is the maximum loaded or stored 1325 // size. 1326 SmallPtrSet<Instruction *, 4> Visited; 1327 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 1328 Visited.insert(Root); 1329 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 1330 const DataLayout &DL = Root->getModule()->getDataLayout(); 1331 // If there are no loads or stores, the access is dead. We mark that as 1332 // a size zero access. 1333 Size = 0; 1334 do { 1335 Instruction *I, *UsedI; 1336 std::tie(UsedI, I) = Uses.pop_back_val(); 1337 1338 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 1339 TypeSize LoadSize = DL.getTypeStoreSize(LI->getType()); 1340 if (LoadSize.isScalable()) { 1341 PI.setAborted(LI); 1342 return nullptr; 1343 } 1344 Size = std::max(Size, LoadSize.getFixedValue()); 1345 continue; 1346 } 1347 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 1348 Value *Op = SI->getOperand(0); 1349 if (Op == UsedI) 1350 return SI; 1351 TypeSize StoreSize = DL.getTypeStoreSize(Op->getType()); 1352 if (StoreSize.isScalable()) { 1353 PI.setAborted(SI); 1354 return nullptr; 1355 } 1356 Size = std::max(Size, StoreSize.getFixedValue()); 1357 continue; 1358 } 1359 1360 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 1361 if (!GEP->hasAllZeroIndices()) 1362 return GEP; 1363 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 1364 !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) { 1365 return I; 1366 } 1367 1368 for (User *U : I->users()) 1369 if (Visited.insert(cast<Instruction>(U)).second) 1370 Uses.push_back(std::make_pair(I, cast<Instruction>(U))); 1371 } while (!Uses.empty()); 1372 1373 return nullptr; 1374 } 1375 1376 void visitPHINodeOrSelectInst(Instruction &I) { 1377 assert(isa<PHINode>(I) || isa<SelectInst>(I)); 1378 if (I.use_empty()) 1379 return markAsDead(I); 1380 1381 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI 1382 // instructions in this BB, which may be required during rewriting. Bail out 1383 // on these cases. 1384 if (isa<PHINode>(I) && 1385 I.getParent()->getFirstInsertionPt() == I.getParent()->end()) 1386 return PI.setAborted(&I); 1387 1388 // TODO: We could use simplifyInstruction here to fold PHINodes and 1389 // SelectInsts. However, doing so requires to change the current 1390 // dead-operand-tracking mechanism. For instance, suppose neither loading 1391 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not 1392 // trap either. However, if we simply replace %U with undef using the 1393 // current dead-operand-tracking mechanism, "load (select undef, undef, 1394 // %other)" may trap because the select may return the first operand 1395 // "undef". 1396 if (Value *Result = foldPHINodeOrSelectInst(I)) { 1397 if (Result == *U) 1398 // If the result of the constant fold will be the pointer, recurse 1399 // through the PHI/select as if we had RAUW'ed it. 1400 enqueueUsers(I); 1401 else 1402 // Otherwise the operand to the PHI/select is dead, and we can replace 1403 // it with poison. 1404 AS.DeadOperands.push_back(U); 1405 1406 return; 1407 } 1408 1409 if (!IsOffsetKnown) 1410 return PI.setAborted(&I); 1411 1412 // See if we already have computed info on this node. 1413 uint64_t &Size = PHIOrSelectSizes[&I]; 1414 if (!Size) { 1415 // This is a new PHI/Select, check for an unsafe use of it. 1416 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size)) 1417 return PI.setAborted(UnsafeI); 1418 } 1419 1420 // For PHI and select operands outside the alloca, we can't nuke the entire 1421 // phi or select -- the other side might still be relevant, so we special 1422 // case them here and use a separate structure to track the operands 1423 // themselves which should be replaced with poison. 1424 // FIXME: This should instead be escaped in the event we're instrumenting 1425 // for address sanitization. 1426 if (Offset.uge(AllocSize)) { 1427 AS.DeadOperands.push_back(U); 1428 return; 1429 } 1430 1431 insertUse(I, Offset, Size); 1432 } 1433 1434 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); } 1435 1436 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); } 1437 1438 /// Disable SROA entirely if there are unhandled users of the alloca. 1439 void visitInstruction(Instruction &I) { PI.setAborted(&I); } 1440 }; 1441 1442 AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) 1443 : 1444 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1445 AI(AI), 1446 #endif 1447 PointerEscapingInstr(nullptr) { 1448 SliceBuilder PB(DL, AI, *this); 1449 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); 1450 if (PtrI.isEscaped() || PtrI.isAborted()) { 1451 // FIXME: We should sink the escape vs. abort info into the caller nicely, 1452 // possibly by just storing the PtrInfo in the AllocaSlices. 1453 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() 1454 : PtrI.getAbortingInst(); 1455 assert(PointerEscapingInstr && "Did not track a bad instruction"); 1456 return; 1457 } 1458 1459 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); }); 1460 1461 // Sort the uses. This arranges for the offsets to be in ascending order, 1462 // and the sizes to be in descending order. 1463 llvm::stable_sort(Slices); 1464 } 1465 1466 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1467 1468 void AllocaSlices::print(raw_ostream &OS, const_iterator I, 1469 StringRef Indent) const { 1470 printSlice(OS, I, Indent); 1471 OS << "\n"; 1472 printUse(OS, I, Indent); 1473 } 1474 1475 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, 1476 StringRef Indent) const { 1477 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" 1478 << " slice #" << (I - begin()) 1479 << (I->isSplittable() ? " (splittable)" : ""); 1480 } 1481 1482 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, 1483 StringRef Indent) const { 1484 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n"; 1485 } 1486 1487 void AllocaSlices::print(raw_ostream &OS) const { 1488 if (PointerEscapingInstr) { 1489 OS << "Can't analyze slices for alloca: " << AI << "\n" 1490 << " A pointer to this alloca escaped by:\n" 1491 << " " << *PointerEscapingInstr << "\n"; 1492 return; 1493 } 1494 1495 OS << "Slices of alloca: " << AI << "\n"; 1496 for (const_iterator I = begin(), E = end(); I != E; ++I) 1497 print(OS, I); 1498 } 1499 1500 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const { 1501 print(dbgs(), I); 1502 } 1503 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } 1504 1505 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1506 1507 /// Walk the range of a partitioning looking for a common type to cover this 1508 /// sequence of slices. 1509 static std::pair<Type *, IntegerType *> 1510 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, 1511 uint64_t EndOffset) { 1512 Type *Ty = nullptr; 1513 bool TyIsCommon = true; 1514 IntegerType *ITy = nullptr; 1515 1516 // Note that we need to look at *every* alloca slice's Use to ensure we 1517 // always get consistent results regardless of the order of slices. 1518 for (AllocaSlices::const_iterator I = B; I != E; ++I) { 1519 Use *U = I->getUse(); 1520 if (isa<IntrinsicInst>(*U->getUser())) 1521 continue; 1522 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) 1523 continue; 1524 1525 Type *UserTy = nullptr; 1526 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 1527 UserTy = LI->getType(); 1528 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 1529 UserTy = SI->getValueOperand()->getType(); 1530 } 1531 1532 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) { 1533 // If the type is larger than the partition, skip it. We only encounter 1534 // this for split integer operations where we want to use the type of the 1535 // entity causing the split. Also skip if the type is not a byte width 1536 // multiple. 1537 if (UserITy->getBitWidth() % 8 != 0 || 1538 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) 1539 continue; 1540 1541 // Track the largest bitwidth integer type used in this way in case there 1542 // is no common type. 1543 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth()) 1544 ITy = UserITy; 1545 } 1546 1547 // To avoid depending on the order of slices, Ty and TyIsCommon must not 1548 // depend on types skipped above. 1549 if (!UserTy || (Ty && Ty != UserTy)) 1550 TyIsCommon = false; // Give up on anything but an iN type. 1551 else 1552 Ty = UserTy; 1553 } 1554 1555 return {TyIsCommon ? Ty : nullptr, ITy}; 1556 } 1557 1558 /// PHI instructions that use an alloca and are subsequently loaded can be 1559 /// rewritten to load both input pointers in the pred blocks and then PHI the 1560 /// results, allowing the load of the alloca to be promoted. 1561 /// From this: 1562 /// %P2 = phi [i32* %Alloca, i32* %Other] 1563 /// %V = load i32* %P2 1564 /// to: 1565 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1566 /// ... 1567 /// %V2 = load i32* %Other 1568 /// ... 1569 /// %V = phi [i32 %V1, i32 %V2] 1570 /// 1571 /// We can do this to a select if its only uses are loads and if the operands 1572 /// to the select can be loaded unconditionally. 1573 /// 1574 /// FIXME: This should be hoisted into a generic utility, likely in 1575 /// Transforms/Util/Local.h 1576 static bool isSafePHIToSpeculate(PHINode &PN) { 1577 const DataLayout &DL = PN.getModule()->getDataLayout(); 1578 1579 // For now, we can only do this promotion if the load is in the same block 1580 // as the PHI, and if there are no stores between the phi and load. 1581 // TODO: Allow recursive phi users. 1582 // TODO: Allow stores. 1583 BasicBlock *BB = PN.getParent(); 1584 Align MaxAlign; 1585 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType()); 1586 Type *LoadType = nullptr; 1587 for (User *U : PN.users()) { 1588 LoadInst *LI = dyn_cast<LoadInst>(U); 1589 if (!LI || !LI->isSimple()) 1590 return false; 1591 1592 // For now we only allow loads in the same block as the PHI. This is 1593 // a common case that happens when instcombine merges two loads through 1594 // a PHI. 1595 if (LI->getParent() != BB) 1596 return false; 1597 1598 if (LoadType) { 1599 if (LoadType != LI->getType()) 1600 return false; 1601 } else { 1602 LoadType = LI->getType(); 1603 } 1604 1605 // Ensure that there are no instructions between the PHI and the load that 1606 // could store. 1607 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) 1608 if (BBI->mayWriteToMemory()) 1609 return false; 1610 1611 MaxAlign = std::max(MaxAlign, LI->getAlign()); 1612 } 1613 1614 if (!LoadType) 1615 return false; 1616 1617 APInt LoadSize = 1618 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue()); 1619 1620 // We can only transform this if it is safe to push the loads into the 1621 // predecessor blocks. The only thing to watch out for is that we can't put 1622 // a possibly trapping load in the predecessor if it is a critical edge. 1623 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1624 Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator(); 1625 Value *InVal = PN.getIncomingValue(Idx); 1626 1627 // If the value is produced by the terminator of the predecessor (an 1628 // invoke) or it has side-effects, there is no valid place to put a load 1629 // in the predecessor. 1630 if (TI == InVal || TI->mayHaveSideEffects()) 1631 return false; 1632 1633 // If the predecessor has a single successor, then the edge isn't 1634 // critical. 1635 if (TI->getNumSuccessors() == 1) 1636 continue; 1637 1638 // If this pointer is always safe to load, or if we can prove that there 1639 // is already a load in the block, then we can move the load to the pred 1640 // block. 1641 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI)) 1642 continue; 1643 1644 return false; 1645 } 1646 1647 return true; 1648 } 1649 1650 static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) { 1651 LLVM_DEBUG(dbgs() << " original: " << PN << "\n"); 1652 1653 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); 1654 Type *LoadTy = SomeLoad->getType(); 1655 IRB.SetInsertPoint(&PN); 1656 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(), 1657 PN.getName() + ".sroa.speculated"); 1658 1659 // Get the AA tags and alignment to use from one of the loads. It does not 1660 // matter which one we get and if any differ. 1661 AAMDNodes AATags = SomeLoad->getAAMetadata(); 1662 Align Alignment = SomeLoad->getAlign(); 1663 1664 // Rewrite all loads of the PN to use the new PHI. 1665 while (!PN.use_empty()) { 1666 LoadInst *LI = cast<LoadInst>(PN.user_back()); 1667 LI->replaceAllUsesWith(NewPN); 1668 LI->eraseFromParent(); 1669 } 1670 1671 // Inject loads into all of the pred blocks. 1672 DenseMap<BasicBlock*, Value*> InjectedLoads; 1673 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1674 BasicBlock *Pred = PN.getIncomingBlock(Idx); 1675 Value *InVal = PN.getIncomingValue(Idx); 1676 1677 // A PHI node is allowed to have multiple (duplicated) entries for the same 1678 // basic block, as long as the value is the same. So if we already injected 1679 // a load in the predecessor, then we should reuse the same load for all 1680 // duplicated entries. 1681 if (Value* V = InjectedLoads.lookup(Pred)) { 1682 NewPN->addIncoming(V, Pred); 1683 continue; 1684 } 1685 1686 Instruction *TI = Pred->getTerminator(); 1687 IRB.SetInsertPoint(TI); 1688 1689 LoadInst *Load = IRB.CreateAlignedLoad( 1690 LoadTy, InVal, Alignment, 1691 (PN.getName() + ".sroa.speculate.load." + Pred->getName())); 1692 ++NumLoadsSpeculated; 1693 if (AATags) 1694 Load->setAAMetadata(AATags); 1695 NewPN->addIncoming(Load, Pred); 1696 InjectedLoads[Pred] = Load; 1697 } 1698 1699 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 1700 PN.eraseFromParent(); 1701 } 1702 1703 SelectHandSpeculativity & 1704 SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) { 1705 if (isTrueVal) 1706 Bitfield::set<SelectHandSpeculativity::TrueVal>(Storage, true); 1707 else 1708 Bitfield::set<SelectHandSpeculativity::FalseVal>(Storage, true); 1709 return *this; 1710 } 1711 1712 bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const { 1713 return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage) 1714 : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage); 1715 } 1716 1717 bool SelectHandSpeculativity::areAllSpeculatable() const { 1718 return isSpeculatable(/*isTrueVal=*/true) && 1719 isSpeculatable(/*isTrueVal=*/false); 1720 } 1721 1722 bool SelectHandSpeculativity::areAnySpeculatable() const { 1723 return isSpeculatable(/*isTrueVal=*/true) || 1724 isSpeculatable(/*isTrueVal=*/false); 1725 } 1726 bool SelectHandSpeculativity::areNoneSpeculatable() const { 1727 return !areAnySpeculatable(); 1728 } 1729 1730 static SelectHandSpeculativity 1731 isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) { 1732 assert(LI.isSimple() && "Only for simple loads"); 1733 SelectHandSpeculativity Spec; 1734 1735 const DataLayout &DL = SI.getModule()->getDataLayout(); 1736 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()}) 1737 if (isSafeToLoadUnconditionally(Value, LI.getType(), LI.getAlign(), DL, 1738 &LI)) 1739 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue()); 1740 else if (PreserveCFG) 1741 return Spec; 1742 1743 return Spec; 1744 } 1745 1746 std::optional<RewriteableMemOps> 1747 SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) { 1748 RewriteableMemOps Ops; 1749 1750 for (User *U : SI.users()) { 1751 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse()) 1752 U = *BC->user_begin(); 1753 1754 if (auto *Store = dyn_cast<StoreInst>(U)) { 1755 // Note that atomic stores can be transformed; atomic semantics do not 1756 // have any meaning for a local alloca. Stores are not speculatable, 1757 // however, so if we can't turn it into a predicated store, we are done. 1758 if (Store->isVolatile() || PreserveCFG) 1759 return {}; // Give up on this `select`. 1760 Ops.emplace_back(Store); 1761 continue; 1762 } 1763 1764 auto *LI = dyn_cast<LoadInst>(U); 1765 1766 // Note that atomic loads can be transformed; 1767 // atomic semantics do not have any meaning for a local alloca. 1768 if (!LI || LI->isVolatile()) 1769 return {}; // Give up on this `select`. 1770 1771 PossiblySpeculatableLoad Load(LI); 1772 if (!LI->isSimple()) { 1773 // If the `load` is not simple, we can't speculatively execute it, 1774 // but we could handle this via a CFG modification. But can we? 1775 if (PreserveCFG) 1776 return {}; // Give up on this `select`. 1777 Ops.emplace_back(Load); 1778 continue; 1779 } 1780 1781 SelectHandSpeculativity Spec = 1782 isSafeLoadOfSelectToSpeculate(*LI, SI, PreserveCFG); 1783 if (PreserveCFG && !Spec.areAllSpeculatable()) 1784 return {}; // Give up on this `select`. 1785 1786 Load.setInt(Spec); 1787 Ops.emplace_back(Load); 1788 } 1789 1790 return Ops; 1791 } 1792 1793 static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, 1794 IRBuilderTy &IRB) { 1795 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n"); 1796 1797 Value *TV = SI.getTrueValue(); 1798 Value *FV = SI.getFalseValue(); 1799 // Replace the given load of the select with a select of two loads. 1800 1801 assert(LI.isSimple() && "We only speculate simple loads"); 1802 1803 IRB.SetInsertPoint(&LI); 1804 1805 LoadInst *TL = 1806 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(), 1807 LI.getName() + ".sroa.speculate.load.true"); 1808 LoadInst *FL = 1809 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(), 1810 LI.getName() + ".sroa.speculate.load.false"); 1811 NumLoadsSpeculated += 2; 1812 1813 // Transfer alignment and AA info if present. 1814 TL->setAlignment(LI.getAlign()); 1815 FL->setAlignment(LI.getAlign()); 1816 1817 AAMDNodes Tags = LI.getAAMetadata(); 1818 if (Tags) { 1819 TL->setAAMetadata(Tags); 1820 FL->setAAMetadata(Tags); 1821 } 1822 1823 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 1824 LI.getName() + ".sroa.speculated"); 1825 1826 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n"); 1827 LI.replaceAllUsesWith(V); 1828 } 1829 1830 template <typename T> 1831 static void rewriteMemOpOfSelect(SelectInst &SI, T &I, 1832 SelectHandSpeculativity Spec, 1833 DomTreeUpdater &DTU) { 1834 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!"); 1835 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n"); 1836 BasicBlock *Head = I.getParent(); 1837 Instruction *ThenTerm = nullptr; 1838 Instruction *ElseTerm = nullptr; 1839 if (Spec.areNoneSpeculatable()) 1840 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm, 1841 SI.getMetadata(LLVMContext::MD_prof), &DTU); 1842 else { 1843 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false, 1844 SI.getMetadata(LLVMContext::MD_prof), &DTU, 1845 /*LI=*/nullptr, /*ThenBlock=*/nullptr); 1846 if (Spec.isSpeculatable(/*isTrueVal=*/true)) 1847 cast<BranchInst>(Head->getTerminator())->swapSuccessors(); 1848 } 1849 auto *HeadBI = cast<BranchInst>(Head->getTerminator()); 1850 Spec = {}; // Do not use `Spec` beyond this point. 1851 BasicBlock *Tail = I.getParent(); 1852 Tail->setName(Head->getName() + ".cont"); 1853 PHINode *PN; 1854 if (isa<LoadInst>(I)) 1855 PN = PHINode::Create(I.getType(), 2, "", &I); 1856 for (BasicBlock *SuccBB : successors(Head)) { 1857 bool IsThen = SuccBB == HeadBI->getSuccessor(0); 1858 int SuccIdx = IsThen ? 0 : 1; 1859 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB; 1860 auto &CondMemOp = cast<T>(*I.clone()); 1861 if (NewMemOpBB != Head) { 1862 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else")); 1863 if (isa<LoadInst>(I)) 1864 ++NumLoadsPredicated; 1865 else 1866 ++NumStoresPredicated; 1867 } else { 1868 CondMemOp.dropUBImplyingAttrsAndMetadata(); 1869 ++NumLoadsSpeculated; 1870 } 1871 CondMemOp.insertBefore(NewMemOpBB->getTerminator()); 1872 Value *Ptr = SI.getOperand(1 + SuccIdx); 1873 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr); 1874 if (isa<LoadInst>(I)) { 1875 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val"); 1876 PN->addIncoming(&CondMemOp, NewMemOpBB); 1877 } else 1878 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n"); 1879 } 1880 if (isa<LoadInst>(I)) { 1881 PN->takeName(&I); 1882 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n"); 1883 I.replaceAllUsesWith(PN); 1884 } 1885 } 1886 1887 static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I, 1888 SelectHandSpeculativity Spec, 1889 DomTreeUpdater &DTU) { 1890 if (auto *LI = dyn_cast<LoadInst>(&I)) 1891 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU); 1892 else if (auto *SI = dyn_cast<StoreInst>(&I)) 1893 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU); 1894 else 1895 llvm_unreachable_internal("Only for load and store."); 1896 } 1897 1898 static bool rewriteSelectInstMemOps(SelectInst &SI, 1899 const RewriteableMemOps &Ops, 1900 IRBuilderTy &IRB, DomTreeUpdater *DTU) { 1901 bool CFGChanged = false; 1902 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n"); 1903 1904 for (const RewriteableMemOp &Op : Ops) { 1905 SelectHandSpeculativity Spec; 1906 Instruction *I; 1907 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) { 1908 I = *US; 1909 } else { 1910 auto PSL = std::get<PossiblySpeculatableLoad>(Op); 1911 I = PSL.getPointer(); 1912 Spec = PSL.getInt(); 1913 } 1914 if (Spec.areAllSpeculatable()) { 1915 speculateSelectInstLoads(SI, cast<LoadInst>(*I), IRB); 1916 } else { 1917 assert(DTU && "Should not get here when not allowed to modify the CFG!"); 1918 rewriteMemOpOfSelect(SI, *I, Spec, *DTU); 1919 CFGChanged = true; 1920 } 1921 I->eraseFromParent(); 1922 } 1923 1924 for (User *U : make_early_inc_range(SI.users())) 1925 cast<BitCastInst>(U)->eraseFromParent(); 1926 SI.eraseFromParent(); 1927 return CFGChanged; 1928 } 1929 1930 /// Compute an adjusted pointer from Ptr by Offset bytes where the 1931 /// resulting pointer has PointerTy. 1932 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, 1933 APInt Offset, Type *PointerTy, 1934 const Twine &NamePrefix) { 1935 if (Offset != 0) 1936 Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset), 1937 NamePrefix + "sroa_idx"); 1938 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy, 1939 NamePrefix + "sroa_cast"); 1940 } 1941 1942 /// Compute the adjusted alignment for a load or store from an offset. 1943 static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) { 1944 return commonAlignment(getLoadStoreAlignment(I), Offset); 1945 } 1946 1947 /// Test whether we can convert a value from the old to the new type. 1948 /// 1949 /// This predicate should be used to guard calls to convertValue in order to 1950 /// ensure that we only try to convert viable values. The strategy is that we 1951 /// will peel off single element struct and array wrappings to get to an 1952 /// underlying value, and convert that value. 1953 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 1954 if (OldTy == NewTy) 1955 return true; 1956 1957 // For integer types, we can't handle any bit-width differences. This would 1958 // break both vector conversions with extension and introduce endianness 1959 // issues when in conjunction with loads and stores. 1960 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) { 1961 assert(cast<IntegerType>(OldTy)->getBitWidth() != 1962 cast<IntegerType>(NewTy)->getBitWidth() && 1963 "We can't have the same bitwidth for different int types"); 1964 return false; 1965 } 1966 1967 if (DL.getTypeSizeInBits(NewTy).getFixedValue() != 1968 DL.getTypeSizeInBits(OldTy).getFixedValue()) 1969 return false; 1970 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 1971 return false; 1972 1973 // We can convert pointers to integers and vice-versa. Same for vectors 1974 // of pointers and integers. 1975 OldTy = OldTy->getScalarType(); 1976 NewTy = NewTy->getScalarType(); 1977 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 1978 if (NewTy->isPointerTy() && OldTy->isPointerTy()) { 1979 unsigned OldAS = OldTy->getPointerAddressSpace(); 1980 unsigned NewAS = NewTy->getPointerAddressSpace(); 1981 // Convert pointers if they are pointers from the same address space or 1982 // different integral (not non-integral) address spaces with the same 1983 // pointer size. 1984 return OldAS == NewAS || 1985 (!DL.isNonIntegralAddressSpace(OldAS) && 1986 !DL.isNonIntegralAddressSpace(NewAS) && 1987 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); 1988 } 1989 1990 // We can convert integers to integral pointers, but not to non-integral 1991 // pointers. 1992 if (OldTy->isIntegerTy()) 1993 return !DL.isNonIntegralPointerType(NewTy); 1994 1995 // We can convert integral pointers to integers, but non-integral pointers 1996 // need to remain pointers. 1997 if (!DL.isNonIntegralPointerType(OldTy)) 1998 return NewTy->isIntegerTy(); 1999 2000 return false; 2001 } 2002 2003 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy()) 2004 return false; 2005 2006 return true; 2007 } 2008 2009 /// Generic routine to convert an SSA value to a value of a different 2010 /// type. 2011 /// 2012 /// This will try various different casting techniques, such as bitcasts, 2013 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 2014 /// two types for viability with this routine. 2015 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 2016 Type *NewTy) { 2017 Type *OldTy = V->getType(); 2018 assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); 2019 2020 if (OldTy == NewTy) 2021 return V; 2022 2023 assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) && 2024 "Integer types must be the exact same to convert."); 2025 2026 // See if we need inttoptr for this type pair. May require additional bitcast. 2027 if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) { 2028 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* 2029 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> 2030 // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*> 2031 // Directly handle i64 to i8* 2032 return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), 2033 NewTy); 2034 } 2035 2036 // See if we need ptrtoint for this type pair. May require additional bitcast. 2037 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) { 2038 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 2039 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> 2040 // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32> 2041 // Expand i8* to i64 --> i8* to i64 to i64 2042 return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 2043 NewTy); 2044 } 2045 2046 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) { 2047 unsigned OldAS = OldTy->getPointerAddressSpace(); 2048 unsigned NewAS = NewTy->getPointerAddressSpace(); 2049 // To convert pointers with different address spaces (they are already 2050 // checked convertible, i.e. they have the same pointer size), so far we 2051 // cannot use `bitcast` (which has restrict on the same address space) or 2052 // `addrspacecast` (which is not always no-op casting). Instead, use a pair 2053 // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit 2054 // size. 2055 if (OldAS != NewAS) { 2056 assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); 2057 return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 2058 NewTy); 2059 } 2060 } 2061 2062 return IRB.CreateBitCast(V, NewTy); 2063 } 2064 2065 /// Test whether the given slice use can be promoted to a vector. 2066 /// 2067 /// This function is called to test each entry in a partition which is slated 2068 /// for a single slice. 2069 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, 2070 VectorType *Ty, 2071 uint64_t ElementSize, 2072 const DataLayout &DL) { 2073 // First validate the slice offsets. 2074 uint64_t BeginOffset = 2075 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); 2076 uint64_t BeginIndex = BeginOffset / ElementSize; 2077 if (BeginIndex * ElementSize != BeginOffset || 2078 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements()) 2079 return false; 2080 uint64_t EndOffset = 2081 std::min(S.endOffset(), P.endOffset()) - P.beginOffset(); 2082 uint64_t EndIndex = EndOffset / ElementSize; 2083 if (EndIndex * ElementSize != EndOffset || 2084 EndIndex > cast<FixedVectorType>(Ty)->getNumElements()) 2085 return false; 2086 2087 assert(EndIndex > BeginIndex && "Empty vector!"); 2088 uint64_t NumElements = EndIndex - BeginIndex; 2089 Type *SliceTy = (NumElements == 1) 2090 ? Ty->getElementType() 2091 : FixedVectorType::get(Ty->getElementType(), NumElements); 2092 2093 Type *SplitIntTy = 2094 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8); 2095 2096 Use *U = S.getUse(); 2097 2098 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 2099 if (MI->isVolatile()) 2100 return false; 2101 if (!S.isSplittable()) 2102 return false; // Skip any unsplittable intrinsics. 2103 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 2104 if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) 2105 return false; 2106 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 2107 if (LI->isVolatile()) 2108 return false; 2109 Type *LTy = LI->getType(); 2110 // Disable vector promotion when there are loads or stores of an FCA. 2111 if (LTy->isStructTy()) 2112 return false; 2113 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { 2114 assert(LTy->isIntegerTy()); 2115 LTy = SplitIntTy; 2116 } 2117 if (!canConvertValue(DL, SliceTy, LTy)) 2118 return false; 2119 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 2120 if (SI->isVolatile()) 2121 return false; 2122 Type *STy = SI->getValueOperand()->getType(); 2123 // Disable vector promotion when there are loads or stores of an FCA. 2124 if (STy->isStructTy()) 2125 return false; 2126 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { 2127 assert(STy->isIntegerTy()); 2128 STy = SplitIntTy; 2129 } 2130 if (!canConvertValue(DL, STy, SliceTy)) 2131 return false; 2132 } else { 2133 return false; 2134 } 2135 2136 return true; 2137 } 2138 2139 /// Test whether a vector type is viable for promotion. 2140 /// 2141 /// This implements the necessary checking for \c checkVectorTypesForPromotion 2142 /// (and thus isVectorPromotionViable) over all slices of the alloca for the 2143 /// given VectorType. 2144 static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, 2145 const DataLayout &DL) { 2146 uint64_t ElementSize = 2147 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); 2148 2149 // While the definition of LLVM vectors is bitpacked, we don't support sizes 2150 // that aren't byte sized. 2151 if (ElementSize % 8) 2152 return false; 2153 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 && 2154 "vector size not a multiple of element size?"); 2155 ElementSize /= 8; 2156 2157 for (const Slice &S : P) 2158 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) 2159 return false; 2160 2161 for (const Slice *S : P.splitSliceTails()) 2162 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) 2163 return false; 2164 2165 return true; 2166 } 2167 2168 /// Test whether any vector type in \p CandidateTys is viable for promotion. 2169 /// 2170 /// This implements the necessary checking for \c isVectorPromotionViable over 2171 /// all slices of the alloca for the given VectorType. 2172 static VectorType * 2173 checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, 2174 SmallVectorImpl<VectorType *> &CandidateTys, 2175 bool HaveCommonEltTy, Type *CommonEltTy, 2176 bool HaveVecPtrTy, bool HaveCommonVecPtrTy, 2177 VectorType *CommonVecPtrTy) { 2178 // If we didn't find a vector type, nothing to do here. 2179 if (CandidateTys.empty()) 2180 return nullptr; 2181 2182 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type, 2183 // then we should choose it, not some other alternative. 2184 // But, we can't perform a no-op pointer address space change via bitcast, 2185 // so if we didn't have a common pointer element type, bail. 2186 if (HaveVecPtrTy && !HaveCommonVecPtrTy) 2187 return nullptr; 2188 2189 // Try to pick the "best" element type out of the choices. 2190 if (!HaveCommonEltTy && HaveVecPtrTy) { 2191 // If there was a pointer element type, there's really only one choice. 2192 CandidateTys.clear(); 2193 CandidateTys.push_back(CommonVecPtrTy); 2194 } else if (!HaveCommonEltTy && !HaveVecPtrTy) { 2195 // Integer-ify vector types. 2196 for (VectorType *&VTy : CandidateTys) { 2197 if (!VTy->getElementType()->isIntegerTy()) 2198 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy( 2199 VTy->getContext(), VTy->getScalarSizeInBits()))); 2200 } 2201 2202 // Rank the remaining candidate vector types. This is easy because we know 2203 // they're all integer vectors. We sort by ascending number of elements. 2204 auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) { 2205 (void)DL; 2206 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() == 2207 DL.getTypeSizeInBits(LHSTy).getFixedValue() && 2208 "Cannot have vector types of different sizes!"); 2209 assert(RHSTy->getElementType()->isIntegerTy() && 2210 "All non-integer types eliminated!"); 2211 assert(LHSTy->getElementType()->isIntegerTy() && 2212 "All non-integer types eliminated!"); 2213 return cast<FixedVectorType>(RHSTy)->getNumElements() < 2214 cast<FixedVectorType>(LHSTy)->getNumElements(); 2215 }; 2216 auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) { 2217 (void)DL; 2218 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() == 2219 DL.getTypeSizeInBits(LHSTy).getFixedValue() && 2220 "Cannot have vector types of different sizes!"); 2221 assert(RHSTy->getElementType()->isIntegerTy() && 2222 "All non-integer types eliminated!"); 2223 assert(LHSTy->getElementType()->isIntegerTy() && 2224 "All non-integer types eliminated!"); 2225 return cast<FixedVectorType>(RHSTy)->getNumElements() == 2226 cast<FixedVectorType>(LHSTy)->getNumElements(); 2227 }; 2228 llvm::sort(CandidateTys, RankVectorTypesComp); 2229 CandidateTys.erase(std::unique(CandidateTys.begin(), CandidateTys.end(), 2230 RankVectorTypesEq), 2231 CandidateTys.end()); 2232 } else { 2233 // The only way to have the same element type in every vector type is to 2234 // have the same vector type. Check that and remove all but one. 2235 #ifndef NDEBUG 2236 for (VectorType *VTy : CandidateTys) { 2237 assert(VTy->getElementType() == CommonEltTy && 2238 "Unaccounted for element type!"); 2239 assert(VTy == CandidateTys[0] && 2240 "Different vector types with the same element type!"); 2241 } 2242 #endif 2243 CandidateTys.resize(1); 2244 } 2245 2246 // FIXME: hack. Do we have a named constant for this? 2247 // SDAG SDNode can't have more than 65535 operands. 2248 llvm::erase_if(CandidateTys, [](VectorType *VTy) { 2249 return cast<FixedVectorType>(VTy)->getNumElements() > 2250 std::numeric_limits<unsigned short>::max(); 2251 }); 2252 2253 for (VectorType *VTy : CandidateTys) 2254 if (checkVectorTypeForPromotion(P, VTy, DL)) 2255 return VTy; 2256 2257 return nullptr; 2258 } 2259 2260 static VectorType *createAndCheckVectorTypesForPromotion( 2261 SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy, 2262 function_ref<void(Type *)> CheckCandidateType, Partition &P, 2263 const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys, 2264 bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, 2265 bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) { 2266 [[maybe_unused]] VectorType *OriginalElt = 2267 CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr; 2268 // Consider additional vector types where the element type size is a 2269 // multiple of load/store element size. 2270 for (Type *Ty : OtherTys) { 2271 if (!VectorType::isValidElementType(Ty)) 2272 continue; 2273 unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue(); 2274 // Make a copy of CandidateTys and iterate through it, because we 2275 // might append to CandidateTys in the loop. 2276 for (VectorType *const VTy : CandidateTysCopy) { 2277 // The elements in the copy should remain invariant throughout the loop 2278 assert(CandidateTysCopy[0] == OriginalElt && "Different Element"); 2279 unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue(); 2280 unsigned ElementSize = 2281 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); 2282 if (TypeSize != VectorSize && TypeSize != ElementSize && 2283 VectorSize % TypeSize == 0) { 2284 VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false); 2285 CheckCandidateType(NewVTy); 2286 } 2287 } 2288 } 2289 2290 return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy, 2291 CommonEltTy, HaveVecPtrTy, 2292 HaveCommonVecPtrTy, CommonVecPtrTy); 2293 } 2294 2295 /// Test whether the given alloca partitioning and range of slices can be 2296 /// promoted to a vector. 2297 /// 2298 /// This is a quick test to check whether we can rewrite a particular alloca 2299 /// partition (and its newly formed alloca) into a vector alloca with only 2300 /// whole-vector loads and stores such that it could be promoted to a vector 2301 /// SSA value. We only can ensure this for a limited set of operations, and we 2302 /// don't want to do the rewrites unless we are confident that the result will 2303 /// be promotable, so we have an early test here. 2304 static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { 2305 // Collect the candidate types for vector-based promotion. Also track whether 2306 // we have different element types. 2307 SmallVector<VectorType *, 4> CandidateTys; 2308 SetVector<Type *> LoadStoreTys; 2309 SetVector<Type *> DeferredTys; 2310 Type *CommonEltTy = nullptr; 2311 VectorType *CommonVecPtrTy = nullptr; 2312 bool HaveVecPtrTy = false; 2313 bool HaveCommonEltTy = true; 2314 bool HaveCommonVecPtrTy = true; 2315 auto CheckCandidateType = [&](Type *Ty) { 2316 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 2317 // Return if bitcast to vectors is different for total size in bits. 2318 if (!CandidateTys.empty()) { 2319 VectorType *V = CandidateTys[0]; 2320 if (DL.getTypeSizeInBits(VTy).getFixedValue() != 2321 DL.getTypeSizeInBits(V).getFixedValue()) { 2322 CandidateTys.clear(); 2323 return; 2324 } 2325 } 2326 CandidateTys.push_back(VTy); 2327 Type *EltTy = VTy->getElementType(); 2328 2329 if (!CommonEltTy) 2330 CommonEltTy = EltTy; 2331 else if (CommonEltTy != EltTy) 2332 HaveCommonEltTy = false; 2333 2334 if (EltTy->isPointerTy()) { 2335 HaveVecPtrTy = true; 2336 if (!CommonVecPtrTy) 2337 CommonVecPtrTy = VTy; 2338 else if (CommonVecPtrTy != VTy) 2339 HaveCommonVecPtrTy = false; 2340 } 2341 } 2342 }; 2343 2344 // Put load and store types into a set for de-duplication. 2345 for (const Slice &S : P) { 2346 Type *Ty; 2347 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser())) 2348 Ty = LI->getType(); 2349 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) 2350 Ty = SI->getValueOperand()->getType(); 2351 else 2352 continue; 2353 2354 auto CandTy = Ty->getScalarType(); 2355 if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() || 2356 S.endOffset() != P.endOffset())) { 2357 DeferredTys.insert(Ty); 2358 continue; 2359 } 2360 2361 LoadStoreTys.insert(Ty); 2362 // Consider any loads or stores that are the exact size of the slice. 2363 if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset()) 2364 CheckCandidateType(Ty); 2365 } 2366 2367 SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys; 2368 if (auto *VTy = createAndCheckVectorTypesForPromotion( 2369 LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL, 2370 CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, 2371 HaveCommonVecPtrTy, CommonVecPtrTy)) 2372 return VTy; 2373 2374 CandidateTys.clear(); 2375 return createAndCheckVectorTypesForPromotion( 2376 DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, 2377 HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, 2378 CommonVecPtrTy); 2379 } 2380 2381 /// Test whether a slice of an alloca is valid for integer widening. 2382 /// 2383 /// This implements the necessary checking for the \c isIntegerWideningViable 2384 /// test below on a single slice of the alloca. 2385 static bool isIntegerWideningViableForSlice(const Slice &S, 2386 uint64_t AllocBeginOffset, 2387 Type *AllocaTy, 2388 const DataLayout &DL, 2389 bool &WholeAllocaOp) { 2390 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue(); 2391 2392 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; 2393 uint64_t RelEnd = S.endOffset() - AllocBeginOffset; 2394 2395 Use *U = S.getUse(); 2396 2397 // Lifetime intrinsics operate over the whole alloca whose sizes are usually 2398 // larger than other load/store slices (RelEnd > Size). But lifetime are 2399 // always promotable and should not impact other slices' promotability of the 2400 // partition. 2401 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 2402 if (II->isLifetimeStartOrEnd() || II->isDroppable()) 2403 return true; 2404 } 2405 2406 // We can't reasonably handle cases where the load or store extends past 2407 // the end of the alloca's type and into its padding. 2408 if (RelEnd > Size) 2409 return false; 2410 2411 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 2412 if (LI->isVolatile()) 2413 return false; 2414 // We can't handle loads that extend past the allocated memory. 2415 if (DL.getTypeStoreSize(LI->getType()).getFixedValue() > Size) 2416 return false; 2417 // So far, AllocaSliceRewriter does not support widening split slice tails 2418 // in rewriteIntegerLoad. 2419 if (S.beginOffset() < AllocBeginOffset) 2420 return false; 2421 // Note that we don't count vector loads or stores as whole-alloca 2422 // operations which enable integer widening because we would prefer to use 2423 // vector widening instead. 2424 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size) 2425 WholeAllocaOp = true; 2426 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 2427 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue()) 2428 return false; 2429 } else if (RelBegin != 0 || RelEnd != Size || 2430 !canConvertValue(DL, AllocaTy, LI->getType())) { 2431 // Non-integer loads need to be convertible from the alloca type so that 2432 // they are promotable. 2433 return false; 2434 } 2435 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 2436 Type *ValueTy = SI->getValueOperand()->getType(); 2437 if (SI->isVolatile()) 2438 return false; 2439 // We can't handle stores that extend past the allocated memory. 2440 if (DL.getTypeStoreSize(ValueTy).getFixedValue() > Size) 2441 return false; 2442 // So far, AllocaSliceRewriter does not support widening split slice tails 2443 // in rewriteIntegerStore. 2444 if (S.beginOffset() < AllocBeginOffset) 2445 return false; 2446 // Note that we don't count vector loads or stores as whole-alloca 2447 // operations which enable integer widening because we would prefer to use 2448 // vector widening instead. 2449 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size) 2450 WholeAllocaOp = true; 2451 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 2452 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue()) 2453 return false; 2454 } else if (RelBegin != 0 || RelEnd != Size || 2455 !canConvertValue(DL, ValueTy, AllocaTy)) { 2456 // Non-integer stores need to be convertible to the alloca type so that 2457 // they are promotable. 2458 return false; 2459 } 2460 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 2461 if (MI->isVolatile() || !isa<Constant>(MI->getLength())) 2462 return false; 2463 if (!S.isSplittable()) 2464 return false; // Skip any unsplittable intrinsics. 2465 } else { 2466 return false; 2467 } 2468 2469 return true; 2470 } 2471 2472 /// Test whether the given alloca partition's integer operations can be 2473 /// widened to promotable ones. 2474 /// 2475 /// This is a quick test to check whether we can rewrite the integer loads and 2476 /// stores to a particular alloca into wider loads and stores and be able to 2477 /// promote the resulting alloca. 2478 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, 2479 const DataLayout &DL) { 2480 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue(); 2481 // Don't create integer types larger than the maximum bitwidth. 2482 if (SizeInBits > IntegerType::MAX_INT_BITS) 2483 return false; 2484 2485 // Don't try to handle allocas with bit-padding. 2486 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue()) 2487 return false; 2488 2489 // We need to ensure that an integer type with the appropriate bitwidth can 2490 // be converted to the alloca type, whatever that is. We don't want to force 2491 // the alloca itself to have an integer type if there is a more suitable one. 2492 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); 2493 if (!canConvertValue(DL, AllocaTy, IntTy) || 2494 !canConvertValue(DL, IntTy, AllocaTy)) 2495 return false; 2496 2497 // While examining uses, we ensure that the alloca has a covering load or 2498 // store. We don't want to widen the integer operations only to fail to 2499 // promote due to some other unsplittable entry (which we may make splittable 2500 // later). However, if there are only splittable uses, go ahead and assume 2501 // that we cover the alloca. 2502 // FIXME: We shouldn't consider split slices that happen to start in the 2503 // partition here... 2504 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits); 2505 2506 for (const Slice &S : P) 2507 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, 2508 WholeAllocaOp)) 2509 return false; 2510 2511 for (const Slice *S : P.splitSliceTails()) 2512 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, 2513 WholeAllocaOp)) 2514 return false; 2515 2516 return WholeAllocaOp; 2517 } 2518 2519 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 2520 IntegerType *Ty, uint64_t Offset, 2521 const Twine &Name) { 2522 LLVM_DEBUG(dbgs() << " start: " << *V << "\n"); 2523 IntegerType *IntTy = cast<IntegerType>(V->getType()); 2524 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <= 2525 DL.getTypeStoreSize(IntTy).getFixedValue() && 2526 "Element extends past full value"); 2527 uint64_t ShAmt = 8 * Offset; 2528 if (DL.isBigEndian()) 2529 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() - 2530 DL.getTypeStoreSize(Ty).getFixedValue() - Offset); 2531 if (ShAmt) { 2532 V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 2533 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n"); 2534 } 2535 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2536 "Cannot extract to a larger integer!"); 2537 if (Ty != IntTy) { 2538 V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 2539 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n"); 2540 } 2541 return V; 2542 } 2543 2544 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, 2545 Value *V, uint64_t Offset, const Twine &Name) { 2546 IntegerType *IntTy = cast<IntegerType>(Old->getType()); 2547 IntegerType *Ty = cast<IntegerType>(V->getType()); 2548 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2549 "Cannot insert a larger integer!"); 2550 LLVM_DEBUG(dbgs() << " start: " << *V << "\n"); 2551 if (Ty != IntTy) { 2552 V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 2553 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n"); 2554 } 2555 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <= 2556 DL.getTypeStoreSize(IntTy).getFixedValue() && 2557 "Element store outside of alloca store"); 2558 uint64_t ShAmt = 8 * Offset; 2559 if (DL.isBigEndian()) 2560 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() - 2561 DL.getTypeStoreSize(Ty).getFixedValue() - Offset); 2562 if (ShAmt) { 2563 V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 2564 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n"); 2565 } 2566 2567 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 2568 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 2569 Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 2570 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n"); 2571 V = IRB.CreateOr(Old, V, Name + ".insert"); 2572 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n"); 2573 } 2574 return V; 2575 } 2576 2577 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, 2578 unsigned EndIndex, const Twine &Name) { 2579 auto *VecTy = cast<FixedVectorType>(V->getType()); 2580 unsigned NumElements = EndIndex - BeginIndex; 2581 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2582 2583 if (NumElements == VecTy->getNumElements()) 2584 return V; 2585 2586 if (NumElements == 1) { 2587 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), 2588 Name + ".extract"); 2589 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n"); 2590 return V; 2591 } 2592 2593 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex)); 2594 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract"); 2595 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n"); 2596 return V; 2597 } 2598 2599 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, 2600 unsigned BeginIndex, const Twine &Name) { 2601 VectorType *VecTy = cast<VectorType>(Old->getType()); 2602 assert(VecTy && "Can only insert a vector into a vector"); 2603 2604 VectorType *Ty = dyn_cast<VectorType>(V->getType()); 2605 if (!Ty) { 2606 // Single element to insert. 2607 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), 2608 Name + ".insert"); 2609 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n"); 2610 return V; 2611 } 2612 2613 assert(cast<FixedVectorType>(Ty)->getNumElements() <= 2614 cast<FixedVectorType>(VecTy)->getNumElements() && 2615 "Too many elements!"); 2616 if (cast<FixedVectorType>(Ty)->getNumElements() == 2617 cast<FixedVectorType>(VecTy)->getNumElements()) { 2618 assert(V->getType() == VecTy && "Vector type mismatch"); 2619 return V; 2620 } 2621 unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements(); 2622 2623 // When inserting a smaller vector into the larger to store, we first 2624 // use a shuffle vector to widen it with undef elements, and then 2625 // a second shuffle vector to select between the loaded vector and the 2626 // incoming vector. 2627 SmallVector<int, 8> Mask; 2628 Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements()); 2629 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i) 2630 if (i >= BeginIndex && i < EndIndex) 2631 Mask.push_back(i - BeginIndex); 2632 else 2633 Mask.push_back(-1); 2634 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand"); 2635 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n"); 2636 2637 SmallVector<Constant *, 8> Mask2; 2638 Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements()); 2639 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i) 2640 Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex)); 2641 2642 V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend"); 2643 2644 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n"); 2645 return V; 2646 } 2647 2648 namespace { 2649 2650 /// Visitor to rewrite instructions using p particular slice of an alloca 2651 /// to use a new alloca. 2652 /// 2653 /// Also implements the rewriting to vector-based accesses when the partition 2654 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic 2655 /// lives here. 2656 class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> { 2657 // Befriend the base class so it can delegate to private visit methods. 2658 friend class InstVisitor<AllocaSliceRewriter, bool>; 2659 2660 using Base = InstVisitor<AllocaSliceRewriter, bool>; 2661 2662 const DataLayout &DL; 2663 AllocaSlices &AS; 2664 SROA &Pass; 2665 AllocaInst &OldAI, &NewAI; 2666 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 2667 Type *NewAllocaTy; 2668 2669 // This is a convenience and flag variable that will be null unless the new 2670 // alloca's integer operations should be widened to this integer type due to 2671 // passing isIntegerWideningViable above. If it is non-null, the desired 2672 // integer type will be stored here for easy access during rewriting. 2673 IntegerType *IntTy; 2674 2675 // If we are rewriting an alloca partition which can be written as pure 2676 // vector operations, we stash extra information here. When VecTy is 2677 // non-null, we have some strict guarantees about the rewritten alloca: 2678 // - The new alloca is exactly the size of the vector type here. 2679 // - The accesses all either map to the entire vector or to a single 2680 // element. 2681 // - The set of accessing instructions is only one of those handled above 2682 // in isVectorPromotionViable. Generally these are the same access kinds 2683 // which are promotable via mem2reg. 2684 VectorType *VecTy; 2685 Type *ElementTy; 2686 uint64_t ElementSize; 2687 2688 // The original offset of the slice currently being rewritten relative to 2689 // the original alloca. 2690 uint64_t BeginOffset = 0; 2691 uint64_t EndOffset = 0; 2692 2693 // The new offsets of the slice currently being rewritten relative to the 2694 // original alloca. 2695 uint64_t NewBeginOffset = 0, NewEndOffset = 0; 2696 2697 uint64_t SliceSize = 0; 2698 bool IsSplittable = false; 2699 bool IsSplit = false; 2700 Use *OldUse = nullptr; 2701 Instruction *OldPtr = nullptr; 2702 2703 // Track post-rewrite users which are PHI nodes and Selects. 2704 SmallSetVector<PHINode *, 8> &PHIUsers; 2705 SmallSetVector<SelectInst *, 8> &SelectUsers; 2706 2707 // Utility IR builder, whose name prefix is setup for each visited use, and 2708 // the insertion point is set to point to the user. 2709 IRBuilderTy IRB; 2710 2711 // Return the new alloca, addrspacecasted if required to avoid changing the 2712 // addrspace of a volatile access. 2713 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) { 2714 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace()) 2715 return &NewAI; 2716 2717 Type *AccessTy = IRB.getPtrTy(AddrSpace); 2718 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy); 2719 } 2720 2721 public: 2722 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass, 2723 AllocaInst &OldAI, AllocaInst &NewAI, 2724 uint64_t NewAllocaBeginOffset, 2725 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, 2726 VectorType *PromotableVecTy, 2727 SmallSetVector<PHINode *, 8> &PHIUsers, 2728 SmallSetVector<SelectInst *, 8> &SelectUsers) 2729 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI), 2730 NewAllocaBeginOffset(NewAllocaBeginOffset), 2731 NewAllocaEndOffset(NewAllocaEndOffset), 2732 NewAllocaTy(NewAI.getAllocatedType()), 2733 IntTy( 2734 IsIntegerPromotable 2735 ? Type::getIntNTy(NewAI.getContext(), 2736 DL.getTypeSizeInBits(NewAI.getAllocatedType()) 2737 .getFixedValue()) 2738 : nullptr), 2739 VecTy(PromotableVecTy), 2740 ElementTy(VecTy ? VecTy->getElementType() : nullptr), 2741 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8 2742 : 0), 2743 PHIUsers(PHIUsers), SelectUsers(SelectUsers), 2744 IRB(NewAI.getContext(), ConstantFolder()) { 2745 if (VecTy) { 2746 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 && 2747 "Only multiple-of-8 sized vector elements are viable"); 2748 ++NumVectorized; 2749 } 2750 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy)); 2751 } 2752 2753 bool visit(AllocaSlices::const_iterator I) { 2754 bool CanSROA = true; 2755 BeginOffset = I->beginOffset(); 2756 EndOffset = I->endOffset(); 2757 IsSplittable = I->isSplittable(); 2758 IsSplit = 2759 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; 2760 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); 2761 LLVM_DEBUG(AS.printSlice(dbgs(), I, "")); 2762 LLVM_DEBUG(dbgs() << "\n"); 2763 2764 // Compute the intersecting offset range. 2765 assert(BeginOffset < NewAllocaEndOffset); 2766 assert(EndOffset > NewAllocaBeginOffset); 2767 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); 2768 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); 2769 2770 SliceSize = NewEndOffset - NewBeginOffset; 2771 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset 2772 << ") NewBegin:(" << NewBeginOffset << ", " 2773 << NewEndOffset << ") NewAllocaBegin:(" 2774 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset 2775 << ")\n"); 2776 assert(IsSplit || NewBeginOffset == BeginOffset); 2777 OldUse = I->getUse(); 2778 OldPtr = cast<Instruction>(OldUse->get()); 2779 2780 Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); 2781 IRB.SetInsertPoint(OldUserI); 2782 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); 2783 IRB.getInserter().SetNamePrefix( 2784 Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); 2785 2786 CanSROA &= visit(cast<Instruction>(OldUse->getUser())); 2787 if (VecTy || IntTy) 2788 assert(CanSROA); 2789 return CanSROA; 2790 } 2791 2792 private: 2793 // Make sure the other visit overloads are visible. 2794 using Base::visit; 2795 2796 // Every instruction which can end up as a user must have a rewrite rule. 2797 bool visitInstruction(Instruction &I) { 2798 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 2799 llvm_unreachable("No rewrite rule for this instruction!"); 2800 } 2801 2802 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) { 2803 // Note that the offset computation can use BeginOffset or NewBeginOffset 2804 // interchangeably for unsplit slices. 2805 assert(IsSplit || BeginOffset == NewBeginOffset); 2806 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 2807 2808 #ifndef NDEBUG 2809 StringRef OldName = OldPtr->getName(); 2810 // Skip through the last '.sroa.' component of the name. 2811 size_t LastSROAPrefix = OldName.rfind(".sroa."); 2812 if (LastSROAPrefix != StringRef::npos) { 2813 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa.")); 2814 // Look for an SROA slice index. 2815 size_t IndexEnd = OldName.find_first_not_of("0123456789"); 2816 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') { 2817 // Strip the index and look for the offset. 2818 OldName = OldName.substr(IndexEnd + 1); 2819 size_t OffsetEnd = OldName.find_first_not_of("0123456789"); 2820 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.') 2821 // Strip the offset. 2822 OldName = OldName.substr(OffsetEnd + 1); 2823 } 2824 } 2825 // Strip any SROA suffixes as well. 2826 OldName = OldName.substr(0, OldName.find(".sroa_")); 2827 #endif 2828 2829 return getAdjustedPtr(IRB, DL, &NewAI, 2830 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset), 2831 PointerTy, 2832 #ifndef NDEBUG 2833 Twine(OldName) + "." 2834 #else 2835 Twine() 2836 #endif 2837 ); 2838 } 2839 2840 /// Compute suitable alignment to access this slice of the *new* 2841 /// alloca. 2842 /// 2843 /// You can optionally pass a type to this routine and if that type's ABI 2844 /// alignment is itself suitable, this will return zero. 2845 Align getSliceAlign() { 2846 return commonAlignment(NewAI.getAlign(), 2847 NewBeginOffset - NewAllocaBeginOffset); 2848 } 2849 2850 unsigned getIndex(uint64_t Offset) { 2851 assert(VecTy && "Can only call getIndex when rewriting a vector"); 2852 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2853 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 2854 uint32_t Index = RelOffset / ElementSize; 2855 assert(Index * ElementSize == RelOffset); 2856 return Index; 2857 } 2858 2859 void deleteIfTriviallyDead(Value *V) { 2860 Instruction *I = cast<Instruction>(V); 2861 if (isInstructionTriviallyDead(I)) 2862 Pass.DeadInsts.push_back(I); 2863 } 2864 2865 Value *rewriteVectorizedLoadInst(LoadInst &LI) { 2866 unsigned BeginIndex = getIndex(NewBeginOffset); 2867 unsigned EndIndex = getIndex(NewEndOffset); 2868 assert(EndIndex > BeginIndex && "Empty vector!"); 2869 2870 LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 2871 NewAI.getAlign(), "load"); 2872 2873 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access, 2874 LLVMContext::MD_access_group}); 2875 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec"); 2876 } 2877 2878 Value *rewriteIntegerLoad(LoadInst &LI) { 2879 assert(IntTy && "We cannot insert an integer to the alloca"); 2880 assert(!LI.isVolatile()); 2881 Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 2882 NewAI.getAlign(), "load"); 2883 V = convertValue(DL, IRB, V, IntTy); 2884 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2885 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 2886 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { 2887 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8); 2888 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract"); 2889 } 2890 // It is possible that the extracted type is not the load type. This 2891 // happens if there is a load past the end of the alloca, and as 2892 // a consequence the slice is narrower but still a candidate for integer 2893 // lowering. To handle this case, we just zero extend the extracted 2894 // integer. 2895 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 && 2896 "Can only handle an extract for an overly wide load"); 2897 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8) 2898 V = IRB.CreateZExt(V, LI.getType()); 2899 return V; 2900 } 2901 2902 bool visitLoadInst(LoadInst &LI) { 2903 LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); 2904 Value *OldOp = LI.getOperand(0); 2905 assert(OldOp == OldPtr); 2906 2907 AAMDNodes AATags = LI.getAAMetadata(); 2908 2909 unsigned AS = LI.getPointerAddressSpace(); 2910 2911 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) 2912 : LI.getType(); 2913 const bool IsLoadPastEnd = 2914 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize; 2915 bool IsPtrAdjusted = false; 2916 Value *V; 2917 if (VecTy) { 2918 V = rewriteVectorizedLoadInst(LI); 2919 } else if (IntTy && LI.getType()->isIntegerTy()) { 2920 V = rewriteIntegerLoad(LI); 2921 } else if (NewBeginOffset == NewAllocaBeginOffset && 2922 NewEndOffset == NewAllocaEndOffset && 2923 (canConvertValue(DL, NewAllocaTy, TargetTy) || 2924 (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && 2925 TargetTy->isIntegerTy() && !LI.isVolatile()))) { 2926 Value *NewPtr = 2927 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile()); 2928 LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr, 2929 NewAI.getAlign(), LI.isVolatile(), 2930 LI.getName()); 2931 if (LI.isVolatile()) 2932 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); 2933 if (NewLI->isAtomic()) 2934 NewLI->setAlignment(LI.getAlign()); 2935 2936 // Copy any metadata that is valid for the new load. This may require 2937 // conversion to a different kind of metadata, e.g. !nonnull might change 2938 // to !range or vice versa. 2939 copyMetadataForLoad(*NewLI, LI); 2940 2941 // Do this after copyMetadataForLoad() to preserve the TBAA shift. 2942 if (AATags) 2943 NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 2944 2945 // Try to preserve nonnull metadata 2946 V = NewLI; 2947 2948 // If this is an integer load past the end of the slice (which means the 2949 // bytes outside the slice are undef or this load is dead) just forcibly 2950 // fix the integer size with correct handling of endianness. 2951 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy)) 2952 if (auto *TITy = dyn_cast<IntegerType>(TargetTy)) 2953 if (AITy->getBitWidth() < TITy->getBitWidth()) { 2954 V = IRB.CreateZExt(V, TITy, "load.ext"); 2955 if (DL.isBigEndian()) 2956 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(), 2957 "endian_shift"); 2958 } 2959 } else { 2960 Type *LTy = IRB.getPtrTy(AS); 2961 LoadInst *NewLI = 2962 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy), 2963 getSliceAlign(), LI.isVolatile(), LI.getName()); 2964 if (AATags) 2965 NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 2966 if (LI.isVolatile()) 2967 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); 2968 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access, 2969 LLVMContext::MD_access_group}); 2970 2971 V = NewLI; 2972 IsPtrAdjusted = true; 2973 } 2974 V = convertValue(DL, IRB, V, TargetTy); 2975 2976 if (IsSplit) { 2977 assert(!LI.isVolatile()); 2978 assert(LI.getType()->isIntegerTy() && 2979 "Only integer type loads and stores are split"); 2980 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() && 2981 "Split load isn't smaller than original load"); 2982 assert(DL.typeSizeEqualsStoreSize(LI.getType()) && 2983 "Non-byte-multiple bit width"); 2984 // Move the insertion point just past the load so that we can refer to it. 2985 IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI))); 2986 // Create a placeholder value with the same type as LI to use as the 2987 // basis for the new value. This allows us to replace the uses of LI with 2988 // the computed value, and then replace the placeholder with LI, leaving 2989 // LI only used for this computation. 2990 Value *Placeholder = 2991 new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "", 2992 false, Align(1)); 2993 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, 2994 "insert"); 2995 LI.replaceAllUsesWith(V); 2996 Placeholder->replaceAllUsesWith(&LI); 2997 Placeholder->deleteValue(); 2998 } else { 2999 LI.replaceAllUsesWith(V); 3000 } 3001 3002 Pass.DeadInsts.push_back(&LI); 3003 deleteIfTriviallyDead(OldOp); 3004 LLVM_DEBUG(dbgs() << " to: " << *V << "\n"); 3005 return !LI.isVolatile() && !IsPtrAdjusted; 3006 } 3007 3008 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp, 3009 AAMDNodes AATags) { 3010 // Capture V for the purpose of debug-info accounting once it's converted 3011 // to a vector store. 3012 Value *OrigV = V; 3013 if (V->getType() != VecTy) { 3014 unsigned BeginIndex = getIndex(NewBeginOffset); 3015 unsigned EndIndex = getIndex(NewEndOffset); 3016 assert(EndIndex > BeginIndex && "Empty vector!"); 3017 unsigned NumElements = EndIndex - BeginIndex; 3018 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() && 3019 "Too many elements!"); 3020 Type *SliceTy = (NumElements == 1) 3021 ? ElementTy 3022 : FixedVectorType::get(ElementTy, NumElements); 3023 if (V->getType() != SliceTy) 3024 V = convertValue(DL, IRB, V, SliceTy); 3025 3026 // Mix in the existing elements. 3027 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3028 NewAI.getAlign(), "load"); 3029 V = insertVector(IRB, Old, V, BeginIndex, "vec"); 3030 } 3031 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign()); 3032 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, 3033 LLVMContext::MD_access_group}); 3034 if (AATags) 3035 Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3036 Pass.DeadInsts.push_back(&SI); 3037 3038 // NOTE: Careful to use OrigV rather than V. 3039 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI, 3040 Store, Store->getPointerOperand(), OrigV, DL); 3041 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); 3042 return true; 3043 } 3044 3045 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) { 3046 assert(IntTy && "We cannot extract an integer from the alloca"); 3047 assert(!SI.isVolatile()); 3048 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() != 3049 IntTy->getBitWidth()) { 3050 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3051 NewAI.getAlign(), "oldload"); 3052 Old = convertValue(DL, IRB, Old, IntTy); 3053 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 3054 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 3055 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); 3056 } 3057 V = convertValue(DL, IRB, V, NewAllocaTy); 3058 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign()); 3059 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, 3060 LLVMContext::MD_access_group}); 3061 if (AATags) 3062 Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3063 3064 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI, 3065 Store, Store->getPointerOperand(), 3066 Store->getValueOperand(), DL); 3067 3068 Pass.DeadInsts.push_back(&SI); 3069 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); 3070 return true; 3071 } 3072 3073 bool visitStoreInst(StoreInst &SI) { 3074 LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); 3075 Value *OldOp = SI.getOperand(1); 3076 assert(OldOp == OldPtr); 3077 3078 AAMDNodes AATags = SI.getAAMetadata(); 3079 Value *V = SI.getValueOperand(); 3080 3081 // Strip all inbounds GEPs and pointer casts to try to dig out any root 3082 // alloca that should be re-examined after promoting this alloca. 3083 if (V->getType()->isPointerTy()) 3084 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) 3085 Pass.PostPromotionWorklist.insert(AI); 3086 3087 if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedValue()) { 3088 assert(!SI.isVolatile()); 3089 assert(V->getType()->isIntegerTy() && 3090 "Only integer type loads and stores are split"); 3091 assert(DL.typeSizeEqualsStoreSize(V->getType()) && 3092 "Non-byte-multiple bit width"); 3093 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); 3094 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset, 3095 "extract"); 3096 } 3097 3098 if (VecTy) 3099 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags); 3100 if (IntTy && V->getType()->isIntegerTy()) 3101 return rewriteIntegerStore(V, SI, AATags); 3102 3103 StoreInst *NewSI; 3104 if (NewBeginOffset == NewAllocaBeginOffset && 3105 NewEndOffset == NewAllocaEndOffset && 3106 canConvertValue(DL, V->getType(), NewAllocaTy)) { 3107 V = convertValue(DL, IRB, V, NewAllocaTy); 3108 Value *NewPtr = 3109 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile()); 3110 3111 NewSI = 3112 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile()); 3113 } else { 3114 unsigned AS = SI.getPointerAddressSpace(); 3115 Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS)); 3116 NewSI = 3117 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile()); 3118 } 3119 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, 3120 LLVMContext::MD_access_group}); 3121 if (AATags) 3122 NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3123 if (SI.isVolatile()) 3124 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); 3125 if (NewSI->isAtomic()) 3126 NewSI->setAlignment(SI.getAlign()); 3127 3128 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI, 3129 NewSI, NewSI->getPointerOperand(), 3130 NewSI->getValueOperand(), DL); 3131 3132 Pass.DeadInsts.push_back(&SI); 3133 deleteIfTriviallyDead(OldOp); 3134 3135 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n"); 3136 return NewSI->getPointerOperand() == &NewAI && 3137 NewSI->getValueOperand()->getType() == NewAllocaTy && 3138 !SI.isVolatile(); 3139 } 3140 3141 /// Compute an integer value from splatting an i8 across the given 3142 /// number of bytes. 3143 /// 3144 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't 3145 /// call this routine. 3146 /// FIXME: Heed the advice above. 3147 /// 3148 /// \param V The i8 value to splat. 3149 /// \param Size The number of bytes in the output (assuming i8 is one byte) 3150 Value *getIntegerSplat(Value *V, unsigned Size) { 3151 assert(Size > 0 && "Expected a positive number of bytes."); 3152 IntegerType *VTy = cast<IntegerType>(V->getType()); 3153 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); 3154 if (Size == 1) 3155 return V; 3156 3157 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8); 3158 V = IRB.CreateMul( 3159 IRB.CreateZExt(V, SplatIntTy, "zext"), 3160 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy), 3161 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()), 3162 SplatIntTy)), 3163 "isplat"); 3164 return V; 3165 } 3166 3167 /// Compute a vector splat for a given element value. 3168 Value *getVectorSplat(Value *V, unsigned NumElements) { 3169 V = IRB.CreateVectorSplat(NumElements, V, "vsplat"); 3170 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n"); 3171 return V; 3172 } 3173 3174 bool visitMemSetInst(MemSetInst &II) { 3175 LLVM_DEBUG(dbgs() << " original: " << II << "\n"); 3176 assert(II.getRawDest() == OldPtr); 3177 3178 AAMDNodes AATags = II.getAAMetadata(); 3179 3180 // If the memset has a variable size, it cannot be split, just adjust the 3181 // pointer to the new alloca. 3182 if (!isa<ConstantInt>(II.getLength())) { 3183 assert(!IsSplit); 3184 assert(NewBeginOffset == BeginOffset); 3185 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType())); 3186 II.setDestAlignment(getSliceAlign()); 3187 // In theory we should call migrateDebugInfo here. However, we do not 3188 // emit dbg.assign intrinsics for mem intrinsics storing through non- 3189 // constant geps, or storing a variable number of bytes. 3190 assert(at::getAssignmentMarkers(&II).empty() && 3191 at::getDPVAssignmentMarkers(&II).empty() && 3192 "AT: Unexpected link to non-const GEP"); 3193 deleteIfTriviallyDead(OldPtr); 3194 return false; 3195 } 3196 3197 // Record this instruction for deletion. 3198 Pass.DeadInsts.push_back(&II); 3199 3200 Type *AllocaTy = NewAI.getAllocatedType(); 3201 Type *ScalarTy = AllocaTy->getScalarType(); 3202 3203 const bool CanContinue = [&]() { 3204 if (VecTy || IntTy) 3205 return true; 3206 if (BeginOffset > NewAllocaBeginOffset || 3207 EndOffset < NewAllocaEndOffset) 3208 return false; 3209 // Length must be in range for FixedVectorType. 3210 auto *C = cast<ConstantInt>(II.getLength()); 3211 const uint64_t Len = C->getLimitedValue(); 3212 if (Len > std::numeric_limits<unsigned>::max()) 3213 return false; 3214 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext()); 3215 auto *SrcTy = FixedVectorType::get(Int8Ty, Len); 3216 return canConvertValue(DL, SrcTy, AllocaTy) && 3217 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue()); 3218 }(); 3219 3220 // If this doesn't map cleanly onto the alloca type, and that type isn't 3221 // a single value type, just emit a memset. 3222 if (!CanContinue) { 3223 Type *SizeTy = II.getLength()->getType(); 3224 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 3225 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet( 3226 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, 3227 MaybeAlign(getSliceAlign()), II.isVolatile())); 3228 if (AATags) 3229 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3230 3231 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II, 3232 New, New->getRawDest(), nullptr, DL); 3233 3234 LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); 3235 return false; 3236 } 3237 3238 // If we can represent this as a simple value, we have to build the actual 3239 // value to store, which requires expanding the byte present in memset to 3240 // a sensible representation for the alloca type. This is essentially 3241 // splatting the byte to a sufficiently wide integer, splatting it across 3242 // any desired vector width, and bitcasting to the final type. 3243 Value *V; 3244 3245 if (VecTy) { 3246 // If this is a memset of a vectorized alloca, insert it. 3247 assert(ElementTy == ScalarTy); 3248 3249 unsigned BeginIndex = getIndex(NewBeginOffset); 3250 unsigned EndIndex = getIndex(NewEndOffset); 3251 assert(EndIndex > BeginIndex && "Empty vector!"); 3252 unsigned NumElements = EndIndex - BeginIndex; 3253 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() && 3254 "Too many elements!"); 3255 3256 Value *Splat = getIntegerSplat( 3257 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8); 3258 Splat = convertValue(DL, IRB, Splat, ElementTy); 3259 if (NumElements > 1) 3260 Splat = getVectorSplat(Splat, NumElements); 3261 3262 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3263 NewAI.getAlign(), "oldload"); 3264 V = insertVector(IRB, Old, Splat, BeginIndex, "vec"); 3265 } else if (IntTy) { 3266 // If this is a memset on an alloca where we can widen stores, insert the 3267 // set integer. 3268 assert(!II.isVolatile()); 3269 3270 uint64_t Size = NewEndOffset - NewBeginOffset; 3271 V = getIntegerSplat(II.getValue(), Size); 3272 3273 if (IntTy && (BeginOffset != NewAllocaBeginOffset || 3274 EndOffset != NewAllocaBeginOffset)) { 3275 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3276 NewAI.getAlign(), "oldload"); 3277 Old = convertValue(DL, IRB, Old, IntTy); 3278 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 3279 V = insertInteger(DL, IRB, Old, V, Offset, "insert"); 3280 } else { 3281 assert(V->getType() == IntTy && 3282 "Wrong type for an alloca wide integer!"); 3283 } 3284 V = convertValue(DL, IRB, V, AllocaTy); 3285 } else { 3286 // Established these invariants above. 3287 assert(NewBeginOffset == NewAllocaBeginOffset); 3288 assert(NewEndOffset == NewAllocaEndOffset); 3289 3290 V = getIntegerSplat(II.getValue(), 3291 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8); 3292 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) 3293 V = getVectorSplat( 3294 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements()); 3295 3296 V = convertValue(DL, IRB, V, AllocaTy); 3297 } 3298 3299 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile()); 3300 StoreInst *New = 3301 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile()); 3302 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, 3303 LLVMContext::MD_access_group}); 3304 if (AATags) 3305 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3306 3307 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II, 3308 New, New->getPointerOperand(), V, DL); 3309 3310 LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); 3311 return !II.isVolatile(); 3312 } 3313 3314 bool visitMemTransferInst(MemTransferInst &II) { 3315 // Rewriting of memory transfer instructions can be a bit tricky. We break 3316 // them into two categories: split intrinsics and unsplit intrinsics. 3317 3318 LLVM_DEBUG(dbgs() << " original: " << II << "\n"); 3319 3320 AAMDNodes AATags = II.getAAMetadata(); 3321 3322 bool IsDest = &II.getRawDestUse() == OldUse; 3323 assert((IsDest && II.getRawDest() == OldPtr) || 3324 (!IsDest && II.getRawSource() == OldPtr)); 3325 3326 Align SliceAlign = getSliceAlign(); 3327 // For unsplit intrinsics, we simply modify the source and destination 3328 // pointers in place. This isn't just an optimization, it is a matter of 3329 // correctness. With unsplit intrinsics we may be dealing with transfers 3330 // within a single alloca before SROA ran, or with transfers that have 3331 // a variable length. We may also be dealing with memmove instead of 3332 // memcpy, and so simply updating the pointers is the necessary for us to 3333 // update both source and dest of a single call. 3334 if (!IsSplittable) { 3335 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 3336 if (IsDest) { 3337 // Update the address component of linked dbg.assigns. 3338 auto UpdateAssignAddress = [&](auto *DbgAssign) { 3339 if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) || 3340 DbgAssign->getAddress() == II.getDest()) 3341 DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr); 3342 }; 3343 for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress); 3344 for_each(at::getDPVAssignmentMarkers(&II), UpdateAssignAddress); 3345 II.setDest(AdjustedPtr); 3346 II.setDestAlignment(SliceAlign); 3347 } else { 3348 II.setSource(AdjustedPtr); 3349 II.setSourceAlignment(SliceAlign); 3350 } 3351 3352 LLVM_DEBUG(dbgs() << " to: " << II << "\n"); 3353 deleteIfTriviallyDead(OldPtr); 3354 return false; 3355 } 3356 // For split transfer intrinsics we have an incredibly useful assurance: 3357 // the source and destination do not reside within the same alloca, and at 3358 // least one of them does not escape. This means that we can replace 3359 // memmove with memcpy, and we don't need to worry about all manner of 3360 // downsides to splitting and transforming the operations. 3361 3362 // If this doesn't map cleanly onto the alloca type, and that type isn't 3363 // a single value type, just emit a memcpy. 3364 bool EmitMemCpy = 3365 !VecTy && !IntTy && 3366 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || 3367 SliceSize != 3368 DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() || 3369 !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) || 3370 !NewAI.getAllocatedType()->isSingleValueType()); 3371 3372 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 3373 // size hasn't been shrunk based on analysis of the viable range, this is 3374 // a no-op. 3375 if (EmitMemCpy && &OldAI == &NewAI) { 3376 // Ensure the start lines up. 3377 assert(NewBeginOffset == BeginOffset); 3378 3379 // Rewrite the size as needed. 3380 if (NewEndOffset != EndOffset) 3381 II.setLength(ConstantInt::get(II.getLength()->getType(), 3382 NewEndOffset - NewBeginOffset)); 3383 return false; 3384 } 3385 // Record this instruction for deletion. 3386 Pass.DeadInsts.push_back(&II); 3387 3388 // Strip all inbounds GEPs and pointer casts to try to dig out any root 3389 // alloca that should be re-examined after rewriting this instruction. 3390 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 3391 if (AllocaInst *AI = 3392 dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { 3393 assert(AI != &OldAI && AI != &NewAI && 3394 "Splittable transfers cannot reach the same alloca on both ends."); 3395 Pass.Worklist.insert(AI); 3396 } 3397 3398 Type *OtherPtrTy = OtherPtr->getType(); 3399 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace(); 3400 3401 // Compute the relative offset for the other pointer within the transfer. 3402 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS); 3403 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset); 3404 Align OtherAlign = 3405 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne(); 3406 OtherAlign = 3407 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue()); 3408 3409 if (EmitMemCpy) { 3410 // Compute the other pointer, folding as much as possible to produce 3411 // a single, simple GEP in most cases. 3412 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 3413 OtherPtr->getName() + "."); 3414 3415 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 3416 Type *SizeTy = II.getLength()->getType(); 3417 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 3418 3419 Value *DestPtr, *SrcPtr; 3420 MaybeAlign DestAlign, SrcAlign; 3421 // Note: IsDest is true iff we're copying into the new alloca slice 3422 if (IsDest) { 3423 DestPtr = OurPtr; 3424 DestAlign = SliceAlign; 3425 SrcPtr = OtherPtr; 3426 SrcAlign = OtherAlign; 3427 } else { 3428 DestPtr = OtherPtr; 3429 DestAlign = OtherAlign; 3430 SrcPtr = OurPtr; 3431 SrcAlign = SliceAlign; 3432 } 3433 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign, 3434 Size, II.isVolatile()); 3435 if (AATags) 3436 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3437 3438 APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0); 3439 if (IsDest) { 3440 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, 3441 &II, New, DestPtr, nullptr, DL); 3442 } else if (AllocaInst *Base = dyn_cast<AllocaInst>( 3443 DestPtr->stripAndAccumulateConstantOffsets( 3444 DL, Offset, /*AllowNonInbounds*/ true))) { 3445 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, 3446 SliceSize * 8, &II, New, DestPtr, nullptr, DL); 3447 } 3448 LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); 3449 return false; 3450 } 3451 3452 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && 3453 NewEndOffset == NewAllocaEndOffset; 3454 uint64_t Size = NewEndOffset - NewBeginOffset; 3455 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; 3456 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; 3457 unsigned NumElements = EndIndex - BeginIndex; 3458 IntegerType *SubIntTy = 3459 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr; 3460 3461 // Reset the other pointer type to match the register type we're going to 3462 // use, but using the address space of the original other pointer. 3463 Type *OtherTy; 3464 if (VecTy && !IsWholeAlloca) { 3465 if (NumElements == 1) 3466 OtherTy = VecTy->getElementType(); 3467 else 3468 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements); 3469 } else if (IntTy && !IsWholeAlloca) { 3470 OtherTy = SubIntTy; 3471 } else { 3472 OtherTy = NewAllocaTy; 3473 } 3474 3475 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 3476 OtherPtr->getName() + "."); 3477 MaybeAlign SrcAlign = OtherAlign; 3478 MaybeAlign DstAlign = SliceAlign; 3479 if (!IsDest) 3480 std::swap(SrcAlign, DstAlign); 3481 3482 Value *SrcPtr; 3483 Value *DstPtr; 3484 3485 if (IsDest) { 3486 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile()); 3487 SrcPtr = AdjPtr; 3488 } else { 3489 DstPtr = AdjPtr; 3490 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile()); 3491 } 3492 3493 Value *Src; 3494 if (VecTy && !IsWholeAlloca && !IsDest) { 3495 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3496 NewAI.getAlign(), "load"); 3497 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec"); 3498 } else if (IntTy && !IsWholeAlloca && !IsDest) { 3499 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3500 NewAI.getAlign(), "load"); 3501 Src = convertValue(DL, IRB, Src, IntTy); 3502 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 3503 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); 3504 } else { 3505 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign, 3506 II.isVolatile(), "copyload"); 3507 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, 3508 LLVMContext::MD_access_group}); 3509 if (AATags) 3510 Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3511 Src = Load; 3512 } 3513 3514 if (VecTy && !IsWholeAlloca && IsDest) { 3515 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3516 NewAI.getAlign(), "oldload"); 3517 Src = insertVector(IRB, Old, Src, BeginIndex, "vec"); 3518 } else if (IntTy && !IsWholeAlloca && IsDest) { 3519 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3520 NewAI.getAlign(), "oldload"); 3521 Old = convertValue(DL, IRB, Old, IntTy); 3522 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 3523 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); 3524 Src = convertValue(DL, IRB, Src, NewAllocaTy); 3525 } 3526 3527 StoreInst *Store = cast<StoreInst>( 3528 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); 3529 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, 3530 LLVMContext::MD_access_group}); 3531 if (AATags) 3532 Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3533 3534 APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0); 3535 if (IsDest) { 3536 3537 migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II, 3538 Store, DstPtr, Src, DL); 3539 } else if (AllocaInst *Base = dyn_cast<AllocaInst>( 3540 DstPtr->stripAndAccumulateConstantOffsets( 3541 DL, Offset, /*AllowNonInbounds*/ true))) { 3542 migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8, 3543 &II, Store, DstPtr, Src, DL); 3544 } 3545 3546 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); 3547 return !II.isVolatile(); 3548 } 3549 3550 bool visitIntrinsicInst(IntrinsicInst &II) { 3551 assert((II.isLifetimeStartOrEnd() || II.isLaunderOrStripInvariantGroup() || 3552 II.isDroppable()) && 3553 "Unexpected intrinsic!"); 3554 LLVM_DEBUG(dbgs() << " original: " << II << "\n"); 3555 3556 // Record this instruction for deletion. 3557 Pass.DeadInsts.push_back(&II); 3558 3559 if (II.isDroppable()) { 3560 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume"); 3561 // TODO For now we forget assumed information, this can be improved. 3562 OldPtr->dropDroppableUsesIn(II); 3563 return true; 3564 } 3565 3566 if (II.isLaunderOrStripInvariantGroup()) 3567 return true; 3568 3569 assert(II.getArgOperand(1) == OldPtr); 3570 // Lifetime intrinsics are only promotable if they cover the whole alloca. 3571 // Therefore, we drop lifetime intrinsics which don't cover the whole 3572 // alloca. 3573 // (In theory, intrinsics which partially cover an alloca could be 3574 // promoted, but PromoteMemToReg doesn't handle that case.) 3575 // FIXME: Check whether the alloca is promotable before dropping the 3576 // lifetime intrinsics? 3577 if (NewBeginOffset != NewAllocaBeginOffset || 3578 NewEndOffset != NewAllocaEndOffset) 3579 return true; 3580 3581 ConstantInt *Size = 3582 ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 3583 NewEndOffset - NewBeginOffset); 3584 // Lifetime intrinsics always expect an i8* so directly get such a pointer 3585 // for the new alloca slice. 3586 Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace()); 3587 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy); 3588 Value *New; 3589 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 3590 New = IRB.CreateLifetimeStart(Ptr, Size); 3591 else 3592 New = IRB.CreateLifetimeEnd(Ptr, Size); 3593 3594 (void)New; 3595 LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); 3596 3597 return true; 3598 } 3599 3600 void fixLoadStoreAlign(Instruction &Root) { 3601 // This algorithm implements the same visitor loop as 3602 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load 3603 // or store found. 3604 SmallPtrSet<Instruction *, 4> Visited; 3605 SmallVector<Instruction *, 4> Uses; 3606 Visited.insert(&Root); 3607 Uses.push_back(&Root); 3608 do { 3609 Instruction *I = Uses.pop_back_val(); 3610 3611 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 3612 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign())); 3613 continue; 3614 } 3615 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 3616 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign())); 3617 continue; 3618 } 3619 3620 assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) || 3621 isa<PHINode>(I) || isa<SelectInst>(I) || 3622 isa<GetElementPtrInst>(I)); 3623 for (User *U : I->users()) 3624 if (Visited.insert(cast<Instruction>(U)).second) 3625 Uses.push_back(cast<Instruction>(U)); 3626 } while (!Uses.empty()); 3627 } 3628 3629 bool visitPHINode(PHINode &PN) { 3630 LLVM_DEBUG(dbgs() << " original: " << PN << "\n"); 3631 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable"); 3632 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable"); 3633 3634 // We would like to compute a new pointer in only one place, but have it be 3635 // as local as possible to the PHI. To do that, we re-use the location of 3636 // the old pointer, which necessarily must be in the right position to 3637 // dominate the PHI. 3638 IRBuilderBase::InsertPointGuard Guard(IRB); 3639 if (isa<PHINode>(OldPtr)) 3640 IRB.SetInsertPoint(OldPtr->getParent(), 3641 OldPtr->getParent()->getFirstInsertionPt()); 3642 else 3643 IRB.SetInsertPoint(OldPtr); 3644 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc()); 3645 3646 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 3647 // Replace the operands which were using the old pointer. 3648 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 3649 3650 LLVM_DEBUG(dbgs() << " to: " << PN << "\n"); 3651 deleteIfTriviallyDead(OldPtr); 3652 3653 // Fix the alignment of any loads or stores using this PHI node. 3654 fixLoadStoreAlign(PN); 3655 3656 // PHIs can't be promoted on their own, but often can be speculated. We 3657 // check the speculation outside of the rewriter so that we see the 3658 // fully-rewritten alloca. 3659 PHIUsers.insert(&PN); 3660 return true; 3661 } 3662 3663 bool visitSelectInst(SelectInst &SI) { 3664 LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); 3665 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) && 3666 "Pointer isn't an operand!"); 3667 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); 3668 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); 3669 3670 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 3671 // Replace the operands which were using the old pointer. 3672 if (SI.getOperand(1) == OldPtr) 3673 SI.setOperand(1, NewPtr); 3674 if (SI.getOperand(2) == OldPtr) 3675 SI.setOperand(2, NewPtr); 3676 3677 LLVM_DEBUG(dbgs() << " to: " << SI << "\n"); 3678 deleteIfTriviallyDead(OldPtr); 3679 3680 // Fix the alignment of any loads or stores using this select. 3681 fixLoadStoreAlign(SI); 3682 3683 // Selects can't be promoted on their own, but often can be speculated. We 3684 // check the speculation outside of the rewriter so that we see the 3685 // fully-rewritten alloca. 3686 SelectUsers.insert(&SI); 3687 return true; 3688 } 3689 }; 3690 3691 /// Visitor to rewrite aggregate loads and stores as scalar. 3692 /// 3693 /// This pass aggressively rewrites all aggregate loads and stores on 3694 /// a particular pointer (or any pointer derived from it which we can identify) 3695 /// with scalar loads and stores. 3696 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 3697 // Befriend the base class so it can delegate to private visit methods. 3698 friend class InstVisitor<AggLoadStoreRewriter, bool>; 3699 3700 /// Queue of pointer uses to analyze and potentially rewrite. 3701 SmallVector<Use *, 8> Queue; 3702 3703 /// Set to prevent us from cycling with phi nodes and loops. 3704 SmallPtrSet<User *, 8> Visited; 3705 3706 /// The current pointer use being rewritten. This is used to dig up the used 3707 /// value (as opposed to the user). 3708 Use *U = nullptr; 3709 3710 /// Used to calculate offsets, and hence alignment, of subobjects. 3711 const DataLayout &DL; 3712 3713 IRBuilderTy &IRB; 3714 3715 public: 3716 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB) 3717 : DL(DL), IRB(IRB) {} 3718 3719 /// Rewrite loads and stores through a pointer and all pointers derived from 3720 /// it. 3721 bool rewrite(Instruction &I) { 3722 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 3723 enqueueUsers(I); 3724 bool Changed = false; 3725 while (!Queue.empty()) { 3726 U = Queue.pop_back_val(); 3727 Changed |= visit(cast<Instruction>(U->getUser())); 3728 } 3729 return Changed; 3730 } 3731 3732 private: 3733 /// Enqueue all the users of the given instruction for further processing. 3734 /// This uses a set to de-duplicate users. 3735 void enqueueUsers(Instruction &I) { 3736 for (Use &U : I.uses()) 3737 if (Visited.insert(U.getUser()).second) 3738 Queue.push_back(&U); 3739 } 3740 3741 // Conservative default is to not rewrite anything. 3742 bool visitInstruction(Instruction &I) { return false; } 3743 3744 /// Generic recursive split emission class. 3745 template <typename Derived> class OpSplitter { 3746 protected: 3747 /// The builder used to form new instructions. 3748 IRBuilderTy &IRB; 3749 3750 /// The indices which to be used with insert- or extractvalue to select the 3751 /// appropriate value within the aggregate. 3752 SmallVector<unsigned, 4> Indices; 3753 3754 /// The indices to a GEP instruction which will move Ptr to the correct slot 3755 /// within the aggregate. 3756 SmallVector<Value *, 4> GEPIndices; 3757 3758 /// The base pointer of the original op, used as a base for GEPing the 3759 /// split operations. 3760 Value *Ptr; 3761 3762 /// The base pointee type being GEPed into. 3763 Type *BaseTy; 3764 3765 /// Known alignment of the base pointer. 3766 Align BaseAlign; 3767 3768 /// To calculate offset of each component so we can correctly deduce 3769 /// alignments. 3770 const DataLayout &DL; 3771 3772 /// Initialize the splitter with an insertion point, Ptr and start with a 3773 /// single zero GEP index. 3774 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, 3775 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB) 3776 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy), 3777 BaseAlign(BaseAlign), DL(DL) { 3778 IRB.SetInsertPoint(InsertionPoint); 3779 } 3780 3781 public: 3782 /// Generic recursive split emission routine. 3783 /// 3784 /// This method recursively splits an aggregate op (load or store) into 3785 /// scalar or vector ops. It splits recursively until it hits a single value 3786 /// and emits that single value operation via the template argument. 3787 /// 3788 /// The logic of this routine relies on GEPs and insertvalue and 3789 /// extractvalue all operating with the same fundamental index list, merely 3790 /// formatted differently (GEPs need actual values). 3791 /// 3792 /// \param Ty The type being split recursively into smaller ops. 3793 /// \param Agg The aggregate value being built up or stored, depending on 3794 /// whether this is splitting a load or a store respectively. 3795 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 3796 if (Ty->isSingleValueType()) { 3797 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices); 3798 return static_cast<Derived *>(this)->emitFunc( 3799 Ty, Agg, commonAlignment(BaseAlign, Offset), Name); 3800 } 3801 3802 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 3803 unsigned OldSize = Indices.size(); 3804 (void)OldSize; 3805 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 3806 ++Idx) { 3807 assert(Indices.size() == OldSize && "Did not return to the old size"); 3808 Indices.push_back(Idx); 3809 GEPIndices.push_back(IRB.getInt32(Idx)); 3810 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 3811 GEPIndices.pop_back(); 3812 Indices.pop_back(); 3813 } 3814 return; 3815 } 3816 3817 if (StructType *STy = dyn_cast<StructType>(Ty)) { 3818 unsigned OldSize = Indices.size(); 3819 (void)OldSize; 3820 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 3821 ++Idx) { 3822 assert(Indices.size() == OldSize && "Did not return to the old size"); 3823 Indices.push_back(Idx); 3824 GEPIndices.push_back(IRB.getInt32(Idx)); 3825 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 3826 GEPIndices.pop_back(); 3827 Indices.pop_back(); 3828 } 3829 return; 3830 } 3831 3832 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 3833 } 3834 }; 3835 3836 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 3837 AAMDNodes AATags; 3838 3839 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, 3840 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL, 3841 IRBuilderTy &IRB) 3842 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL, 3843 IRB), 3844 AATags(AATags) {} 3845 3846 /// Emit a leaf load of a single value. This is called at the leaves of the 3847 /// recursive emission to actually load values. 3848 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { 3849 assert(Ty->isSingleValueType()); 3850 // Load the single value and insert it using the indices. 3851 Value *GEP = 3852 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); 3853 LoadInst *Load = 3854 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load"); 3855 3856 APInt Offset( 3857 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0); 3858 if (AATags && 3859 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset)) 3860 Load->setAAMetadata(AATags.shift(Offset.getZExtValue())); 3861 3862 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 3863 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n"); 3864 } 3865 }; 3866 3867 bool visitLoadInst(LoadInst &LI) { 3868 assert(LI.getPointerOperand() == *U); 3869 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 3870 return false; 3871 3872 // We have an aggregate being loaded, split it apart. 3873 LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); 3874 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(), 3875 getAdjustedAlignment(&LI, 0), DL, IRB); 3876 Value *V = PoisonValue::get(LI.getType()); 3877 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 3878 Visited.erase(&LI); 3879 LI.replaceAllUsesWith(V); 3880 LI.eraseFromParent(); 3881 return true; 3882 } 3883 3884 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 3885 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, 3886 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign, 3887 const DataLayout &DL, IRBuilderTy &IRB) 3888 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, 3889 DL, IRB), 3890 AATags(AATags), AggStore(AggStore) {} 3891 AAMDNodes AATags; 3892 StoreInst *AggStore; 3893 /// Emit a leaf store of a single value. This is called at the leaves of the 3894 /// recursive emission to actually produce stores. 3895 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { 3896 assert(Ty->isSingleValueType()); 3897 // Extract the single value and store it using the indices. 3898 // 3899 // The gep and extractvalue values are factored out of the CreateStore 3900 // call to make the output independent of the argument evaluation order. 3901 Value *ExtractValue = 3902 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"); 3903 Value *InBoundsGEP = 3904 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); 3905 StoreInst *Store = 3906 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment); 3907 3908 APInt Offset( 3909 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0); 3910 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset); 3911 if (AATags) 3912 Store->setAAMetadata(AATags.shift(Offset.getZExtValue())); 3913 3914 // migrateDebugInfo requires the base Alloca. Walk to it from this gep. 3915 // If we cannot (because there's an intervening non-const or unbounded 3916 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to 3917 // this instruction. 3918 Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets(); 3919 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) { 3920 uint64_t SizeInBits = 3921 DL.getTypeSizeInBits(Store->getValueOperand()->getType()); 3922 migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8, 3923 SizeInBits, AggStore, Store, 3924 Store->getPointerOperand(), Store->getValueOperand(), 3925 DL); 3926 } else { 3927 assert(at::getAssignmentMarkers(Store).empty() && 3928 at::getDPVAssignmentMarkers(Store).empty() && 3929 "AT: unexpected debug.assign linked to store through " 3930 "unbounded GEP"); 3931 } 3932 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); 3933 } 3934 }; 3935 3936 bool visitStoreInst(StoreInst &SI) { 3937 if (!SI.isSimple() || SI.getPointerOperand() != *U) 3938 return false; 3939 Value *V = SI.getValueOperand(); 3940 if (V->getType()->isSingleValueType()) 3941 return false; 3942 3943 // We have an aggregate being stored, split it apart. 3944 LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); 3945 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI, 3946 getAdjustedAlignment(&SI, 0), DL, IRB); 3947 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 3948 Visited.erase(&SI); 3949 // The stores replacing SI each have markers describing fragments of the 3950 // assignment so delete the assignment markers linked to SI. 3951 at::deleteAssignmentMarkers(&SI); 3952 SI.eraseFromParent(); 3953 return true; 3954 } 3955 3956 bool visitBitCastInst(BitCastInst &BC) { 3957 enqueueUsers(BC); 3958 return false; 3959 } 3960 3961 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 3962 enqueueUsers(ASC); 3963 return false; 3964 } 3965 3966 // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2) 3967 bool foldGEPSelect(GetElementPtrInst &GEPI) { 3968 if (!GEPI.hasAllConstantIndices()) 3969 return false; 3970 3971 SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand()); 3972 3973 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):" 3974 << "\n original: " << *Sel 3975 << "\n " << GEPI); 3976 3977 IRB.SetInsertPoint(&GEPI); 3978 SmallVector<Value *, 4> Index(GEPI.indices()); 3979 bool IsInBounds = GEPI.isInBounds(); 3980 3981 Type *Ty = GEPI.getSourceElementType(); 3982 Value *True = Sel->getTrueValue(); 3983 Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep", 3984 IsInBounds); 3985 3986 Value *False = Sel->getFalseValue(); 3987 3988 Value *NFalse = IRB.CreateGEP(Ty, False, Index, 3989 False->getName() + ".sroa.gep", IsInBounds); 3990 3991 Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse, 3992 Sel->getName() + ".sroa.sel"); 3993 Visited.erase(&GEPI); 3994 GEPI.replaceAllUsesWith(NSel); 3995 GEPI.eraseFromParent(); 3996 Instruction *NSelI = cast<Instruction>(NSel); 3997 Visited.insert(NSelI); 3998 enqueueUsers(*NSelI); 3999 4000 LLVM_DEBUG(dbgs() << "\n to: " << *NTrue 4001 << "\n " << *NFalse 4002 << "\n " << *NSel << '\n'); 4003 4004 return true; 4005 } 4006 4007 // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2) 4008 bool foldGEPPhi(GetElementPtrInst &GEPI) { 4009 if (!GEPI.hasAllConstantIndices()) 4010 return false; 4011 4012 PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand()); 4013 if (GEPI.getParent() != PHI->getParent() || 4014 llvm::any_of(PHI->incoming_values(), [](Value *In) 4015 { Instruction *I = dyn_cast<Instruction>(In); 4016 return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) || 4017 succ_empty(I->getParent()) || 4018 !I->getParent()->isLegalToHoistInto(); 4019 })) 4020 return false; 4021 4022 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):" 4023 << "\n original: " << *PHI 4024 << "\n " << GEPI 4025 << "\n to: "); 4026 4027 SmallVector<Value *, 4> Index(GEPI.indices()); 4028 bool IsInBounds = GEPI.isInBounds(); 4029 IRB.SetInsertPoint(GEPI.getParent(), GEPI.getParent()->getFirstNonPHIIt()); 4030 PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(), 4031 PHI->getName() + ".sroa.phi"); 4032 for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) { 4033 BasicBlock *B = PHI->getIncomingBlock(I); 4034 Value *NewVal = nullptr; 4035 int Idx = NewPN->getBasicBlockIndex(B); 4036 if (Idx >= 0) { 4037 NewVal = NewPN->getIncomingValue(Idx); 4038 } else { 4039 Instruction *In = cast<Instruction>(PHI->getIncomingValue(I)); 4040 4041 IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator())); 4042 Type *Ty = GEPI.getSourceElementType(); 4043 NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep", 4044 IsInBounds); 4045 } 4046 NewPN->addIncoming(NewVal, B); 4047 } 4048 4049 Visited.erase(&GEPI); 4050 GEPI.replaceAllUsesWith(NewPN); 4051 GEPI.eraseFromParent(); 4052 Visited.insert(NewPN); 4053 enqueueUsers(*NewPN); 4054 4055 LLVM_DEBUG(for (Value *In : NewPN->incoming_values()) 4056 dbgs() << "\n " << *In; 4057 dbgs() << "\n " << *NewPN << '\n'); 4058 4059 return true; 4060 } 4061 4062 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 4063 if (isa<SelectInst>(GEPI.getPointerOperand()) && 4064 foldGEPSelect(GEPI)) 4065 return true; 4066 4067 if (isa<PHINode>(GEPI.getPointerOperand()) && 4068 foldGEPPhi(GEPI)) 4069 return true; 4070 4071 enqueueUsers(GEPI); 4072 return false; 4073 } 4074 4075 bool visitPHINode(PHINode &PN) { 4076 enqueueUsers(PN); 4077 return false; 4078 } 4079 4080 bool visitSelectInst(SelectInst &SI) { 4081 enqueueUsers(SI); 4082 return false; 4083 } 4084 }; 4085 4086 } // end anonymous namespace 4087 4088 /// Strip aggregate type wrapping. 4089 /// 4090 /// This removes no-op aggregate types wrapping an underlying type. It will 4091 /// strip as many layers of types as it can without changing either the type 4092 /// size or the allocated size. 4093 static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 4094 if (Ty->isSingleValueType()) 4095 return Ty; 4096 4097 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue(); 4098 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue(); 4099 4100 Type *InnerTy; 4101 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 4102 InnerTy = ArrTy->getElementType(); 4103 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 4104 const StructLayout *SL = DL.getStructLayout(STy); 4105 unsigned Index = SL->getElementContainingOffset(0); 4106 InnerTy = STy->getElementType(Index); 4107 } else { 4108 return Ty; 4109 } 4110 4111 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() || 4112 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue()) 4113 return Ty; 4114 4115 return stripAggregateTypeWrapping(DL, InnerTy); 4116 } 4117 4118 /// Try to find a partition of the aggregate type passed in for a given 4119 /// offset and size. 4120 /// 4121 /// This recurses through the aggregate type and tries to compute a subtype 4122 /// based on the offset and size. When the offset and size span a sub-section 4123 /// of an array, it will even compute a new array type for that sub-section, 4124 /// and the same for structs. 4125 /// 4126 /// Note that this routine is very strict and tries to find a partition of the 4127 /// type which produces the *exact* right offset and size. It is not forgiving 4128 /// when the size or offset cause either end of type-based partition to be off. 4129 /// Also, this is a best-effort routine. It is reasonable to give up and not 4130 /// return a type if necessary. 4131 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, 4132 uint64_t Size) { 4133 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size) 4134 return stripAggregateTypeWrapping(DL, Ty); 4135 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() || 4136 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size) 4137 return nullptr; 4138 4139 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) { 4140 Type *ElementTy; 4141 uint64_t TyNumElements; 4142 if (auto *AT = dyn_cast<ArrayType>(Ty)) { 4143 ElementTy = AT->getElementType(); 4144 TyNumElements = AT->getNumElements(); 4145 } else { 4146 // FIXME: This isn't right for vectors with non-byte-sized or 4147 // non-power-of-two sized elements. 4148 auto *VT = cast<FixedVectorType>(Ty); 4149 ElementTy = VT->getElementType(); 4150 TyNumElements = VT->getNumElements(); 4151 } 4152 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue(); 4153 uint64_t NumSkippedElements = Offset / ElementSize; 4154 if (NumSkippedElements >= TyNumElements) 4155 return nullptr; 4156 Offset -= NumSkippedElements * ElementSize; 4157 4158 // First check if we need to recurse. 4159 if (Offset > 0 || Size < ElementSize) { 4160 // Bail if the partition ends in a different array element. 4161 if ((Offset + Size) > ElementSize) 4162 return nullptr; 4163 // Recurse through the element type trying to peel off offset bytes. 4164 return getTypePartition(DL, ElementTy, Offset, Size); 4165 } 4166 assert(Offset == 0); 4167 4168 if (Size == ElementSize) 4169 return stripAggregateTypeWrapping(DL, ElementTy); 4170 assert(Size > ElementSize); 4171 uint64_t NumElements = Size / ElementSize; 4172 if (NumElements * ElementSize != Size) 4173 return nullptr; 4174 return ArrayType::get(ElementTy, NumElements); 4175 } 4176 4177 StructType *STy = dyn_cast<StructType>(Ty); 4178 if (!STy) 4179 return nullptr; 4180 4181 const StructLayout *SL = DL.getStructLayout(STy); 4182 4183 if (SL->getSizeInBits().isScalable()) 4184 return nullptr; 4185 4186 if (Offset >= SL->getSizeInBytes()) 4187 return nullptr; 4188 uint64_t EndOffset = Offset + Size; 4189 if (EndOffset > SL->getSizeInBytes()) 4190 return nullptr; 4191 4192 unsigned Index = SL->getElementContainingOffset(Offset); 4193 Offset -= SL->getElementOffset(Index); 4194 4195 Type *ElementTy = STy->getElementType(Index); 4196 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue(); 4197 if (Offset >= ElementSize) 4198 return nullptr; // The offset points into alignment padding. 4199 4200 // See if any partition must be contained by the element. 4201 if (Offset > 0 || Size < ElementSize) { 4202 if ((Offset + Size) > ElementSize) 4203 return nullptr; 4204 return getTypePartition(DL, ElementTy, Offset, Size); 4205 } 4206 assert(Offset == 0); 4207 4208 if (Size == ElementSize) 4209 return stripAggregateTypeWrapping(DL, ElementTy); 4210 4211 StructType::element_iterator EI = STy->element_begin() + Index, 4212 EE = STy->element_end(); 4213 if (EndOffset < SL->getSizeInBytes()) { 4214 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 4215 if (Index == EndIndex) 4216 return nullptr; // Within a single element and its padding. 4217 4218 // Don't try to form "natural" types if the elements don't line up with the 4219 // expected size. 4220 // FIXME: We could potentially recurse down through the last element in the 4221 // sub-struct to find a natural end point. 4222 if (SL->getElementOffset(EndIndex) != EndOffset) 4223 return nullptr; 4224 4225 assert(Index < EndIndex); 4226 EE = STy->element_begin() + EndIndex; 4227 } 4228 4229 // Try to build up a sub-structure. 4230 StructType *SubTy = 4231 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked()); 4232 const StructLayout *SubSL = DL.getStructLayout(SubTy); 4233 if (Size != SubSL->getSizeInBytes()) 4234 return nullptr; // The sub-struct doesn't have quite the size needed. 4235 4236 return SubTy; 4237 } 4238 4239 /// Pre-split loads and stores to simplify rewriting. 4240 /// 4241 /// We want to break up the splittable load+store pairs as much as 4242 /// possible. This is important to do as a preprocessing step, as once we 4243 /// start rewriting the accesses to partitions of the alloca we lose the 4244 /// necessary information to correctly split apart paired loads and stores 4245 /// which both point into this alloca. The case to consider is something like 4246 /// the following: 4247 /// 4248 /// %a = alloca [12 x i8] 4249 /// %gep1 = getelementptr i8, ptr %a, i32 0 4250 /// %gep2 = getelementptr i8, ptr %a, i32 4 4251 /// %gep3 = getelementptr i8, ptr %a, i32 8 4252 /// store float 0.0, ptr %gep1 4253 /// store float 1.0, ptr %gep2 4254 /// %v = load i64, ptr %gep1 4255 /// store i64 %v, ptr %gep2 4256 /// %f1 = load float, ptr %gep2 4257 /// %f2 = load float, ptr %gep3 4258 /// 4259 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and 4260 /// promote everything so we recover the 2 SSA values that should have been 4261 /// there all along. 4262 /// 4263 /// \returns true if any changes are made. 4264 bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { 4265 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n"); 4266 4267 // Track the loads and stores which are candidates for pre-splitting here, in 4268 // the order they first appear during the partition scan. These give stable 4269 // iteration order and a basis for tracking which loads and stores we 4270 // actually split. 4271 SmallVector<LoadInst *, 4> Loads; 4272 SmallVector<StoreInst *, 4> Stores; 4273 4274 // We need to accumulate the splits required of each load or store where we 4275 // can find them via a direct lookup. This is important to cross-check loads 4276 // and stores against each other. We also track the slice so that we can kill 4277 // all the slices that end up split. 4278 struct SplitOffsets { 4279 Slice *S; 4280 std::vector<uint64_t> Splits; 4281 }; 4282 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap; 4283 4284 // Track loads out of this alloca which cannot, for any reason, be pre-split. 4285 // This is important as we also cannot pre-split stores of those loads! 4286 // FIXME: This is all pretty gross. It means that we can be more aggressive 4287 // in pre-splitting when the load feeding the store happens to come from 4288 // a separate alloca. Put another way, the effectiveness of SROA would be 4289 // decreased by a frontend which just concatenated all of its local allocas 4290 // into one big flat alloca. But defeating such patterns is exactly the job 4291 // SROA is tasked with! Sadly, to not have this discrepancy we would have 4292 // change store pre-splitting to actually force pre-splitting of the load 4293 // that feeds it *and all stores*. That makes pre-splitting much harder, but 4294 // maybe it would make it more principled? 4295 SmallPtrSet<LoadInst *, 8> UnsplittableLoads; 4296 4297 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n"); 4298 for (auto &P : AS.partitions()) { 4299 for (Slice &S : P) { 4300 Instruction *I = cast<Instruction>(S.getUse()->getUser()); 4301 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) { 4302 // If this is a load we have to track that it can't participate in any 4303 // pre-splitting. If this is a store of a load we have to track that 4304 // that load also can't participate in any pre-splitting. 4305 if (auto *LI = dyn_cast<LoadInst>(I)) 4306 UnsplittableLoads.insert(LI); 4307 else if (auto *SI = dyn_cast<StoreInst>(I)) 4308 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand())) 4309 UnsplittableLoads.insert(LI); 4310 continue; 4311 } 4312 assert(P.endOffset() > S.beginOffset() && 4313 "Empty or backwards partition!"); 4314 4315 // Determine if this is a pre-splittable slice. 4316 if (auto *LI = dyn_cast<LoadInst>(I)) { 4317 assert(!LI->isVolatile() && "Cannot split volatile loads!"); 4318 4319 // The load must be used exclusively to store into other pointers for 4320 // us to be able to arbitrarily pre-split it. The stores must also be 4321 // simple to avoid changing semantics. 4322 auto IsLoadSimplyStored = [](LoadInst *LI) { 4323 for (User *LU : LI->users()) { 4324 auto *SI = dyn_cast<StoreInst>(LU); 4325 if (!SI || !SI->isSimple()) 4326 return false; 4327 } 4328 return true; 4329 }; 4330 if (!IsLoadSimplyStored(LI)) { 4331 UnsplittableLoads.insert(LI); 4332 continue; 4333 } 4334 4335 Loads.push_back(LI); 4336 } else if (auto *SI = dyn_cast<StoreInst>(I)) { 4337 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) 4338 // Skip stores *of* pointers. FIXME: This shouldn't even be possible! 4339 continue; 4340 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand()); 4341 if (!StoredLoad || !StoredLoad->isSimple()) 4342 continue; 4343 assert(!SI->isVolatile() && "Cannot split volatile stores!"); 4344 4345 Stores.push_back(SI); 4346 } else { 4347 // Other uses cannot be pre-split. 4348 continue; 4349 } 4350 4351 // Record the initial split. 4352 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n"); 4353 auto &Offsets = SplitOffsetsMap[I]; 4354 assert(Offsets.Splits.empty() && 4355 "Should not have splits the first time we see an instruction!"); 4356 Offsets.S = &S; 4357 Offsets.Splits.push_back(P.endOffset() - S.beginOffset()); 4358 } 4359 4360 // Now scan the already split slices, and add a split for any of them which 4361 // we're going to pre-split. 4362 for (Slice *S : P.splitSliceTails()) { 4363 auto SplitOffsetsMapI = 4364 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser())); 4365 if (SplitOffsetsMapI == SplitOffsetsMap.end()) 4366 continue; 4367 auto &Offsets = SplitOffsetsMapI->second; 4368 4369 assert(Offsets.S == S && "Found a mismatched slice!"); 4370 assert(!Offsets.Splits.empty() && 4371 "Cannot have an empty set of splits on the second partition!"); 4372 assert(Offsets.Splits.back() == 4373 P.beginOffset() - Offsets.S->beginOffset() && 4374 "Previous split does not end where this one begins!"); 4375 4376 // Record each split. The last partition's end isn't needed as the size 4377 // of the slice dictates that. 4378 if (S->endOffset() > P.endOffset()) 4379 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset()); 4380 } 4381 } 4382 4383 // We may have split loads where some of their stores are split stores. For 4384 // such loads and stores, we can only pre-split them if their splits exactly 4385 // match relative to their starting offset. We have to verify this prior to 4386 // any rewriting. 4387 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) { 4388 // Lookup the load we are storing in our map of split 4389 // offsets. 4390 auto *LI = cast<LoadInst>(SI->getValueOperand()); 4391 // If it was completely unsplittable, then we're done, 4392 // and this store can't be pre-split. 4393 if (UnsplittableLoads.count(LI)) 4394 return true; 4395 4396 auto LoadOffsetsI = SplitOffsetsMap.find(LI); 4397 if (LoadOffsetsI == SplitOffsetsMap.end()) 4398 return false; // Unrelated loads are definitely safe. 4399 auto &LoadOffsets = LoadOffsetsI->second; 4400 4401 // Now lookup the store's offsets. 4402 auto &StoreOffsets = SplitOffsetsMap[SI]; 4403 4404 // If the relative offsets of each split in the load and 4405 // store match exactly, then we can split them and we 4406 // don't need to remove them here. 4407 if (LoadOffsets.Splits == StoreOffsets.Splits) 4408 return false; 4409 4410 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n" 4411 << " " << *LI << "\n" 4412 << " " << *SI << "\n"); 4413 4414 // We've found a store and load that we need to split 4415 // with mismatched relative splits. Just give up on them 4416 // and remove both instructions from our list of 4417 // candidates. 4418 UnsplittableLoads.insert(LI); 4419 return true; 4420 }); 4421 // Now we have to go *back* through all the stores, because a later store may 4422 // have caused an earlier store's load to become unsplittable and if it is 4423 // unsplittable for the later store, then we can't rely on it being split in 4424 // the earlier store either. 4425 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) { 4426 auto *LI = cast<LoadInst>(SI->getValueOperand()); 4427 return UnsplittableLoads.count(LI); 4428 }); 4429 // Once we've established all the loads that can't be split for some reason, 4430 // filter any that made it into our list out. 4431 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) { 4432 return UnsplittableLoads.count(LI); 4433 }); 4434 4435 // If no loads or stores are left, there is no pre-splitting to be done for 4436 // this alloca. 4437 if (Loads.empty() && Stores.empty()) 4438 return false; 4439 4440 // From here on, we can't fail and will be building new accesses, so rig up 4441 // an IR builder. 4442 IRBuilderTy IRB(&AI); 4443 4444 // Collect the new slices which we will merge into the alloca slices. 4445 SmallVector<Slice, 4> NewSlices; 4446 4447 // Track any allocas we end up splitting loads and stores for so we iterate 4448 // on them. 4449 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas; 4450 4451 // At this point, we have collected all of the loads and stores we can 4452 // pre-split, and the specific splits needed for them. We actually do the 4453 // splitting in a specific order in order to handle when one of the loads in 4454 // the value operand to one of the stores. 4455 // 4456 // First, we rewrite all of the split loads, and just accumulate each split 4457 // load in a parallel structure. We also build the slices for them and append 4458 // them to the alloca slices. 4459 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; 4460 std::vector<LoadInst *> SplitLoads; 4461 const DataLayout &DL = AI.getModule()->getDataLayout(); 4462 for (LoadInst *LI : Loads) { 4463 SplitLoads.clear(); 4464 4465 auto &Offsets = SplitOffsetsMap[LI]; 4466 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset(); 4467 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 && 4468 "Load must have type size equal to store size"); 4469 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize && 4470 "Load must be >= slice size"); 4471 4472 uint64_t BaseOffset = Offsets.S->beginOffset(); 4473 assert(BaseOffset + SliceSize > BaseOffset && 4474 "Cannot represent alloca access size using 64-bit integers!"); 4475 4476 Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); 4477 IRB.SetInsertPoint(LI); 4478 4479 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); 4480 4481 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); 4482 int Idx = 0, Size = Offsets.Splits.size(); 4483 for (;;) { 4484 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8); 4485 auto AS = LI->getPointerAddressSpace(); 4486 auto *PartPtrTy = LI->getPointerOperandType(); 4487 LoadInst *PLoad = IRB.CreateAlignedLoad( 4488 PartTy, 4489 getAdjustedPtr(IRB, DL, BasePtr, 4490 APInt(DL.getIndexSizeInBits(AS), PartOffset), 4491 PartPtrTy, BasePtr->getName() + "."), 4492 getAdjustedAlignment(LI, PartOffset), 4493 /*IsVolatile*/ false, LI->getName()); 4494 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, 4495 LLVMContext::MD_access_group}); 4496 4497 // Append this load onto the list of split loads so we can find it later 4498 // to rewrite the stores. 4499 SplitLoads.push_back(PLoad); 4500 4501 // Now build a new slice for the alloca. 4502 NewSlices.push_back( 4503 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, 4504 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), 4505 /*IsSplittable*/ false)); 4506 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() 4507 << ", " << NewSlices.back().endOffset() 4508 << "): " << *PLoad << "\n"); 4509 4510 // See if we've handled all the splits. 4511 if (Idx >= Size) 4512 break; 4513 4514 // Setup the next partition. 4515 PartOffset = Offsets.Splits[Idx]; 4516 ++Idx; 4517 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset; 4518 } 4519 4520 // Now that we have the split loads, do the slow walk over all uses of the 4521 // load and rewrite them as split stores, or save the split loads to use 4522 // below if the store is going to be split there anyways. 4523 bool DeferredStores = false; 4524 for (User *LU : LI->users()) { 4525 StoreInst *SI = cast<StoreInst>(LU); 4526 if (!Stores.empty() && SplitOffsetsMap.count(SI)) { 4527 DeferredStores = true; 4528 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI 4529 << "\n"); 4530 continue; 4531 } 4532 4533 Value *StoreBasePtr = SI->getPointerOperand(); 4534 IRB.SetInsertPoint(SI); 4535 4536 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); 4537 4538 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { 4539 LoadInst *PLoad = SplitLoads[Idx]; 4540 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1]; 4541 auto *PartPtrTy = SI->getPointerOperandType(); 4542 4543 auto AS = SI->getPointerAddressSpace(); 4544 StoreInst *PStore = IRB.CreateAlignedStore( 4545 PLoad, 4546 getAdjustedPtr(IRB, DL, StoreBasePtr, 4547 APInt(DL.getIndexSizeInBits(AS), PartOffset), 4548 PartPtrTy, StoreBasePtr->getName() + "."), 4549 getAdjustedAlignment(SI, PartOffset), 4550 /*IsVolatile*/ false); 4551 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access, 4552 LLVMContext::MD_access_group, 4553 LLVMContext::MD_DIAssignID}); 4554 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); 4555 } 4556 4557 // We want to immediately iterate on any allocas impacted by splitting 4558 // this store, and we have to track any promotable alloca (indicated by 4559 // a direct store) as needing to be resplit because it is no longer 4560 // promotable. 4561 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) { 4562 ResplitPromotableAllocas.insert(OtherAI); 4563 Worklist.insert(OtherAI); 4564 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( 4565 StoreBasePtr->stripInBoundsOffsets())) { 4566 Worklist.insert(OtherAI); 4567 } 4568 4569 // Mark the original store as dead. 4570 DeadInsts.push_back(SI); 4571 } 4572 4573 // Save the split loads if there are deferred stores among the users. 4574 if (DeferredStores) 4575 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads))); 4576 4577 // Mark the original load as dead and kill the original slice. 4578 DeadInsts.push_back(LI); 4579 Offsets.S->kill(); 4580 } 4581 4582 // Second, we rewrite all of the split stores. At this point, we know that 4583 // all loads from this alloca have been split already. For stores of such 4584 // loads, we can simply look up the pre-existing split loads. For stores of 4585 // other loads, we split those loads first and then write split stores of 4586 // them. 4587 for (StoreInst *SI : Stores) { 4588 auto *LI = cast<LoadInst>(SI->getValueOperand()); 4589 IntegerType *Ty = cast<IntegerType>(LI->getType()); 4590 assert(Ty->getBitWidth() % 8 == 0); 4591 uint64_t StoreSize = Ty->getBitWidth() / 8; 4592 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!"); 4593 4594 auto &Offsets = SplitOffsetsMap[SI]; 4595 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && 4596 "Slice size should always match load size exactly!"); 4597 uint64_t BaseOffset = Offsets.S->beginOffset(); 4598 assert(BaseOffset + StoreSize > BaseOffset && 4599 "Cannot represent alloca access size using 64-bit integers!"); 4600 4601 Value *LoadBasePtr = LI->getPointerOperand(); 4602 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand()); 4603 4604 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); 4605 4606 // Check whether we have an already split load. 4607 auto SplitLoadsMapI = SplitLoadsMap.find(LI); 4608 std::vector<LoadInst *> *SplitLoads = nullptr; 4609 if (SplitLoadsMapI != SplitLoadsMap.end()) { 4610 SplitLoads = &SplitLoadsMapI->second; 4611 assert(SplitLoads->size() == Offsets.Splits.size() + 1 && 4612 "Too few split loads for the number of splits in the store!"); 4613 } else { 4614 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n"); 4615 } 4616 4617 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); 4618 int Idx = 0, Size = Offsets.Splits.size(); 4619 for (;;) { 4620 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); 4621 auto *LoadPartPtrTy = LI->getPointerOperandType(); 4622 auto *StorePartPtrTy = SI->getPointerOperandType(); 4623 4624 // Either lookup a split load or create one. 4625 LoadInst *PLoad; 4626 if (SplitLoads) { 4627 PLoad = (*SplitLoads)[Idx]; 4628 } else { 4629 IRB.SetInsertPoint(LI); 4630 auto AS = LI->getPointerAddressSpace(); 4631 PLoad = IRB.CreateAlignedLoad( 4632 PartTy, 4633 getAdjustedPtr(IRB, DL, LoadBasePtr, 4634 APInt(DL.getIndexSizeInBits(AS), PartOffset), 4635 LoadPartPtrTy, LoadBasePtr->getName() + "."), 4636 getAdjustedAlignment(LI, PartOffset), 4637 /*IsVolatile*/ false, LI->getName()); 4638 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, 4639 LLVMContext::MD_access_group}); 4640 } 4641 4642 // And store this partition. 4643 IRB.SetInsertPoint(SI); 4644 auto AS = SI->getPointerAddressSpace(); 4645 StoreInst *PStore = IRB.CreateAlignedStore( 4646 PLoad, 4647 getAdjustedPtr(IRB, DL, StoreBasePtr, 4648 APInt(DL.getIndexSizeInBits(AS), PartOffset), 4649 StorePartPtrTy, StoreBasePtr->getName() + "."), 4650 getAdjustedAlignment(SI, PartOffset), 4651 /*IsVolatile*/ false); 4652 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access, 4653 LLVMContext::MD_access_group}); 4654 4655 // Now build a new slice for the alloca. 4656 NewSlices.push_back( 4657 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, 4658 &PStore->getOperandUse(PStore->getPointerOperandIndex()), 4659 /*IsSplittable*/ false)); 4660 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() 4661 << ", " << NewSlices.back().endOffset() 4662 << "): " << *PStore << "\n"); 4663 if (!SplitLoads) { 4664 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); 4665 } 4666 4667 // See if we've finished all the splits. 4668 if (Idx >= Size) 4669 break; 4670 4671 // Setup the next partition. 4672 PartOffset = Offsets.Splits[Idx]; 4673 ++Idx; 4674 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset; 4675 } 4676 4677 // We want to immediately iterate on any allocas impacted by splitting 4678 // this load, which is only relevant if it isn't a load of this alloca and 4679 // thus we didn't already split the loads above. We also have to keep track 4680 // of any promotable allocas we split loads on as they can no longer be 4681 // promoted. 4682 if (!SplitLoads) { 4683 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) { 4684 assert(OtherAI != &AI && "We can't re-split our own alloca!"); 4685 ResplitPromotableAllocas.insert(OtherAI); 4686 Worklist.insert(OtherAI); 4687 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( 4688 LoadBasePtr->stripInBoundsOffsets())) { 4689 assert(OtherAI != &AI && "We can't re-split our own alloca!"); 4690 Worklist.insert(OtherAI); 4691 } 4692 } 4693 4694 // Mark the original store as dead now that we've split it up and kill its 4695 // slice. Note that we leave the original load in place unless this store 4696 // was its only use. It may in turn be split up if it is an alloca load 4697 // for some other alloca, but it may be a normal load. This may introduce 4698 // redundant loads, but where those can be merged the rest of the optimizer 4699 // should handle the merging, and this uncovers SSA splits which is more 4700 // important. In practice, the original loads will almost always be fully 4701 // split and removed eventually, and the splits will be merged by any 4702 // trivial CSE, including instcombine. 4703 if (LI->hasOneUse()) { 4704 assert(*LI->user_begin() == SI && "Single use isn't this store!"); 4705 DeadInsts.push_back(LI); 4706 } 4707 DeadInsts.push_back(SI); 4708 Offsets.S->kill(); 4709 } 4710 4711 // Remove the killed slices that have ben pre-split. 4712 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); }); 4713 4714 // Insert our new slices. This will sort and merge them into the sorted 4715 // sequence. 4716 AS.insert(NewSlices); 4717 4718 LLVM_DEBUG(dbgs() << " Pre-split slices:\n"); 4719 #ifndef NDEBUG 4720 for (auto I = AS.begin(), E = AS.end(); I != E; ++I) 4721 LLVM_DEBUG(AS.print(dbgs(), I, " ")); 4722 #endif 4723 4724 // Finally, don't try to promote any allocas that new require re-splitting. 4725 // They have already been added to the worklist above. 4726 llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) { 4727 return ResplitPromotableAllocas.count(AI); 4728 }); 4729 4730 return true; 4731 } 4732 4733 /// Rewrite an alloca partition's users. 4734 /// 4735 /// This routine drives both of the rewriting goals of the SROA pass. It tries 4736 /// to rewrite uses of an alloca partition to be conducive for SSA value 4737 /// promotion. If the partition needs a new, more refined alloca, this will 4738 /// build that new alloca, preserving as much type information as possible, and 4739 /// rewrite the uses of the old alloca to point at the new one and have the 4740 /// appropriate new offsets. It also evaluates how successful the rewrite was 4741 /// at enabling promotion and if it was successful queues the alloca to be 4742 /// promoted. 4743 AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, 4744 Partition &P) { 4745 // Try to compute a friendly type for this partition of the alloca. This 4746 // won't always succeed, in which case we fall back to a legal integer type 4747 // or an i8 array of an appropriate size. 4748 Type *SliceTy = nullptr; 4749 VectorType *SliceVecTy = nullptr; 4750 const DataLayout &DL = AI.getModule()->getDataLayout(); 4751 std::pair<Type *, IntegerType *> CommonUseTy = 4752 findCommonType(P.begin(), P.end(), P.endOffset()); 4753 // Do all uses operate on the same type? 4754 if (CommonUseTy.first) 4755 if (DL.getTypeAllocSize(CommonUseTy.first).getFixedValue() >= P.size()) { 4756 SliceTy = CommonUseTy.first; 4757 SliceVecTy = dyn_cast<VectorType>(SliceTy); 4758 } 4759 // If not, can we find an appropriate subtype in the original allocated type? 4760 if (!SliceTy) 4761 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), 4762 P.beginOffset(), P.size())) 4763 SliceTy = TypePartitionTy; 4764 4765 // If still not, can we use the largest bitwidth integer type used? 4766 if (!SliceTy && CommonUseTy.second) 4767 if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) { 4768 SliceTy = CommonUseTy.second; 4769 SliceVecTy = dyn_cast<VectorType>(SliceTy); 4770 } 4771 if ((!SliceTy || (SliceTy->isArrayTy() && 4772 SliceTy->getArrayElementType()->isIntegerTy())) && 4773 DL.isLegalInteger(P.size() * 8)) { 4774 SliceTy = Type::getIntNTy(*C, P.size() * 8); 4775 } 4776 4777 // If the common use types are not viable for promotion then attempt to find 4778 // another type that is viable. 4779 if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL)) 4780 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), 4781 P.beginOffset(), P.size())) { 4782 VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy); 4783 if (TypePartitionVecTy && 4784 checkVectorTypeForPromotion(P, TypePartitionVecTy, DL)) 4785 SliceTy = TypePartitionTy; 4786 } 4787 4788 if (!SliceTy) 4789 SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); 4790 assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size()); 4791 4792 bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); 4793 4794 VectorType *VecTy = 4795 IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); 4796 if (VecTy) 4797 SliceTy = VecTy; 4798 4799 // Check for the case where we're going to rewrite to a new alloca of the 4800 // exact same type as the original, and with the same access offsets. In that 4801 // case, re-use the existing alloca, but still run through the rewriter to 4802 // perform phi and select speculation. 4803 // P.beginOffset() can be non-zero even with the same type in a case with 4804 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll). 4805 AllocaInst *NewAI; 4806 if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) { 4807 NewAI = &AI; 4808 // FIXME: We should be able to bail at this point with "nothing changed". 4809 // FIXME: We might want to defer PHI speculation until after here. 4810 // FIXME: return nullptr; 4811 } else { 4812 // Make sure the alignment is compatible with P.beginOffset(). 4813 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset()); 4814 // If we will get at least this much alignment from the type alone, leave 4815 // the alloca's alignment unconstrained. 4816 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy); 4817 NewAI = new AllocaInst( 4818 SliceTy, AI.getAddressSpace(), nullptr, 4819 IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment, 4820 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); 4821 // Copy the old AI debug location over to the new one. 4822 NewAI->setDebugLoc(AI.getDebugLoc()); 4823 ++NumNewAllocas; 4824 } 4825 4826 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " 4827 << "[" << P.beginOffset() << "," << P.endOffset() 4828 << ") to: " << *NewAI << "\n"); 4829 4830 // Track the high watermark on the worklist as it is only relevant for 4831 // promoted allocas. We will reset it to this point if the alloca is not in 4832 // fact scheduled for promotion. 4833 unsigned PPWOldSize = PostPromotionWorklist.size(); 4834 unsigned NumUses = 0; 4835 SmallSetVector<PHINode *, 8> PHIUsers; 4836 SmallSetVector<SelectInst *, 8> SelectUsers; 4837 4838 AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(), 4839 P.endOffset(), IsIntegerPromotable, VecTy, 4840 PHIUsers, SelectUsers); 4841 bool Promotable = true; 4842 for (Slice *S : P.splitSliceTails()) { 4843 Promotable &= Rewriter.visit(S); 4844 ++NumUses; 4845 } 4846 for (Slice &S : P) { 4847 Promotable &= Rewriter.visit(&S); 4848 ++NumUses; 4849 } 4850 4851 NumAllocaPartitionUses += NumUses; 4852 MaxUsesPerAllocaPartition.updateMax(NumUses); 4853 4854 // Now that we've processed all the slices in the new partition, check if any 4855 // PHIs or Selects would block promotion. 4856 for (PHINode *PHI : PHIUsers) 4857 if (!isSafePHIToSpeculate(*PHI)) { 4858 Promotable = false; 4859 PHIUsers.clear(); 4860 SelectUsers.clear(); 4861 break; 4862 } 4863 4864 SmallVector<std::pair<SelectInst *, RewriteableMemOps>, 2> 4865 NewSelectsToRewrite; 4866 NewSelectsToRewrite.reserve(SelectUsers.size()); 4867 for (SelectInst *Sel : SelectUsers) { 4868 std::optional<RewriteableMemOps> Ops = 4869 isSafeSelectToSpeculate(*Sel, PreserveCFG); 4870 if (!Ops) { 4871 Promotable = false; 4872 PHIUsers.clear(); 4873 SelectUsers.clear(); 4874 NewSelectsToRewrite.clear(); 4875 break; 4876 } 4877 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops)); 4878 } 4879 4880 if (Promotable) { 4881 for (Use *U : AS.getDeadUsesIfPromotable()) { 4882 auto *OldInst = dyn_cast<Instruction>(U->get()); 4883 Value::dropDroppableUse(*U); 4884 if (OldInst) 4885 if (isInstructionTriviallyDead(OldInst)) 4886 DeadInsts.push_back(OldInst); 4887 } 4888 if (PHIUsers.empty() && SelectUsers.empty()) { 4889 // Promote the alloca. 4890 PromotableAllocas.push_back(NewAI); 4891 } else { 4892 // If we have either PHIs or Selects to speculate, add them to those 4893 // worklists and re-queue the new alloca so that we promote in on the 4894 // next iteration. 4895 for (PHINode *PHIUser : PHIUsers) 4896 SpeculatablePHIs.insert(PHIUser); 4897 SelectsToRewrite.reserve(SelectsToRewrite.size() + 4898 NewSelectsToRewrite.size()); 4899 for (auto &&KV : llvm::make_range( 4900 std::make_move_iterator(NewSelectsToRewrite.begin()), 4901 std::make_move_iterator(NewSelectsToRewrite.end()))) 4902 SelectsToRewrite.insert(std::move(KV)); 4903 Worklist.insert(NewAI); 4904 } 4905 } else { 4906 // Drop any post-promotion work items if promotion didn't happen. 4907 while (PostPromotionWorklist.size() > PPWOldSize) 4908 PostPromotionWorklist.pop_back(); 4909 4910 // We couldn't promote and we didn't create a new partition, nothing 4911 // happened. 4912 if (NewAI == &AI) 4913 return nullptr; 4914 4915 // If we can't promote the alloca, iterate on it to check for new 4916 // refinements exposed by splitting the current alloca. Don't iterate on an 4917 // alloca which didn't actually change and didn't get promoted. 4918 Worklist.insert(NewAI); 4919 } 4920 4921 return NewAI; 4922 } 4923 4924 static void insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, 4925 AllocaInst *NewAddr, DIExpression *NewFragmentExpr, 4926 Instruction *BeforeInst) { 4927 DIB.insertDeclare(NewAddr, Orig->getVariable(), NewFragmentExpr, 4928 Orig->getDebugLoc(), BeforeInst); 4929 } 4930 static void insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, 4931 AllocaInst *NewAddr, DIExpression *NewFragmentExpr, 4932 Instruction *BeforeInst) { 4933 (void)BeforeInst; 4934 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) { 4935 NewAddr->setMetadata(LLVMContext::MD_DIAssignID, 4936 DIAssignID::getDistinct(NewAddr->getContext())); 4937 } 4938 auto *NewAssign = DIB.insertDbgAssign( 4939 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr, 4940 Orig->getAddressExpression(), Orig->getDebugLoc()); 4941 LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n"); 4942 (void)NewAssign; 4943 } 4944 static void insertNewDbgInst(DIBuilder &DIB, DPValue *Orig, AllocaInst *NewAddr, 4945 DIExpression *NewFragmentExpr, 4946 Instruction *BeforeInst) { 4947 (void)DIB; 4948 if (Orig->isDbgDeclare()) { 4949 DPValue *DPV = DPValue::createDPVDeclare( 4950 NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc()); 4951 BeforeInst->getParent()->insertDPValueBefore(DPV, 4952 BeforeInst->getIterator()); 4953 return; 4954 } 4955 if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) { 4956 NewAddr->setMetadata(LLVMContext::MD_DIAssignID, 4957 DIAssignID::getDistinct(NewAddr->getContext())); 4958 } 4959 auto *NewAssign = DPValue::createLinkedDPVAssign( 4960 NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr, 4961 Orig->getAddressExpression(), Orig->getDebugLoc()); 4962 LLVM_DEBUG(dbgs() << "Created new DPVAssign: " << *NewAssign << "\n"); 4963 (void)NewAssign; 4964 } 4965 4966 /// Walks the slices of an alloca and form partitions based on them, 4967 /// rewriting each of their uses. 4968 bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { 4969 if (AS.begin() == AS.end()) 4970 return false; 4971 4972 unsigned NumPartitions = 0; 4973 bool Changed = false; 4974 const DataLayout &DL = AI.getModule()->getDataLayout(); 4975 4976 // First try to pre-split loads and stores. 4977 Changed |= presplitLoadsAndStores(AI, AS); 4978 4979 // Now that we have identified any pre-splitting opportunities, 4980 // mark loads and stores unsplittable except for the following case. 4981 // We leave a slice splittable if all other slices are disjoint or fully 4982 // included in the slice, such as whole-alloca loads and stores. 4983 // If we fail to split these during pre-splitting, we want to force them 4984 // to be rewritten into a partition. 4985 bool IsSorted = true; 4986 4987 uint64_t AllocaSize = 4988 DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue(); 4989 const uint64_t MaxBitVectorSize = 1024; 4990 if (AllocaSize <= MaxBitVectorSize) { 4991 // If a byte boundary is included in any load or store, a slice starting or 4992 // ending at the boundary is not splittable. 4993 SmallBitVector SplittableOffset(AllocaSize + 1, true); 4994 for (Slice &S : AS) 4995 for (unsigned O = S.beginOffset() + 1; 4996 O < S.endOffset() && O < AllocaSize; O++) 4997 SplittableOffset.reset(O); 4998 4999 for (Slice &S : AS) { 5000 if (!S.isSplittable()) 5001 continue; 5002 5003 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) && 5004 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()])) 5005 continue; 5006 5007 if (isa<LoadInst>(S.getUse()->getUser()) || 5008 isa<StoreInst>(S.getUse()->getUser())) { 5009 S.makeUnsplittable(); 5010 IsSorted = false; 5011 } 5012 } 5013 } 5014 else { 5015 // We only allow whole-alloca splittable loads and stores 5016 // for a large alloca to avoid creating too large BitVector. 5017 for (Slice &S : AS) { 5018 if (!S.isSplittable()) 5019 continue; 5020 5021 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize) 5022 continue; 5023 5024 if (isa<LoadInst>(S.getUse()->getUser()) || 5025 isa<StoreInst>(S.getUse()->getUser())) { 5026 S.makeUnsplittable(); 5027 IsSorted = false; 5028 } 5029 } 5030 } 5031 5032 if (!IsSorted) 5033 llvm::sort(AS); 5034 5035 /// Describes the allocas introduced by rewritePartition in order to migrate 5036 /// the debug info. 5037 struct Fragment { 5038 AllocaInst *Alloca; 5039 uint64_t Offset; 5040 uint64_t Size; 5041 Fragment(AllocaInst *AI, uint64_t O, uint64_t S) 5042 : Alloca(AI), Offset(O), Size(S) {} 5043 }; 5044 SmallVector<Fragment, 4> Fragments; 5045 5046 // Rewrite each partition. 5047 for (auto &P : AS.partitions()) { 5048 if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) { 5049 Changed = true; 5050 if (NewAI != &AI) { 5051 uint64_t SizeOfByte = 8; 5052 uint64_t AllocaSize = 5053 DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue(); 5054 // Don't include any padding. 5055 uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); 5056 Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size)); 5057 } 5058 } 5059 ++NumPartitions; 5060 } 5061 5062 NumAllocaPartitions += NumPartitions; 5063 MaxPartitionsPerAlloca.updateMax(NumPartitions); 5064 5065 // Migrate debug information from the old alloca to the new alloca(s) 5066 // and the individual partitions. 5067 auto MigrateOne = [&](auto *DbgVariable) { 5068 auto *Expr = DbgVariable->getExpression(); 5069 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); 5070 uint64_t AllocaSize = 5071 DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedValue(); 5072 for (auto Fragment : Fragments) { 5073 // Create a fragment expression describing the new partition or reuse AI's 5074 // expression if there is only one partition. 5075 auto *FragmentExpr = Expr; 5076 if (Fragment.Size < AllocaSize || Expr->isFragment()) { 5077 // If this alloca is already a scalar replacement of a larger aggregate, 5078 // Fragment.Offset describes the offset inside the scalar. 5079 auto ExprFragment = Expr->getFragmentInfo(); 5080 uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0; 5081 uint64_t Start = Offset + Fragment.Offset; 5082 uint64_t Size = Fragment.Size; 5083 if (ExprFragment) { 5084 uint64_t AbsEnd = 5085 ExprFragment->OffsetInBits + ExprFragment->SizeInBits; 5086 if (Start >= AbsEnd) { 5087 // No need to describe a SROAed padding. 5088 continue; 5089 } 5090 Size = std::min(Size, AbsEnd - Start); 5091 } 5092 // The new, smaller fragment is stenciled out from the old fragment. 5093 if (auto OrigFragment = FragmentExpr->getFragmentInfo()) { 5094 assert(Start >= OrigFragment->OffsetInBits && 5095 "new fragment is outside of original fragment"); 5096 Start -= OrigFragment->OffsetInBits; 5097 } 5098 5099 // The alloca may be larger than the variable. 5100 auto VarSize = DbgVariable->getVariable()->getSizeInBits(); 5101 if (VarSize) { 5102 if (Size > *VarSize) 5103 Size = *VarSize; 5104 if (Size == 0 || Start + Size > *VarSize) 5105 continue; 5106 } 5107 5108 // Avoid creating a fragment expression that covers the entire variable. 5109 if (!VarSize || *VarSize != Size) { 5110 if (auto E = 5111 DIExpression::createFragmentExpression(Expr, Start, Size)) 5112 FragmentExpr = *E; 5113 else 5114 continue; 5115 } 5116 } 5117 5118 // Remove any existing intrinsics on the new alloca describing 5119 // the variable fragment. 5120 auto RemoveOne = [DbgVariable](auto *OldDII) { 5121 auto SameVariableFragment = [](const auto *LHS, const auto *RHS) { 5122 return LHS->getVariable() == RHS->getVariable() && 5123 LHS->getDebugLoc()->getInlinedAt() == 5124 RHS->getDebugLoc()->getInlinedAt(); 5125 }; 5126 if (SameVariableFragment(OldDII, DbgVariable)) 5127 OldDII->eraseFromParent(); 5128 }; 5129 for_each(findDbgDeclares(Fragment.Alloca), RemoveOne); 5130 for_each(findDPVDeclares(Fragment.Alloca), RemoveOne); 5131 5132 insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, FragmentExpr, &AI); 5133 } 5134 }; 5135 5136 // Migrate debug information from the old alloca to the new alloca(s) 5137 // and the individual partitions. 5138 for_each(findDbgDeclares(&AI), MigrateOne); 5139 for_each(findDPVDeclares(&AI), MigrateOne); 5140 for_each(at::getAssignmentMarkers(&AI), MigrateOne); 5141 for_each(at::getDPVAssignmentMarkers(&AI), MigrateOne); 5142 5143 return Changed; 5144 } 5145 5146 /// Clobber a use with poison, deleting the used value if it becomes dead. 5147 void SROA::clobberUse(Use &U) { 5148 Value *OldV = U; 5149 // Replace the use with an poison value. 5150 U = PoisonValue::get(OldV->getType()); 5151 5152 // Check for this making an instruction dead. We have to garbage collect 5153 // all the dead instructions to ensure the uses of any alloca end up being 5154 // minimal. 5155 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 5156 if (isInstructionTriviallyDead(OldI)) { 5157 DeadInsts.push_back(OldI); 5158 } 5159 } 5160 5161 /// Analyze an alloca for SROA. 5162 /// 5163 /// This analyzes the alloca to ensure we can reason about it, builds 5164 /// the slices of the alloca, and then hands it off to be split and 5165 /// rewritten as needed. 5166 std::pair<bool /*Changed*/, bool /*CFGChanged*/> 5167 SROA::runOnAlloca(AllocaInst &AI) { 5168 bool Changed = false; 5169 bool CFGChanged = false; 5170 5171 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 5172 ++NumAllocasAnalyzed; 5173 5174 // Special case dead allocas, as they're trivial. 5175 if (AI.use_empty()) { 5176 AI.eraseFromParent(); 5177 Changed = true; 5178 return {Changed, CFGChanged}; 5179 } 5180 const DataLayout &DL = AI.getModule()->getDataLayout(); 5181 5182 // Skip alloca forms that this analysis can't handle. 5183 auto *AT = AI.getAllocatedType(); 5184 TypeSize Size = DL.getTypeAllocSize(AT); 5185 if (AI.isArrayAllocation() || !AT->isSized() || Size.isScalable() || 5186 Size.getFixedValue() == 0) 5187 return {Changed, CFGChanged}; 5188 5189 // First, split any FCA loads and stores touching this alloca to promote 5190 // better splitting and promotion opportunities. 5191 IRBuilderTy IRB(&AI); 5192 AggLoadStoreRewriter AggRewriter(DL, IRB); 5193 Changed |= AggRewriter.rewrite(AI); 5194 5195 // Build the slices using a recursive instruction-visiting builder. 5196 AllocaSlices AS(DL, AI); 5197 LLVM_DEBUG(AS.print(dbgs())); 5198 if (AS.isEscaped()) 5199 return {Changed, CFGChanged}; 5200 5201 // Delete all the dead users of this alloca before splitting and rewriting it. 5202 for (Instruction *DeadUser : AS.getDeadUsers()) { 5203 // Free up everything used by this instruction. 5204 for (Use &DeadOp : DeadUser->operands()) 5205 clobberUse(DeadOp); 5206 5207 // Now replace the uses of this instruction. 5208 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType())); 5209 5210 // And mark it for deletion. 5211 DeadInsts.push_back(DeadUser); 5212 Changed = true; 5213 } 5214 for (Use *DeadOp : AS.getDeadOperands()) { 5215 clobberUse(*DeadOp); 5216 Changed = true; 5217 } 5218 5219 // No slices to split. Leave the dead alloca for a later pass to clean up. 5220 if (AS.begin() == AS.end()) 5221 return {Changed, CFGChanged}; 5222 5223 Changed |= splitAlloca(AI, AS); 5224 5225 LLVM_DEBUG(dbgs() << " Speculating PHIs\n"); 5226 while (!SpeculatablePHIs.empty()) 5227 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val()); 5228 5229 LLVM_DEBUG(dbgs() << " Rewriting Selects\n"); 5230 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector(); 5231 while (!RemainingSelectsToRewrite.empty()) { 5232 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val(); 5233 CFGChanged |= 5234 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU); 5235 } 5236 5237 return {Changed, CFGChanged}; 5238 } 5239 5240 /// Delete the dead instructions accumulated in this run. 5241 /// 5242 /// Recursively deletes the dead instructions we've accumulated. This is done 5243 /// at the very end to maximize locality of the recursive delete and to 5244 /// minimize the problems of invalidated instruction pointers as such pointers 5245 /// are used heavily in the intermediate stages of the algorithm. 5246 /// 5247 /// We also record the alloca instructions deleted here so that they aren't 5248 /// subsequently handed to mem2reg to promote. 5249 bool SROA::deleteDeadInstructions( 5250 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) { 5251 bool Changed = false; 5252 while (!DeadInsts.empty()) { 5253 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()); 5254 if (!I) 5255 continue; 5256 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 5257 5258 // If the instruction is an alloca, find the possible dbg.declare connected 5259 // to it, and remove it too. We must do this before calling RAUW or we will 5260 // not be able to find it. 5261 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { 5262 DeletedAllocas.insert(AI); 5263 for (DbgDeclareInst *OldDII : findDbgDeclares(AI)) 5264 OldDII->eraseFromParent(); 5265 for (DPValue *OldDII : findDPVDeclares(AI)) 5266 OldDII->eraseFromParent(); 5267 } 5268 5269 at::deleteAssignmentMarkers(I); 5270 I->replaceAllUsesWith(UndefValue::get(I->getType())); 5271 5272 for (Use &Operand : I->operands()) 5273 if (Instruction *U = dyn_cast<Instruction>(Operand)) { 5274 // Zero out the operand and see if it becomes trivially dead. 5275 Operand = nullptr; 5276 if (isInstructionTriviallyDead(U)) 5277 DeadInsts.push_back(U); 5278 } 5279 5280 ++NumDeleted; 5281 I->eraseFromParent(); 5282 Changed = true; 5283 } 5284 return Changed; 5285 } 5286 5287 /// Promote the allocas, using the best available technique. 5288 /// 5289 /// This attempts to promote whatever allocas have been identified as viable in 5290 /// the PromotableAllocas list. If that list is empty, there is nothing to do. 5291 /// This function returns whether any promotion occurred. 5292 bool SROA::promoteAllocas(Function &F) { 5293 if (PromotableAllocas.empty()) 5294 return false; 5295 5296 NumPromoted += PromotableAllocas.size(); 5297 5298 if (SROASkipMem2Reg) { 5299 LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n"); 5300 } else { 5301 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 5302 PromoteMemToReg(PromotableAllocas, DTU->getDomTree(), AC); 5303 } 5304 5305 PromotableAllocas.clear(); 5306 return true; 5307 } 5308 5309 std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) { 5310 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 5311 5312 const DataLayout &DL = F.getParent()->getDataLayout(); 5313 BasicBlock &EntryBB = F.getEntryBlock(); 5314 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); 5315 I != E; ++I) { 5316 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { 5317 if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() && 5318 isAllocaPromotable(AI)) 5319 PromotableAllocas.push_back(AI); 5320 else 5321 Worklist.insert(AI); 5322 } 5323 } 5324 5325 bool Changed = false; 5326 bool CFGChanged = false; 5327 // A set of deleted alloca instruction pointers which should be removed from 5328 // the list of promotable allocas. 5329 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 5330 5331 do { 5332 while (!Worklist.empty()) { 5333 auto [IterationChanged, IterationCFGChanged] = 5334 runOnAlloca(*Worklist.pop_back_val()); 5335 Changed |= IterationChanged; 5336 CFGChanged |= IterationCFGChanged; 5337 5338 Changed |= deleteDeadInstructions(DeletedAllocas); 5339 5340 // Remove the deleted allocas from various lists so that we don't try to 5341 // continue processing them. 5342 if (!DeletedAllocas.empty()) { 5343 auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); }; 5344 Worklist.remove_if(IsInSet); 5345 PostPromotionWorklist.remove_if(IsInSet); 5346 llvm::erase_if(PromotableAllocas, IsInSet); 5347 DeletedAllocas.clear(); 5348 } 5349 } 5350 5351 Changed |= promoteAllocas(F); 5352 5353 Worklist = PostPromotionWorklist; 5354 PostPromotionWorklist.clear(); 5355 } while (!Worklist.empty()); 5356 5357 assert((!CFGChanged || Changed) && "Can not only modify the CFG."); 5358 assert((!CFGChanged || !PreserveCFG) && 5359 "Should not have modified the CFG when told to preserve it."); 5360 5361 if (Changed && isAssignmentTrackingEnabled(*F.getParent())) { 5362 for (auto &BB : F) { 5363 RemoveRedundantDbgInstrs(&BB); 5364 } 5365 } 5366 5367 return {Changed, CFGChanged}; 5368 } 5369 5370 PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) { 5371 DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); 5372 AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); 5373 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 5374 auto [Changed, CFGChanged] = 5375 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F); 5376 if (!Changed) 5377 return PreservedAnalyses::all(); 5378 PreservedAnalyses PA; 5379 if (!CFGChanged) 5380 PA.preserveSet<CFGAnalyses>(); 5381 PA.preserve<DominatorTreeAnalysis>(); 5382 return PA; 5383 } 5384 5385 void SROAPass::printPipeline( 5386 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 5387 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline( 5388 OS, MapClassName2PassName); 5389 OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>" 5390 : "<modify-cfg>"); 5391 } 5392 5393 SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {} 5394 5395 namespace { 5396 5397 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass. 5398 class SROALegacyPass : public FunctionPass { 5399 SROAOptions PreserveCFG; 5400 5401 public: 5402 static char ID; 5403 5404 SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG) 5405 : FunctionPass(ID), PreserveCFG(PreserveCFG) { 5406 initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); 5407 } 5408 5409 bool runOnFunction(Function &F) override { 5410 if (skipFunction(F)) 5411 return false; 5412 5413 DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); 5414 AssumptionCache &AC = 5415 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); 5416 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); 5417 auto [Changed, _] = 5418 SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F); 5419 return Changed; 5420 } 5421 5422 void getAnalysisUsage(AnalysisUsage &AU) const override { 5423 AU.addRequired<AssumptionCacheTracker>(); 5424 AU.addRequired<DominatorTreeWrapperPass>(); 5425 AU.addPreserved<GlobalsAAWrapperPass>(); 5426 AU.addPreserved<DominatorTreeWrapperPass>(); 5427 } 5428 5429 StringRef getPassName() const override { return "SROA"; } 5430 }; 5431 5432 } // end anonymous namespace 5433 5434 char SROALegacyPass::ID = 0; 5435 5436 FunctionPass *llvm::createSROAPass(bool PreserveCFG) { 5437 return new SROALegacyPass(PreserveCFG ? SROAOptions::PreserveCFG 5438 : SROAOptions::ModifyCFG); 5439 } 5440 5441 INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", 5442 "Scalar Replacement Of Aggregates", false, false) 5443 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 5444 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 5445 INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", 5446 false, false) 5447