1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// \file 9 /// This transformation implements the well known scalar replacement of 10 /// aggregates transformation. It tries to identify promotable elements of an 11 /// aggregate alloca, and promote them to registers. It will also try to 12 /// convert uses of an element (or set of elements) of an alloca into a vector 13 /// or bitfield-style integer scalar if appropriate. 14 /// 15 /// It works to do this with minimal slicing of the alloca so that regions 16 /// which are merely transferred in and out of external memory remain unchanged 17 /// and are not decomposed to scalar code. 18 /// 19 /// Because this also performs alloca promotion, it can be thought of as also 20 /// serving the purpose of SSA formation. The algorithm iterates on the 21 /// function until all opportunities for promotion have been realized. 22 /// 23 //===----------------------------------------------------------------------===// 24 25 #include "llvm/Transforms/Scalar/SROA.h" 26 #include "llvm/ADT/APInt.h" 27 #include "llvm/ADT/ArrayRef.h" 28 #include "llvm/ADT/DenseMap.h" 29 #include "llvm/ADT/PointerIntPair.h" 30 #include "llvm/ADT/STLExtras.h" 31 #include "llvm/ADT/SetVector.h" 32 #include "llvm/ADT/SmallBitVector.h" 33 #include "llvm/ADT/SmallPtrSet.h" 34 #include "llvm/ADT/SmallVector.h" 35 #include "llvm/ADT/Statistic.h" 36 #include "llvm/ADT/StringRef.h" 37 #include "llvm/ADT/Twine.h" 38 #include "llvm/ADT/iterator.h" 39 #include "llvm/ADT/iterator_range.h" 40 #include "llvm/Analysis/AssumptionCache.h" 41 #include "llvm/Analysis/DomTreeUpdater.h" 42 #include "llvm/Analysis/GlobalsModRef.h" 43 #include "llvm/Analysis/Loads.h" 44 #include "llvm/Analysis/PtrUseVisitor.h" 45 #include "llvm/Config/llvm-config.h" 46 #include "llvm/IR/BasicBlock.h" 47 #include "llvm/IR/Constant.h" 48 #include "llvm/IR/ConstantFolder.h" 49 #include "llvm/IR/Constants.h" 50 #include "llvm/IR/DIBuilder.h" 51 #include "llvm/IR/DataLayout.h" 52 #include "llvm/IR/DebugInfo.h" 53 #include "llvm/IR/DebugInfoMetadata.h" 54 #include "llvm/IR/DerivedTypes.h" 55 #include "llvm/IR/Dominators.h" 56 #include "llvm/IR/Function.h" 57 #include "llvm/IR/GetElementPtrTypeIterator.h" 58 #include "llvm/IR/GlobalAlias.h" 59 #include "llvm/IR/IRBuilder.h" 60 #include "llvm/IR/InstVisitor.h" 61 #include "llvm/IR/Instruction.h" 62 #include "llvm/IR/Instructions.h" 63 #include "llvm/IR/IntrinsicInst.h" 64 #include "llvm/IR/LLVMContext.h" 65 #include "llvm/IR/Metadata.h" 66 #include "llvm/IR/Module.h" 67 #include "llvm/IR/Operator.h" 68 #include "llvm/IR/PassManager.h" 69 #include "llvm/IR/Type.h" 70 #include "llvm/IR/Use.h" 71 #include "llvm/IR/User.h" 72 #include "llvm/IR/Value.h" 73 #include "llvm/InitializePasses.h" 74 #include "llvm/Pass.h" 75 #include "llvm/Support/Casting.h" 76 #include "llvm/Support/CommandLine.h" 77 #include "llvm/Support/Compiler.h" 78 #include "llvm/Support/Debug.h" 79 #include "llvm/Support/ErrorHandling.h" 80 #include "llvm/Support/raw_ostream.h" 81 #include "llvm/Transforms/Scalar.h" 82 #include "llvm/Transforms/Utils/BasicBlockUtils.h" 83 #include "llvm/Transforms/Utils/Local.h" 84 #include "llvm/Transforms/Utils/PromoteMemToReg.h" 85 #include <algorithm> 86 #include <cassert> 87 #include <cstddef> 88 #include <cstdint> 89 #include <cstring> 90 #include <iterator> 91 #include <string> 92 #include <tuple> 93 #include <utility> 94 #include <vector> 95 96 using namespace llvm; 97 using namespace llvm::sroa; 98 99 #define DEBUG_TYPE "sroa" 100 101 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); 102 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); 103 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); 104 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten"); 105 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition"); 106 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced"); 107 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values"); 108 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion"); 109 STATISTIC(NumLoadsPredicated, 110 "Number of loads rewritten into predicated loads to allow promotion"); 111 STATISTIC( 112 NumStoresPredicated, 113 "Number of stores rewritten into predicated loads to allow promotion"); 114 STATISTIC(NumDeleted, "Number of instructions deleted"); 115 STATISTIC(NumVectorized, "Number of vectorized aggregates"); 116 117 /// Hidden option to experiment with completely strict handling of inbounds 118 /// GEPs. 119 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false), 120 cl::Hidden); 121 namespace { 122 /// Find linked dbg.assign and generate a new one with the correct 123 /// FragmentInfo. Link Inst to the new dbg.assign. If Value is nullptr the 124 /// value component is copied from the old dbg.assign to the new. 125 /// \param OldAlloca Alloca for the variable before splitting. 126 /// \param RelativeOffsetInBits Offset into \p OldAlloca relative to the 127 /// offset prior to splitting (change in offset). 128 /// \param SliceSizeInBits New number of bits being written to. 129 /// \param OldInst Instruction that is being split. 130 /// \param Inst New instruction performing this part of the 131 /// split store. 132 /// \param Dest Store destination. 133 /// \param Value Stored value. 134 /// \param DL Datalayout. 135 static void migrateDebugInfo(AllocaInst *OldAlloca, 136 uint64_t RelativeOffsetInBits, 137 uint64_t SliceSizeInBits, Instruction *OldInst, 138 Instruction *Inst, Value *Dest, Value *Value, 139 const DataLayout &DL) { 140 auto MarkerRange = at::getAssignmentMarkers(OldInst); 141 // Nothing to do if OldInst has no linked dbg.assign intrinsics. 142 if (MarkerRange.empty()) 143 return; 144 145 LLVM_DEBUG(dbgs() << " migrateDebugInfo\n"); 146 LLVM_DEBUG(dbgs() << " OldAlloca: " << *OldAlloca << "\n"); 147 LLVM_DEBUG(dbgs() << " RelativeOffset: " << RelativeOffsetInBits << "\n"); 148 LLVM_DEBUG(dbgs() << " SliceSizeInBits: " << SliceSizeInBits << "\n"); 149 LLVM_DEBUG(dbgs() << " OldInst: " << *OldInst << "\n"); 150 LLVM_DEBUG(dbgs() << " Inst: " << *Inst << "\n"); 151 LLVM_DEBUG(dbgs() << " Dest: " << *Dest << "\n"); 152 if (Value) 153 LLVM_DEBUG(dbgs() << " Value: " << *Value << "\n"); 154 155 // The new inst needs a DIAssignID unique metadata tag (if OldInst has 156 // one). It shouldn't already have one: assert this assumption. 157 assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID)); 158 DIAssignID *NewID = nullptr; 159 auto &Ctx = Inst->getContext(); 160 DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false); 161 uint64_t AllocaSizeInBits = *OldAlloca->getAllocationSizeInBits(DL); 162 assert(OldAlloca->isStaticAlloca()); 163 164 for (DbgAssignIntrinsic *DbgAssign : MarkerRange) { 165 LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign 166 << "\n"); 167 auto *Expr = DbgAssign->getExpression(); 168 169 // Check if the dbg.assign already describes a fragment. 170 auto GetCurrentFragSize = [AllocaSizeInBits, DbgAssign, 171 Expr]() -> uint64_t { 172 if (auto FI = Expr->getFragmentInfo()) 173 return FI->SizeInBits; 174 if (auto VarSize = DbgAssign->getVariable()->getSizeInBits()) 175 return *VarSize; 176 // The variable type has an unspecified size. This can happen in the 177 // case of DW_TAG_unspecified_type types, e.g. std::nullptr_t. Because 178 // there is no fragment and we do not know the size of the variable type, 179 // we'll guess by looking at the alloca. 180 return AllocaSizeInBits; 181 }; 182 uint64_t CurrentFragSize = GetCurrentFragSize(); 183 bool MakeNewFragment = CurrentFragSize != SliceSizeInBits; 184 assert(MakeNewFragment || RelativeOffsetInBits == 0); 185 186 assert(SliceSizeInBits <= AllocaSizeInBits); 187 if (MakeNewFragment) { 188 assert(RelativeOffsetInBits + SliceSizeInBits <= CurrentFragSize); 189 auto E = DIExpression::createFragmentExpression( 190 Expr, RelativeOffsetInBits, SliceSizeInBits); 191 assert(E && "Failed to create fragment expr!"); 192 Expr = *E; 193 } 194 195 // If we haven't created a DIAssignID ID do that now and attach it to Inst. 196 if (!NewID) { 197 NewID = DIAssignID::getDistinct(Ctx); 198 Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID); 199 } 200 201 Value = Value ? Value : DbgAssign->getValue(); 202 auto *NewAssign = DIB.insertDbgAssign( 203 Inst, Value, DbgAssign->getVariable(), Expr, Dest, 204 DIExpression::get(Ctx, std::nullopt), DbgAssign->getDebugLoc()); 205 206 // We could use more precision here at the cost of some additional (code) 207 // complexity - if the original dbg.assign was adjacent to its store, we 208 // could position this new dbg.assign adjacent to its store rather than the 209 // old dbg.assgn. That would result in interleaved dbg.assigns rather than 210 // what we get now: 211 // split store !1 212 // split store !2 213 // dbg.assign !1 214 // dbg.assign !2 215 // This (current behaviour) results results in debug assignments being 216 // noted as slightly offset (in code) from the store. In practice this 217 // should have little effect on the debugging experience due to the fact 218 // that all the split stores should get the same line number. 219 NewAssign->moveBefore(DbgAssign); 220 221 NewAssign->setDebugLoc(DbgAssign->getDebugLoc()); 222 LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign 223 << "\n"); 224 } 225 } 226 227 /// A custom IRBuilder inserter which prefixes all names, but only in 228 /// Assert builds. 229 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter { 230 std::string Prefix; 231 232 Twine getNameWithPrefix(const Twine &Name) const { 233 return Name.isTriviallyEmpty() ? Name : Prefix + Name; 234 } 235 236 public: 237 void SetNamePrefix(const Twine &P) { Prefix = P.str(); } 238 239 void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB, 240 BasicBlock::iterator InsertPt) const override { 241 IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB, 242 InsertPt); 243 } 244 }; 245 246 /// Provide a type for IRBuilder that drops names in release builds. 247 using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>; 248 249 /// A used slice of an alloca. 250 /// 251 /// This structure represents a slice of an alloca used by some instruction. It 252 /// stores both the begin and end offsets of this use, a pointer to the use 253 /// itself, and a flag indicating whether we can classify the use as splittable 254 /// or not when forming partitions of the alloca. 255 class Slice { 256 /// The beginning offset of the range. 257 uint64_t BeginOffset = 0; 258 259 /// The ending offset, not included in the range. 260 uint64_t EndOffset = 0; 261 262 /// Storage for both the use of this slice and whether it can be 263 /// split. 264 PointerIntPair<Use *, 1, bool> UseAndIsSplittable; 265 266 public: 267 Slice() = default; 268 269 Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable) 270 : BeginOffset(BeginOffset), EndOffset(EndOffset), 271 UseAndIsSplittable(U, IsSplittable) {} 272 273 uint64_t beginOffset() const { return BeginOffset; } 274 uint64_t endOffset() const { return EndOffset; } 275 276 bool isSplittable() const { return UseAndIsSplittable.getInt(); } 277 void makeUnsplittable() { UseAndIsSplittable.setInt(false); } 278 279 Use *getUse() const { return UseAndIsSplittable.getPointer(); } 280 281 bool isDead() const { return getUse() == nullptr; } 282 void kill() { UseAndIsSplittable.setPointer(nullptr); } 283 284 /// Support for ordering ranges. 285 /// 286 /// This provides an ordering over ranges such that start offsets are 287 /// always increasing, and within equal start offsets, the end offsets are 288 /// decreasing. Thus the spanning range comes first in a cluster with the 289 /// same start position. 290 bool operator<(const Slice &RHS) const { 291 if (beginOffset() < RHS.beginOffset()) 292 return true; 293 if (beginOffset() > RHS.beginOffset()) 294 return false; 295 if (isSplittable() != RHS.isSplittable()) 296 return !isSplittable(); 297 if (endOffset() > RHS.endOffset()) 298 return true; 299 return false; 300 } 301 302 /// Support comparison with a single offset to allow binary searches. 303 friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS, 304 uint64_t RHSOffset) { 305 return LHS.beginOffset() < RHSOffset; 306 } 307 friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset, 308 const Slice &RHS) { 309 return LHSOffset < RHS.beginOffset(); 310 } 311 312 bool operator==(const Slice &RHS) const { 313 return isSplittable() == RHS.isSplittable() && 314 beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset(); 315 } 316 bool operator!=(const Slice &RHS) const { return !operator==(RHS); } 317 }; 318 319 } // end anonymous namespace 320 321 /// Representation of the alloca slices. 322 /// 323 /// This class represents the slices of an alloca which are formed by its 324 /// various uses. If a pointer escapes, we can't fully build a representation 325 /// for the slices used and we reflect that in this structure. The uses are 326 /// stored, sorted by increasing beginning offset and with unsplittable slices 327 /// starting at a particular offset before splittable slices. 328 class llvm::sroa::AllocaSlices { 329 public: 330 /// Construct the slices of a particular alloca. 331 AllocaSlices(const DataLayout &DL, AllocaInst &AI); 332 333 /// Test whether a pointer to the allocation escapes our analysis. 334 /// 335 /// If this is true, the slices are never fully built and should be 336 /// ignored. 337 bool isEscaped() const { return PointerEscapingInstr; } 338 339 /// Support for iterating over the slices. 340 /// @{ 341 using iterator = SmallVectorImpl<Slice>::iterator; 342 using range = iterator_range<iterator>; 343 344 iterator begin() { return Slices.begin(); } 345 iterator end() { return Slices.end(); } 346 347 using const_iterator = SmallVectorImpl<Slice>::const_iterator; 348 using const_range = iterator_range<const_iterator>; 349 350 const_iterator begin() const { return Slices.begin(); } 351 const_iterator end() const { return Slices.end(); } 352 /// @} 353 354 /// Erase a range of slices. 355 void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); } 356 357 /// Insert new slices for this alloca. 358 /// 359 /// This moves the slices into the alloca's slices collection, and re-sorts 360 /// everything so that the usual ordering properties of the alloca's slices 361 /// hold. 362 void insert(ArrayRef<Slice> NewSlices) { 363 int OldSize = Slices.size(); 364 Slices.append(NewSlices.begin(), NewSlices.end()); 365 auto SliceI = Slices.begin() + OldSize; 366 llvm::sort(SliceI, Slices.end()); 367 std::inplace_merge(Slices.begin(), SliceI, Slices.end()); 368 } 369 370 // Forward declare the iterator and range accessor for walking the 371 // partitions. 372 class partition_iterator; 373 iterator_range<partition_iterator> partitions(); 374 375 /// Access the dead users for this alloca. 376 ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; } 377 378 /// Access Uses that should be dropped if the alloca is promotable. 379 ArrayRef<Use *> getDeadUsesIfPromotable() const { 380 return DeadUseIfPromotable; 381 } 382 383 /// Access the dead operands referring to this alloca. 384 /// 385 /// These are operands which have cannot actually be used to refer to the 386 /// alloca as they are outside its range and the user doesn't correct for 387 /// that. These mostly consist of PHI node inputs and the like which we just 388 /// need to replace with undef. 389 ArrayRef<Use *> getDeadOperands() const { return DeadOperands; } 390 391 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 392 void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const; 393 void printSlice(raw_ostream &OS, const_iterator I, 394 StringRef Indent = " ") const; 395 void printUse(raw_ostream &OS, const_iterator I, 396 StringRef Indent = " ") const; 397 void print(raw_ostream &OS) const; 398 void dump(const_iterator I) const; 399 void dump() const; 400 #endif 401 402 private: 403 template <typename DerivedT, typename RetT = void> class BuilderBase; 404 class SliceBuilder; 405 406 friend class AllocaSlices::SliceBuilder; 407 408 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 409 /// Handle to alloca instruction to simplify method interfaces. 410 AllocaInst &AI; 411 #endif 412 413 /// The instruction responsible for this alloca not having a known set 414 /// of slices. 415 /// 416 /// When an instruction (potentially) escapes the pointer to the alloca, we 417 /// store a pointer to that here and abort trying to form slices of the 418 /// alloca. This will be null if the alloca slices are analyzed successfully. 419 Instruction *PointerEscapingInstr; 420 421 /// The slices of the alloca. 422 /// 423 /// We store a vector of the slices formed by uses of the alloca here. This 424 /// vector is sorted by increasing begin offset, and then the unsplittable 425 /// slices before the splittable ones. See the Slice inner class for more 426 /// details. 427 SmallVector<Slice, 8> Slices; 428 429 /// Instructions which will become dead if we rewrite the alloca. 430 /// 431 /// Note that these are not separated by slice. This is because we expect an 432 /// alloca to be completely rewritten or not rewritten at all. If rewritten, 433 /// all these instructions can simply be removed and replaced with poison as 434 /// they come from outside of the allocated space. 435 SmallVector<Instruction *, 8> DeadUsers; 436 437 /// Uses which will become dead if can promote the alloca. 438 SmallVector<Use *, 8> DeadUseIfPromotable; 439 440 /// Operands which will become dead if we rewrite the alloca. 441 /// 442 /// These are operands that in their particular use can be replaced with 443 /// poison when we rewrite the alloca. These show up in out-of-bounds inputs 444 /// to PHI nodes and the like. They aren't entirely dead (there might be 445 /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we 446 /// want to swap this particular input for poison to simplify the use lists of 447 /// the alloca. 448 SmallVector<Use *, 8> DeadOperands; 449 }; 450 451 /// A partition of the slices. 452 /// 453 /// An ephemeral representation for a range of slices which can be viewed as 454 /// a partition of the alloca. This range represents a span of the alloca's 455 /// memory which cannot be split, and provides access to all of the slices 456 /// overlapping some part of the partition. 457 /// 458 /// Objects of this type are produced by traversing the alloca's slices, but 459 /// are only ephemeral and not persistent. 460 class llvm::sroa::Partition { 461 private: 462 friend class AllocaSlices; 463 friend class AllocaSlices::partition_iterator; 464 465 using iterator = AllocaSlices::iterator; 466 467 /// The beginning and ending offsets of the alloca for this 468 /// partition. 469 uint64_t BeginOffset = 0, EndOffset = 0; 470 471 /// The start and end iterators of this partition. 472 iterator SI, SJ; 473 474 /// A collection of split slice tails overlapping the partition. 475 SmallVector<Slice *, 4> SplitTails; 476 477 /// Raw constructor builds an empty partition starting and ending at 478 /// the given iterator. 479 Partition(iterator SI) : SI(SI), SJ(SI) {} 480 481 public: 482 /// The start offset of this partition. 483 /// 484 /// All of the contained slices start at or after this offset. 485 uint64_t beginOffset() const { return BeginOffset; } 486 487 /// The end offset of this partition. 488 /// 489 /// All of the contained slices end at or before this offset. 490 uint64_t endOffset() const { return EndOffset; } 491 492 /// The size of the partition. 493 /// 494 /// Note that this can never be zero. 495 uint64_t size() const { 496 assert(BeginOffset < EndOffset && "Partitions must span some bytes!"); 497 return EndOffset - BeginOffset; 498 } 499 500 /// Test whether this partition contains no slices, and merely spans 501 /// a region occupied by split slices. 502 bool empty() const { return SI == SJ; } 503 504 /// \name Iterate slices that start within the partition. 505 /// These may be splittable or unsplittable. They have a begin offset >= the 506 /// partition begin offset. 507 /// @{ 508 // FIXME: We should probably define a "concat_iterator" helper and use that 509 // to stitch together pointee_iterators over the split tails and the 510 // contiguous iterators of the partition. That would give a much nicer 511 // interface here. We could then additionally expose filtered iterators for 512 // split, unsplit, and unsplittable splices based on the usage patterns. 513 iterator begin() const { return SI; } 514 iterator end() const { return SJ; } 515 /// @} 516 517 /// Get the sequence of split slice tails. 518 /// 519 /// These tails are of slices which start before this partition but are 520 /// split and overlap into the partition. We accumulate these while forming 521 /// partitions. 522 ArrayRef<Slice *> splitSliceTails() const { return SplitTails; } 523 }; 524 525 /// An iterator over partitions of the alloca's slices. 526 /// 527 /// This iterator implements the core algorithm for partitioning the alloca's 528 /// slices. It is a forward iterator as we don't support backtracking for 529 /// efficiency reasons, and re-use a single storage area to maintain the 530 /// current set of split slices. 531 /// 532 /// It is templated on the slice iterator type to use so that it can operate 533 /// with either const or non-const slice iterators. 534 class AllocaSlices::partition_iterator 535 : public iterator_facade_base<partition_iterator, std::forward_iterator_tag, 536 Partition> { 537 friend class AllocaSlices; 538 539 /// Most of the state for walking the partitions is held in a class 540 /// with a nice interface for examining them. 541 Partition P; 542 543 /// We need to keep the end of the slices to know when to stop. 544 AllocaSlices::iterator SE; 545 546 /// We also need to keep track of the maximum split end offset seen. 547 /// FIXME: Do we really? 548 uint64_t MaxSplitSliceEndOffset = 0; 549 550 /// Sets the partition to be empty at given iterator, and sets the 551 /// end iterator. 552 partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE) 553 : P(SI), SE(SE) { 554 // If not already at the end, advance our state to form the initial 555 // partition. 556 if (SI != SE) 557 advance(); 558 } 559 560 /// Advance the iterator to the next partition. 561 /// 562 /// Requires that the iterator not be at the end of the slices. 563 void advance() { 564 assert((P.SI != SE || !P.SplitTails.empty()) && 565 "Cannot advance past the end of the slices!"); 566 567 // Clear out any split uses which have ended. 568 if (!P.SplitTails.empty()) { 569 if (P.EndOffset >= MaxSplitSliceEndOffset) { 570 // If we've finished all splits, this is easy. 571 P.SplitTails.clear(); 572 MaxSplitSliceEndOffset = 0; 573 } else { 574 // Remove the uses which have ended in the prior partition. This 575 // cannot change the max split slice end because we just checked that 576 // the prior partition ended prior to that max. 577 llvm::erase_if(P.SplitTails, 578 [&](Slice *S) { return S->endOffset() <= P.EndOffset; }); 579 assert(llvm::any_of(P.SplitTails, 580 [&](Slice *S) { 581 return S->endOffset() == MaxSplitSliceEndOffset; 582 }) && 583 "Could not find the current max split slice offset!"); 584 assert(llvm::all_of(P.SplitTails, 585 [&](Slice *S) { 586 return S->endOffset() <= MaxSplitSliceEndOffset; 587 }) && 588 "Max split slice end offset is not actually the max!"); 589 } 590 } 591 592 // If P.SI is already at the end, then we've cleared the split tail and 593 // now have an end iterator. 594 if (P.SI == SE) { 595 assert(P.SplitTails.empty() && "Failed to clear the split slices!"); 596 return; 597 } 598 599 // If we had a non-empty partition previously, set up the state for 600 // subsequent partitions. 601 if (P.SI != P.SJ) { 602 // Accumulate all the splittable slices which started in the old 603 // partition into the split list. 604 for (Slice &S : P) 605 if (S.isSplittable() && S.endOffset() > P.EndOffset) { 606 P.SplitTails.push_back(&S); 607 MaxSplitSliceEndOffset = 608 std::max(S.endOffset(), MaxSplitSliceEndOffset); 609 } 610 611 // Start from the end of the previous partition. 612 P.SI = P.SJ; 613 614 // If P.SI is now at the end, we at most have a tail of split slices. 615 if (P.SI == SE) { 616 P.BeginOffset = P.EndOffset; 617 P.EndOffset = MaxSplitSliceEndOffset; 618 return; 619 } 620 621 // If the we have split slices and the next slice is after a gap and is 622 // not splittable immediately form an empty partition for the split 623 // slices up until the next slice begins. 624 if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset && 625 !P.SI->isSplittable()) { 626 P.BeginOffset = P.EndOffset; 627 P.EndOffset = P.SI->beginOffset(); 628 return; 629 } 630 } 631 632 // OK, we need to consume new slices. Set the end offset based on the 633 // current slice, and step SJ past it. The beginning offset of the 634 // partition is the beginning offset of the next slice unless we have 635 // pre-existing split slices that are continuing, in which case we begin 636 // at the prior end offset. 637 P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset; 638 P.EndOffset = P.SI->endOffset(); 639 ++P.SJ; 640 641 // There are two strategies to form a partition based on whether the 642 // partition starts with an unsplittable slice or a splittable slice. 643 if (!P.SI->isSplittable()) { 644 // When we're forming an unsplittable region, it must always start at 645 // the first slice and will extend through its end. 646 assert(P.BeginOffset == P.SI->beginOffset()); 647 648 // Form a partition including all of the overlapping slices with this 649 // unsplittable slice. 650 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { 651 if (!P.SJ->isSplittable()) 652 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); 653 ++P.SJ; 654 } 655 656 // We have a partition across a set of overlapping unsplittable 657 // partitions. 658 return; 659 } 660 661 // If we're starting with a splittable slice, then we need to form 662 // a synthetic partition spanning it and any other overlapping splittable 663 // splices. 664 assert(P.SI->isSplittable() && "Forming a splittable partition!"); 665 666 // Collect all of the overlapping splittable slices. 667 while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset && 668 P.SJ->isSplittable()) { 669 P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset()); 670 ++P.SJ; 671 } 672 673 // Back upiP.EndOffset if we ended the span early when encountering an 674 // unsplittable slice. This synthesizes the early end offset of 675 // a partition spanning only splittable slices. 676 if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) { 677 assert(!P.SJ->isSplittable()); 678 P.EndOffset = P.SJ->beginOffset(); 679 } 680 } 681 682 public: 683 bool operator==(const partition_iterator &RHS) const { 684 assert(SE == RHS.SE && 685 "End iterators don't match between compared partition iterators!"); 686 687 // The observed positions of partitions is marked by the P.SI iterator and 688 // the emptiness of the split slices. The latter is only relevant when 689 // P.SI == SE, as the end iterator will additionally have an empty split 690 // slices list, but the prior may have the same P.SI and a tail of split 691 // slices. 692 if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) { 693 assert(P.SJ == RHS.P.SJ && 694 "Same set of slices formed two different sized partitions!"); 695 assert(P.SplitTails.size() == RHS.P.SplitTails.size() && 696 "Same slice position with differently sized non-empty split " 697 "slice tails!"); 698 return true; 699 } 700 return false; 701 } 702 703 partition_iterator &operator++() { 704 advance(); 705 return *this; 706 } 707 708 Partition &operator*() { return P; } 709 }; 710 711 /// A forward range over the partitions of the alloca's slices. 712 /// 713 /// This accesses an iterator range over the partitions of the alloca's 714 /// slices. It computes these partitions on the fly based on the overlapping 715 /// offsets of the slices and the ability to split them. It will visit "empty" 716 /// partitions to cover regions of the alloca only accessed via split 717 /// slices. 718 iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() { 719 return make_range(partition_iterator(begin(), end()), 720 partition_iterator(end(), end())); 721 } 722 723 static Value *foldSelectInst(SelectInst &SI) { 724 // If the condition being selected on is a constant or the same value is 725 // being selected between, fold the select. Yes this does (rarely) happen 726 // early on. 727 if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition())) 728 return SI.getOperand(1 + CI->isZero()); 729 if (SI.getOperand(1) == SI.getOperand(2)) 730 return SI.getOperand(1); 731 732 return nullptr; 733 } 734 735 /// A helper that folds a PHI node or a select. 736 static Value *foldPHINodeOrSelectInst(Instruction &I) { 737 if (PHINode *PN = dyn_cast<PHINode>(&I)) { 738 // If PN merges together the same value, return that value. 739 return PN->hasConstantValue(); 740 } 741 return foldSelectInst(cast<SelectInst>(I)); 742 } 743 744 /// Builder for the alloca slices. 745 /// 746 /// This class builds a set of alloca slices by recursively visiting the uses 747 /// of an alloca and making a slice for each load and store at each offset. 748 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> { 749 friend class PtrUseVisitor<SliceBuilder>; 750 friend class InstVisitor<SliceBuilder>; 751 752 using Base = PtrUseVisitor<SliceBuilder>; 753 754 const uint64_t AllocSize; 755 AllocaSlices &AS; 756 757 SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap; 758 SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes; 759 760 /// Set to de-duplicate dead instructions found in the use walk. 761 SmallPtrSet<Instruction *, 4> VisitedDeadInsts; 762 763 public: 764 SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS) 765 : PtrUseVisitor<SliceBuilder>(DL), 766 AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()), 767 AS(AS) {} 768 769 private: 770 void markAsDead(Instruction &I) { 771 if (VisitedDeadInsts.insert(&I).second) 772 AS.DeadUsers.push_back(&I); 773 } 774 775 void insertUse(Instruction &I, const APInt &Offset, uint64_t Size, 776 bool IsSplittable = false) { 777 // Completely skip uses which have a zero size or start either before or 778 // past the end of the allocation. 779 if (Size == 0 || Offset.uge(AllocSize)) { 780 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" 781 << Offset 782 << " which has zero size or starts outside of the " 783 << AllocSize << " byte alloca:\n" 784 << " alloca: " << AS.AI << "\n" 785 << " use: " << I << "\n"); 786 return markAsDead(I); 787 } 788 789 uint64_t BeginOffset = Offset.getZExtValue(); 790 uint64_t EndOffset = BeginOffset + Size; 791 792 // Clamp the end offset to the end of the allocation. Note that this is 793 // formulated to handle even the case where "BeginOffset + Size" overflows. 794 // This may appear superficially to be something we could ignore entirely, 795 // but that is not so! There may be widened loads or PHI-node uses where 796 // some instructions are dead but not others. We can't completely ignore 797 // them, and so have to record at least the information here. 798 assert(AllocSize >= BeginOffset); // Established above. 799 if (Size > AllocSize - BeginOffset) { 800 LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" 801 << Offset << " to remain within the " << AllocSize 802 << " byte alloca:\n" 803 << " alloca: " << AS.AI << "\n" 804 << " use: " << I << "\n"); 805 EndOffset = AllocSize; 806 } 807 808 AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable)); 809 } 810 811 void visitBitCastInst(BitCastInst &BC) { 812 if (BC.use_empty()) 813 return markAsDead(BC); 814 815 return Base::visitBitCastInst(BC); 816 } 817 818 void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 819 if (ASC.use_empty()) 820 return markAsDead(ASC); 821 822 return Base::visitAddrSpaceCastInst(ASC); 823 } 824 825 void visitGetElementPtrInst(GetElementPtrInst &GEPI) { 826 if (GEPI.use_empty()) 827 return markAsDead(GEPI); 828 829 if (SROAStrictInbounds && GEPI.isInBounds()) { 830 // FIXME: This is a manually un-factored variant of the basic code inside 831 // of GEPs with checking of the inbounds invariant specified in the 832 // langref in a very strict sense. If we ever want to enable 833 // SROAStrictInbounds, this code should be factored cleanly into 834 // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds 835 // by writing out the code here where we have the underlying allocation 836 // size readily available. 837 APInt GEPOffset = Offset; 838 const DataLayout &DL = GEPI.getModule()->getDataLayout(); 839 for (gep_type_iterator GTI = gep_type_begin(GEPI), 840 GTE = gep_type_end(GEPI); 841 GTI != GTE; ++GTI) { 842 ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand()); 843 if (!OpC) 844 break; 845 846 // Handle a struct index, which adds its field offset to the pointer. 847 if (StructType *STy = GTI.getStructTypeOrNull()) { 848 unsigned ElementIdx = OpC->getZExtValue(); 849 const StructLayout *SL = DL.getStructLayout(STy); 850 GEPOffset += 851 APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)); 852 } else { 853 // For array or vector indices, scale the index by the size of the 854 // type. 855 APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth()); 856 GEPOffset += 857 Index * 858 APInt(Offset.getBitWidth(), 859 DL.getTypeAllocSize(GTI.getIndexedType()).getFixedValue()); 860 } 861 862 // If this index has computed an intermediate pointer which is not 863 // inbounds, then the result of the GEP is a poison value and we can 864 // delete it and all uses. 865 if (GEPOffset.ugt(AllocSize)) 866 return markAsDead(GEPI); 867 } 868 } 869 870 return Base::visitGetElementPtrInst(GEPI); 871 } 872 873 void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset, 874 uint64_t Size, bool IsVolatile) { 875 // We allow splitting of non-volatile loads and stores where the type is an 876 // integer type. These may be used to implement 'memcpy' or other "transfer 877 // of bits" patterns. 878 bool IsSplittable = 879 Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty); 880 881 insertUse(I, Offset, Size, IsSplittable); 882 } 883 884 void visitLoadInst(LoadInst &LI) { 885 assert((!LI.isSimple() || LI.getType()->isSingleValueType()) && 886 "All simple FCA loads should have been pre-split"); 887 888 if (!IsOffsetKnown) 889 return PI.setAborted(&LI); 890 891 if (isa<ScalableVectorType>(LI.getType())) 892 return PI.setAborted(&LI); 893 894 uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedValue(); 895 return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile()); 896 } 897 898 void visitStoreInst(StoreInst &SI) { 899 Value *ValOp = SI.getValueOperand(); 900 if (ValOp == *U) 901 return PI.setEscapedAndAborted(&SI); 902 if (!IsOffsetKnown) 903 return PI.setAborted(&SI); 904 905 if (isa<ScalableVectorType>(ValOp->getType())) 906 return PI.setAborted(&SI); 907 908 uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedValue(); 909 910 // If this memory access can be shown to *statically* extend outside the 911 // bounds of the allocation, it's behavior is undefined, so simply 912 // ignore it. Note that this is more strict than the generic clamping 913 // behavior of insertUse. We also try to handle cases which might run the 914 // risk of overflow. 915 // FIXME: We should instead consider the pointer to have escaped if this 916 // function is being instrumented for addressing bugs or race conditions. 917 if (Size > AllocSize || Offset.ugt(AllocSize - Size)) { 918 LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" 919 << Offset << " which extends past the end of the " 920 << AllocSize << " byte alloca:\n" 921 << " alloca: " << AS.AI << "\n" 922 << " use: " << SI << "\n"); 923 return markAsDead(SI); 924 } 925 926 assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) && 927 "All simple FCA stores should have been pre-split"); 928 handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile()); 929 } 930 931 void visitMemSetInst(MemSetInst &II) { 932 assert(II.getRawDest() == *U && "Pointer use is not the destination?"); 933 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 934 if ((Length && Length->getValue() == 0) || 935 (IsOffsetKnown && Offset.uge(AllocSize))) 936 // Zero-length mem transfer intrinsics can be ignored entirely. 937 return markAsDead(II); 938 939 if (!IsOffsetKnown) 940 return PI.setAborted(&II); 941 942 insertUse(II, Offset, Length ? Length->getLimitedValue() 943 : AllocSize - Offset.getLimitedValue(), 944 (bool)Length); 945 } 946 947 void visitMemTransferInst(MemTransferInst &II) { 948 ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength()); 949 if (Length && Length->getValue() == 0) 950 // Zero-length mem transfer intrinsics can be ignored entirely. 951 return markAsDead(II); 952 953 // Because we can visit these intrinsics twice, also check to see if the 954 // first time marked this instruction as dead. If so, skip it. 955 if (VisitedDeadInsts.count(&II)) 956 return; 957 958 if (!IsOffsetKnown) 959 return PI.setAborted(&II); 960 961 // This side of the transfer is completely out-of-bounds, and so we can 962 // nuke the entire transfer. However, we also need to nuke the other side 963 // if already added to our partitions. 964 // FIXME: Yet another place we really should bypass this when 965 // instrumenting for ASan. 966 if (Offset.uge(AllocSize)) { 967 SmallDenseMap<Instruction *, unsigned>::iterator MTPI = 968 MemTransferSliceMap.find(&II); 969 if (MTPI != MemTransferSliceMap.end()) 970 AS.Slices[MTPI->second].kill(); 971 return markAsDead(II); 972 } 973 974 uint64_t RawOffset = Offset.getLimitedValue(); 975 uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset; 976 977 // Check for the special case where the same exact value is used for both 978 // source and dest. 979 if (*U == II.getRawDest() && *U == II.getRawSource()) { 980 // For non-volatile transfers this is a no-op. 981 if (!II.isVolatile()) 982 return markAsDead(II); 983 984 return insertUse(II, Offset, Size, /*IsSplittable=*/false); 985 } 986 987 // If we have seen both source and destination for a mem transfer, then 988 // they both point to the same alloca. 989 bool Inserted; 990 SmallDenseMap<Instruction *, unsigned>::iterator MTPI; 991 std::tie(MTPI, Inserted) = 992 MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size())); 993 unsigned PrevIdx = MTPI->second; 994 if (!Inserted) { 995 Slice &PrevP = AS.Slices[PrevIdx]; 996 997 // Check if the begin offsets match and this is a non-volatile transfer. 998 // In that case, we can completely elide the transfer. 999 if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) { 1000 PrevP.kill(); 1001 return markAsDead(II); 1002 } 1003 1004 // Otherwise we have an offset transfer within the same alloca. We can't 1005 // split those. 1006 PrevP.makeUnsplittable(); 1007 } 1008 1009 // Insert the use now that we've fixed up the splittable nature. 1010 insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length); 1011 1012 // Check that we ended up with a valid index in the map. 1013 assert(AS.Slices[PrevIdx].getUse()->getUser() == &II && 1014 "Map index doesn't point back to a slice with this user."); 1015 } 1016 1017 // Disable SRoA for any intrinsics except for lifetime invariants and 1018 // invariant group. 1019 // FIXME: What about debug intrinsics? This matches old behavior, but 1020 // doesn't make sense. 1021 void visitIntrinsicInst(IntrinsicInst &II) { 1022 if (II.isDroppable()) { 1023 AS.DeadUseIfPromotable.push_back(U); 1024 return; 1025 } 1026 1027 if (!IsOffsetKnown) 1028 return PI.setAborted(&II); 1029 1030 if (II.isLifetimeStartOrEnd()) { 1031 ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0)); 1032 uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(), 1033 Length->getLimitedValue()); 1034 insertUse(II, Offset, Size, true); 1035 return; 1036 } 1037 1038 if (II.isLaunderOrStripInvariantGroup()) { 1039 enqueueUsers(II); 1040 return; 1041 } 1042 1043 Base::visitIntrinsicInst(II); 1044 } 1045 1046 Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) { 1047 // We consider any PHI or select that results in a direct load or store of 1048 // the same offset to be a viable use for slicing purposes. These uses 1049 // are considered unsplittable and the size is the maximum loaded or stored 1050 // size. 1051 SmallPtrSet<Instruction *, 4> Visited; 1052 SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses; 1053 Visited.insert(Root); 1054 Uses.push_back(std::make_pair(cast<Instruction>(*U), Root)); 1055 const DataLayout &DL = Root->getModule()->getDataLayout(); 1056 // If there are no loads or stores, the access is dead. We mark that as 1057 // a size zero access. 1058 Size = 0; 1059 do { 1060 Instruction *I, *UsedI; 1061 std::tie(UsedI, I) = Uses.pop_back_val(); 1062 1063 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 1064 Size = 1065 std::max(Size, DL.getTypeStoreSize(LI->getType()).getFixedValue()); 1066 continue; 1067 } 1068 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 1069 Value *Op = SI->getOperand(0); 1070 if (Op == UsedI) 1071 return SI; 1072 Size = 1073 std::max(Size, DL.getTypeStoreSize(Op->getType()).getFixedValue()); 1074 continue; 1075 } 1076 1077 if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { 1078 if (!GEP->hasAllZeroIndices()) 1079 return GEP; 1080 } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) && 1081 !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) { 1082 return I; 1083 } 1084 1085 for (User *U : I->users()) 1086 if (Visited.insert(cast<Instruction>(U)).second) 1087 Uses.push_back(std::make_pair(I, cast<Instruction>(U))); 1088 } while (!Uses.empty()); 1089 1090 return nullptr; 1091 } 1092 1093 void visitPHINodeOrSelectInst(Instruction &I) { 1094 assert(isa<PHINode>(I) || isa<SelectInst>(I)); 1095 if (I.use_empty()) 1096 return markAsDead(I); 1097 1098 // If this is a PHI node before a catchswitch, we cannot insert any non-PHI 1099 // instructions in this BB, which may be required during rewriting. Bail out 1100 // on these cases. 1101 if (isa<PHINode>(I) && 1102 I.getParent()->getFirstInsertionPt() == I.getParent()->end()) 1103 return PI.setAborted(&I); 1104 1105 // TODO: We could use simplifyInstruction here to fold PHINodes and 1106 // SelectInsts. However, doing so requires to change the current 1107 // dead-operand-tracking mechanism. For instance, suppose neither loading 1108 // from %U nor %other traps. Then "load (select undef, %U, %other)" does not 1109 // trap either. However, if we simply replace %U with undef using the 1110 // current dead-operand-tracking mechanism, "load (select undef, undef, 1111 // %other)" may trap because the select may return the first operand 1112 // "undef". 1113 if (Value *Result = foldPHINodeOrSelectInst(I)) { 1114 if (Result == *U) 1115 // If the result of the constant fold will be the pointer, recurse 1116 // through the PHI/select as if we had RAUW'ed it. 1117 enqueueUsers(I); 1118 else 1119 // Otherwise the operand to the PHI/select is dead, and we can replace 1120 // it with poison. 1121 AS.DeadOperands.push_back(U); 1122 1123 return; 1124 } 1125 1126 if (!IsOffsetKnown) 1127 return PI.setAborted(&I); 1128 1129 // See if we already have computed info on this node. 1130 uint64_t &Size = PHIOrSelectSizes[&I]; 1131 if (!Size) { 1132 // This is a new PHI/Select, check for an unsafe use of it. 1133 if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size)) 1134 return PI.setAborted(UnsafeI); 1135 } 1136 1137 // For PHI and select operands outside the alloca, we can't nuke the entire 1138 // phi or select -- the other side might still be relevant, so we special 1139 // case them here and use a separate structure to track the operands 1140 // themselves which should be replaced with poison. 1141 // FIXME: This should instead be escaped in the event we're instrumenting 1142 // for address sanitization. 1143 if (Offset.uge(AllocSize)) { 1144 AS.DeadOperands.push_back(U); 1145 return; 1146 } 1147 1148 insertUse(I, Offset, Size); 1149 } 1150 1151 void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); } 1152 1153 void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); } 1154 1155 /// Disable SROA entirely if there are unhandled users of the alloca. 1156 void visitInstruction(Instruction &I) { PI.setAborted(&I); } 1157 }; 1158 1159 AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI) 1160 : 1161 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1162 AI(AI), 1163 #endif 1164 PointerEscapingInstr(nullptr) { 1165 SliceBuilder PB(DL, AI, *this); 1166 SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI); 1167 if (PtrI.isEscaped() || PtrI.isAborted()) { 1168 // FIXME: We should sink the escape vs. abort info into the caller nicely, 1169 // possibly by just storing the PtrInfo in the AllocaSlices. 1170 PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst() 1171 : PtrI.getAbortingInst(); 1172 assert(PointerEscapingInstr && "Did not track a bad instruction"); 1173 return; 1174 } 1175 1176 llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); }); 1177 1178 // Sort the uses. This arranges for the offsets to be in ascending order, 1179 // and the sizes to be in descending order. 1180 llvm::stable_sort(Slices); 1181 } 1182 1183 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1184 1185 void AllocaSlices::print(raw_ostream &OS, const_iterator I, 1186 StringRef Indent) const { 1187 printSlice(OS, I, Indent); 1188 OS << "\n"; 1189 printUse(OS, I, Indent); 1190 } 1191 1192 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I, 1193 StringRef Indent) const { 1194 OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")" 1195 << " slice #" << (I - begin()) 1196 << (I->isSplittable() ? " (splittable)" : ""); 1197 } 1198 1199 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I, 1200 StringRef Indent) const { 1201 OS << Indent << " used by: " << *I->getUse()->getUser() << "\n"; 1202 } 1203 1204 void AllocaSlices::print(raw_ostream &OS) const { 1205 if (PointerEscapingInstr) { 1206 OS << "Can't analyze slices for alloca: " << AI << "\n" 1207 << " A pointer to this alloca escaped by:\n" 1208 << " " << *PointerEscapingInstr << "\n"; 1209 return; 1210 } 1211 1212 OS << "Slices of alloca: " << AI << "\n"; 1213 for (const_iterator I = begin(), E = end(); I != E; ++I) 1214 print(OS, I); 1215 } 1216 1217 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const { 1218 print(dbgs(), I); 1219 } 1220 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); } 1221 1222 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) 1223 1224 /// Walk the range of a partitioning looking for a common type to cover this 1225 /// sequence of slices. 1226 static std::pair<Type *, IntegerType *> 1227 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E, 1228 uint64_t EndOffset) { 1229 Type *Ty = nullptr; 1230 bool TyIsCommon = true; 1231 IntegerType *ITy = nullptr; 1232 1233 // Note that we need to look at *every* alloca slice's Use to ensure we 1234 // always get consistent results regardless of the order of slices. 1235 for (AllocaSlices::const_iterator I = B; I != E; ++I) { 1236 Use *U = I->getUse(); 1237 if (isa<IntrinsicInst>(*U->getUser())) 1238 continue; 1239 if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset) 1240 continue; 1241 1242 Type *UserTy = nullptr; 1243 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 1244 UserTy = LI->getType(); 1245 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 1246 UserTy = SI->getValueOperand()->getType(); 1247 } 1248 1249 if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) { 1250 // If the type is larger than the partition, skip it. We only encounter 1251 // this for split integer operations where we want to use the type of the 1252 // entity causing the split. Also skip if the type is not a byte width 1253 // multiple. 1254 if (UserITy->getBitWidth() % 8 != 0 || 1255 UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset())) 1256 continue; 1257 1258 // Track the largest bitwidth integer type used in this way in case there 1259 // is no common type. 1260 if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth()) 1261 ITy = UserITy; 1262 } 1263 1264 // To avoid depending on the order of slices, Ty and TyIsCommon must not 1265 // depend on types skipped above. 1266 if (!UserTy || (Ty && Ty != UserTy)) 1267 TyIsCommon = false; // Give up on anything but an iN type. 1268 else 1269 Ty = UserTy; 1270 } 1271 1272 return {TyIsCommon ? Ty : nullptr, ITy}; 1273 } 1274 1275 /// PHI instructions that use an alloca and are subsequently loaded can be 1276 /// rewritten to load both input pointers in the pred blocks and then PHI the 1277 /// results, allowing the load of the alloca to be promoted. 1278 /// From this: 1279 /// %P2 = phi [i32* %Alloca, i32* %Other] 1280 /// %V = load i32* %P2 1281 /// to: 1282 /// %V1 = load i32* %Alloca -> will be mem2reg'd 1283 /// ... 1284 /// %V2 = load i32* %Other 1285 /// ... 1286 /// %V = phi [i32 %V1, i32 %V2] 1287 /// 1288 /// We can do this to a select if its only uses are loads and if the operands 1289 /// to the select can be loaded unconditionally. 1290 /// 1291 /// FIXME: This should be hoisted into a generic utility, likely in 1292 /// Transforms/Util/Local.h 1293 static bool isSafePHIToSpeculate(PHINode &PN) { 1294 const DataLayout &DL = PN.getModule()->getDataLayout(); 1295 1296 // For now, we can only do this promotion if the load is in the same block 1297 // as the PHI, and if there are no stores between the phi and load. 1298 // TODO: Allow recursive phi users. 1299 // TODO: Allow stores. 1300 BasicBlock *BB = PN.getParent(); 1301 Align MaxAlign; 1302 uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType()); 1303 Type *LoadType = nullptr; 1304 for (User *U : PN.users()) { 1305 LoadInst *LI = dyn_cast<LoadInst>(U); 1306 if (!LI || !LI->isSimple()) 1307 return false; 1308 1309 // For now we only allow loads in the same block as the PHI. This is 1310 // a common case that happens when instcombine merges two loads through 1311 // a PHI. 1312 if (LI->getParent() != BB) 1313 return false; 1314 1315 if (LoadType) { 1316 if (LoadType != LI->getType()) 1317 return false; 1318 } else { 1319 LoadType = LI->getType(); 1320 } 1321 1322 // Ensure that there are no instructions between the PHI and the load that 1323 // could store. 1324 for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI) 1325 if (BBI->mayWriteToMemory()) 1326 return false; 1327 1328 MaxAlign = std::max(MaxAlign, LI->getAlign()); 1329 } 1330 1331 if (!LoadType) 1332 return false; 1333 1334 APInt LoadSize = 1335 APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue()); 1336 1337 // We can only transform this if it is safe to push the loads into the 1338 // predecessor blocks. The only thing to watch out for is that we can't put 1339 // a possibly trapping load in the predecessor if it is a critical edge. 1340 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1341 Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator(); 1342 Value *InVal = PN.getIncomingValue(Idx); 1343 1344 // If the value is produced by the terminator of the predecessor (an 1345 // invoke) or it has side-effects, there is no valid place to put a load 1346 // in the predecessor. 1347 if (TI == InVal || TI->mayHaveSideEffects()) 1348 return false; 1349 1350 // If the predecessor has a single successor, then the edge isn't 1351 // critical. 1352 if (TI->getNumSuccessors() == 1) 1353 continue; 1354 1355 // If this pointer is always safe to load, or if we can prove that there 1356 // is already a load in the block, then we can move the load to the pred 1357 // block. 1358 if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI)) 1359 continue; 1360 1361 return false; 1362 } 1363 1364 return true; 1365 } 1366 1367 static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) { 1368 LLVM_DEBUG(dbgs() << " original: " << PN << "\n"); 1369 1370 LoadInst *SomeLoad = cast<LoadInst>(PN.user_back()); 1371 Type *LoadTy = SomeLoad->getType(); 1372 IRB.SetInsertPoint(&PN); 1373 PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(), 1374 PN.getName() + ".sroa.speculated"); 1375 1376 // Get the AA tags and alignment to use from one of the loads. It does not 1377 // matter which one we get and if any differ. 1378 AAMDNodes AATags = SomeLoad->getAAMetadata(); 1379 Align Alignment = SomeLoad->getAlign(); 1380 1381 // Rewrite all loads of the PN to use the new PHI. 1382 while (!PN.use_empty()) { 1383 LoadInst *LI = cast<LoadInst>(PN.user_back()); 1384 LI->replaceAllUsesWith(NewPN); 1385 LI->eraseFromParent(); 1386 } 1387 1388 // Inject loads into all of the pred blocks. 1389 DenseMap<BasicBlock*, Value*> InjectedLoads; 1390 for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) { 1391 BasicBlock *Pred = PN.getIncomingBlock(Idx); 1392 Value *InVal = PN.getIncomingValue(Idx); 1393 1394 // A PHI node is allowed to have multiple (duplicated) entries for the same 1395 // basic block, as long as the value is the same. So if we already injected 1396 // a load in the predecessor, then we should reuse the same load for all 1397 // duplicated entries. 1398 if (Value* V = InjectedLoads.lookup(Pred)) { 1399 NewPN->addIncoming(V, Pred); 1400 continue; 1401 } 1402 1403 Instruction *TI = Pred->getTerminator(); 1404 IRB.SetInsertPoint(TI); 1405 1406 LoadInst *Load = IRB.CreateAlignedLoad( 1407 LoadTy, InVal, Alignment, 1408 (PN.getName() + ".sroa.speculate.load." + Pred->getName())); 1409 ++NumLoadsSpeculated; 1410 if (AATags) 1411 Load->setAAMetadata(AATags); 1412 NewPN->addIncoming(Load, Pred); 1413 InjectedLoads[Pred] = Load; 1414 } 1415 1416 LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n"); 1417 PN.eraseFromParent(); 1418 } 1419 1420 sroa::SelectHandSpeculativity & 1421 sroa::SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) { 1422 if (isTrueVal) 1423 Bitfield::set<sroa::SelectHandSpeculativity::TrueVal>(Storage, true); 1424 else 1425 Bitfield::set<sroa::SelectHandSpeculativity::FalseVal>(Storage, true); 1426 return *this; 1427 } 1428 1429 bool sroa::SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const { 1430 return isTrueVal 1431 ? Bitfield::get<sroa::SelectHandSpeculativity::TrueVal>(Storage) 1432 : Bitfield::get<sroa::SelectHandSpeculativity::FalseVal>(Storage); 1433 } 1434 1435 bool sroa::SelectHandSpeculativity::areAllSpeculatable() const { 1436 return isSpeculatable(/*isTrueVal=*/true) && 1437 isSpeculatable(/*isTrueVal=*/false); 1438 } 1439 1440 bool sroa::SelectHandSpeculativity::areAnySpeculatable() const { 1441 return isSpeculatable(/*isTrueVal=*/true) || 1442 isSpeculatable(/*isTrueVal=*/false); 1443 } 1444 bool sroa::SelectHandSpeculativity::areNoneSpeculatable() const { 1445 return !areAnySpeculatable(); 1446 } 1447 1448 static sroa::SelectHandSpeculativity 1449 isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) { 1450 assert(LI.isSimple() && "Only for simple loads"); 1451 sroa::SelectHandSpeculativity Spec; 1452 1453 const DataLayout &DL = SI.getModule()->getDataLayout(); 1454 for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()}) 1455 if (isSafeToLoadUnconditionally(Value, LI.getType(), LI.getAlign(), DL, 1456 &LI)) 1457 Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue()); 1458 else if (PreserveCFG) 1459 return Spec; 1460 1461 return Spec; 1462 } 1463 1464 std::optional<sroa::RewriteableMemOps> 1465 SROAPass::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) { 1466 RewriteableMemOps Ops; 1467 1468 for (User *U : SI.users()) { 1469 if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse()) 1470 U = *BC->user_begin(); 1471 1472 if (auto *Store = dyn_cast<StoreInst>(U)) { 1473 // Note that atomic stores can be transformed; atomic semantics do not 1474 // have any meaning for a local alloca. Stores are not speculatable, 1475 // however, so if we can't turn it into a predicated store, we are done. 1476 if (Store->isVolatile() || PreserveCFG) 1477 return {}; // Give up on this `select`. 1478 Ops.emplace_back(Store); 1479 continue; 1480 } 1481 1482 auto *LI = dyn_cast<LoadInst>(U); 1483 1484 // Note that atomic loads can be transformed; 1485 // atomic semantics do not have any meaning for a local alloca. 1486 if (!LI || LI->isVolatile()) 1487 return {}; // Give up on this `select`. 1488 1489 PossiblySpeculatableLoad Load(LI); 1490 if (!LI->isSimple()) { 1491 // If the `load` is not simple, we can't speculatively execute it, 1492 // but we could handle this via a CFG modification. But can we? 1493 if (PreserveCFG) 1494 return {}; // Give up on this `select`. 1495 Ops.emplace_back(Load); 1496 continue; 1497 } 1498 1499 sroa::SelectHandSpeculativity Spec = 1500 isSafeLoadOfSelectToSpeculate(*LI, SI, PreserveCFG); 1501 if (PreserveCFG && !Spec.areAllSpeculatable()) 1502 return {}; // Give up on this `select`. 1503 1504 Load.setInt(Spec); 1505 Ops.emplace_back(Load); 1506 } 1507 1508 return Ops; 1509 } 1510 1511 static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI, 1512 IRBuilderTy &IRB) { 1513 LLVM_DEBUG(dbgs() << " original load: " << SI << "\n"); 1514 1515 Value *TV = SI.getTrueValue(); 1516 Value *FV = SI.getFalseValue(); 1517 // Replace the given load of the select with a select of two loads. 1518 1519 assert(LI.isSimple() && "We only speculate simple loads"); 1520 1521 IRB.SetInsertPoint(&LI); 1522 1523 if (auto *TypedPtrTy = LI.getPointerOperandType(); 1524 !TypedPtrTy->isOpaquePointerTy() && SI.getType() != TypedPtrTy) { 1525 TV = IRB.CreateBitOrPointerCast(TV, TypedPtrTy, ""); 1526 FV = IRB.CreateBitOrPointerCast(FV, TypedPtrTy, ""); 1527 } 1528 1529 LoadInst *TL = 1530 IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(), 1531 LI.getName() + ".sroa.speculate.load.true"); 1532 LoadInst *FL = 1533 IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(), 1534 LI.getName() + ".sroa.speculate.load.false"); 1535 NumLoadsSpeculated += 2; 1536 1537 // Transfer alignment and AA info if present. 1538 TL->setAlignment(LI.getAlign()); 1539 FL->setAlignment(LI.getAlign()); 1540 1541 AAMDNodes Tags = LI.getAAMetadata(); 1542 if (Tags) { 1543 TL->setAAMetadata(Tags); 1544 FL->setAAMetadata(Tags); 1545 } 1546 1547 Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL, 1548 LI.getName() + ".sroa.speculated"); 1549 1550 LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n"); 1551 LI.replaceAllUsesWith(V); 1552 } 1553 1554 template <typename T> 1555 static void rewriteMemOpOfSelect(SelectInst &SI, T &I, 1556 sroa::SelectHandSpeculativity Spec, 1557 DomTreeUpdater &DTU) { 1558 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!"); 1559 LLVM_DEBUG(dbgs() << " original mem op: " << I << "\n"); 1560 BasicBlock *Head = I.getParent(); 1561 Instruction *ThenTerm = nullptr; 1562 Instruction *ElseTerm = nullptr; 1563 if (Spec.areNoneSpeculatable()) 1564 SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm, 1565 SI.getMetadata(LLVMContext::MD_prof), &DTU); 1566 else { 1567 SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false, 1568 SI.getMetadata(LLVMContext::MD_prof), &DTU, 1569 /*LI=*/nullptr, /*ThenBlock=*/nullptr); 1570 if (Spec.isSpeculatable(/*isTrueVal=*/true)) 1571 cast<BranchInst>(Head->getTerminator())->swapSuccessors(); 1572 } 1573 auto *HeadBI = cast<BranchInst>(Head->getTerminator()); 1574 Spec = {}; // Do not use `Spec` beyond this point. 1575 BasicBlock *Tail = I.getParent(); 1576 Tail->setName(Head->getName() + ".cont"); 1577 PHINode *PN; 1578 if (isa<LoadInst>(I)) 1579 PN = PHINode::Create(I.getType(), 2, "", &I); 1580 for (BasicBlock *SuccBB : successors(Head)) { 1581 bool IsThen = SuccBB == HeadBI->getSuccessor(0); 1582 int SuccIdx = IsThen ? 0 : 1; 1583 auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB; 1584 auto &CondMemOp = cast<T>(*I.clone()); 1585 if (NewMemOpBB != Head) { 1586 NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else")); 1587 if (isa<LoadInst>(I)) 1588 ++NumLoadsPredicated; 1589 else 1590 ++NumStoresPredicated; 1591 } else { 1592 CondMemOp.dropUndefImplyingAttrsAndUnknownMetadata(); 1593 ++NumLoadsSpeculated; 1594 } 1595 CondMemOp.insertBefore(NewMemOpBB->getTerminator()); 1596 Value *Ptr = SI.getOperand(1 + SuccIdx); 1597 if (auto *PtrTy = Ptr->getType(); 1598 !PtrTy->isOpaquePointerTy() && 1599 PtrTy != CondMemOp.getPointerOperandType()) 1600 Ptr = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( 1601 Ptr, CondMemOp.getPointerOperandType(), "", &CondMemOp); 1602 CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr); 1603 if (isa<LoadInst>(I)) { 1604 CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val"); 1605 PN->addIncoming(&CondMemOp, NewMemOpBB); 1606 } else 1607 LLVM_DEBUG(dbgs() << " to: " << CondMemOp << "\n"); 1608 } 1609 if (isa<LoadInst>(I)) { 1610 PN->takeName(&I); 1611 LLVM_DEBUG(dbgs() << " to: " << *PN << "\n"); 1612 I.replaceAllUsesWith(PN); 1613 } 1614 } 1615 1616 static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I, 1617 sroa::SelectHandSpeculativity Spec, 1618 DomTreeUpdater &DTU) { 1619 if (auto *LI = dyn_cast<LoadInst>(&I)) 1620 rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU); 1621 else if (auto *SI = dyn_cast<StoreInst>(&I)) 1622 rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU); 1623 else 1624 llvm_unreachable_internal("Only for load and store."); 1625 } 1626 1627 static bool rewriteSelectInstMemOps(SelectInst &SI, 1628 const sroa::RewriteableMemOps &Ops, 1629 IRBuilderTy &IRB, DomTreeUpdater *DTU) { 1630 bool CFGChanged = false; 1631 LLVM_DEBUG(dbgs() << " original select: " << SI << "\n"); 1632 1633 for (const RewriteableMemOp &Op : Ops) { 1634 sroa::SelectHandSpeculativity Spec; 1635 Instruction *I; 1636 if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) { 1637 I = *US; 1638 } else { 1639 auto PSL = std::get<PossiblySpeculatableLoad>(Op); 1640 I = PSL.getPointer(); 1641 Spec = PSL.getInt(); 1642 } 1643 if (Spec.areAllSpeculatable()) { 1644 speculateSelectInstLoads(SI, cast<LoadInst>(*I), IRB); 1645 } else { 1646 assert(DTU && "Should not get here when not allowed to modify the CFG!"); 1647 rewriteMemOpOfSelect(SI, *I, Spec, *DTU); 1648 CFGChanged = true; 1649 } 1650 I->eraseFromParent(); 1651 } 1652 1653 for (User *U : make_early_inc_range(SI.users())) 1654 cast<BitCastInst>(U)->eraseFromParent(); 1655 SI.eraseFromParent(); 1656 return CFGChanged; 1657 } 1658 1659 /// Build a GEP out of a base pointer and indices. 1660 /// 1661 /// This will return the BasePtr if that is valid, or build a new GEP 1662 /// instruction using the IRBuilder if GEP-ing is needed. 1663 static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr, 1664 SmallVectorImpl<Value *> &Indices, 1665 const Twine &NamePrefix) { 1666 if (Indices.empty()) 1667 return BasePtr; 1668 1669 // A single zero index is a no-op, so check for this and avoid building a GEP 1670 // in that case. 1671 if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero()) 1672 return BasePtr; 1673 1674 // buildGEP() is only called for non-opaque pointers. 1675 return IRB.CreateInBoundsGEP( 1676 BasePtr->getType()->getNonOpaquePointerElementType(), BasePtr, Indices, 1677 NamePrefix + "sroa_idx"); 1678 } 1679 1680 /// Get a natural GEP off of the BasePtr walking through Ty toward 1681 /// TargetTy without changing the offset of the pointer. 1682 /// 1683 /// This routine assumes we've already established a properly offset GEP with 1684 /// Indices, and arrived at the Ty type. The goal is to continue to GEP with 1685 /// zero-indices down through type layers until we find one the same as 1686 /// TargetTy. If we can't find one with the same type, we at least try to use 1687 /// one with the same size. If none of that works, we just produce the GEP as 1688 /// indicated by Indices to have the correct offset. 1689 static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL, 1690 Value *BasePtr, Type *Ty, Type *TargetTy, 1691 SmallVectorImpl<Value *> &Indices, 1692 const Twine &NamePrefix) { 1693 if (Ty == TargetTy) 1694 return buildGEP(IRB, BasePtr, Indices, NamePrefix); 1695 1696 // Offset size to use for the indices. 1697 unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType()); 1698 1699 // See if we can descend into a struct and locate a field with the correct 1700 // type. 1701 unsigned NumLayers = 0; 1702 Type *ElementTy = Ty; 1703 do { 1704 if (ElementTy->isPointerTy()) 1705 break; 1706 1707 if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) { 1708 ElementTy = ArrayTy->getElementType(); 1709 Indices.push_back(IRB.getIntN(OffsetSize, 0)); 1710 } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) { 1711 ElementTy = VectorTy->getElementType(); 1712 Indices.push_back(IRB.getInt32(0)); 1713 } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) { 1714 if (STy->element_begin() == STy->element_end()) 1715 break; // Nothing left to descend into. 1716 ElementTy = *STy->element_begin(); 1717 Indices.push_back(IRB.getInt32(0)); 1718 } else { 1719 break; 1720 } 1721 ++NumLayers; 1722 } while (ElementTy != TargetTy); 1723 if (ElementTy != TargetTy) 1724 Indices.erase(Indices.end() - NumLayers, Indices.end()); 1725 1726 return buildGEP(IRB, BasePtr, Indices, NamePrefix); 1727 } 1728 1729 /// Get a natural GEP from a base pointer to a particular offset and 1730 /// resulting in a particular type. 1731 /// 1732 /// The goal is to produce a "natural" looking GEP that works with the existing 1733 /// composite types to arrive at the appropriate offset and element type for 1734 /// a pointer. TargetTy is the element type the returned GEP should point-to if 1735 /// possible. We recurse by decreasing Offset, adding the appropriate index to 1736 /// Indices, and setting Ty to the result subtype. 1737 /// 1738 /// If no natural GEP can be constructed, this function returns null. 1739 static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL, 1740 Value *Ptr, APInt Offset, Type *TargetTy, 1741 SmallVectorImpl<Value *> &Indices, 1742 const Twine &NamePrefix) { 1743 PointerType *Ty = cast<PointerType>(Ptr->getType()); 1744 1745 // Don't consider any GEPs through an i8* as natural unless the TargetTy is 1746 // an i8. 1747 if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8)) 1748 return nullptr; 1749 1750 Type *ElementTy = Ty->getNonOpaquePointerElementType(); 1751 if (!ElementTy->isSized()) 1752 return nullptr; // We can't GEP through an unsized element. 1753 1754 SmallVector<APInt> IntIndices = DL.getGEPIndicesForOffset(ElementTy, Offset); 1755 if (Offset != 0) 1756 return nullptr; 1757 1758 for (const APInt &Index : IntIndices) 1759 Indices.push_back(IRB.getInt(Index)); 1760 return getNaturalGEPWithType(IRB, DL, Ptr, ElementTy, TargetTy, Indices, 1761 NamePrefix); 1762 } 1763 1764 /// Compute an adjusted pointer from Ptr by Offset bytes where the 1765 /// resulting pointer has PointerTy. 1766 /// 1767 /// This tries very hard to compute a "natural" GEP which arrives at the offset 1768 /// and produces the pointer type desired. Where it cannot, it will try to use 1769 /// the natural GEP to arrive at the offset and bitcast to the type. Where that 1770 /// fails, it will try to use an existing i8* and GEP to the byte offset and 1771 /// bitcast to the type. 1772 /// 1773 /// The strategy for finding the more natural GEPs is to peel off layers of the 1774 /// pointer, walking back through bit casts and GEPs, searching for a base 1775 /// pointer from which we can compute a natural GEP with the desired 1776 /// properties. The algorithm tries to fold as many constant indices into 1777 /// a single GEP as possible, thus making each GEP more independent of the 1778 /// surrounding code. 1779 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr, 1780 APInt Offset, Type *PointerTy, 1781 const Twine &NamePrefix) { 1782 // Create i8 GEP for opaque pointers. 1783 if (Ptr->getType()->isOpaquePointerTy()) { 1784 if (Offset != 0) 1785 Ptr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Ptr, IRB.getInt(Offset), 1786 NamePrefix + "sroa_idx"); 1787 return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy, 1788 NamePrefix + "sroa_cast"); 1789 } 1790 1791 // Even though we don't look through PHI nodes, we could be called on an 1792 // instruction in an unreachable block, which may be on a cycle. 1793 SmallPtrSet<Value *, 4> Visited; 1794 Visited.insert(Ptr); 1795 SmallVector<Value *, 4> Indices; 1796 1797 // We may end up computing an offset pointer that has the wrong type. If we 1798 // never are able to compute one directly that has the correct type, we'll 1799 // fall back to it, so keep it and the base it was computed from around here. 1800 Value *OffsetPtr = nullptr; 1801 Value *OffsetBasePtr; 1802 1803 // Remember any i8 pointer we come across to re-use if we need to do a raw 1804 // byte offset. 1805 Value *Int8Ptr = nullptr; 1806 APInt Int8PtrOffset(Offset.getBitWidth(), 0); 1807 1808 PointerType *TargetPtrTy = cast<PointerType>(PointerTy); 1809 Type *TargetTy = TargetPtrTy->getNonOpaquePointerElementType(); 1810 1811 // As `addrspacecast` is , `Ptr` (the storage pointer) may have different 1812 // address space from the expected `PointerTy` (the pointer to be used). 1813 // Adjust the pointer type based the original storage pointer. 1814 auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace(); 1815 PointerTy = TargetTy->getPointerTo(AS); 1816 1817 do { 1818 // First fold any existing GEPs into the offset. 1819 while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) { 1820 APInt GEPOffset(Offset.getBitWidth(), 0); 1821 if (!GEP->accumulateConstantOffset(DL, GEPOffset)) 1822 break; 1823 Offset += GEPOffset; 1824 Ptr = GEP->getPointerOperand(); 1825 if (!Visited.insert(Ptr).second) 1826 break; 1827 } 1828 1829 // See if we can perform a natural GEP here. 1830 Indices.clear(); 1831 if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy, 1832 Indices, NamePrefix)) { 1833 // If we have a new natural pointer at the offset, clear out any old 1834 // offset pointer we computed. Unless it is the base pointer or 1835 // a non-instruction, we built a GEP we don't need. Zap it. 1836 if (OffsetPtr && OffsetPtr != OffsetBasePtr) 1837 if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) { 1838 assert(I->use_empty() && "Built a GEP with uses some how!"); 1839 I->eraseFromParent(); 1840 } 1841 OffsetPtr = P; 1842 OffsetBasePtr = Ptr; 1843 // If we also found a pointer of the right type, we're done. 1844 if (P->getType() == PointerTy) 1845 break; 1846 } 1847 1848 // Stash this pointer if we've found an i8*. 1849 if (Ptr->getType()->isIntegerTy(8)) { 1850 Int8Ptr = Ptr; 1851 Int8PtrOffset = Offset; 1852 } 1853 1854 // Peel off a layer of the pointer and update the offset appropriately. 1855 if (Operator::getOpcode(Ptr) == Instruction::BitCast) { 1856 Ptr = cast<Operator>(Ptr)->getOperand(0); 1857 } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) { 1858 if (GA->isInterposable()) 1859 break; 1860 Ptr = GA->getAliasee(); 1861 } else { 1862 break; 1863 } 1864 assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!"); 1865 } while (Visited.insert(Ptr).second); 1866 1867 if (!OffsetPtr) { 1868 if (!Int8Ptr) { 1869 Int8Ptr = IRB.CreateBitCast( 1870 Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()), 1871 NamePrefix + "sroa_raw_cast"); 1872 Int8PtrOffset = Offset; 1873 } 1874 1875 OffsetPtr = Int8PtrOffset == 0 1876 ? Int8Ptr 1877 : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr, 1878 IRB.getInt(Int8PtrOffset), 1879 NamePrefix + "sroa_raw_idx"); 1880 } 1881 Ptr = OffsetPtr; 1882 1883 // On the off chance we were targeting i8*, guard the bitcast here. 1884 if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) { 1885 Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, 1886 TargetPtrTy, 1887 NamePrefix + "sroa_cast"); 1888 } 1889 1890 return Ptr; 1891 } 1892 1893 /// Compute the adjusted alignment for a load or store from an offset. 1894 static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) { 1895 return commonAlignment(getLoadStoreAlignment(I), Offset); 1896 } 1897 1898 /// Test whether we can convert a value from the old to the new type. 1899 /// 1900 /// This predicate should be used to guard calls to convertValue in order to 1901 /// ensure that we only try to convert viable values. The strategy is that we 1902 /// will peel off single element struct and array wrappings to get to an 1903 /// underlying value, and convert that value. 1904 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { 1905 if (OldTy == NewTy) 1906 return true; 1907 1908 // For integer types, we can't handle any bit-width differences. This would 1909 // break both vector conversions with extension and introduce endianness 1910 // issues when in conjunction with loads and stores. 1911 if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) { 1912 assert(cast<IntegerType>(OldTy)->getBitWidth() != 1913 cast<IntegerType>(NewTy)->getBitWidth() && 1914 "We can't have the same bitwidth for different int types"); 1915 return false; 1916 } 1917 1918 if (DL.getTypeSizeInBits(NewTy).getFixedValue() != 1919 DL.getTypeSizeInBits(OldTy).getFixedValue()) 1920 return false; 1921 if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) 1922 return false; 1923 1924 // We can convert pointers to integers and vice-versa. Same for vectors 1925 // of pointers and integers. 1926 OldTy = OldTy->getScalarType(); 1927 NewTy = NewTy->getScalarType(); 1928 if (NewTy->isPointerTy() || OldTy->isPointerTy()) { 1929 if (NewTy->isPointerTy() && OldTy->isPointerTy()) { 1930 unsigned OldAS = OldTy->getPointerAddressSpace(); 1931 unsigned NewAS = NewTy->getPointerAddressSpace(); 1932 // Convert pointers if they are pointers from the same address space or 1933 // different integral (not non-integral) address spaces with the same 1934 // pointer size. 1935 return OldAS == NewAS || 1936 (!DL.isNonIntegralAddressSpace(OldAS) && 1937 !DL.isNonIntegralAddressSpace(NewAS) && 1938 DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); 1939 } 1940 1941 // We can convert integers to integral pointers, but not to non-integral 1942 // pointers. 1943 if (OldTy->isIntegerTy()) 1944 return !DL.isNonIntegralPointerType(NewTy); 1945 1946 // We can convert integral pointers to integers, but non-integral pointers 1947 // need to remain pointers. 1948 if (!DL.isNonIntegralPointerType(OldTy)) 1949 return NewTy->isIntegerTy(); 1950 1951 return false; 1952 } 1953 1954 if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy()) 1955 return false; 1956 1957 return true; 1958 } 1959 1960 /// Generic routine to convert an SSA value to a value of a different 1961 /// type. 1962 /// 1963 /// This will try various different casting techniques, such as bitcasts, 1964 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test 1965 /// two types for viability with this routine. 1966 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 1967 Type *NewTy) { 1968 Type *OldTy = V->getType(); 1969 assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); 1970 1971 if (OldTy == NewTy) 1972 return V; 1973 1974 assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) && 1975 "Integer types must be the exact same to convert."); 1976 1977 // See if we need inttoptr for this type pair. May require additional bitcast. 1978 if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) { 1979 // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* 1980 // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> 1981 // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*> 1982 // Directly handle i64 to i8* 1983 return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), 1984 NewTy); 1985 } 1986 1987 // See if we need ptrtoint for this type pair. May require additional bitcast. 1988 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) { 1989 // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 1990 // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> 1991 // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32> 1992 // Expand i8* to i64 --> i8* to i64 to i64 1993 return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 1994 NewTy); 1995 } 1996 1997 if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) { 1998 unsigned OldAS = OldTy->getPointerAddressSpace(); 1999 unsigned NewAS = NewTy->getPointerAddressSpace(); 2000 // To convert pointers with different address spaces (they are already 2001 // checked convertible, i.e. they have the same pointer size), so far we 2002 // cannot use `bitcast` (which has restrict on the same address space) or 2003 // `addrspacecast` (which is not always no-op casting). Instead, use a pair 2004 // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit 2005 // size. 2006 if (OldAS != NewAS) { 2007 assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); 2008 return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), 2009 NewTy); 2010 } 2011 } 2012 2013 return IRB.CreateBitCast(V, NewTy); 2014 } 2015 2016 /// Test whether the given slice use can be promoted to a vector. 2017 /// 2018 /// This function is called to test each entry in a partition which is slated 2019 /// for a single slice. 2020 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, 2021 VectorType *Ty, 2022 uint64_t ElementSize, 2023 const DataLayout &DL) { 2024 // First validate the slice offsets. 2025 uint64_t BeginOffset = 2026 std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); 2027 uint64_t BeginIndex = BeginOffset / ElementSize; 2028 if (BeginIndex * ElementSize != BeginOffset || 2029 BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements()) 2030 return false; 2031 uint64_t EndOffset = 2032 std::min(S.endOffset(), P.endOffset()) - P.beginOffset(); 2033 uint64_t EndIndex = EndOffset / ElementSize; 2034 if (EndIndex * ElementSize != EndOffset || 2035 EndIndex > cast<FixedVectorType>(Ty)->getNumElements()) 2036 return false; 2037 2038 assert(EndIndex > BeginIndex && "Empty vector!"); 2039 uint64_t NumElements = EndIndex - BeginIndex; 2040 Type *SliceTy = (NumElements == 1) 2041 ? Ty->getElementType() 2042 : FixedVectorType::get(Ty->getElementType(), NumElements); 2043 2044 Type *SplitIntTy = 2045 Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8); 2046 2047 Use *U = S.getUse(); 2048 2049 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 2050 if (MI->isVolatile()) 2051 return false; 2052 if (!S.isSplittable()) 2053 return false; // Skip any unsplittable intrinsics. 2054 } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 2055 if (!II->isLifetimeStartOrEnd() && !II->isDroppable()) 2056 return false; 2057 } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 2058 if (LI->isVolatile()) 2059 return false; 2060 Type *LTy = LI->getType(); 2061 // Disable vector promotion when there are loads or stores of an FCA. 2062 if (LTy->isStructTy()) 2063 return false; 2064 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { 2065 assert(LTy->isIntegerTy()); 2066 LTy = SplitIntTy; 2067 } 2068 if (!canConvertValue(DL, SliceTy, LTy)) 2069 return false; 2070 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 2071 if (SI->isVolatile()) 2072 return false; 2073 Type *STy = SI->getValueOperand()->getType(); 2074 // Disable vector promotion when there are loads or stores of an FCA. 2075 if (STy->isStructTy()) 2076 return false; 2077 if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) { 2078 assert(STy->isIntegerTy()); 2079 STy = SplitIntTy; 2080 } 2081 if (!canConvertValue(DL, STy, SliceTy)) 2082 return false; 2083 } else { 2084 return false; 2085 } 2086 2087 return true; 2088 } 2089 2090 /// Test whether a vector type is viable for promotion. 2091 /// 2092 /// This implements the necessary checking for \c isVectorPromotionViable over 2093 /// all slices of the alloca for the given VectorType. 2094 static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, 2095 const DataLayout &DL) { 2096 uint64_t ElementSize = 2097 DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); 2098 2099 // While the definition of LLVM vectors is bitpacked, we don't support sizes 2100 // that aren't byte sized. 2101 if (ElementSize % 8) 2102 return false; 2103 assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 && 2104 "vector size not a multiple of element size?"); 2105 ElementSize /= 8; 2106 2107 for (const Slice &S : P) 2108 if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) 2109 return false; 2110 2111 for (const Slice *S : P.splitSliceTails()) 2112 if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) 2113 return false; 2114 2115 return true; 2116 } 2117 2118 /// Test whether the given alloca partitioning and range of slices can be 2119 /// promoted to a vector. 2120 /// 2121 /// This is a quick test to check whether we can rewrite a particular alloca 2122 /// partition (and its newly formed alloca) into a vector alloca with only 2123 /// whole-vector loads and stores such that it could be promoted to a vector 2124 /// SSA value. We only can ensure this for a limited set of operations, and we 2125 /// don't want to do the rewrites unless we are confident that the result will 2126 /// be promotable, so we have an early test here. 2127 static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { 2128 // Collect the candidate types for vector-based promotion. Also track whether 2129 // we have different element types. 2130 SmallVector<VectorType *, 4> CandidateTys; 2131 Type *CommonEltTy = nullptr; 2132 VectorType *CommonVecPtrTy = nullptr; 2133 bool HaveVecPtrTy = false; 2134 bool HaveCommonEltTy = true; 2135 bool HaveCommonVecPtrTy = true; 2136 auto CheckCandidateType = [&](Type *Ty) { 2137 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 2138 // Return if bitcast to vectors is different for total size in bits. 2139 if (!CandidateTys.empty()) { 2140 VectorType *V = CandidateTys[0]; 2141 if (DL.getTypeSizeInBits(VTy).getFixedValue() != 2142 DL.getTypeSizeInBits(V).getFixedValue()) { 2143 CandidateTys.clear(); 2144 return; 2145 } 2146 } 2147 CandidateTys.push_back(VTy); 2148 Type *EltTy = VTy->getElementType(); 2149 2150 if (!CommonEltTy) 2151 CommonEltTy = EltTy; 2152 else if (CommonEltTy != EltTy) 2153 HaveCommonEltTy = false; 2154 2155 if (EltTy->isPointerTy()) { 2156 HaveVecPtrTy = true; 2157 if (!CommonVecPtrTy) 2158 CommonVecPtrTy = VTy; 2159 else if (CommonVecPtrTy != VTy) 2160 HaveCommonVecPtrTy = false; 2161 } 2162 } 2163 }; 2164 // Consider any loads or stores that are the exact size of the slice. 2165 for (const Slice &S : P) 2166 if (S.beginOffset() == P.beginOffset() && 2167 S.endOffset() == P.endOffset()) { 2168 if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser())) 2169 CheckCandidateType(LI->getType()); 2170 else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser())) 2171 CheckCandidateType(SI->getValueOperand()->getType()); 2172 } 2173 2174 // If we didn't find a vector type, nothing to do here. 2175 if (CandidateTys.empty()) 2176 return nullptr; 2177 2178 // Pointer-ness is sticky, if we had a vector-of-pointers candidate type, 2179 // then we should choose it, not some other alternative. 2180 // But, we can't perform a no-op pointer address space change via bitcast, 2181 // so if we didn't have a common pointer element type, bail. 2182 if (HaveVecPtrTy && !HaveCommonVecPtrTy) 2183 return nullptr; 2184 2185 // Try to pick the "best" element type out of the choices. 2186 if (!HaveCommonEltTy && HaveVecPtrTy) { 2187 // If there was a pointer element type, there's really only one choice. 2188 CandidateTys.clear(); 2189 CandidateTys.push_back(CommonVecPtrTy); 2190 } else if (!HaveCommonEltTy && !HaveVecPtrTy) { 2191 // Integer-ify vector types. 2192 for (VectorType *&VTy : CandidateTys) { 2193 if (!VTy->getElementType()->isIntegerTy()) 2194 VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy( 2195 VTy->getContext(), VTy->getScalarSizeInBits()))); 2196 } 2197 2198 // Rank the remaining candidate vector types. This is easy because we know 2199 // they're all integer vectors. We sort by ascending number of elements. 2200 auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) { 2201 (void)DL; 2202 assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() == 2203 DL.getTypeSizeInBits(LHSTy).getFixedValue() && 2204 "Cannot have vector types of different sizes!"); 2205 assert(RHSTy->getElementType()->isIntegerTy() && 2206 "All non-integer types eliminated!"); 2207 assert(LHSTy->getElementType()->isIntegerTy() && 2208 "All non-integer types eliminated!"); 2209 return cast<FixedVectorType>(RHSTy)->getNumElements() < 2210 cast<FixedVectorType>(LHSTy)->getNumElements(); 2211 }; 2212 llvm::sort(CandidateTys, RankVectorTypes); 2213 CandidateTys.erase( 2214 std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes), 2215 CandidateTys.end()); 2216 } else { 2217 // The only way to have the same element type in every vector type is to 2218 // have the same vector type. Check that and remove all but one. 2219 #ifndef NDEBUG 2220 for (VectorType *VTy : CandidateTys) { 2221 assert(VTy->getElementType() == CommonEltTy && 2222 "Unaccounted for element type!"); 2223 assert(VTy == CandidateTys[0] && 2224 "Different vector types with the same element type!"); 2225 } 2226 #endif 2227 CandidateTys.resize(1); 2228 } 2229 2230 // FIXME: hack. Do we have a named constant for this? 2231 // SDAG SDNode can't have more than 65535 operands. 2232 llvm::erase_if(CandidateTys, [](VectorType *VTy) { 2233 return cast<FixedVectorType>(VTy)->getNumElements() > 2234 std::numeric_limits<unsigned short>::max(); 2235 }); 2236 2237 for (VectorType *VTy : CandidateTys) 2238 if (checkVectorTypeForPromotion(P, VTy, DL)) 2239 return VTy; 2240 2241 return nullptr; 2242 } 2243 2244 /// Test whether a slice of an alloca is valid for integer widening. 2245 /// 2246 /// This implements the necessary checking for the \c isIntegerWideningViable 2247 /// test below on a single slice of the alloca. 2248 static bool isIntegerWideningViableForSlice(const Slice &S, 2249 uint64_t AllocBeginOffset, 2250 Type *AllocaTy, 2251 const DataLayout &DL, 2252 bool &WholeAllocaOp) { 2253 uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue(); 2254 2255 uint64_t RelBegin = S.beginOffset() - AllocBeginOffset; 2256 uint64_t RelEnd = S.endOffset() - AllocBeginOffset; 2257 2258 Use *U = S.getUse(); 2259 2260 // Lifetime intrinsics operate over the whole alloca whose sizes are usually 2261 // larger than other load/store slices (RelEnd > Size). But lifetime are 2262 // always promotable and should not impact other slices' promotability of the 2263 // partition. 2264 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) { 2265 if (II->isLifetimeStartOrEnd() || II->isDroppable()) 2266 return true; 2267 } 2268 2269 // We can't reasonably handle cases where the load or store extends past 2270 // the end of the alloca's type and into its padding. 2271 if (RelEnd > Size) 2272 return false; 2273 2274 if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) { 2275 if (LI->isVolatile()) 2276 return false; 2277 // We can't handle loads that extend past the allocated memory. 2278 if (DL.getTypeStoreSize(LI->getType()).getFixedValue() > Size) 2279 return false; 2280 // So far, AllocaSliceRewriter does not support widening split slice tails 2281 // in rewriteIntegerLoad. 2282 if (S.beginOffset() < AllocBeginOffset) 2283 return false; 2284 // Note that we don't count vector loads or stores as whole-alloca 2285 // operations which enable integer widening because we would prefer to use 2286 // vector widening instead. 2287 if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size) 2288 WholeAllocaOp = true; 2289 if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) { 2290 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue()) 2291 return false; 2292 } else if (RelBegin != 0 || RelEnd != Size || 2293 !canConvertValue(DL, AllocaTy, LI->getType())) { 2294 // Non-integer loads need to be convertible from the alloca type so that 2295 // they are promotable. 2296 return false; 2297 } 2298 } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) { 2299 Type *ValueTy = SI->getValueOperand()->getType(); 2300 if (SI->isVolatile()) 2301 return false; 2302 // We can't handle stores that extend past the allocated memory. 2303 if (DL.getTypeStoreSize(ValueTy).getFixedValue() > Size) 2304 return false; 2305 // So far, AllocaSliceRewriter does not support widening split slice tails 2306 // in rewriteIntegerStore. 2307 if (S.beginOffset() < AllocBeginOffset) 2308 return false; 2309 // Note that we don't count vector loads or stores as whole-alloca 2310 // operations which enable integer widening because we would prefer to use 2311 // vector widening instead. 2312 if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size) 2313 WholeAllocaOp = true; 2314 if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) { 2315 if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue()) 2316 return false; 2317 } else if (RelBegin != 0 || RelEnd != Size || 2318 !canConvertValue(DL, ValueTy, AllocaTy)) { 2319 // Non-integer stores need to be convertible to the alloca type so that 2320 // they are promotable. 2321 return false; 2322 } 2323 } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) { 2324 if (MI->isVolatile() || !isa<Constant>(MI->getLength())) 2325 return false; 2326 if (!S.isSplittable()) 2327 return false; // Skip any unsplittable intrinsics. 2328 } else { 2329 return false; 2330 } 2331 2332 return true; 2333 } 2334 2335 /// Test whether the given alloca partition's integer operations can be 2336 /// widened to promotable ones. 2337 /// 2338 /// This is a quick test to check whether we can rewrite the integer loads and 2339 /// stores to a particular alloca into wider loads and stores and be able to 2340 /// promote the resulting alloca. 2341 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy, 2342 const DataLayout &DL) { 2343 uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue(); 2344 // Don't create integer types larger than the maximum bitwidth. 2345 if (SizeInBits > IntegerType::MAX_INT_BITS) 2346 return false; 2347 2348 // Don't try to handle allocas with bit-padding. 2349 if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue()) 2350 return false; 2351 2352 // We need to ensure that an integer type with the appropriate bitwidth can 2353 // be converted to the alloca type, whatever that is. We don't want to force 2354 // the alloca itself to have an integer type if there is a more suitable one. 2355 Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits); 2356 if (!canConvertValue(DL, AllocaTy, IntTy) || 2357 !canConvertValue(DL, IntTy, AllocaTy)) 2358 return false; 2359 2360 // While examining uses, we ensure that the alloca has a covering load or 2361 // store. We don't want to widen the integer operations only to fail to 2362 // promote due to some other unsplittable entry (which we may make splittable 2363 // later). However, if there are only splittable uses, go ahead and assume 2364 // that we cover the alloca. 2365 // FIXME: We shouldn't consider split slices that happen to start in the 2366 // partition here... 2367 bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits); 2368 2369 for (const Slice &S : P) 2370 if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL, 2371 WholeAllocaOp)) 2372 return false; 2373 2374 for (const Slice *S : P.splitSliceTails()) 2375 if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL, 2376 WholeAllocaOp)) 2377 return false; 2378 2379 return WholeAllocaOp; 2380 } 2381 2382 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V, 2383 IntegerType *Ty, uint64_t Offset, 2384 const Twine &Name) { 2385 LLVM_DEBUG(dbgs() << " start: " << *V << "\n"); 2386 IntegerType *IntTy = cast<IntegerType>(V->getType()); 2387 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <= 2388 DL.getTypeStoreSize(IntTy).getFixedValue() && 2389 "Element extends past full value"); 2390 uint64_t ShAmt = 8 * Offset; 2391 if (DL.isBigEndian()) 2392 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() - 2393 DL.getTypeStoreSize(Ty).getFixedValue() - Offset); 2394 if (ShAmt) { 2395 V = IRB.CreateLShr(V, ShAmt, Name + ".shift"); 2396 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n"); 2397 } 2398 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2399 "Cannot extract to a larger integer!"); 2400 if (Ty != IntTy) { 2401 V = IRB.CreateTrunc(V, Ty, Name + ".trunc"); 2402 LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n"); 2403 } 2404 return V; 2405 } 2406 2407 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old, 2408 Value *V, uint64_t Offset, const Twine &Name) { 2409 IntegerType *IntTy = cast<IntegerType>(Old->getType()); 2410 IntegerType *Ty = cast<IntegerType>(V->getType()); 2411 assert(Ty->getBitWidth() <= IntTy->getBitWidth() && 2412 "Cannot insert a larger integer!"); 2413 LLVM_DEBUG(dbgs() << " start: " << *V << "\n"); 2414 if (Ty != IntTy) { 2415 V = IRB.CreateZExt(V, IntTy, Name + ".ext"); 2416 LLVM_DEBUG(dbgs() << " extended: " << *V << "\n"); 2417 } 2418 assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <= 2419 DL.getTypeStoreSize(IntTy).getFixedValue() && 2420 "Element store outside of alloca store"); 2421 uint64_t ShAmt = 8 * Offset; 2422 if (DL.isBigEndian()) 2423 ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() - 2424 DL.getTypeStoreSize(Ty).getFixedValue() - Offset); 2425 if (ShAmt) { 2426 V = IRB.CreateShl(V, ShAmt, Name + ".shift"); 2427 LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n"); 2428 } 2429 2430 if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) { 2431 APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt); 2432 Old = IRB.CreateAnd(Old, Mask, Name + ".mask"); 2433 LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n"); 2434 V = IRB.CreateOr(Old, V, Name + ".insert"); 2435 LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n"); 2436 } 2437 return V; 2438 } 2439 2440 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex, 2441 unsigned EndIndex, const Twine &Name) { 2442 auto *VecTy = cast<FixedVectorType>(V->getType()); 2443 unsigned NumElements = EndIndex - BeginIndex; 2444 assert(NumElements <= VecTy->getNumElements() && "Too many elements!"); 2445 2446 if (NumElements == VecTy->getNumElements()) 2447 return V; 2448 2449 if (NumElements == 1) { 2450 V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex), 2451 Name + ".extract"); 2452 LLVM_DEBUG(dbgs() << " extract: " << *V << "\n"); 2453 return V; 2454 } 2455 2456 auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex)); 2457 V = IRB.CreateShuffleVector(V, Mask, Name + ".extract"); 2458 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n"); 2459 return V; 2460 } 2461 2462 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V, 2463 unsigned BeginIndex, const Twine &Name) { 2464 VectorType *VecTy = cast<VectorType>(Old->getType()); 2465 assert(VecTy && "Can only insert a vector into a vector"); 2466 2467 VectorType *Ty = dyn_cast<VectorType>(V->getType()); 2468 if (!Ty) { 2469 // Single element to insert. 2470 V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex), 2471 Name + ".insert"); 2472 LLVM_DEBUG(dbgs() << " insert: " << *V << "\n"); 2473 return V; 2474 } 2475 2476 assert(cast<FixedVectorType>(Ty)->getNumElements() <= 2477 cast<FixedVectorType>(VecTy)->getNumElements() && 2478 "Too many elements!"); 2479 if (cast<FixedVectorType>(Ty)->getNumElements() == 2480 cast<FixedVectorType>(VecTy)->getNumElements()) { 2481 assert(V->getType() == VecTy && "Vector type mismatch"); 2482 return V; 2483 } 2484 unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements(); 2485 2486 // When inserting a smaller vector into the larger to store, we first 2487 // use a shuffle vector to widen it with undef elements, and then 2488 // a second shuffle vector to select between the loaded vector and the 2489 // incoming vector. 2490 SmallVector<int, 8> Mask; 2491 Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements()); 2492 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i) 2493 if (i >= BeginIndex && i < EndIndex) 2494 Mask.push_back(i - BeginIndex); 2495 else 2496 Mask.push_back(-1); 2497 V = IRB.CreateShuffleVector(V, Mask, Name + ".expand"); 2498 LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n"); 2499 2500 SmallVector<Constant *, 8> Mask2; 2501 Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements()); 2502 for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i) 2503 Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex)); 2504 2505 V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend"); 2506 2507 LLVM_DEBUG(dbgs() << " blend: " << *V << "\n"); 2508 return V; 2509 } 2510 2511 /// Visitor to rewrite instructions using p particular slice of an alloca 2512 /// to use a new alloca. 2513 /// 2514 /// Also implements the rewriting to vector-based accesses when the partition 2515 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic 2516 /// lives here. 2517 class llvm::sroa::AllocaSliceRewriter 2518 : public InstVisitor<AllocaSliceRewriter, bool> { 2519 // Befriend the base class so it can delegate to private visit methods. 2520 friend class InstVisitor<AllocaSliceRewriter, bool>; 2521 2522 using Base = InstVisitor<AllocaSliceRewriter, bool>; 2523 2524 const DataLayout &DL; 2525 AllocaSlices &AS; 2526 SROAPass &Pass; 2527 AllocaInst &OldAI, &NewAI; 2528 const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset; 2529 Type *NewAllocaTy; 2530 2531 // This is a convenience and flag variable that will be null unless the new 2532 // alloca's integer operations should be widened to this integer type due to 2533 // passing isIntegerWideningViable above. If it is non-null, the desired 2534 // integer type will be stored here for easy access during rewriting. 2535 IntegerType *IntTy; 2536 2537 // If we are rewriting an alloca partition which can be written as pure 2538 // vector operations, we stash extra information here. When VecTy is 2539 // non-null, we have some strict guarantees about the rewritten alloca: 2540 // - The new alloca is exactly the size of the vector type here. 2541 // - The accesses all either map to the entire vector or to a single 2542 // element. 2543 // - The set of accessing instructions is only one of those handled above 2544 // in isVectorPromotionViable. Generally these are the same access kinds 2545 // which are promotable via mem2reg. 2546 VectorType *VecTy; 2547 Type *ElementTy; 2548 uint64_t ElementSize; 2549 2550 // The original offset of the slice currently being rewritten relative to 2551 // the original alloca. 2552 uint64_t BeginOffset = 0; 2553 uint64_t EndOffset = 0; 2554 2555 // The new offsets of the slice currently being rewritten relative to the 2556 // original alloca. 2557 uint64_t NewBeginOffset = 0, NewEndOffset = 0; 2558 2559 uint64_t RelativeOffset = 0; 2560 uint64_t SliceSize = 0; 2561 bool IsSplittable = false; 2562 bool IsSplit = false; 2563 Use *OldUse = nullptr; 2564 Instruction *OldPtr = nullptr; 2565 2566 // Track post-rewrite users which are PHI nodes and Selects. 2567 SmallSetVector<PHINode *, 8> &PHIUsers; 2568 SmallSetVector<SelectInst *, 8> &SelectUsers; 2569 2570 // Utility IR builder, whose name prefix is setup for each visited use, and 2571 // the insertion point is set to point to the user. 2572 IRBuilderTy IRB; 2573 2574 // Return the new alloca, addrspacecasted if required to avoid changing the 2575 // addrspace of a volatile access. 2576 Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) { 2577 if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace()) 2578 return &NewAI; 2579 2580 Type *AccessTy = NewAI.getAllocatedType()->getPointerTo(AddrSpace); 2581 return IRB.CreateAddrSpaceCast(&NewAI, AccessTy); 2582 } 2583 2584 public: 2585 AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROAPass &Pass, 2586 AllocaInst &OldAI, AllocaInst &NewAI, 2587 uint64_t NewAllocaBeginOffset, 2588 uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, 2589 VectorType *PromotableVecTy, 2590 SmallSetVector<PHINode *, 8> &PHIUsers, 2591 SmallSetVector<SelectInst *, 8> &SelectUsers) 2592 : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI), 2593 NewAllocaBeginOffset(NewAllocaBeginOffset), 2594 NewAllocaEndOffset(NewAllocaEndOffset), 2595 NewAllocaTy(NewAI.getAllocatedType()), 2596 IntTy( 2597 IsIntegerPromotable 2598 ? Type::getIntNTy(NewAI.getContext(), 2599 DL.getTypeSizeInBits(NewAI.getAllocatedType()) 2600 .getFixedValue()) 2601 : nullptr), 2602 VecTy(PromotableVecTy), 2603 ElementTy(VecTy ? VecTy->getElementType() : nullptr), 2604 ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8 2605 : 0), 2606 PHIUsers(PHIUsers), SelectUsers(SelectUsers), 2607 IRB(NewAI.getContext(), ConstantFolder()) { 2608 if (VecTy) { 2609 assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 && 2610 "Only multiple-of-8 sized vector elements are viable"); 2611 ++NumVectorized; 2612 } 2613 assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy)); 2614 } 2615 2616 bool visit(AllocaSlices::const_iterator I) { 2617 bool CanSROA = true; 2618 BeginOffset = I->beginOffset(); 2619 EndOffset = I->endOffset(); 2620 IsSplittable = I->isSplittable(); 2621 IsSplit = 2622 BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset; 2623 LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : "")); 2624 LLVM_DEBUG(AS.printSlice(dbgs(), I, "")); 2625 LLVM_DEBUG(dbgs() << "\n"); 2626 2627 // Compute the intersecting offset range. 2628 assert(BeginOffset < NewAllocaEndOffset); 2629 assert(EndOffset > NewAllocaBeginOffset); 2630 NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset); 2631 NewEndOffset = std::min(EndOffset, NewAllocaEndOffset); 2632 2633 RelativeOffset = NewBeginOffset - BeginOffset; 2634 SliceSize = NewEndOffset - NewBeginOffset; 2635 LLVM_DEBUG(dbgs() << " Begin:(" << BeginOffset << ", " << EndOffset 2636 << ") NewBegin:(" << NewBeginOffset << ", " 2637 << NewEndOffset << ") NewAllocaBegin:(" 2638 << NewAllocaBeginOffset << ", " << NewAllocaEndOffset 2639 << ")\n"); 2640 assert(IsSplit || RelativeOffset == 0); 2641 OldUse = I->getUse(); 2642 OldPtr = cast<Instruction>(OldUse->get()); 2643 2644 Instruction *OldUserI = cast<Instruction>(OldUse->getUser()); 2645 IRB.SetInsertPoint(OldUserI); 2646 IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc()); 2647 IRB.getInserter().SetNamePrefix( 2648 Twine(NewAI.getName()) + "." + Twine(BeginOffset) + "."); 2649 2650 CanSROA &= visit(cast<Instruction>(OldUse->getUser())); 2651 if (VecTy || IntTy) 2652 assert(CanSROA); 2653 return CanSROA; 2654 } 2655 2656 private: 2657 // Make sure the other visit overloads are visible. 2658 using Base::visit; 2659 2660 // Every instruction which can end up as a user must have a rewrite rule. 2661 bool visitInstruction(Instruction &I) { 2662 LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n"); 2663 llvm_unreachable("No rewrite rule for this instruction!"); 2664 } 2665 2666 Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) { 2667 // Note that the offset computation can use BeginOffset or NewBeginOffset 2668 // interchangeably for unsplit slices. 2669 assert(IsSplit || BeginOffset == NewBeginOffset); 2670 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 2671 2672 #ifndef NDEBUG 2673 StringRef OldName = OldPtr->getName(); 2674 // Skip through the last '.sroa.' component of the name. 2675 size_t LastSROAPrefix = OldName.rfind(".sroa."); 2676 if (LastSROAPrefix != StringRef::npos) { 2677 OldName = OldName.substr(LastSROAPrefix + strlen(".sroa.")); 2678 // Look for an SROA slice index. 2679 size_t IndexEnd = OldName.find_first_not_of("0123456789"); 2680 if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') { 2681 // Strip the index and look for the offset. 2682 OldName = OldName.substr(IndexEnd + 1); 2683 size_t OffsetEnd = OldName.find_first_not_of("0123456789"); 2684 if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.') 2685 // Strip the offset. 2686 OldName = OldName.substr(OffsetEnd + 1); 2687 } 2688 } 2689 // Strip any SROA suffixes as well. 2690 OldName = OldName.substr(0, OldName.find(".sroa_")); 2691 #endif 2692 2693 return getAdjustedPtr(IRB, DL, &NewAI, 2694 APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset), 2695 PointerTy, 2696 #ifndef NDEBUG 2697 Twine(OldName) + "." 2698 #else 2699 Twine() 2700 #endif 2701 ); 2702 } 2703 2704 /// Compute suitable alignment to access this slice of the *new* 2705 /// alloca. 2706 /// 2707 /// You can optionally pass a type to this routine and if that type's ABI 2708 /// alignment is itself suitable, this will return zero. 2709 Align getSliceAlign() { 2710 return commonAlignment(NewAI.getAlign(), 2711 NewBeginOffset - NewAllocaBeginOffset); 2712 } 2713 2714 unsigned getIndex(uint64_t Offset) { 2715 assert(VecTy && "Can only call getIndex when rewriting a vector"); 2716 uint64_t RelOffset = Offset - NewAllocaBeginOffset; 2717 assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds"); 2718 uint32_t Index = RelOffset / ElementSize; 2719 assert(Index * ElementSize == RelOffset); 2720 return Index; 2721 } 2722 2723 void deleteIfTriviallyDead(Value *V) { 2724 Instruction *I = cast<Instruction>(V); 2725 if (isInstructionTriviallyDead(I)) 2726 Pass.DeadInsts.push_back(I); 2727 } 2728 2729 Value *rewriteVectorizedLoadInst(LoadInst &LI) { 2730 unsigned BeginIndex = getIndex(NewBeginOffset); 2731 unsigned EndIndex = getIndex(NewEndOffset); 2732 assert(EndIndex > BeginIndex && "Empty vector!"); 2733 2734 LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 2735 NewAI.getAlign(), "load"); 2736 2737 Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access, 2738 LLVMContext::MD_access_group}); 2739 return extractVector(IRB, Load, BeginIndex, EndIndex, "vec"); 2740 } 2741 2742 Value *rewriteIntegerLoad(LoadInst &LI) { 2743 assert(IntTy && "We cannot insert an integer to the alloca"); 2744 assert(!LI.isVolatile()); 2745 Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 2746 NewAI.getAlign(), "load"); 2747 V = convertValue(DL, IRB, V, IntTy); 2748 assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2749 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 2750 if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { 2751 IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8); 2752 V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract"); 2753 } 2754 // It is possible that the extracted type is not the load type. This 2755 // happens if there is a load past the end of the alloca, and as 2756 // a consequence the slice is narrower but still a candidate for integer 2757 // lowering. To handle this case, we just zero extend the extracted 2758 // integer. 2759 assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 && 2760 "Can only handle an extract for an overly wide load"); 2761 if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8) 2762 V = IRB.CreateZExt(V, LI.getType()); 2763 return V; 2764 } 2765 2766 bool visitLoadInst(LoadInst &LI) { 2767 LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); 2768 Value *OldOp = LI.getOperand(0); 2769 assert(OldOp == OldPtr); 2770 2771 AAMDNodes AATags = LI.getAAMetadata(); 2772 2773 unsigned AS = LI.getPointerAddressSpace(); 2774 2775 Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) 2776 : LI.getType(); 2777 const bool IsLoadPastEnd = 2778 DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize; 2779 bool IsPtrAdjusted = false; 2780 Value *V; 2781 if (VecTy) { 2782 V = rewriteVectorizedLoadInst(LI); 2783 } else if (IntTy && LI.getType()->isIntegerTy()) { 2784 V = rewriteIntegerLoad(LI); 2785 } else if (NewBeginOffset == NewAllocaBeginOffset && 2786 NewEndOffset == NewAllocaEndOffset && 2787 (canConvertValue(DL, NewAllocaTy, TargetTy) || 2788 (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && 2789 TargetTy->isIntegerTy()))) { 2790 Value *NewPtr = 2791 getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile()); 2792 LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr, 2793 NewAI.getAlign(), LI.isVolatile(), 2794 LI.getName()); 2795 if (LI.isVolatile()) 2796 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); 2797 if (NewLI->isAtomic()) 2798 NewLI->setAlignment(LI.getAlign()); 2799 2800 // Copy any metadata that is valid for the new load. This may require 2801 // conversion to a different kind of metadata, e.g. !nonnull might change 2802 // to !range or vice versa. 2803 copyMetadataForLoad(*NewLI, LI); 2804 2805 // Do this after copyMetadataForLoad() to preserve the TBAA shift. 2806 if (AATags) 2807 NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 2808 2809 // Try to preserve nonnull metadata 2810 V = NewLI; 2811 2812 // If this is an integer load past the end of the slice (which means the 2813 // bytes outside the slice are undef or this load is dead) just forcibly 2814 // fix the integer size with correct handling of endianness. 2815 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy)) 2816 if (auto *TITy = dyn_cast<IntegerType>(TargetTy)) 2817 if (AITy->getBitWidth() < TITy->getBitWidth()) { 2818 V = IRB.CreateZExt(V, TITy, "load.ext"); 2819 if (DL.isBigEndian()) 2820 V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(), 2821 "endian_shift"); 2822 } 2823 } else { 2824 Type *LTy = TargetTy->getPointerTo(AS); 2825 LoadInst *NewLI = 2826 IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy), 2827 getSliceAlign(), LI.isVolatile(), LI.getName()); 2828 if (AATags) 2829 NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 2830 if (LI.isVolatile()) 2831 NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); 2832 NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access, 2833 LLVMContext::MD_access_group}); 2834 2835 V = NewLI; 2836 IsPtrAdjusted = true; 2837 } 2838 V = convertValue(DL, IRB, V, TargetTy); 2839 2840 if (IsSplit) { 2841 assert(!LI.isVolatile()); 2842 assert(LI.getType()->isIntegerTy() && 2843 "Only integer type loads and stores are split"); 2844 assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() && 2845 "Split load isn't smaller than original load"); 2846 assert(DL.typeSizeEqualsStoreSize(LI.getType()) && 2847 "Non-byte-multiple bit width"); 2848 // Move the insertion point just past the load so that we can refer to it. 2849 IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI))); 2850 // Create a placeholder value with the same type as LI to use as the 2851 // basis for the new value. This allows us to replace the uses of LI with 2852 // the computed value, and then replace the placeholder with LI, leaving 2853 // LI only used for this computation. 2854 Value *Placeholder = new LoadInst( 2855 LI.getType(), PoisonValue::get(LI.getType()->getPointerTo(AS)), "", 2856 false, Align(1)); 2857 V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, 2858 "insert"); 2859 LI.replaceAllUsesWith(V); 2860 Placeholder->replaceAllUsesWith(&LI); 2861 Placeholder->deleteValue(); 2862 } else { 2863 LI.replaceAllUsesWith(V); 2864 } 2865 2866 Pass.DeadInsts.push_back(&LI); 2867 deleteIfTriviallyDead(OldOp); 2868 LLVM_DEBUG(dbgs() << " to: " << *V << "\n"); 2869 return !LI.isVolatile() && !IsPtrAdjusted; 2870 } 2871 2872 bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp, 2873 AAMDNodes AATags) { 2874 // Capture V for the purpose of debug-info accounting once it's converted 2875 // to a vector store. 2876 Value *OrigV = V; 2877 if (V->getType() != VecTy) { 2878 unsigned BeginIndex = getIndex(NewBeginOffset); 2879 unsigned EndIndex = getIndex(NewEndOffset); 2880 assert(EndIndex > BeginIndex && "Empty vector!"); 2881 unsigned NumElements = EndIndex - BeginIndex; 2882 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() && 2883 "Too many elements!"); 2884 Type *SliceTy = (NumElements == 1) 2885 ? ElementTy 2886 : FixedVectorType::get(ElementTy, NumElements); 2887 if (V->getType() != SliceTy) 2888 V = convertValue(DL, IRB, V, SliceTy); 2889 2890 // Mix in the existing elements. 2891 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 2892 NewAI.getAlign(), "load"); 2893 V = insertVector(IRB, Old, V, BeginIndex, "vec"); 2894 } 2895 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign()); 2896 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, 2897 LLVMContext::MD_access_group}); 2898 if (AATags) 2899 Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 2900 Pass.DeadInsts.push_back(&SI); 2901 2902 // NOTE: Careful to use OrigV rather than V. 2903 migrateDebugInfo(&OldAI, RelativeOffset * 8, SliceSize * 8, &SI, Store, 2904 Store->getPointerOperand(), OrigV, DL); 2905 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); 2906 return true; 2907 } 2908 2909 bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) { 2910 assert(IntTy && "We cannot extract an integer from the alloca"); 2911 assert(!SI.isVolatile()); 2912 if (DL.getTypeSizeInBits(V->getType()).getFixedValue() != 2913 IntTy->getBitWidth()) { 2914 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 2915 NewAI.getAlign(), "oldload"); 2916 Old = convertValue(DL, IRB, Old, IntTy); 2917 assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); 2918 uint64_t Offset = BeginOffset - NewAllocaBeginOffset; 2919 V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); 2920 } 2921 V = convertValue(DL, IRB, V, NewAllocaTy); 2922 StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign()); 2923 Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, 2924 LLVMContext::MD_access_group}); 2925 if (AATags) 2926 Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 2927 2928 migrateDebugInfo(&OldAI, RelativeOffset * 8, SliceSize * 8, &SI, Store, 2929 Store->getPointerOperand(), Store->getValueOperand(), DL); 2930 2931 Pass.DeadInsts.push_back(&SI); 2932 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); 2933 return true; 2934 } 2935 2936 bool visitStoreInst(StoreInst &SI) { 2937 LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); 2938 Value *OldOp = SI.getOperand(1); 2939 assert(OldOp == OldPtr); 2940 2941 AAMDNodes AATags = SI.getAAMetadata(); 2942 Value *V = SI.getValueOperand(); 2943 2944 // Strip all inbounds GEPs and pointer casts to try to dig out any root 2945 // alloca that should be re-examined after promoting this alloca. 2946 if (V->getType()->isPointerTy()) 2947 if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets())) 2948 Pass.PostPromotionWorklist.insert(AI); 2949 2950 if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedValue()) { 2951 assert(!SI.isVolatile()); 2952 assert(V->getType()->isIntegerTy() && 2953 "Only integer type loads and stores are split"); 2954 assert(DL.typeSizeEqualsStoreSize(V->getType()) && 2955 "Non-byte-multiple bit width"); 2956 IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8); 2957 V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset, 2958 "extract"); 2959 } 2960 2961 if (VecTy) 2962 return rewriteVectorizedStoreInst(V, SI, OldOp, AATags); 2963 if (IntTy && V->getType()->isIntegerTy()) 2964 return rewriteIntegerStore(V, SI, AATags); 2965 2966 const bool IsStorePastEnd = 2967 DL.getTypeStoreSize(V->getType()).getFixedValue() > SliceSize; 2968 StoreInst *NewSI; 2969 if (NewBeginOffset == NewAllocaBeginOffset && 2970 NewEndOffset == NewAllocaEndOffset && 2971 (canConvertValue(DL, V->getType(), NewAllocaTy) || 2972 (IsStorePastEnd && NewAllocaTy->isIntegerTy() && 2973 V->getType()->isIntegerTy()))) { 2974 // If this is an integer store past the end of slice (and thus the bytes 2975 // past that point are irrelevant or this is unreachable), truncate the 2976 // value prior to storing. 2977 if (auto *VITy = dyn_cast<IntegerType>(V->getType())) 2978 if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy)) 2979 if (VITy->getBitWidth() > AITy->getBitWidth()) { 2980 if (DL.isBigEndian()) 2981 V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(), 2982 "endian_shift"); 2983 V = IRB.CreateTrunc(V, AITy, "load.trunc"); 2984 } 2985 2986 V = convertValue(DL, IRB, V, NewAllocaTy); 2987 Value *NewPtr = 2988 getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile()); 2989 2990 NewSI = 2991 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile()); 2992 } else { 2993 unsigned AS = SI.getPointerAddressSpace(); 2994 Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS)); 2995 NewSI = 2996 IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile()); 2997 } 2998 NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, 2999 LLVMContext::MD_access_group}); 3000 if (AATags) 3001 NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3002 if (SI.isVolatile()) 3003 NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); 3004 if (NewSI->isAtomic()) 3005 NewSI->setAlignment(SI.getAlign()); 3006 3007 migrateDebugInfo(&OldAI, RelativeOffset * 8, SliceSize * 8, &SI, NewSI, 3008 NewSI->getPointerOperand(), NewSI->getValueOperand(), DL); 3009 3010 Pass.DeadInsts.push_back(&SI); 3011 deleteIfTriviallyDead(OldOp); 3012 3013 LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n"); 3014 return NewSI->getPointerOperand() == &NewAI && 3015 NewSI->getValueOperand()->getType() == NewAllocaTy && 3016 !SI.isVolatile(); 3017 } 3018 3019 /// Compute an integer value from splatting an i8 across the given 3020 /// number of bytes. 3021 /// 3022 /// Note that this routine assumes an i8 is a byte. If that isn't true, don't 3023 /// call this routine. 3024 /// FIXME: Heed the advice above. 3025 /// 3026 /// \param V The i8 value to splat. 3027 /// \param Size The number of bytes in the output (assuming i8 is one byte) 3028 Value *getIntegerSplat(Value *V, unsigned Size) { 3029 assert(Size > 0 && "Expected a positive number of bytes."); 3030 IntegerType *VTy = cast<IntegerType>(V->getType()); 3031 assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte"); 3032 if (Size == 1) 3033 return V; 3034 3035 Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8); 3036 V = IRB.CreateMul( 3037 IRB.CreateZExt(V, SplatIntTy, "zext"), 3038 IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy), 3039 IRB.CreateZExt(Constant::getAllOnesValue(V->getType()), 3040 SplatIntTy)), 3041 "isplat"); 3042 return V; 3043 } 3044 3045 /// Compute a vector splat for a given element value. 3046 Value *getVectorSplat(Value *V, unsigned NumElements) { 3047 V = IRB.CreateVectorSplat(NumElements, V, "vsplat"); 3048 LLVM_DEBUG(dbgs() << " splat: " << *V << "\n"); 3049 return V; 3050 } 3051 3052 bool visitMemSetInst(MemSetInst &II) { 3053 LLVM_DEBUG(dbgs() << " original: " << II << "\n"); 3054 assert(II.getRawDest() == OldPtr); 3055 3056 AAMDNodes AATags = II.getAAMetadata(); 3057 3058 // If the memset has a variable size, it cannot be split, just adjust the 3059 // pointer to the new alloca. 3060 if (!isa<ConstantInt>(II.getLength())) { 3061 assert(!IsSplit); 3062 assert(NewBeginOffset == BeginOffset); 3063 II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType())); 3064 II.setDestAlignment(getSliceAlign()); 3065 // In theory we should call migrateDebugInfo here. However, we do not 3066 // emit dbg.assign intrinsics for mem intrinsics storing through non- 3067 // constant geps, or storing a variable number of bytes. 3068 assert(at::getAssignmentMarkers(&II).empty() && 3069 "AT: Unexpected link to non-const GEP"); 3070 deleteIfTriviallyDead(OldPtr); 3071 return false; 3072 } 3073 3074 // Record this instruction for deletion. 3075 Pass.DeadInsts.push_back(&II); 3076 3077 Type *AllocaTy = NewAI.getAllocatedType(); 3078 Type *ScalarTy = AllocaTy->getScalarType(); 3079 3080 const bool CanContinue = [&]() { 3081 if (VecTy || IntTy) 3082 return true; 3083 if (BeginOffset > NewAllocaBeginOffset || 3084 EndOffset < NewAllocaEndOffset) 3085 return false; 3086 // Length must be in range for FixedVectorType. 3087 auto *C = cast<ConstantInt>(II.getLength()); 3088 const uint64_t Len = C->getLimitedValue(); 3089 if (Len > std::numeric_limits<unsigned>::max()) 3090 return false; 3091 auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext()); 3092 auto *SrcTy = FixedVectorType::get(Int8Ty, Len); 3093 return canConvertValue(DL, SrcTy, AllocaTy) && 3094 DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue()); 3095 }(); 3096 3097 // If this doesn't map cleanly onto the alloca type, and that type isn't 3098 // a single value type, just emit a memset. 3099 if (!CanContinue) { 3100 Type *SizeTy = II.getLength()->getType(); 3101 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 3102 MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet( 3103 getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size, 3104 MaybeAlign(getSliceAlign()), II.isVolatile())); 3105 if (AATags) 3106 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3107 3108 migrateDebugInfo(&OldAI, RelativeOffset * 8, SliceSize * 8, &II, New, 3109 New->getRawDest(), nullptr, DL); 3110 3111 LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); 3112 return false; 3113 } 3114 3115 // If we can represent this as a simple value, we have to build the actual 3116 // value to store, which requires expanding the byte present in memset to 3117 // a sensible representation for the alloca type. This is essentially 3118 // splatting the byte to a sufficiently wide integer, splatting it across 3119 // any desired vector width, and bitcasting to the final type. 3120 Value *V; 3121 3122 if (VecTy) { 3123 // If this is a memset of a vectorized alloca, insert it. 3124 assert(ElementTy == ScalarTy); 3125 3126 unsigned BeginIndex = getIndex(NewBeginOffset); 3127 unsigned EndIndex = getIndex(NewEndOffset); 3128 assert(EndIndex > BeginIndex && "Empty vector!"); 3129 unsigned NumElements = EndIndex - BeginIndex; 3130 assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() && 3131 "Too many elements!"); 3132 3133 Value *Splat = getIntegerSplat( 3134 II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8); 3135 Splat = convertValue(DL, IRB, Splat, ElementTy); 3136 if (NumElements > 1) 3137 Splat = getVectorSplat(Splat, NumElements); 3138 3139 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3140 NewAI.getAlign(), "oldload"); 3141 V = insertVector(IRB, Old, Splat, BeginIndex, "vec"); 3142 } else if (IntTy) { 3143 // If this is a memset on an alloca where we can widen stores, insert the 3144 // set integer. 3145 assert(!II.isVolatile()); 3146 3147 uint64_t Size = NewEndOffset - NewBeginOffset; 3148 V = getIntegerSplat(II.getValue(), Size); 3149 3150 if (IntTy && (BeginOffset != NewAllocaBeginOffset || 3151 EndOffset != NewAllocaBeginOffset)) { 3152 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3153 NewAI.getAlign(), "oldload"); 3154 Old = convertValue(DL, IRB, Old, IntTy); 3155 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 3156 V = insertInteger(DL, IRB, Old, V, Offset, "insert"); 3157 } else { 3158 assert(V->getType() == IntTy && 3159 "Wrong type for an alloca wide integer!"); 3160 } 3161 V = convertValue(DL, IRB, V, AllocaTy); 3162 } else { 3163 // Established these invariants above. 3164 assert(NewBeginOffset == NewAllocaBeginOffset); 3165 assert(NewEndOffset == NewAllocaEndOffset); 3166 3167 V = getIntegerSplat(II.getValue(), 3168 DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8); 3169 if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy)) 3170 V = getVectorSplat( 3171 V, cast<FixedVectorType>(AllocaVecTy)->getNumElements()); 3172 3173 V = convertValue(DL, IRB, V, AllocaTy); 3174 } 3175 3176 Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile()); 3177 StoreInst *New = 3178 IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile()); 3179 New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, 3180 LLVMContext::MD_access_group}); 3181 if (AATags) 3182 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3183 3184 migrateDebugInfo(&OldAI, RelativeOffset * 8, SliceSize * 8, &II, New, 3185 New->getPointerOperand(), V, DL); 3186 3187 LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); 3188 return !II.isVolatile(); 3189 } 3190 3191 bool visitMemTransferInst(MemTransferInst &II) { 3192 // Rewriting of memory transfer instructions can be a bit tricky. We break 3193 // them into two categories: split intrinsics and unsplit intrinsics. 3194 3195 LLVM_DEBUG(dbgs() << " original: " << II << "\n"); 3196 3197 AAMDNodes AATags = II.getAAMetadata(); 3198 3199 bool IsDest = &II.getRawDestUse() == OldUse; 3200 assert((IsDest && II.getRawDest() == OldPtr) || 3201 (!IsDest && II.getRawSource() == OldPtr)); 3202 3203 Align SliceAlign = getSliceAlign(); 3204 // For unsplit intrinsics, we simply modify the source and destination 3205 // pointers in place. This isn't just an optimization, it is a matter of 3206 // correctness. With unsplit intrinsics we may be dealing with transfers 3207 // within a single alloca before SROA ran, or with transfers that have 3208 // a variable length. We may also be dealing with memmove instead of 3209 // memcpy, and so simply updating the pointers is the necessary for us to 3210 // update both source and dest of a single call. 3211 if (!IsSplittable) { 3212 Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 3213 if (IsDest) { 3214 // Update the address component of linked dbg.assigns. 3215 for (auto *DAI : at::getAssignmentMarkers(&II)) { 3216 if (any_of(DAI->location_ops(), 3217 [&](Value *V) { return V == II.getDest(); }) || 3218 DAI->getAddress() == II.getDest()) 3219 DAI->replaceVariableLocationOp(II.getDest(), AdjustedPtr); 3220 } 3221 II.setDest(AdjustedPtr); 3222 II.setDestAlignment(SliceAlign); 3223 } else { 3224 II.setSource(AdjustedPtr); 3225 II.setSourceAlignment(SliceAlign); 3226 } 3227 3228 LLVM_DEBUG(dbgs() << " to: " << II << "\n"); 3229 deleteIfTriviallyDead(OldPtr); 3230 return false; 3231 } 3232 // For split transfer intrinsics we have an incredibly useful assurance: 3233 // the source and destination do not reside within the same alloca, and at 3234 // least one of them does not escape. This means that we can replace 3235 // memmove with memcpy, and we don't need to worry about all manner of 3236 // downsides to splitting and transforming the operations. 3237 3238 // If this doesn't map cleanly onto the alloca type, and that type isn't 3239 // a single value type, just emit a memcpy. 3240 bool EmitMemCpy = 3241 !VecTy && !IntTy && 3242 (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset || 3243 SliceSize != 3244 DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() || 3245 !NewAI.getAllocatedType()->isSingleValueType()); 3246 3247 // If we're just going to emit a memcpy, the alloca hasn't changed, and the 3248 // size hasn't been shrunk based on analysis of the viable range, this is 3249 // a no-op. 3250 if (EmitMemCpy && &OldAI == &NewAI) { 3251 // Ensure the start lines up. 3252 assert(NewBeginOffset == BeginOffset); 3253 3254 // Rewrite the size as needed. 3255 if (NewEndOffset != EndOffset) 3256 II.setLength(ConstantInt::get(II.getLength()->getType(), 3257 NewEndOffset - NewBeginOffset)); 3258 return false; 3259 } 3260 // Record this instruction for deletion. 3261 Pass.DeadInsts.push_back(&II); 3262 3263 // Strip all inbounds GEPs and pointer casts to try to dig out any root 3264 // alloca that should be re-examined after rewriting this instruction. 3265 Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest(); 3266 if (AllocaInst *AI = 3267 dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) { 3268 assert(AI != &OldAI && AI != &NewAI && 3269 "Splittable transfers cannot reach the same alloca on both ends."); 3270 Pass.Worklist.insert(AI); 3271 } 3272 3273 Type *OtherPtrTy = OtherPtr->getType(); 3274 unsigned OtherAS = OtherPtrTy->getPointerAddressSpace(); 3275 3276 // Compute the relative offset for the other pointer within the transfer. 3277 unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS); 3278 APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset); 3279 Align OtherAlign = 3280 (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne(); 3281 OtherAlign = 3282 commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue()); 3283 3284 if (EmitMemCpy) { 3285 // Compute the other pointer, folding as much as possible to produce 3286 // a single, simple GEP in most cases. 3287 OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 3288 OtherPtr->getName() + "."); 3289 3290 Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 3291 Type *SizeTy = II.getLength()->getType(); 3292 Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset); 3293 3294 Value *DestPtr, *SrcPtr; 3295 MaybeAlign DestAlign, SrcAlign; 3296 // Note: IsDest is true iff we're copying into the new alloca slice 3297 if (IsDest) { 3298 DestPtr = OurPtr; 3299 DestAlign = SliceAlign; 3300 SrcPtr = OtherPtr; 3301 SrcAlign = OtherAlign; 3302 } else { 3303 DestPtr = OtherPtr; 3304 DestAlign = OtherAlign; 3305 SrcPtr = OurPtr; 3306 SrcAlign = SliceAlign; 3307 } 3308 CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign, 3309 Size, II.isVolatile()); 3310 if (AATags) 3311 New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3312 3313 migrateDebugInfo(&OldAI, RelativeOffset * 8, SliceSize * 8, &II, New, 3314 DestPtr, nullptr, DL); 3315 LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); 3316 return false; 3317 } 3318 3319 bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset && 3320 NewEndOffset == NewAllocaEndOffset; 3321 uint64_t Size = NewEndOffset - NewBeginOffset; 3322 unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0; 3323 unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0; 3324 unsigned NumElements = EndIndex - BeginIndex; 3325 IntegerType *SubIntTy = 3326 IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr; 3327 3328 // Reset the other pointer type to match the register type we're going to 3329 // use, but using the address space of the original other pointer. 3330 Type *OtherTy; 3331 if (VecTy && !IsWholeAlloca) { 3332 if (NumElements == 1) 3333 OtherTy = VecTy->getElementType(); 3334 else 3335 OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements); 3336 } else if (IntTy && !IsWholeAlloca) { 3337 OtherTy = SubIntTy; 3338 } else { 3339 OtherTy = NewAllocaTy; 3340 } 3341 OtherPtrTy = OtherTy->getPointerTo(OtherAS); 3342 3343 Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy, 3344 OtherPtr->getName() + "."); 3345 MaybeAlign SrcAlign = OtherAlign; 3346 MaybeAlign DstAlign = SliceAlign; 3347 if (!IsDest) 3348 std::swap(SrcAlign, DstAlign); 3349 3350 Value *SrcPtr; 3351 Value *DstPtr; 3352 3353 if (IsDest) { 3354 DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile()); 3355 SrcPtr = AdjPtr; 3356 } else { 3357 DstPtr = AdjPtr; 3358 SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile()); 3359 } 3360 3361 Value *Src; 3362 if (VecTy && !IsWholeAlloca && !IsDest) { 3363 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3364 NewAI.getAlign(), "load"); 3365 Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec"); 3366 } else if (IntTy && !IsWholeAlloca && !IsDest) { 3367 Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3368 NewAI.getAlign(), "load"); 3369 Src = convertValue(DL, IRB, Src, IntTy); 3370 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 3371 Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); 3372 } else { 3373 LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign, 3374 II.isVolatile(), "copyload"); 3375 Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, 3376 LLVMContext::MD_access_group}); 3377 if (AATags) 3378 Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3379 Src = Load; 3380 } 3381 3382 if (VecTy && !IsWholeAlloca && IsDest) { 3383 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3384 NewAI.getAlign(), "oldload"); 3385 Src = insertVector(IRB, Old, Src, BeginIndex, "vec"); 3386 } else if (IntTy && !IsWholeAlloca && IsDest) { 3387 Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI, 3388 NewAI.getAlign(), "oldload"); 3389 Old = convertValue(DL, IRB, Old, IntTy); 3390 uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; 3391 Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); 3392 Src = convertValue(DL, IRB, Src, NewAllocaTy); 3393 } 3394 3395 StoreInst *Store = cast<StoreInst>( 3396 IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile())); 3397 Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access, 3398 LLVMContext::MD_access_group}); 3399 if (AATags) 3400 Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset)); 3401 3402 migrateDebugInfo(&OldAI, RelativeOffset * 8, SliceSize * 8, &II, Store, 3403 DstPtr, Src, DL); 3404 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); 3405 return !II.isVolatile(); 3406 } 3407 3408 bool visitIntrinsicInst(IntrinsicInst &II) { 3409 assert((II.isLifetimeStartOrEnd() || II.isDroppable()) && 3410 "Unexpected intrinsic!"); 3411 LLVM_DEBUG(dbgs() << " original: " << II << "\n"); 3412 3413 // Record this instruction for deletion. 3414 Pass.DeadInsts.push_back(&II); 3415 3416 if (II.isDroppable()) { 3417 assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume"); 3418 // TODO For now we forget assumed information, this can be improved. 3419 OldPtr->dropDroppableUsesIn(II); 3420 return true; 3421 } 3422 3423 assert(II.getArgOperand(1) == OldPtr); 3424 // Lifetime intrinsics are only promotable if they cover the whole alloca. 3425 // Therefore, we drop lifetime intrinsics which don't cover the whole 3426 // alloca. 3427 // (In theory, intrinsics which partially cover an alloca could be 3428 // promoted, but PromoteMemToReg doesn't handle that case.) 3429 // FIXME: Check whether the alloca is promotable before dropping the 3430 // lifetime intrinsics? 3431 if (NewBeginOffset != NewAllocaBeginOffset || 3432 NewEndOffset != NewAllocaEndOffset) 3433 return true; 3434 3435 ConstantInt *Size = 3436 ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()), 3437 NewEndOffset - NewBeginOffset); 3438 // Lifetime intrinsics always expect an i8* so directly get such a pointer 3439 // for the new alloca slice. 3440 Type *PointerTy = IRB.getInt8PtrTy(OldPtr->getType()->getPointerAddressSpace()); 3441 Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy); 3442 Value *New; 3443 if (II.getIntrinsicID() == Intrinsic::lifetime_start) 3444 New = IRB.CreateLifetimeStart(Ptr, Size); 3445 else 3446 New = IRB.CreateLifetimeEnd(Ptr, Size); 3447 3448 (void)New; 3449 LLVM_DEBUG(dbgs() << " to: " << *New << "\n"); 3450 3451 return true; 3452 } 3453 3454 void fixLoadStoreAlign(Instruction &Root) { 3455 // This algorithm implements the same visitor loop as 3456 // hasUnsafePHIOrSelectUse, and fixes the alignment of each load 3457 // or store found. 3458 SmallPtrSet<Instruction *, 4> Visited; 3459 SmallVector<Instruction *, 4> Uses; 3460 Visited.insert(&Root); 3461 Uses.push_back(&Root); 3462 do { 3463 Instruction *I = Uses.pop_back_val(); 3464 3465 if (LoadInst *LI = dyn_cast<LoadInst>(I)) { 3466 LI->setAlignment(std::min(LI->getAlign(), getSliceAlign())); 3467 continue; 3468 } 3469 if (StoreInst *SI = dyn_cast<StoreInst>(I)) { 3470 SI->setAlignment(std::min(SI->getAlign(), getSliceAlign())); 3471 continue; 3472 } 3473 3474 assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) || 3475 isa<PHINode>(I) || isa<SelectInst>(I) || 3476 isa<GetElementPtrInst>(I)); 3477 for (User *U : I->users()) 3478 if (Visited.insert(cast<Instruction>(U)).second) 3479 Uses.push_back(cast<Instruction>(U)); 3480 } while (!Uses.empty()); 3481 } 3482 3483 bool visitPHINode(PHINode &PN) { 3484 LLVM_DEBUG(dbgs() << " original: " << PN << "\n"); 3485 assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable"); 3486 assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable"); 3487 3488 // We would like to compute a new pointer in only one place, but have it be 3489 // as local as possible to the PHI. To do that, we re-use the location of 3490 // the old pointer, which necessarily must be in the right position to 3491 // dominate the PHI. 3492 IRBuilderBase::InsertPointGuard Guard(IRB); 3493 if (isa<PHINode>(OldPtr)) 3494 IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt()); 3495 else 3496 IRB.SetInsertPoint(OldPtr); 3497 IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc()); 3498 3499 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 3500 // Replace the operands which were using the old pointer. 3501 std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr); 3502 3503 LLVM_DEBUG(dbgs() << " to: " << PN << "\n"); 3504 deleteIfTriviallyDead(OldPtr); 3505 3506 // Fix the alignment of any loads or stores using this PHI node. 3507 fixLoadStoreAlign(PN); 3508 3509 // PHIs can't be promoted on their own, but often can be speculated. We 3510 // check the speculation outside of the rewriter so that we see the 3511 // fully-rewritten alloca. 3512 PHIUsers.insert(&PN); 3513 return true; 3514 } 3515 3516 bool visitSelectInst(SelectInst &SI) { 3517 LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); 3518 assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) && 3519 "Pointer isn't an operand!"); 3520 assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable"); 3521 assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable"); 3522 3523 Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType()); 3524 // Replace the operands which were using the old pointer. 3525 if (SI.getOperand(1) == OldPtr) 3526 SI.setOperand(1, NewPtr); 3527 if (SI.getOperand(2) == OldPtr) 3528 SI.setOperand(2, NewPtr); 3529 3530 LLVM_DEBUG(dbgs() << " to: " << SI << "\n"); 3531 deleteIfTriviallyDead(OldPtr); 3532 3533 // Fix the alignment of any loads or stores using this select. 3534 fixLoadStoreAlign(SI); 3535 3536 // Selects can't be promoted on their own, but often can be speculated. We 3537 // check the speculation outside of the rewriter so that we see the 3538 // fully-rewritten alloca. 3539 SelectUsers.insert(&SI); 3540 return true; 3541 } 3542 }; 3543 3544 namespace { 3545 3546 /// Visitor to rewrite aggregate loads and stores as scalar. 3547 /// 3548 /// This pass aggressively rewrites all aggregate loads and stores on 3549 /// a particular pointer (or any pointer derived from it which we can identify) 3550 /// with scalar loads and stores. 3551 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> { 3552 // Befriend the base class so it can delegate to private visit methods. 3553 friend class InstVisitor<AggLoadStoreRewriter, bool>; 3554 3555 /// Queue of pointer uses to analyze and potentially rewrite. 3556 SmallVector<Use *, 8> Queue; 3557 3558 /// Set to prevent us from cycling with phi nodes and loops. 3559 SmallPtrSet<User *, 8> Visited; 3560 3561 /// The current pointer use being rewritten. This is used to dig up the used 3562 /// value (as opposed to the user). 3563 Use *U = nullptr; 3564 3565 /// Used to calculate offsets, and hence alignment, of subobjects. 3566 const DataLayout &DL; 3567 3568 IRBuilderTy &IRB; 3569 3570 public: 3571 AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB) 3572 : DL(DL), IRB(IRB) {} 3573 3574 /// Rewrite loads and stores through a pointer and all pointers derived from 3575 /// it. 3576 bool rewrite(Instruction &I) { 3577 LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n"); 3578 enqueueUsers(I); 3579 bool Changed = false; 3580 while (!Queue.empty()) { 3581 U = Queue.pop_back_val(); 3582 Changed |= visit(cast<Instruction>(U->getUser())); 3583 } 3584 return Changed; 3585 } 3586 3587 private: 3588 /// Enqueue all the users of the given instruction for further processing. 3589 /// This uses a set to de-duplicate users. 3590 void enqueueUsers(Instruction &I) { 3591 for (Use &U : I.uses()) 3592 if (Visited.insert(U.getUser()).second) 3593 Queue.push_back(&U); 3594 } 3595 3596 // Conservative default is to not rewrite anything. 3597 bool visitInstruction(Instruction &I) { return false; } 3598 3599 /// Generic recursive split emission class. 3600 template <typename Derived> class OpSplitter { 3601 protected: 3602 /// The builder used to form new instructions. 3603 IRBuilderTy &IRB; 3604 3605 /// The indices which to be used with insert- or extractvalue to select the 3606 /// appropriate value within the aggregate. 3607 SmallVector<unsigned, 4> Indices; 3608 3609 /// The indices to a GEP instruction which will move Ptr to the correct slot 3610 /// within the aggregate. 3611 SmallVector<Value *, 4> GEPIndices; 3612 3613 /// The base pointer of the original op, used as a base for GEPing the 3614 /// split operations. 3615 Value *Ptr; 3616 3617 /// The base pointee type being GEPed into. 3618 Type *BaseTy; 3619 3620 /// Known alignment of the base pointer. 3621 Align BaseAlign; 3622 3623 /// To calculate offset of each component so we can correctly deduce 3624 /// alignments. 3625 const DataLayout &DL; 3626 3627 /// Initialize the splitter with an insertion point, Ptr and start with a 3628 /// single zero GEP index. 3629 OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, 3630 Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB) 3631 : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy), 3632 BaseAlign(BaseAlign), DL(DL) { 3633 IRB.SetInsertPoint(InsertionPoint); 3634 } 3635 3636 public: 3637 /// Generic recursive split emission routine. 3638 /// 3639 /// This method recursively splits an aggregate op (load or store) into 3640 /// scalar or vector ops. It splits recursively until it hits a single value 3641 /// and emits that single value operation via the template argument. 3642 /// 3643 /// The logic of this routine relies on GEPs and insertvalue and 3644 /// extractvalue all operating with the same fundamental index list, merely 3645 /// formatted differently (GEPs need actual values). 3646 /// 3647 /// \param Ty The type being split recursively into smaller ops. 3648 /// \param Agg The aggregate value being built up or stored, depending on 3649 /// whether this is splitting a load or a store respectively. 3650 void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) { 3651 if (Ty->isSingleValueType()) { 3652 unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices); 3653 return static_cast<Derived *>(this)->emitFunc( 3654 Ty, Agg, commonAlignment(BaseAlign, Offset), Name); 3655 } 3656 3657 if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 3658 unsigned OldSize = Indices.size(); 3659 (void)OldSize; 3660 for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size; 3661 ++Idx) { 3662 assert(Indices.size() == OldSize && "Did not return to the old size"); 3663 Indices.push_back(Idx); 3664 GEPIndices.push_back(IRB.getInt32(Idx)); 3665 emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx)); 3666 GEPIndices.pop_back(); 3667 Indices.pop_back(); 3668 } 3669 return; 3670 } 3671 3672 if (StructType *STy = dyn_cast<StructType>(Ty)) { 3673 unsigned OldSize = Indices.size(); 3674 (void)OldSize; 3675 for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size; 3676 ++Idx) { 3677 assert(Indices.size() == OldSize && "Did not return to the old size"); 3678 Indices.push_back(Idx); 3679 GEPIndices.push_back(IRB.getInt32(Idx)); 3680 emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx)); 3681 GEPIndices.pop_back(); 3682 Indices.pop_back(); 3683 } 3684 return; 3685 } 3686 3687 llvm_unreachable("Only arrays and structs are aggregate loadable types"); 3688 } 3689 }; 3690 3691 struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> { 3692 AAMDNodes AATags; 3693 3694 LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, 3695 AAMDNodes AATags, Align BaseAlign, const DataLayout &DL, 3696 IRBuilderTy &IRB) 3697 : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL, 3698 IRB), 3699 AATags(AATags) {} 3700 3701 /// Emit a leaf load of a single value. This is called at the leaves of the 3702 /// recursive emission to actually load values. 3703 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { 3704 assert(Ty->isSingleValueType()); 3705 // Load the single value and insert it using the indices. 3706 Value *GEP = 3707 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); 3708 LoadInst *Load = 3709 IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load"); 3710 3711 APInt Offset( 3712 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0); 3713 if (AATags && 3714 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset)) 3715 Load->setAAMetadata(AATags.shift(Offset.getZExtValue())); 3716 3717 Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert"); 3718 LLVM_DEBUG(dbgs() << " to: " << *Load << "\n"); 3719 } 3720 }; 3721 3722 bool visitLoadInst(LoadInst &LI) { 3723 assert(LI.getPointerOperand() == *U); 3724 if (!LI.isSimple() || LI.getType()->isSingleValueType()) 3725 return false; 3726 3727 // We have an aggregate being loaded, split it apart. 3728 LLVM_DEBUG(dbgs() << " original: " << LI << "\n"); 3729 LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(), 3730 getAdjustedAlignment(&LI, 0), DL, IRB); 3731 Value *V = PoisonValue::get(LI.getType()); 3732 Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca"); 3733 Visited.erase(&LI); 3734 LI.replaceAllUsesWith(V); 3735 LI.eraseFromParent(); 3736 return true; 3737 } 3738 3739 struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> { 3740 StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy, 3741 AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign, 3742 const DataLayout &DL, IRBuilderTy &IRB) 3743 : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, 3744 DL, IRB), 3745 AATags(AATags), AggStore(AggStore) {} 3746 AAMDNodes AATags; 3747 StoreInst *AggStore; 3748 /// Emit a leaf store of a single value. This is called at the leaves of the 3749 /// recursive emission to actually produce stores. 3750 void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) { 3751 assert(Ty->isSingleValueType()); 3752 // Extract the single value and store it using the indices. 3753 // 3754 // The gep and extractvalue values are factored out of the CreateStore 3755 // call to make the output independent of the argument evaluation order. 3756 Value *ExtractValue = 3757 IRB.CreateExtractValue(Agg, Indices, Name + ".extract"); 3758 Value *InBoundsGEP = 3759 IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep"); 3760 StoreInst *Store = 3761 IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment); 3762 3763 APInt Offset( 3764 DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0); 3765 if (AATags && 3766 GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset)) 3767 Store->setAAMetadata(AATags.shift(Offset.getZExtValue())); 3768 3769 // migrateDebugInfo requires the base Alloca. Walk to it from this gep. 3770 // If we cannot (because there's an intervening non-const or unbounded 3771 // gep) then we wouldn't expect to see dbg.assign intrinsics linked to 3772 // this instruction. 3773 APInt OffsetInBytes(DL.getTypeSizeInBits(Ptr->getType()), false); 3774 Value *Base = InBoundsGEP->stripAndAccumulateInBoundsConstantOffsets( 3775 DL, OffsetInBytes); 3776 if (auto *OldAI = dyn_cast<AllocaInst>(Base)) { 3777 uint64_t SizeInBits = 3778 DL.getTypeSizeInBits(Store->getValueOperand()->getType()); 3779 migrateDebugInfo(OldAI, OffsetInBytes.getZExtValue() * 8, SizeInBits, 3780 AggStore, Store, Store->getPointerOperand(), 3781 Store->getValueOperand(), DL); 3782 } else { 3783 assert(at::getAssignmentMarkers(Store).empty() && 3784 "AT: unexpected debug.assign linked to store through " 3785 "unbounded GEP"); 3786 } 3787 LLVM_DEBUG(dbgs() << " to: " << *Store << "\n"); 3788 } 3789 }; 3790 3791 bool visitStoreInst(StoreInst &SI) { 3792 if (!SI.isSimple() || SI.getPointerOperand() != *U) 3793 return false; 3794 Value *V = SI.getValueOperand(); 3795 if (V->getType()->isSingleValueType()) 3796 return false; 3797 3798 // We have an aggregate being stored, split it apart. 3799 LLVM_DEBUG(dbgs() << " original: " << SI << "\n"); 3800 StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI, 3801 getAdjustedAlignment(&SI, 0), DL, IRB); 3802 Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca"); 3803 Visited.erase(&SI); 3804 SI.eraseFromParent(); 3805 return true; 3806 } 3807 3808 bool visitBitCastInst(BitCastInst &BC) { 3809 enqueueUsers(BC); 3810 return false; 3811 } 3812 3813 bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) { 3814 enqueueUsers(ASC); 3815 return false; 3816 } 3817 3818 // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2) 3819 bool foldGEPSelect(GetElementPtrInst &GEPI) { 3820 if (!GEPI.hasAllConstantIndices()) 3821 return false; 3822 3823 SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand()); 3824 3825 LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):" 3826 << "\n original: " << *Sel 3827 << "\n " << GEPI); 3828 3829 IRB.SetInsertPoint(&GEPI); 3830 SmallVector<Value *, 4> Index(GEPI.indices()); 3831 bool IsInBounds = GEPI.isInBounds(); 3832 3833 Type *Ty = GEPI.getSourceElementType(); 3834 Value *True = Sel->getTrueValue(); 3835 Value *NTrue = IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep", 3836 IsInBounds); 3837 3838 Value *False = Sel->getFalseValue(); 3839 3840 Value *NFalse = IRB.CreateGEP(Ty, False, Index, 3841 False->getName() + ".sroa.gep", IsInBounds); 3842 3843 Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse, 3844 Sel->getName() + ".sroa.sel"); 3845 Visited.erase(&GEPI); 3846 GEPI.replaceAllUsesWith(NSel); 3847 GEPI.eraseFromParent(); 3848 Instruction *NSelI = cast<Instruction>(NSel); 3849 Visited.insert(NSelI); 3850 enqueueUsers(*NSelI); 3851 3852 LLVM_DEBUG(dbgs() << "\n to: " << *NTrue 3853 << "\n " << *NFalse 3854 << "\n " << *NSel << '\n'); 3855 3856 return true; 3857 } 3858 3859 // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2) 3860 bool foldGEPPhi(GetElementPtrInst &GEPI) { 3861 if (!GEPI.hasAllConstantIndices()) 3862 return false; 3863 3864 PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand()); 3865 if (GEPI.getParent() != PHI->getParent() || 3866 llvm::any_of(PHI->incoming_values(), [](Value *In) 3867 { Instruction *I = dyn_cast<Instruction>(In); 3868 return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) || 3869 succ_empty(I->getParent()) || 3870 !I->getParent()->isLegalToHoistInto(); 3871 })) 3872 return false; 3873 3874 LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):" 3875 << "\n original: " << *PHI 3876 << "\n " << GEPI 3877 << "\n to: "); 3878 3879 SmallVector<Value *, 4> Index(GEPI.indices()); 3880 bool IsInBounds = GEPI.isInBounds(); 3881 IRB.SetInsertPoint(GEPI.getParent()->getFirstNonPHI()); 3882 PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(), 3883 PHI->getName() + ".sroa.phi"); 3884 for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) { 3885 BasicBlock *B = PHI->getIncomingBlock(I); 3886 Value *NewVal = nullptr; 3887 int Idx = NewPN->getBasicBlockIndex(B); 3888 if (Idx >= 0) { 3889 NewVal = NewPN->getIncomingValue(Idx); 3890 } else { 3891 Instruction *In = cast<Instruction>(PHI->getIncomingValue(I)); 3892 3893 IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator())); 3894 Type *Ty = GEPI.getSourceElementType(); 3895 NewVal = IRB.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep", 3896 IsInBounds); 3897 } 3898 NewPN->addIncoming(NewVal, B); 3899 } 3900 3901 Visited.erase(&GEPI); 3902 GEPI.replaceAllUsesWith(NewPN); 3903 GEPI.eraseFromParent(); 3904 Visited.insert(NewPN); 3905 enqueueUsers(*NewPN); 3906 3907 LLVM_DEBUG(for (Value *In : NewPN->incoming_values()) 3908 dbgs() << "\n " << *In; 3909 dbgs() << "\n " << *NewPN << '\n'); 3910 3911 return true; 3912 } 3913 3914 bool visitGetElementPtrInst(GetElementPtrInst &GEPI) { 3915 if (isa<SelectInst>(GEPI.getPointerOperand()) && 3916 foldGEPSelect(GEPI)) 3917 return true; 3918 3919 if (isa<PHINode>(GEPI.getPointerOperand()) && 3920 foldGEPPhi(GEPI)) 3921 return true; 3922 3923 enqueueUsers(GEPI); 3924 return false; 3925 } 3926 3927 bool visitPHINode(PHINode &PN) { 3928 enqueueUsers(PN); 3929 return false; 3930 } 3931 3932 bool visitSelectInst(SelectInst &SI) { 3933 enqueueUsers(SI); 3934 return false; 3935 } 3936 }; 3937 3938 } // end anonymous namespace 3939 3940 /// Strip aggregate type wrapping. 3941 /// 3942 /// This removes no-op aggregate types wrapping an underlying type. It will 3943 /// strip as many layers of types as it can without changing either the type 3944 /// size or the allocated size. 3945 static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) { 3946 if (Ty->isSingleValueType()) 3947 return Ty; 3948 3949 uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue(); 3950 uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue(); 3951 3952 Type *InnerTy; 3953 if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) { 3954 InnerTy = ArrTy->getElementType(); 3955 } else if (StructType *STy = dyn_cast<StructType>(Ty)) { 3956 const StructLayout *SL = DL.getStructLayout(STy); 3957 unsigned Index = SL->getElementContainingOffset(0); 3958 InnerTy = STy->getElementType(Index); 3959 } else { 3960 return Ty; 3961 } 3962 3963 if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() || 3964 TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue()) 3965 return Ty; 3966 3967 return stripAggregateTypeWrapping(DL, InnerTy); 3968 } 3969 3970 /// Try to find a partition of the aggregate type passed in for a given 3971 /// offset and size. 3972 /// 3973 /// This recurses through the aggregate type and tries to compute a subtype 3974 /// based on the offset and size. When the offset and size span a sub-section 3975 /// of an array, it will even compute a new array type for that sub-section, 3976 /// and the same for structs. 3977 /// 3978 /// Note that this routine is very strict and tries to find a partition of the 3979 /// type which produces the *exact* right offset and size. It is not forgiving 3980 /// when the size or offset cause either end of type-based partition to be off. 3981 /// Also, this is a best-effort routine. It is reasonable to give up and not 3982 /// return a type if necessary. 3983 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset, 3984 uint64_t Size) { 3985 if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size) 3986 return stripAggregateTypeWrapping(DL, Ty); 3987 if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() || 3988 (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size) 3989 return nullptr; 3990 3991 if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) { 3992 Type *ElementTy; 3993 uint64_t TyNumElements; 3994 if (auto *AT = dyn_cast<ArrayType>(Ty)) { 3995 ElementTy = AT->getElementType(); 3996 TyNumElements = AT->getNumElements(); 3997 } else { 3998 // FIXME: This isn't right for vectors with non-byte-sized or 3999 // non-power-of-two sized elements. 4000 auto *VT = cast<FixedVectorType>(Ty); 4001 ElementTy = VT->getElementType(); 4002 TyNumElements = VT->getNumElements(); 4003 } 4004 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue(); 4005 uint64_t NumSkippedElements = Offset / ElementSize; 4006 if (NumSkippedElements >= TyNumElements) 4007 return nullptr; 4008 Offset -= NumSkippedElements * ElementSize; 4009 4010 // First check if we need to recurse. 4011 if (Offset > 0 || Size < ElementSize) { 4012 // Bail if the partition ends in a different array element. 4013 if ((Offset + Size) > ElementSize) 4014 return nullptr; 4015 // Recurse through the element type trying to peel off offset bytes. 4016 return getTypePartition(DL, ElementTy, Offset, Size); 4017 } 4018 assert(Offset == 0); 4019 4020 if (Size == ElementSize) 4021 return stripAggregateTypeWrapping(DL, ElementTy); 4022 assert(Size > ElementSize); 4023 uint64_t NumElements = Size / ElementSize; 4024 if (NumElements * ElementSize != Size) 4025 return nullptr; 4026 return ArrayType::get(ElementTy, NumElements); 4027 } 4028 4029 StructType *STy = dyn_cast<StructType>(Ty); 4030 if (!STy) 4031 return nullptr; 4032 4033 const StructLayout *SL = DL.getStructLayout(STy); 4034 if (Offset >= SL->getSizeInBytes()) 4035 return nullptr; 4036 uint64_t EndOffset = Offset + Size; 4037 if (EndOffset > SL->getSizeInBytes()) 4038 return nullptr; 4039 4040 unsigned Index = SL->getElementContainingOffset(Offset); 4041 Offset -= SL->getElementOffset(Index); 4042 4043 Type *ElementTy = STy->getElementType(Index); 4044 uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue(); 4045 if (Offset >= ElementSize) 4046 return nullptr; // The offset points into alignment padding. 4047 4048 // See if any partition must be contained by the element. 4049 if (Offset > 0 || Size < ElementSize) { 4050 if ((Offset + Size) > ElementSize) 4051 return nullptr; 4052 return getTypePartition(DL, ElementTy, Offset, Size); 4053 } 4054 assert(Offset == 0); 4055 4056 if (Size == ElementSize) 4057 return stripAggregateTypeWrapping(DL, ElementTy); 4058 4059 StructType::element_iterator EI = STy->element_begin() + Index, 4060 EE = STy->element_end(); 4061 if (EndOffset < SL->getSizeInBytes()) { 4062 unsigned EndIndex = SL->getElementContainingOffset(EndOffset); 4063 if (Index == EndIndex) 4064 return nullptr; // Within a single element and its padding. 4065 4066 // Don't try to form "natural" types if the elements don't line up with the 4067 // expected size. 4068 // FIXME: We could potentially recurse down through the last element in the 4069 // sub-struct to find a natural end point. 4070 if (SL->getElementOffset(EndIndex) != EndOffset) 4071 return nullptr; 4072 4073 assert(Index < EndIndex); 4074 EE = STy->element_begin() + EndIndex; 4075 } 4076 4077 // Try to build up a sub-structure. 4078 StructType *SubTy = 4079 StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked()); 4080 const StructLayout *SubSL = DL.getStructLayout(SubTy); 4081 if (Size != SubSL->getSizeInBytes()) 4082 return nullptr; // The sub-struct doesn't have quite the size needed. 4083 4084 return SubTy; 4085 } 4086 4087 /// Pre-split loads and stores to simplify rewriting. 4088 /// 4089 /// We want to break up the splittable load+store pairs as much as 4090 /// possible. This is important to do as a preprocessing step, as once we 4091 /// start rewriting the accesses to partitions of the alloca we lose the 4092 /// necessary information to correctly split apart paired loads and stores 4093 /// which both point into this alloca. The case to consider is something like 4094 /// the following: 4095 /// 4096 /// %a = alloca [12 x i8] 4097 /// %gep1 = getelementptr i8, ptr %a, i32 0 4098 /// %gep2 = getelementptr i8, ptr %a, i32 4 4099 /// %gep3 = getelementptr i8, ptr %a, i32 8 4100 /// store float 0.0, ptr %gep1 4101 /// store float 1.0, ptr %gep2 4102 /// %v = load i64, ptr %gep1 4103 /// store i64 %v, ptr %gep2 4104 /// %f1 = load float, ptr %gep2 4105 /// %f2 = load float, ptr %gep3 4106 /// 4107 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and 4108 /// promote everything so we recover the 2 SSA values that should have been 4109 /// there all along. 4110 /// 4111 /// \returns true if any changes are made. 4112 bool SROAPass::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { 4113 LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n"); 4114 4115 // Track the loads and stores which are candidates for pre-splitting here, in 4116 // the order they first appear during the partition scan. These give stable 4117 // iteration order and a basis for tracking which loads and stores we 4118 // actually split. 4119 SmallVector<LoadInst *, 4> Loads; 4120 SmallVector<StoreInst *, 4> Stores; 4121 4122 // We need to accumulate the splits required of each load or store where we 4123 // can find them via a direct lookup. This is important to cross-check loads 4124 // and stores against each other. We also track the slice so that we can kill 4125 // all the slices that end up split. 4126 struct SplitOffsets { 4127 Slice *S; 4128 std::vector<uint64_t> Splits; 4129 }; 4130 SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap; 4131 4132 // Track loads out of this alloca which cannot, for any reason, be pre-split. 4133 // This is important as we also cannot pre-split stores of those loads! 4134 // FIXME: This is all pretty gross. It means that we can be more aggressive 4135 // in pre-splitting when the load feeding the store happens to come from 4136 // a separate alloca. Put another way, the effectiveness of SROA would be 4137 // decreased by a frontend which just concatenated all of its local allocas 4138 // into one big flat alloca. But defeating such patterns is exactly the job 4139 // SROA is tasked with! Sadly, to not have this discrepancy we would have 4140 // change store pre-splitting to actually force pre-splitting of the load 4141 // that feeds it *and all stores*. That makes pre-splitting much harder, but 4142 // maybe it would make it more principled? 4143 SmallPtrSet<LoadInst *, 8> UnsplittableLoads; 4144 4145 LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n"); 4146 for (auto &P : AS.partitions()) { 4147 for (Slice &S : P) { 4148 Instruction *I = cast<Instruction>(S.getUse()->getUser()); 4149 if (!S.isSplittable() || S.endOffset() <= P.endOffset()) { 4150 // If this is a load we have to track that it can't participate in any 4151 // pre-splitting. If this is a store of a load we have to track that 4152 // that load also can't participate in any pre-splitting. 4153 if (auto *LI = dyn_cast<LoadInst>(I)) 4154 UnsplittableLoads.insert(LI); 4155 else if (auto *SI = dyn_cast<StoreInst>(I)) 4156 if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand())) 4157 UnsplittableLoads.insert(LI); 4158 continue; 4159 } 4160 assert(P.endOffset() > S.beginOffset() && 4161 "Empty or backwards partition!"); 4162 4163 // Determine if this is a pre-splittable slice. 4164 if (auto *LI = dyn_cast<LoadInst>(I)) { 4165 assert(!LI->isVolatile() && "Cannot split volatile loads!"); 4166 4167 // The load must be used exclusively to store into other pointers for 4168 // us to be able to arbitrarily pre-split it. The stores must also be 4169 // simple to avoid changing semantics. 4170 auto IsLoadSimplyStored = [](LoadInst *LI) { 4171 for (User *LU : LI->users()) { 4172 auto *SI = dyn_cast<StoreInst>(LU); 4173 if (!SI || !SI->isSimple()) 4174 return false; 4175 } 4176 return true; 4177 }; 4178 if (!IsLoadSimplyStored(LI)) { 4179 UnsplittableLoads.insert(LI); 4180 continue; 4181 } 4182 4183 Loads.push_back(LI); 4184 } else if (auto *SI = dyn_cast<StoreInst>(I)) { 4185 if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex())) 4186 // Skip stores *of* pointers. FIXME: This shouldn't even be possible! 4187 continue; 4188 auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand()); 4189 if (!StoredLoad || !StoredLoad->isSimple()) 4190 continue; 4191 assert(!SI->isVolatile() && "Cannot split volatile stores!"); 4192 4193 Stores.push_back(SI); 4194 } else { 4195 // Other uses cannot be pre-split. 4196 continue; 4197 } 4198 4199 // Record the initial split. 4200 LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n"); 4201 auto &Offsets = SplitOffsetsMap[I]; 4202 assert(Offsets.Splits.empty() && 4203 "Should not have splits the first time we see an instruction!"); 4204 Offsets.S = &S; 4205 Offsets.Splits.push_back(P.endOffset() - S.beginOffset()); 4206 } 4207 4208 // Now scan the already split slices, and add a split for any of them which 4209 // we're going to pre-split. 4210 for (Slice *S : P.splitSliceTails()) { 4211 auto SplitOffsetsMapI = 4212 SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser())); 4213 if (SplitOffsetsMapI == SplitOffsetsMap.end()) 4214 continue; 4215 auto &Offsets = SplitOffsetsMapI->second; 4216 4217 assert(Offsets.S == S && "Found a mismatched slice!"); 4218 assert(!Offsets.Splits.empty() && 4219 "Cannot have an empty set of splits on the second partition!"); 4220 assert(Offsets.Splits.back() == 4221 P.beginOffset() - Offsets.S->beginOffset() && 4222 "Previous split does not end where this one begins!"); 4223 4224 // Record each split. The last partition's end isn't needed as the size 4225 // of the slice dictates that. 4226 if (S->endOffset() > P.endOffset()) 4227 Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset()); 4228 } 4229 } 4230 4231 // We may have split loads where some of their stores are split stores. For 4232 // such loads and stores, we can only pre-split them if their splits exactly 4233 // match relative to their starting offset. We have to verify this prior to 4234 // any rewriting. 4235 llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) { 4236 // Lookup the load we are storing in our map of split 4237 // offsets. 4238 auto *LI = cast<LoadInst>(SI->getValueOperand()); 4239 // If it was completely unsplittable, then we're done, 4240 // and this store can't be pre-split. 4241 if (UnsplittableLoads.count(LI)) 4242 return true; 4243 4244 auto LoadOffsetsI = SplitOffsetsMap.find(LI); 4245 if (LoadOffsetsI == SplitOffsetsMap.end()) 4246 return false; // Unrelated loads are definitely safe. 4247 auto &LoadOffsets = LoadOffsetsI->second; 4248 4249 // Now lookup the store's offsets. 4250 auto &StoreOffsets = SplitOffsetsMap[SI]; 4251 4252 // If the relative offsets of each split in the load and 4253 // store match exactly, then we can split them and we 4254 // don't need to remove them here. 4255 if (LoadOffsets.Splits == StoreOffsets.Splits) 4256 return false; 4257 4258 LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n" 4259 << " " << *LI << "\n" 4260 << " " << *SI << "\n"); 4261 4262 // We've found a store and load that we need to split 4263 // with mismatched relative splits. Just give up on them 4264 // and remove both instructions from our list of 4265 // candidates. 4266 UnsplittableLoads.insert(LI); 4267 return true; 4268 }); 4269 // Now we have to go *back* through all the stores, because a later store may 4270 // have caused an earlier store's load to become unsplittable and if it is 4271 // unsplittable for the later store, then we can't rely on it being split in 4272 // the earlier store either. 4273 llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) { 4274 auto *LI = cast<LoadInst>(SI->getValueOperand()); 4275 return UnsplittableLoads.count(LI); 4276 }); 4277 // Once we've established all the loads that can't be split for some reason, 4278 // filter any that made it into our list out. 4279 llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) { 4280 return UnsplittableLoads.count(LI); 4281 }); 4282 4283 // If no loads or stores are left, there is no pre-splitting to be done for 4284 // this alloca. 4285 if (Loads.empty() && Stores.empty()) 4286 return false; 4287 4288 // From here on, we can't fail and will be building new accesses, so rig up 4289 // an IR builder. 4290 IRBuilderTy IRB(&AI); 4291 4292 // Collect the new slices which we will merge into the alloca slices. 4293 SmallVector<Slice, 4> NewSlices; 4294 4295 // Track any allocas we end up splitting loads and stores for so we iterate 4296 // on them. 4297 SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas; 4298 4299 // At this point, we have collected all of the loads and stores we can 4300 // pre-split, and the specific splits needed for them. We actually do the 4301 // splitting in a specific order in order to handle when one of the loads in 4302 // the value operand to one of the stores. 4303 // 4304 // First, we rewrite all of the split loads, and just accumulate each split 4305 // load in a parallel structure. We also build the slices for them and append 4306 // them to the alloca slices. 4307 SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap; 4308 std::vector<LoadInst *> SplitLoads; 4309 const DataLayout &DL = AI.getModule()->getDataLayout(); 4310 for (LoadInst *LI : Loads) { 4311 SplitLoads.clear(); 4312 4313 auto &Offsets = SplitOffsetsMap[LI]; 4314 unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset(); 4315 assert(LI->getType()->getIntegerBitWidth() % 8 == 0 && 4316 "Load must have type size equal to store size"); 4317 assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize && 4318 "Load must be >= slice size"); 4319 4320 uint64_t BaseOffset = Offsets.S->beginOffset(); 4321 assert(BaseOffset + SliceSize > BaseOffset && 4322 "Cannot represent alloca access size using 64-bit integers!"); 4323 4324 Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand()); 4325 IRB.SetInsertPoint(LI); 4326 4327 LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n"); 4328 4329 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); 4330 int Idx = 0, Size = Offsets.Splits.size(); 4331 for (;;) { 4332 auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8); 4333 auto AS = LI->getPointerAddressSpace(); 4334 auto *PartPtrTy = PartTy->getPointerTo(AS); 4335 LoadInst *PLoad = IRB.CreateAlignedLoad( 4336 PartTy, 4337 getAdjustedPtr(IRB, DL, BasePtr, 4338 APInt(DL.getIndexSizeInBits(AS), PartOffset), 4339 PartPtrTy, BasePtr->getName() + "."), 4340 getAdjustedAlignment(LI, PartOffset), 4341 /*IsVolatile*/ false, LI->getName()); 4342 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, 4343 LLVMContext::MD_access_group}); 4344 4345 // Append this load onto the list of split loads so we can find it later 4346 // to rewrite the stores. 4347 SplitLoads.push_back(PLoad); 4348 4349 // Now build a new slice for the alloca. 4350 NewSlices.push_back( 4351 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, 4352 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()), 4353 /*IsSplittable*/ false)); 4354 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() 4355 << ", " << NewSlices.back().endOffset() 4356 << "): " << *PLoad << "\n"); 4357 4358 // See if we've handled all the splits. 4359 if (Idx >= Size) 4360 break; 4361 4362 // Setup the next partition. 4363 PartOffset = Offsets.Splits[Idx]; 4364 ++Idx; 4365 PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset; 4366 } 4367 4368 // Now that we have the split loads, do the slow walk over all uses of the 4369 // load and rewrite them as split stores, or save the split loads to use 4370 // below if the store is going to be split there anyways. 4371 bool DeferredStores = false; 4372 for (User *LU : LI->users()) { 4373 StoreInst *SI = cast<StoreInst>(LU); 4374 if (!Stores.empty() && SplitOffsetsMap.count(SI)) { 4375 DeferredStores = true; 4376 LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI 4377 << "\n"); 4378 continue; 4379 } 4380 4381 Value *StoreBasePtr = SI->getPointerOperand(); 4382 IRB.SetInsertPoint(SI); 4383 4384 LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n"); 4385 4386 for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) { 4387 LoadInst *PLoad = SplitLoads[Idx]; 4388 uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1]; 4389 auto *PartPtrTy = 4390 PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); 4391 4392 auto AS = SI->getPointerAddressSpace(); 4393 StoreInst *PStore = IRB.CreateAlignedStore( 4394 PLoad, 4395 getAdjustedPtr(IRB, DL, StoreBasePtr, 4396 APInt(DL.getIndexSizeInBits(AS), PartOffset), 4397 PartPtrTy, StoreBasePtr->getName() + "."), 4398 getAdjustedAlignment(SI, PartOffset), 4399 /*IsVolatile*/ false); 4400 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access, 4401 LLVMContext::MD_access_group, 4402 LLVMContext::MD_DIAssignID}); 4403 LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); 4404 } 4405 4406 // We want to immediately iterate on any allocas impacted by splitting 4407 // this store, and we have to track any promotable alloca (indicated by 4408 // a direct store) as needing to be resplit because it is no longer 4409 // promotable. 4410 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) { 4411 ResplitPromotableAllocas.insert(OtherAI); 4412 Worklist.insert(OtherAI); 4413 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( 4414 StoreBasePtr->stripInBoundsOffsets())) { 4415 Worklist.insert(OtherAI); 4416 } 4417 4418 // Mark the original store as dead. 4419 DeadInsts.push_back(SI); 4420 } 4421 4422 // Save the split loads if there are deferred stores among the users. 4423 if (DeferredStores) 4424 SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads))); 4425 4426 // Mark the original load as dead and kill the original slice. 4427 DeadInsts.push_back(LI); 4428 Offsets.S->kill(); 4429 } 4430 4431 // Second, we rewrite all of the split stores. At this point, we know that 4432 // all loads from this alloca have been split already. For stores of such 4433 // loads, we can simply look up the pre-existing split loads. For stores of 4434 // other loads, we split those loads first and then write split stores of 4435 // them. 4436 for (StoreInst *SI : Stores) { 4437 auto *LI = cast<LoadInst>(SI->getValueOperand()); 4438 IntegerType *Ty = cast<IntegerType>(LI->getType()); 4439 assert(Ty->getBitWidth() % 8 == 0); 4440 uint64_t StoreSize = Ty->getBitWidth() / 8; 4441 assert(StoreSize > 0 && "Cannot have a zero-sized integer store!"); 4442 4443 auto &Offsets = SplitOffsetsMap[SI]; 4444 assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() && 4445 "Slice size should always match load size exactly!"); 4446 uint64_t BaseOffset = Offsets.S->beginOffset(); 4447 assert(BaseOffset + StoreSize > BaseOffset && 4448 "Cannot represent alloca access size using 64-bit integers!"); 4449 4450 Value *LoadBasePtr = LI->getPointerOperand(); 4451 Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand()); 4452 4453 LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n"); 4454 4455 // Check whether we have an already split load. 4456 auto SplitLoadsMapI = SplitLoadsMap.find(LI); 4457 std::vector<LoadInst *> *SplitLoads = nullptr; 4458 if (SplitLoadsMapI != SplitLoadsMap.end()) { 4459 SplitLoads = &SplitLoadsMapI->second; 4460 assert(SplitLoads->size() == Offsets.Splits.size() + 1 && 4461 "Too few split loads for the number of splits in the store!"); 4462 } else { 4463 LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n"); 4464 } 4465 4466 uint64_t PartOffset = 0, PartSize = Offsets.Splits.front(); 4467 int Idx = 0, Size = Offsets.Splits.size(); 4468 for (;;) { 4469 auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); 4470 auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); 4471 auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); 4472 4473 // Either lookup a split load or create one. 4474 LoadInst *PLoad; 4475 if (SplitLoads) { 4476 PLoad = (*SplitLoads)[Idx]; 4477 } else { 4478 IRB.SetInsertPoint(LI); 4479 auto AS = LI->getPointerAddressSpace(); 4480 PLoad = IRB.CreateAlignedLoad( 4481 PartTy, 4482 getAdjustedPtr(IRB, DL, LoadBasePtr, 4483 APInt(DL.getIndexSizeInBits(AS), PartOffset), 4484 LoadPartPtrTy, LoadBasePtr->getName() + "."), 4485 getAdjustedAlignment(LI, PartOffset), 4486 /*IsVolatile*/ false, LI->getName()); 4487 PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access, 4488 LLVMContext::MD_access_group}); 4489 } 4490 4491 // And store this partition. 4492 IRB.SetInsertPoint(SI); 4493 auto AS = SI->getPointerAddressSpace(); 4494 StoreInst *PStore = IRB.CreateAlignedStore( 4495 PLoad, 4496 getAdjustedPtr(IRB, DL, StoreBasePtr, 4497 APInt(DL.getIndexSizeInBits(AS), PartOffset), 4498 StorePartPtrTy, StoreBasePtr->getName() + "."), 4499 getAdjustedAlignment(SI, PartOffset), 4500 /*IsVolatile*/ false); 4501 PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access, 4502 LLVMContext::MD_access_group}); 4503 4504 // Now build a new slice for the alloca. 4505 NewSlices.push_back( 4506 Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize, 4507 &PStore->getOperandUse(PStore->getPointerOperandIndex()), 4508 /*IsSplittable*/ false)); 4509 LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset() 4510 << ", " << NewSlices.back().endOffset() 4511 << "): " << *PStore << "\n"); 4512 if (!SplitLoads) { 4513 LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n"); 4514 } 4515 4516 // See if we've finished all the splits. 4517 if (Idx >= Size) 4518 break; 4519 4520 // Setup the next partition. 4521 PartOffset = Offsets.Splits[Idx]; 4522 ++Idx; 4523 PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset; 4524 } 4525 4526 // We want to immediately iterate on any allocas impacted by splitting 4527 // this load, which is only relevant if it isn't a load of this alloca and 4528 // thus we didn't already split the loads above. We also have to keep track 4529 // of any promotable allocas we split loads on as they can no longer be 4530 // promoted. 4531 if (!SplitLoads) { 4532 if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) { 4533 assert(OtherAI != &AI && "We can't re-split our own alloca!"); 4534 ResplitPromotableAllocas.insert(OtherAI); 4535 Worklist.insert(OtherAI); 4536 } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>( 4537 LoadBasePtr->stripInBoundsOffsets())) { 4538 assert(OtherAI != &AI && "We can't re-split our own alloca!"); 4539 Worklist.insert(OtherAI); 4540 } 4541 } 4542 4543 // Mark the original store as dead now that we've split it up and kill its 4544 // slice. Note that we leave the original load in place unless this store 4545 // was its only use. It may in turn be split up if it is an alloca load 4546 // for some other alloca, but it may be a normal load. This may introduce 4547 // redundant loads, but where those can be merged the rest of the optimizer 4548 // should handle the merging, and this uncovers SSA splits which is more 4549 // important. In practice, the original loads will almost always be fully 4550 // split and removed eventually, and the splits will be merged by any 4551 // trivial CSE, including instcombine. 4552 if (LI->hasOneUse()) { 4553 assert(*LI->user_begin() == SI && "Single use isn't this store!"); 4554 DeadInsts.push_back(LI); 4555 } 4556 DeadInsts.push_back(SI); 4557 Offsets.S->kill(); 4558 } 4559 4560 // Remove the killed slices that have ben pre-split. 4561 llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); }); 4562 4563 // Insert our new slices. This will sort and merge them into the sorted 4564 // sequence. 4565 AS.insert(NewSlices); 4566 4567 LLVM_DEBUG(dbgs() << " Pre-split slices:\n"); 4568 #ifndef NDEBUG 4569 for (auto I = AS.begin(), E = AS.end(); I != E; ++I) 4570 LLVM_DEBUG(AS.print(dbgs(), I, " ")); 4571 #endif 4572 4573 // Finally, don't try to promote any allocas that new require re-splitting. 4574 // They have already been added to the worklist above. 4575 llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) { 4576 return ResplitPromotableAllocas.count(AI); 4577 }); 4578 4579 return true; 4580 } 4581 4582 /// Rewrite an alloca partition's users. 4583 /// 4584 /// This routine drives both of the rewriting goals of the SROA pass. It tries 4585 /// to rewrite uses of an alloca partition to be conducive for SSA value 4586 /// promotion. If the partition needs a new, more refined alloca, this will 4587 /// build that new alloca, preserving as much type information as possible, and 4588 /// rewrite the uses of the old alloca to point at the new one and have the 4589 /// appropriate new offsets. It also evaluates how successful the rewrite was 4590 /// at enabling promotion and if it was successful queues the alloca to be 4591 /// promoted. 4592 AllocaInst *SROAPass::rewritePartition(AllocaInst &AI, AllocaSlices &AS, 4593 Partition &P) { 4594 // Try to compute a friendly type for this partition of the alloca. This 4595 // won't always succeed, in which case we fall back to a legal integer type 4596 // or an i8 array of an appropriate size. 4597 Type *SliceTy = nullptr; 4598 VectorType *SliceVecTy = nullptr; 4599 const DataLayout &DL = AI.getModule()->getDataLayout(); 4600 std::pair<Type *, IntegerType *> CommonUseTy = 4601 findCommonType(P.begin(), P.end(), P.endOffset()); 4602 // Do all uses operate on the same type? 4603 if (CommonUseTy.first) 4604 if (DL.getTypeAllocSize(CommonUseTy.first).getFixedValue() >= P.size()) { 4605 SliceTy = CommonUseTy.first; 4606 SliceVecTy = dyn_cast<VectorType>(SliceTy); 4607 } 4608 // If not, can we find an appropriate subtype in the original allocated type? 4609 if (!SliceTy) 4610 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), 4611 P.beginOffset(), P.size())) 4612 SliceTy = TypePartitionTy; 4613 4614 // If still not, can we use the largest bitwidth integer type used? 4615 if (!SliceTy && CommonUseTy.second) 4616 if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) { 4617 SliceTy = CommonUseTy.second; 4618 SliceVecTy = dyn_cast<VectorType>(SliceTy); 4619 } 4620 if ((!SliceTy || (SliceTy->isArrayTy() && 4621 SliceTy->getArrayElementType()->isIntegerTy())) && 4622 DL.isLegalInteger(P.size() * 8)) { 4623 SliceTy = Type::getIntNTy(*C, P.size() * 8); 4624 } 4625 4626 // If the common use types are not viable for promotion then attempt to find 4627 // another type that is viable. 4628 if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL)) 4629 if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), 4630 P.beginOffset(), P.size())) { 4631 VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy); 4632 if (TypePartitionVecTy && 4633 checkVectorTypeForPromotion(P, TypePartitionVecTy, DL)) 4634 SliceTy = TypePartitionTy; 4635 } 4636 4637 if (!SliceTy) 4638 SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size()); 4639 assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size()); 4640 4641 bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); 4642 4643 VectorType *VecTy = 4644 IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); 4645 if (VecTy) 4646 SliceTy = VecTy; 4647 4648 // Check for the case where we're going to rewrite to a new alloca of the 4649 // exact same type as the original, and with the same access offsets. In that 4650 // case, re-use the existing alloca, but still run through the rewriter to 4651 // perform phi and select speculation. 4652 // P.beginOffset() can be non-zero even with the same type in a case with 4653 // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll). 4654 AllocaInst *NewAI; 4655 if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) { 4656 NewAI = &AI; 4657 // FIXME: We should be able to bail at this point with "nothing changed". 4658 // FIXME: We might want to defer PHI speculation until after here. 4659 // FIXME: return nullptr; 4660 } else { 4661 // Make sure the alignment is compatible with P.beginOffset(). 4662 const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset()); 4663 // If we will get at least this much alignment from the type alone, leave 4664 // the alloca's alignment unconstrained. 4665 const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy); 4666 NewAI = new AllocaInst( 4667 SliceTy, AI.getAddressSpace(), nullptr, 4668 IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment, 4669 AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); 4670 // Copy the old AI debug location over to the new one. 4671 NewAI->setDebugLoc(AI.getDebugLoc()); 4672 ++NumNewAllocas; 4673 } 4674 4675 LLVM_DEBUG(dbgs() << "Rewriting alloca partition " 4676 << "[" << P.beginOffset() << "," << P.endOffset() 4677 << ") to: " << *NewAI << "\n"); 4678 4679 // Track the high watermark on the worklist as it is only relevant for 4680 // promoted allocas. We will reset it to this point if the alloca is not in 4681 // fact scheduled for promotion. 4682 unsigned PPWOldSize = PostPromotionWorklist.size(); 4683 unsigned NumUses = 0; 4684 SmallSetVector<PHINode *, 8> PHIUsers; 4685 SmallSetVector<SelectInst *, 8> SelectUsers; 4686 4687 AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(), 4688 P.endOffset(), IsIntegerPromotable, VecTy, 4689 PHIUsers, SelectUsers); 4690 bool Promotable = true; 4691 for (Slice *S : P.splitSliceTails()) { 4692 Promotable &= Rewriter.visit(S); 4693 ++NumUses; 4694 } 4695 for (Slice &S : P) { 4696 Promotable &= Rewriter.visit(&S); 4697 ++NumUses; 4698 } 4699 4700 NumAllocaPartitionUses += NumUses; 4701 MaxUsesPerAllocaPartition.updateMax(NumUses); 4702 4703 // Now that we've processed all the slices in the new partition, check if any 4704 // PHIs or Selects would block promotion. 4705 for (PHINode *PHI : PHIUsers) 4706 if (!isSafePHIToSpeculate(*PHI)) { 4707 Promotable = false; 4708 PHIUsers.clear(); 4709 SelectUsers.clear(); 4710 break; 4711 } 4712 4713 SmallVector<std::pair<SelectInst *, RewriteableMemOps>, 2> 4714 NewSelectsToRewrite; 4715 NewSelectsToRewrite.reserve(SelectUsers.size()); 4716 for (SelectInst *Sel : SelectUsers) { 4717 std::optional<RewriteableMemOps> Ops = 4718 isSafeSelectToSpeculate(*Sel, PreserveCFG); 4719 if (!Ops) { 4720 Promotable = false; 4721 PHIUsers.clear(); 4722 SelectUsers.clear(); 4723 NewSelectsToRewrite.clear(); 4724 break; 4725 } 4726 NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops)); 4727 } 4728 4729 if (Promotable) { 4730 for (Use *U : AS.getDeadUsesIfPromotable()) { 4731 auto *OldInst = dyn_cast<Instruction>(U->get()); 4732 Value::dropDroppableUse(*U); 4733 if (OldInst) 4734 if (isInstructionTriviallyDead(OldInst)) 4735 DeadInsts.push_back(OldInst); 4736 } 4737 if (PHIUsers.empty() && SelectUsers.empty()) { 4738 // Promote the alloca. 4739 PromotableAllocas.push_back(NewAI); 4740 } else { 4741 // If we have either PHIs or Selects to speculate, add them to those 4742 // worklists and re-queue the new alloca so that we promote in on the 4743 // next iteration. 4744 for (PHINode *PHIUser : PHIUsers) 4745 SpeculatablePHIs.insert(PHIUser); 4746 SelectsToRewrite.reserve(SelectsToRewrite.size() + 4747 NewSelectsToRewrite.size()); 4748 for (auto &&KV : llvm::make_range( 4749 std::make_move_iterator(NewSelectsToRewrite.begin()), 4750 std::make_move_iterator(NewSelectsToRewrite.end()))) 4751 SelectsToRewrite.insert(std::move(KV)); 4752 Worklist.insert(NewAI); 4753 } 4754 } else { 4755 // Drop any post-promotion work items if promotion didn't happen. 4756 while (PostPromotionWorklist.size() > PPWOldSize) 4757 PostPromotionWorklist.pop_back(); 4758 4759 // We couldn't promote and we didn't create a new partition, nothing 4760 // happened. 4761 if (NewAI == &AI) 4762 return nullptr; 4763 4764 // If we can't promote the alloca, iterate on it to check for new 4765 // refinements exposed by splitting the current alloca. Don't iterate on an 4766 // alloca which didn't actually change and didn't get promoted. 4767 Worklist.insert(NewAI); 4768 } 4769 4770 return NewAI; 4771 } 4772 4773 /// Walks the slices of an alloca and form partitions based on them, 4774 /// rewriting each of their uses. 4775 bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { 4776 if (AS.begin() == AS.end()) 4777 return false; 4778 4779 unsigned NumPartitions = 0; 4780 bool Changed = false; 4781 const DataLayout &DL = AI.getModule()->getDataLayout(); 4782 4783 // First try to pre-split loads and stores. 4784 Changed |= presplitLoadsAndStores(AI, AS); 4785 4786 // Now that we have identified any pre-splitting opportunities, 4787 // mark loads and stores unsplittable except for the following case. 4788 // We leave a slice splittable if all other slices are disjoint or fully 4789 // included in the slice, such as whole-alloca loads and stores. 4790 // If we fail to split these during pre-splitting, we want to force them 4791 // to be rewritten into a partition. 4792 bool IsSorted = true; 4793 4794 uint64_t AllocaSize = 4795 DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue(); 4796 const uint64_t MaxBitVectorSize = 1024; 4797 if (AllocaSize <= MaxBitVectorSize) { 4798 // If a byte boundary is included in any load or store, a slice starting or 4799 // ending at the boundary is not splittable. 4800 SmallBitVector SplittableOffset(AllocaSize + 1, true); 4801 for (Slice &S : AS) 4802 for (unsigned O = S.beginOffset() + 1; 4803 O < S.endOffset() && O < AllocaSize; O++) 4804 SplittableOffset.reset(O); 4805 4806 for (Slice &S : AS) { 4807 if (!S.isSplittable()) 4808 continue; 4809 4810 if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) && 4811 (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()])) 4812 continue; 4813 4814 if (isa<LoadInst>(S.getUse()->getUser()) || 4815 isa<StoreInst>(S.getUse()->getUser())) { 4816 S.makeUnsplittable(); 4817 IsSorted = false; 4818 } 4819 } 4820 } 4821 else { 4822 // We only allow whole-alloca splittable loads and stores 4823 // for a large alloca to avoid creating too large BitVector. 4824 for (Slice &S : AS) { 4825 if (!S.isSplittable()) 4826 continue; 4827 4828 if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize) 4829 continue; 4830 4831 if (isa<LoadInst>(S.getUse()->getUser()) || 4832 isa<StoreInst>(S.getUse()->getUser())) { 4833 S.makeUnsplittable(); 4834 IsSorted = false; 4835 } 4836 } 4837 } 4838 4839 if (!IsSorted) 4840 llvm::sort(AS); 4841 4842 /// Describes the allocas introduced by rewritePartition in order to migrate 4843 /// the debug info. 4844 struct Fragment { 4845 AllocaInst *Alloca; 4846 uint64_t Offset; 4847 uint64_t Size; 4848 Fragment(AllocaInst *AI, uint64_t O, uint64_t S) 4849 : Alloca(AI), Offset(O), Size(S) {} 4850 }; 4851 SmallVector<Fragment, 4> Fragments; 4852 4853 // Rewrite each partition. 4854 for (auto &P : AS.partitions()) { 4855 if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) { 4856 Changed = true; 4857 if (NewAI != &AI) { 4858 uint64_t SizeOfByte = 8; 4859 uint64_t AllocaSize = 4860 DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue(); 4861 // Don't include any padding. 4862 uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte); 4863 Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size)); 4864 } 4865 } 4866 ++NumPartitions; 4867 } 4868 4869 NumAllocaPartitions += NumPartitions; 4870 MaxPartitionsPerAlloca.updateMax(NumPartitions); 4871 4872 // Migrate debug information from the old alloca to the new alloca(s) 4873 // and the individual partitions. 4874 TinyPtrVector<DbgVariableIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI); 4875 for (auto *DbgAssign : at::getAssignmentMarkers(&AI)) 4876 DbgDeclares.push_back(DbgAssign); 4877 for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) { 4878 auto *Expr = DbgDeclare->getExpression(); 4879 DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false); 4880 uint64_t AllocaSize = 4881 DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedValue(); 4882 for (auto Fragment : Fragments) { 4883 // Create a fragment expression describing the new partition or reuse AI's 4884 // expression if there is only one partition. 4885 auto *FragmentExpr = Expr; 4886 if (Fragment.Size < AllocaSize || Expr->isFragment()) { 4887 // If this alloca is already a scalar replacement of a larger aggregate, 4888 // Fragment.Offset describes the offset inside the scalar. 4889 auto ExprFragment = Expr->getFragmentInfo(); 4890 uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0; 4891 uint64_t Start = Offset + Fragment.Offset; 4892 uint64_t Size = Fragment.Size; 4893 if (ExprFragment) { 4894 uint64_t AbsEnd = 4895 ExprFragment->OffsetInBits + ExprFragment->SizeInBits; 4896 if (Start >= AbsEnd) { 4897 // No need to describe a SROAed padding. 4898 continue; 4899 } 4900 Size = std::min(Size, AbsEnd - Start); 4901 } 4902 // The new, smaller fragment is stenciled out from the old fragment. 4903 if (auto OrigFragment = FragmentExpr->getFragmentInfo()) { 4904 assert(Start >= OrigFragment->OffsetInBits && 4905 "new fragment is outside of original fragment"); 4906 Start -= OrigFragment->OffsetInBits; 4907 } 4908 4909 // The alloca may be larger than the variable. 4910 auto VarSize = DbgDeclare->getVariable()->getSizeInBits(); 4911 if (VarSize) { 4912 if (Size > *VarSize) 4913 Size = *VarSize; 4914 if (Size == 0 || Start + Size > *VarSize) 4915 continue; 4916 } 4917 4918 // Avoid creating a fragment expression that covers the entire variable. 4919 if (!VarSize || *VarSize != Size) { 4920 if (auto E = 4921 DIExpression::createFragmentExpression(Expr, Start, Size)) 4922 FragmentExpr = *E; 4923 else 4924 continue; 4925 } 4926 } 4927 4928 // Remove any existing intrinsics on the new alloca describing 4929 // the variable fragment. 4930 for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) { 4931 auto SameVariableFragment = [](const DbgVariableIntrinsic *LHS, 4932 const DbgVariableIntrinsic *RHS) { 4933 return LHS->getVariable() == RHS->getVariable() && 4934 LHS->getDebugLoc()->getInlinedAt() == 4935 RHS->getDebugLoc()->getInlinedAt(); 4936 }; 4937 if (SameVariableFragment(OldDII, DbgDeclare)) 4938 OldDII->eraseFromParent(); 4939 } 4940 4941 if (auto *DbgAssign = dyn_cast<DbgAssignIntrinsic>(DbgDeclare)) { 4942 if (!Fragment.Alloca->hasMetadata(LLVMContext::MD_DIAssignID)) { 4943 Fragment.Alloca->setMetadata( 4944 LLVMContext::MD_DIAssignID, 4945 DIAssignID::getDistinct(AI.getContext())); 4946 } 4947 auto *NewAssign = DIB.insertDbgAssign( 4948 Fragment.Alloca, DbgAssign->getValue(), DbgAssign->getVariable(), 4949 FragmentExpr, Fragment.Alloca, DbgAssign->getAddressExpression(), 4950 DbgAssign->getDebugLoc()); 4951 NewAssign->setDebugLoc(DbgAssign->getDebugLoc()); 4952 LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign 4953 << "\n"); 4954 } else { 4955 DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), 4956 FragmentExpr, DbgDeclare->getDebugLoc(), &AI); 4957 } 4958 } 4959 } 4960 return Changed; 4961 } 4962 4963 /// Clobber a use with poison, deleting the used value if it becomes dead. 4964 void SROAPass::clobberUse(Use &U) { 4965 Value *OldV = U; 4966 // Replace the use with an poison value. 4967 U = PoisonValue::get(OldV->getType()); 4968 4969 // Check for this making an instruction dead. We have to garbage collect 4970 // all the dead instructions to ensure the uses of any alloca end up being 4971 // minimal. 4972 if (Instruction *OldI = dyn_cast<Instruction>(OldV)) 4973 if (isInstructionTriviallyDead(OldI)) { 4974 DeadInsts.push_back(OldI); 4975 } 4976 } 4977 4978 /// Analyze an alloca for SROA. 4979 /// 4980 /// This analyzes the alloca to ensure we can reason about it, builds 4981 /// the slices of the alloca, and then hands it off to be split and 4982 /// rewritten as needed. 4983 std::pair<bool /*Changed*/, bool /*CFGChanged*/> 4984 SROAPass::runOnAlloca(AllocaInst &AI) { 4985 bool Changed = false; 4986 bool CFGChanged = false; 4987 4988 LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n"); 4989 ++NumAllocasAnalyzed; 4990 4991 // Special case dead allocas, as they're trivial. 4992 if (AI.use_empty()) { 4993 AI.eraseFromParent(); 4994 Changed = true; 4995 return {Changed, CFGChanged}; 4996 } 4997 const DataLayout &DL = AI.getModule()->getDataLayout(); 4998 4999 // Skip alloca forms that this analysis can't handle. 5000 auto *AT = AI.getAllocatedType(); 5001 if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) || 5002 DL.getTypeAllocSize(AT).getFixedValue() == 0) 5003 return {Changed, CFGChanged}; 5004 5005 // First, split any FCA loads and stores touching this alloca to promote 5006 // better splitting and promotion opportunities. 5007 IRBuilderTy IRB(&AI); 5008 AggLoadStoreRewriter AggRewriter(DL, IRB); 5009 Changed |= AggRewriter.rewrite(AI); 5010 5011 // Build the slices using a recursive instruction-visiting builder. 5012 AllocaSlices AS(DL, AI); 5013 LLVM_DEBUG(AS.print(dbgs())); 5014 if (AS.isEscaped()) 5015 return {Changed, CFGChanged}; 5016 5017 // Delete all the dead users of this alloca before splitting and rewriting it. 5018 for (Instruction *DeadUser : AS.getDeadUsers()) { 5019 // Free up everything used by this instruction. 5020 for (Use &DeadOp : DeadUser->operands()) 5021 clobberUse(DeadOp); 5022 5023 // Now replace the uses of this instruction. 5024 DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType())); 5025 5026 // And mark it for deletion. 5027 DeadInsts.push_back(DeadUser); 5028 Changed = true; 5029 } 5030 for (Use *DeadOp : AS.getDeadOperands()) { 5031 clobberUse(*DeadOp); 5032 Changed = true; 5033 } 5034 5035 // No slices to split. Leave the dead alloca for a later pass to clean up. 5036 if (AS.begin() == AS.end()) 5037 return {Changed, CFGChanged}; 5038 5039 Changed |= splitAlloca(AI, AS); 5040 5041 LLVM_DEBUG(dbgs() << " Speculating PHIs\n"); 5042 while (!SpeculatablePHIs.empty()) 5043 speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val()); 5044 5045 LLVM_DEBUG(dbgs() << " Rewriting Selects\n"); 5046 auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector(); 5047 while (!RemainingSelectsToRewrite.empty()) { 5048 const auto [K, V] = RemainingSelectsToRewrite.pop_back_val(); 5049 CFGChanged |= 5050 rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU); 5051 } 5052 5053 return {Changed, CFGChanged}; 5054 } 5055 5056 /// Delete the dead instructions accumulated in this run. 5057 /// 5058 /// Recursively deletes the dead instructions we've accumulated. This is done 5059 /// at the very end to maximize locality of the recursive delete and to 5060 /// minimize the problems of invalidated instruction pointers as such pointers 5061 /// are used heavily in the intermediate stages of the algorithm. 5062 /// 5063 /// We also record the alloca instructions deleted here so that they aren't 5064 /// subsequently handed to mem2reg to promote. 5065 bool SROAPass::deleteDeadInstructions( 5066 SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) { 5067 bool Changed = false; 5068 while (!DeadInsts.empty()) { 5069 Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()); 5070 if (!I) 5071 continue; 5072 LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n"); 5073 5074 // If the instruction is an alloca, find the possible dbg.declare connected 5075 // to it, and remove it too. We must do this before calling RAUW or we will 5076 // not be able to find it. 5077 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { 5078 DeletedAllocas.insert(AI); 5079 for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI)) 5080 OldDII->eraseFromParent(); 5081 } 5082 5083 at::deleteAssignmentMarkers(I); 5084 I->replaceAllUsesWith(UndefValue::get(I->getType())); 5085 5086 for (Use &Operand : I->operands()) 5087 if (Instruction *U = dyn_cast<Instruction>(Operand)) { 5088 // Zero out the operand and see if it becomes trivially dead. 5089 Operand = nullptr; 5090 if (isInstructionTriviallyDead(U)) 5091 DeadInsts.push_back(U); 5092 } 5093 5094 ++NumDeleted; 5095 I->eraseFromParent(); 5096 Changed = true; 5097 } 5098 return Changed; 5099 } 5100 5101 /// Promote the allocas, using the best available technique. 5102 /// 5103 /// This attempts to promote whatever allocas have been identified as viable in 5104 /// the PromotableAllocas list. If that list is empty, there is nothing to do. 5105 /// This function returns whether any promotion occurred. 5106 bool SROAPass::promoteAllocas(Function &F) { 5107 if (PromotableAllocas.empty()) 5108 return false; 5109 5110 NumPromoted += PromotableAllocas.size(); 5111 5112 LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); 5113 PromoteMemToReg(PromotableAllocas, DTU->getDomTree(), AC); 5114 PromotableAllocas.clear(); 5115 return true; 5116 } 5117 5118 PreservedAnalyses SROAPass::runImpl(Function &F, DomTreeUpdater &RunDTU, 5119 AssumptionCache &RunAC) { 5120 LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n"); 5121 C = &F.getContext(); 5122 DTU = &RunDTU; 5123 AC = &RunAC; 5124 5125 BasicBlock &EntryBB = F.getEntryBlock(); 5126 for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); 5127 I != E; ++I) { 5128 if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) { 5129 if (isa<ScalableVectorType>(AI->getAllocatedType())) { 5130 if (isAllocaPromotable(AI)) 5131 PromotableAllocas.push_back(AI); 5132 } else { 5133 Worklist.insert(AI); 5134 } 5135 } 5136 } 5137 5138 bool Changed = false; 5139 bool CFGChanged = false; 5140 // A set of deleted alloca instruction pointers which should be removed from 5141 // the list of promotable allocas. 5142 SmallPtrSet<AllocaInst *, 4> DeletedAllocas; 5143 5144 do { 5145 while (!Worklist.empty()) { 5146 auto [IterationChanged, IterationCFGChanged] = 5147 runOnAlloca(*Worklist.pop_back_val()); 5148 Changed |= IterationChanged; 5149 CFGChanged |= IterationCFGChanged; 5150 5151 Changed |= deleteDeadInstructions(DeletedAllocas); 5152 5153 // Remove the deleted allocas from various lists so that we don't try to 5154 // continue processing them. 5155 if (!DeletedAllocas.empty()) { 5156 auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); }; 5157 Worklist.remove_if(IsInSet); 5158 PostPromotionWorklist.remove_if(IsInSet); 5159 llvm::erase_if(PromotableAllocas, IsInSet); 5160 DeletedAllocas.clear(); 5161 } 5162 } 5163 5164 Changed |= promoteAllocas(F); 5165 5166 Worklist = PostPromotionWorklist; 5167 PostPromotionWorklist.clear(); 5168 } while (!Worklist.empty()); 5169 5170 assert((!CFGChanged || Changed) && "Can not only modify the CFG."); 5171 assert((!CFGChanged || !PreserveCFG) && 5172 "Should not have modified the CFG when told to preserve it."); 5173 5174 if (!Changed) 5175 return PreservedAnalyses::all(); 5176 5177 PreservedAnalyses PA; 5178 if (!CFGChanged) 5179 PA.preserveSet<CFGAnalyses>(); 5180 PA.preserve<DominatorTreeAnalysis>(); 5181 return PA; 5182 } 5183 5184 PreservedAnalyses SROAPass::runImpl(Function &F, DominatorTree &RunDT, 5185 AssumptionCache &RunAC) { 5186 DomTreeUpdater DTU(RunDT, DomTreeUpdater::UpdateStrategy::Lazy); 5187 return runImpl(F, DTU, RunAC); 5188 } 5189 5190 PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) { 5191 return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F), 5192 AM.getResult<AssumptionAnalysis>(F)); 5193 } 5194 5195 void SROAPass::printPipeline( 5196 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) { 5197 static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline( 5198 OS, MapClassName2PassName); 5199 OS << (PreserveCFG ? "<preserve-cfg>" : "<modify-cfg>"); 5200 } 5201 5202 SROAPass::SROAPass(SROAOptions PreserveCFG_) 5203 : PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {} 5204 5205 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass. 5206 /// 5207 /// This is in the llvm namespace purely to allow it to be a friend of the \c 5208 /// SROA pass. 5209 class llvm::sroa::SROALegacyPass : public FunctionPass { 5210 /// The SROA implementation. 5211 SROAPass Impl; 5212 5213 public: 5214 static char ID; 5215 5216 SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG) 5217 : FunctionPass(ID), Impl(PreserveCFG) { 5218 initializeSROALegacyPassPass(*PassRegistry::getPassRegistry()); 5219 } 5220 5221 bool runOnFunction(Function &F) override { 5222 if (skipFunction(F)) 5223 return false; 5224 5225 auto PA = Impl.runImpl( 5226 F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), 5227 getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F)); 5228 return !PA.areAllPreserved(); 5229 } 5230 5231 void getAnalysisUsage(AnalysisUsage &AU) const override { 5232 AU.addRequired<AssumptionCacheTracker>(); 5233 AU.addRequired<DominatorTreeWrapperPass>(); 5234 AU.addPreserved<GlobalsAAWrapperPass>(); 5235 AU.addPreserved<DominatorTreeWrapperPass>(); 5236 } 5237 5238 StringRef getPassName() const override { return "SROA"; } 5239 }; 5240 5241 char SROALegacyPass::ID = 0; 5242 5243 FunctionPass *llvm::createSROAPass(bool PreserveCFG) { 5244 return new SROALegacyPass(PreserveCFG ? SROAOptions::PreserveCFG 5245 : SROAOptions::ModifyCFG); 5246 } 5247 5248 INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa", 5249 "Scalar Replacement Of Aggregates", false, false) 5250 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) 5251 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) 5252 INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates", 5253 false, false) 5254