xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Scalar/SROA.cpp (revision e64bea71c21eb42e97aa615188ba91f6cce0d36d)
1 //===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This transformation implements the well known scalar replacement of
10 /// aggregates transformation. It tries to identify promotable elements of an
11 /// aggregate alloca, and promote them to registers. It will also try to
12 /// convert uses of an element (or set of elements) of an alloca into a vector
13 /// or bitfield-style integer scalar if appropriate.
14 ///
15 /// It works to do this with minimal slicing of the alloca so that regions
16 /// which are merely transferred in and out of external memory remain unchanged
17 /// and are not decomposed to scalar code.
18 ///
19 /// Because this also performs alloca promotion, it can be thought of as also
20 /// serving the purpose of SSA formation. The algorithm iterates on the
21 /// function until all opportunities for promotion have been realized.
22 ///
23 //===----------------------------------------------------------------------===//
24 
25 #include "llvm/Transforms/Scalar/SROA.h"
26 #include "llvm/ADT/APInt.h"
27 #include "llvm/ADT/ArrayRef.h"
28 #include "llvm/ADT/DenseMap.h"
29 #include "llvm/ADT/MapVector.h"
30 #include "llvm/ADT/PointerIntPair.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/ADT/SetVector.h"
33 #include "llvm/ADT/SmallBitVector.h"
34 #include "llvm/ADT/SmallPtrSet.h"
35 #include "llvm/ADT/SmallVector.h"
36 #include "llvm/ADT/Statistic.h"
37 #include "llvm/ADT/StringRef.h"
38 #include "llvm/ADT/Twine.h"
39 #include "llvm/ADT/iterator.h"
40 #include "llvm/ADT/iterator_range.h"
41 #include "llvm/Analysis/AssumptionCache.h"
42 #include "llvm/Analysis/DomTreeUpdater.h"
43 #include "llvm/Analysis/GlobalsModRef.h"
44 #include "llvm/Analysis/Loads.h"
45 #include "llvm/Analysis/PtrUseVisitor.h"
46 #include "llvm/Analysis/ValueTracking.h"
47 #include "llvm/Config/llvm-config.h"
48 #include "llvm/IR/BasicBlock.h"
49 #include "llvm/IR/Constant.h"
50 #include "llvm/IR/ConstantFolder.h"
51 #include "llvm/IR/Constants.h"
52 #include "llvm/IR/DIBuilder.h"
53 #include "llvm/IR/DataLayout.h"
54 #include "llvm/IR/DebugInfo.h"
55 #include "llvm/IR/DebugInfoMetadata.h"
56 #include "llvm/IR/DerivedTypes.h"
57 #include "llvm/IR/Dominators.h"
58 #include "llvm/IR/Function.h"
59 #include "llvm/IR/GlobalAlias.h"
60 #include "llvm/IR/IRBuilder.h"
61 #include "llvm/IR/InstVisitor.h"
62 #include "llvm/IR/Instruction.h"
63 #include "llvm/IR/Instructions.h"
64 #include "llvm/IR/IntrinsicInst.h"
65 #include "llvm/IR/LLVMContext.h"
66 #include "llvm/IR/Metadata.h"
67 #include "llvm/IR/Module.h"
68 #include "llvm/IR/Operator.h"
69 #include "llvm/IR/PassManager.h"
70 #include "llvm/IR/Type.h"
71 #include "llvm/IR/Use.h"
72 #include "llvm/IR/User.h"
73 #include "llvm/IR/Value.h"
74 #include "llvm/IR/ValueHandle.h"
75 #include "llvm/InitializePasses.h"
76 #include "llvm/Pass.h"
77 #include "llvm/Support/Casting.h"
78 #include "llvm/Support/CommandLine.h"
79 #include "llvm/Support/Compiler.h"
80 #include "llvm/Support/Debug.h"
81 #include "llvm/Support/ErrorHandling.h"
82 #include "llvm/Support/raw_ostream.h"
83 #include "llvm/Transforms/Scalar.h"
84 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
85 #include "llvm/Transforms/Utils/Local.h"
86 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
87 #include "llvm/Transforms/Utils/SSAUpdater.h"
88 #include <algorithm>
89 #include <cassert>
90 #include <cstddef>
91 #include <cstdint>
92 #include <cstring>
93 #include <iterator>
94 #include <string>
95 #include <tuple>
96 #include <utility>
97 #include <variant>
98 #include <vector>
99 
100 using namespace llvm;
101 
102 #define DEBUG_TYPE "sroa"
103 
104 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
105 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
106 STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
107 STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
108 STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
109 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
110 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
111 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
112 STATISTIC(NumLoadsPredicated,
113           "Number of loads rewritten into predicated loads to allow promotion");
114 STATISTIC(
115     NumStoresPredicated,
116     "Number of stores rewritten into predicated loads to allow promotion");
117 STATISTIC(NumDeleted, "Number of instructions deleted");
118 STATISTIC(NumVectorized, "Number of vectorized aggregates");
119 
120 /// Disable running mem2reg during SROA in order to test or debug SROA.
121 static cl::opt<bool> SROASkipMem2Reg("sroa-skip-mem2reg", cl::init(false),
122                                      cl::Hidden);
123 namespace {
124 
125 class AllocaSliceRewriter;
126 class AllocaSlices;
127 class Partition;
128 
129 class SelectHandSpeculativity {
130   unsigned char Storage = 0; // None are speculatable by default.
131   using TrueVal = Bitfield::Element<bool, 0, 1>;  // Low 0'th bit.
132   using FalseVal = Bitfield::Element<bool, 1, 1>; // Low 1'th bit.
133 public:
134   SelectHandSpeculativity() = default;
135   SelectHandSpeculativity &setAsSpeculatable(bool isTrueVal);
136   bool isSpeculatable(bool isTrueVal) const;
137   bool areAllSpeculatable() const;
138   bool areAnySpeculatable() const;
139   bool areNoneSpeculatable() const;
140   // For interop as int half of PointerIntPair.
operator intptr_t() const141   explicit operator intptr_t() const { return static_cast<intptr_t>(Storage); }
SelectHandSpeculativity(intptr_t Storage_)142   explicit SelectHandSpeculativity(intptr_t Storage_) : Storage(Storage_) {}
143 };
144 static_assert(sizeof(SelectHandSpeculativity) == sizeof(unsigned char));
145 
146 using PossiblySpeculatableLoad =
147     PointerIntPair<LoadInst *, 2, SelectHandSpeculativity>;
148 using UnspeculatableStore = StoreInst *;
149 using RewriteableMemOp =
150     std::variant<PossiblySpeculatableLoad, UnspeculatableStore>;
151 using RewriteableMemOps = SmallVector<RewriteableMemOp, 2>;
152 
153 /// An optimization pass providing Scalar Replacement of Aggregates.
154 ///
155 /// This pass takes allocations which can be completely analyzed (that is, they
156 /// don't escape) and tries to turn them into scalar SSA values. There are
157 /// a few steps to this process.
158 ///
159 /// 1) It takes allocations of aggregates and analyzes the ways in which they
160 ///    are used to try to split them into smaller allocations, ideally of
161 ///    a single scalar data type. It will split up memcpy and memset accesses
162 ///    as necessary and try to isolate individual scalar accesses.
163 /// 2) It will transform accesses into forms which are suitable for SSA value
164 ///    promotion. This can be replacing a memset with a scalar store of an
165 ///    integer value, or it can involve speculating operations on a PHI or
166 ///    select to be a PHI or select of the results.
167 /// 3) Finally, this will try to detect a pattern of accesses which map cleanly
168 ///    onto insert and extract operations on a vector value, and convert them to
169 ///    this form. By doing so, it will enable promotion of vector aggregates to
170 ///    SSA vector values.
171 class SROA {
172   LLVMContext *const C;
173   DomTreeUpdater *const DTU;
174   AssumptionCache *const AC;
175   const bool PreserveCFG;
176 
177   /// Worklist of alloca instructions to simplify.
178   ///
179   /// Each alloca in the function is added to this. Each new alloca formed gets
180   /// added to it as well to recursively simplify unless that alloca can be
181   /// directly promoted. Finally, each time we rewrite a use of an alloca other
182   /// the one being actively rewritten, we add it back onto the list if not
183   /// already present to ensure it is re-visited.
184   SmallSetVector<AllocaInst *, 16> Worklist;
185 
186   /// A collection of instructions to delete.
187   /// We try to batch deletions to simplify code and make things a bit more
188   /// efficient. We also make sure there is no dangling pointers.
189   SmallVector<WeakVH, 8> DeadInsts;
190 
191   /// Post-promotion worklist.
192   ///
193   /// Sometimes we discover an alloca which has a high probability of becoming
194   /// viable for SROA after a round of promotion takes place. In those cases,
195   /// the alloca is enqueued here for re-processing.
196   ///
197   /// Note that we have to be very careful to clear allocas out of this list in
198   /// the event they are deleted.
199   SmallSetVector<AllocaInst *, 16> PostPromotionWorklist;
200 
201   /// A collection of alloca instructions we can directly promote.
202   SetVector<AllocaInst *, SmallVector<AllocaInst *>,
203             SmallPtrSet<AllocaInst *, 16>, 16>
204       PromotableAllocas;
205 
206   /// A worklist of PHIs to speculate prior to promoting allocas.
207   ///
208   /// All of these PHIs have been checked for the safety of speculation and by
209   /// being speculated will allow promoting allocas currently in the promotable
210   /// queue.
211   SmallSetVector<PHINode *, 8> SpeculatablePHIs;
212 
213   /// A worklist of select instructions to rewrite prior to promoting
214   /// allocas.
215   SmallMapVector<SelectInst *, RewriteableMemOps, 8> SelectsToRewrite;
216 
217   /// Select instructions that use an alloca and are subsequently loaded can be
218   /// rewritten to load both input pointers and then select between the result,
219   /// allowing the load of the alloca to be promoted.
220   /// From this:
221   ///   %P2 = select i1 %cond, ptr %Alloca, ptr %Other
222   ///   %V = load <type>, ptr %P2
223   /// to:
224   ///   %V1 = load <type>, ptr %Alloca      -> will be mem2reg'd
225   ///   %V2 = load <type>, ptr %Other
226   ///   %V = select i1 %cond, <type> %V1, <type> %V2
227   ///
228   /// We can do this to a select if its only uses are loads
229   /// and if either the operand to the select can be loaded unconditionally,
230   ///        or if we are allowed to perform CFG modifications.
231   /// If found an intervening bitcast with a single use of the load,
232   /// allow the promotion.
233   static std::optional<RewriteableMemOps>
234   isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG);
235 
236 public:
SROA(LLVMContext * C,DomTreeUpdater * DTU,AssumptionCache * AC,SROAOptions PreserveCFG_)237   SROA(LLVMContext *C, DomTreeUpdater *DTU, AssumptionCache *AC,
238        SROAOptions PreserveCFG_)
239       : C(C), DTU(DTU), AC(AC),
240         PreserveCFG(PreserveCFG_ == SROAOptions::PreserveCFG) {}
241 
242   /// Main run method used by both the SROAPass and by the legacy pass.
243   std::pair<bool /*Changed*/, bool /*CFGChanged*/> runSROA(Function &F);
244 
245 private:
246   friend class AllocaSliceRewriter;
247 
248   bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS);
249   AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P);
250   bool splitAlloca(AllocaInst &AI, AllocaSlices &AS);
251   bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS);
252   std::pair<bool /*Changed*/, bool /*CFGChanged*/> runOnAlloca(AllocaInst &AI);
253   void clobberUse(Use &U);
254   bool deleteDeadInstructions(SmallPtrSetImpl<AllocaInst *> &DeletedAllocas);
255   bool promoteAllocas();
256 };
257 
258 } // end anonymous namespace
259 
260 /// Calculate the fragment of a variable to use when slicing a store
261 /// based on the slice dimensions, existing fragment, and base storage
262 /// fragment.
263 /// Results:
264 /// UseFrag - Use Target as the new fragment.
265 /// UseNoFrag - The new slice already covers the whole variable.
266 /// Skip - The new alloca slice doesn't include this variable.
267 /// FIXME: Can we use calculateFragmentIntersect instead?
268 namespace {
269 enum FragCalcResult { UseFrag, UseNoFrag, Skip };
270 }
271 static FragCalcResult
calculateFragment(DILocalVariable * Variable,uint64_t NewStorageSliceOffsetInBits,uint64_t NewStorageSliceSizeInBits,std::optional<DIExpression::FragmentInfo> StorageFragment,std::optional<DIExpression::FragmentInfo> CurrentFragment,DIExpression::FragmentInfo & Target)272 calculateFragment(DILocalVariable *Variable,
273                   uint64_t NewStorageSliceOffsetInBits,
274                   uint64_t NewStorageSliceSizeInBits,
275                   std::optional<DIExpression::FragmentInfo> StorageFragment,
276                   std::optional<DIExpression::FragmentInfo> CurrentFragment,
277                   DIExpression::FragmentInfo &Target) {
278   // If the base storage describes part of the variable apply the offset and
279   // the size constraint.
280   if (StorageFragment) {
281     Target.SizeInBits =
282         std::min(NewStorageSliceSizeInBits, StorageFragment->SizeInBits);
283     Target.OffsetInBits =
284         NewStorageSliceOffsetInBits + StorageFragment->OffsetInBits;
285   } else {
286     Target.SizeInBits = NewStorageSliceSizeInBits;
287     Target.OffsetInBits = NewStorageSliceOffsetInBits;
288   }
289 
290   // If this slice extracts the entirety of an independent variable from a
291   // larger alloca, do not produce a fragment expression, as the variable is
292   // not fragmented.
293   if (!CurrentFragment) {
294     if (auto Size = Variable->getSizeInBits()) {
295       // Treat the current fragment as covering the whole variable.
296       CurrentFragment = DIExpression::FragmentInfo(*Size, 0);
297       if (Target == CurrentFragment)
298         return UseNoFrag;
299     }
300   }
301 
302   // No additional work to do if there isn't a fragment already, or there is
303   // but it already exactly describes the new assignment.
304   if (!CurrentFragment || *CurrentFragment == Target)
305     return UseFrag;
306 
307   // Reject the target fragment if it doesn't fit wholly within the current
308   // fragment. TODO: We could instead chop up the target to fit in the case of
309   // a partial overlap.
310   if (Target.startInBits() < CurrentFragment->startInBits() ||
311       Target.endInBits() > CurrentFragment->endInBits())
312     return Skip;
313 
314   // Target fits within the current fragment, return it.
315   return UseFrag;
316 }
317 
getAggregateVariable(DbgVariableIntrinsic * DVI)318 static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
319   return DebugVariable(DVI->getVariable(), std::nullopt,
320                        DVI->getDebugLoc().getInlinedAt());
321 }
getAggregateVariable(DbgVariableRecord * DVR)322 static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) {
323   return DebugVariable(DVR->getVariable(), std::nullopt,
324                        DVR->getDebugLoc().getInlinedAt());
325 }
326 
327 /// Helpers for handling new and old debug info modes in migrateDebugInfo.
328 /// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based
329 /// on the \p Unused parameter type.
UnwrapDbgInstPtr(DbgInstPtr P,DbgVariableRecord * Unused)330 DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) {
331   (void)Unused;
332   return static_cast<DbgVariableRecord *>(cast<DbgRecord *>(P));
333 }
UnwrapDbgInstPtr(DbgInstPtr P,DbgAssignIntrinsic * Unused)334 DbgAssignIntrinsic *UnwrapDbgInstPtr(DbgInstPtr P, DbgAssignIntrinsic *Unused) {
335   (void)Unused;
336   return static_cast<DbgAssignIntrinsic *>(cast<Instruction *>(P));
337 }
338 
339 /// Find linked dbg.assign and generate a new one with the correct
340 /// FragmentInfo. Link Inst to the new dbg.assign.  If Value is nullptr the
341 /// value component is copied from the old dbg.assign to the new.
342 /// \param OldAlloca             Alloca for the variable before splitting.
343 /// \param IsSplit               True if the store (not necessarily alloca)
344 ///                              is being split.
345 /// \param OldAllocaOffsetInBits Offset of the slice taken from OldAlloca.
346 /// \param SliceSizeInBits       New number of bits being written to.
347 /// \param OldInst               Instruction that is being split.
348 /// \param Inst                  New instruction performing this part of the
349 ///                              split store.
350 /// \param Dest                  Store destination.
351 /// \param Value                 Stored value.
352 /// \param DL                    Datalayout.
migrateDebugInfo(AllocaInst * OldAlloca,bool IsSplit,uint64_t OldAllocaOffsetInBits,uint64_t SliceSizeInBits,Instruction * OldInst,Instruction * Inst,Value * Dest,Value * Value,const DataLayout & DL)353 static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
354                              uint64_t OldAllocaOffsetInBits,
355                              uint64_t SliceSizeInBits, Instruction *OldInst,
356                              Instruction *Inst, Value *Dest, Value *Value,
357                              const DataLayout &DL) {
358   auto MarkerRange = at::getAssignmentMarkers(OldInst);
359   auto DVRAssignMarkerRange = at::getDVRAssignmentMarkers(OldInst);
360   // Nothing to do if OldInst has no linked dbg.assign intrinsics.
361   if (MarkerRange.empty() && DVRAssignMarkerRange.empty())
362     return;
363 
364   LLVM_DEBUG(dbgs() << "  migrateDebugInfo\n");
365   LLVM_DEBUG(dbgs() << "    OldAlloca: " << *OldAlloca << "\n");
366   LLVM_DEBUG(dbgs() << "    IsSplit: " << IsSplit << "\n");
367   LLVM_DEBUG(dbgs() << "    OldAllocaOffsetInBits: " << OldAllocaOffsetInBits
368                     << "\n");
369   LLVM_DEBUG(dbgs() << "    SliceSizeInBits: " << SliceSizeInBits << "\n");
370   LLVM_DEBUG(dbgs() << "    OldInst: " << *OldInst << "\n");
371   LLVM_DEBUG(dbgs() << "    Inst: " << *Inst << "\n");
372   LLVM_DEBUG(dbgs() << "    Dest: " << *Dest << "\n");
373   if (Value)
374     LLVM_DEBUG(dbgs() << "    Value: " << *Value << "\n");
375 
376   /// Map of aggregate variables to their fragment associated with OldAlloca.
377   DenseMap<DebugVariable, std::optional<DIExpression::FragmentInfo>>
378       BaseFragments;
379   for (auto *DAI : at::getAssignmentMarkers(OldAlloca))
380     BaseFragments[getAggregateVariable(DAI)] =
381         DAI->getExpression()->getFragmentInfo();
382   for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca))
383     BaseFragments[getAggregateVariable(DVR)] =
384         DVR->getExpression()->getFragmentInfo();
385 
386   // The new inst needs a DIAssignID unique metadata tag (if OldInst has
387   // one). It shouldn't already have one: assert this assumption.
388   assert(!Inst->getMetadata(LLVMContext::MD_DIAssignID));
389   DIAssignID *NewID = nullptr;
390   auto &Ctx = Inst->getContext();
391   DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
392   assert(OldAlloca->isStaticAlloca());
393 
394   auto MigrateDbgAssign = [&](auto *DbgAssign) {
395     LLVM_DEBUG(dbgs() << "      existing dbg.assign is: " << *DbgAssign
396                       << "\n");
397     auto *Expr = DbgAssign->getExpression();
398     bool SetKillLocation = false;
399 
400     if (IsSplit) {
401       std::optional<DIExpression::FragmentInfo> BaseFragment;
402       {
403         auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
404         if (R == BaseFragments.end())
405           return;
406         BaseFragment = R->second;
407       }
408       std::optional<DIExpression::FragmentInfo> CurrentFragment =
409           Expr->getFragmentInfo();
410       DIExpression::FragmentInfo NewFragment;
411       FragCalcResult Result = calculateFragment(
412           DbgAssign->getVariable(), OldAllocaOffsetInBits, SliceSizeInBits,
413           BaseFragment, CurrentFragment, NewFragment);
414 
415       if (Result == Skip)
416         return;
417       if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
418         if (CurrentFragment) {
419           // Rewrite NewFragment to be relative to the existing one (this is
420           // what createFragmentExpression wants).  CalculateFragment has
421           // already resolved the size for us. FIXME: Should it return the
422           // relative fragment too?
423           NewFragment.OffsetInBits -= CurrentFragment->OffsetInBits;
424         }
425         // Add the new fragment info to the existing expression if possible.
426         if (auto E = DIExpression::createFragmentExpression(
427                 Expr, NewFragment.OffsetInBits, NewFragment.SizeInBits)) {
428           Expr = *E;
429         } else {
430           // Otherwise, add the new fragment info to an empty expression and
431           // discard the value component of this dbg.assign as the value cannot
432           // be computed with the new fragment.
433           Expr = *DIExpression::createFragmentExpression(
434               DIExpression::get(Expr->getContext(), {}),
435               NewFragment.OffsetInBits, NewFragment.SizeInBits);
436           SetKillLocation = true;
437         }
438       }
439     }
440 
441     // If we haven't created a DIAssignID ID do that now and attach it to Inst.
442     if (!NewID) {
443       NewID = DIAssignID::getDistinct(Ctx);
444       Inst->setMetadata(LLVMContext::MD_DIAssignID, NewID);
445     }
446 
447     ::Value *NewValue = Value ? Value : DbgAssign->getValue();
448     auto *NewAssign = UnwrapDbgInstPtr(
449         DIB.insertDbgAssign(Inst, NewValue, DbgAssign->getVariable(), Expr,
450                             Dest, DIExpression::get(Expr->getContext(), {}),
451                             DbgAssign->getDebugLoc()),
452         DbgAssign);
453 
454     // If we've updated the value but the original dbg.assign has an arglist
455     // then kill it now - we can't use the requested new value.
456     // We can't replace the DIArgList with the new value as it'd leave
457     // the DIExpression in an invalid state (DW_OP_LLVM_arg operands without
458     // an arglist). And we can't keep the DIArgList in case the linked store
459     // is being split - in which case the DIArgList + expression may no longer
460     // be computing the correct value.
461     // This should be a very rare situation as it requires the value being
462     // stored to differ from the dbg.assign (i.e., the value has been
463     // represented differently in the debug intrinsic for some reason).
464     SetKillLocation |=
465         Value && (DbgAssign->hasArgList() ||
466                   !DbgAssign->getExpression()->isSingleLocationExpression());
467     if (SetKillLocation)
468       NewAssign->setKillLocation();
469 
470     // We could use more precision here at the cost of some additional (code)
471     // complexity - if the original dbg.assign was adjacent to its store, we
472     // could position this new dbg.assign adjacent to its store rather than the
473     // old dbg.assgn. That would result in interleaved dbg.assigns rather than
474     // what we get now:
475     //    split store !1
476     //    split store !2
477     //    dbg.assign !1
478     //    dbg.assign !2
479     // This (current behaviour) results results in debug assignments being
480     // noted as slightly offset (in code) from the store. In practice this
481     // should have little effect on the debugging experience due to the fact
482     // that all the split stores should get the same line number.
483     NewAssign->moveBefore(DbgAssign->getIterator());
484 
485     NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
486     LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
487   };
488 
489   for_each(MarkerRange, MigrateDbgAssign);
490   for_each(DVRAssignMarkerRange, MigrateDbgAssign);
491 }
492 
493 namespace {
494 
495 /// A custom IRBuilder inserter which prefixes all names, but only in
496 /// Assert builds.
497 class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
498   std::string Prefix;
499 
getNameWithPrefix(const Twine & Name) const500   Twine getNameWithPrefix(const Twine &Name) const {
501     return Name.isTriviallyEmpty() ? Name : Prefix + Name;
502   }
503 
504 public:
SetNamePrefix(const Twine & P)505   void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
506 
InsertHelper(Instruction * I,const Twine & Name,BasicBlock::iterator InsertPt) const507   void InsertHelper(Instruction *I, const Twine &Name,
508                     BasicBlock::iterator InsertPt) const override {
509     IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name),
510                                            InsertPt);
511   }
512 };
513 
514 /// Provide a type for IRBuilder that drops names in release builds.
515 using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
516 
517 /// A used slice of an alloca.
518 ///
519 /// This structure represents a slice of an alloca used by some instruction. It
520 /// stores both the begin and end offsets of this use, a pointer to the use
521 /// itself, and a flag indicating whether we can classify the use as splittable
522 /// or not when forming partitions of the alloca.
523 class Slice {
524   /// The beginning offset of the range.
525   uint64_t BeginOffset = 0;
526 
527   /// The ending offset, not included in the range.
528   uint64_t EndOffset = 0;
529 
530   /// Storage for both the use of this slice and whether it can be
531   /// split.
532   PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
533 
534 public:
535   Slice() = default;
536 
Slice(uint64_t BeginOffset,uint64_t EndOffset,Use * U,bool IsSplittable)537   Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
538       : BeginOffset(BeginOffset), EndOffset(EndOffset),
539         UseAndIsSplittable(U, IsSplittable) {}
540 
beginOffset() const541   uint64_t beginOffset() const { return BeginOffset; }
endOffset() const542   uint64_t endOffset() const { return EndOffset; }
543 
isSplittable() const544   bool isSplittable() const { return UseAndIsSplittable.getInt(); }
makeUnsplittable()545   void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
546 
getUse() const547   Use *getUse() const { return UseAndIsSplittable.getPointer(); }
548 
isDead() const549   bool isDead() const { return getUse() == nullptr; }
kill()550   void kill() { UseAndIsSplittable.setPointer(nullptr); }
551 
552   /// Support for ordering ranges.
553   ///
554   /// This provides an ordering over ranges such that start offsets are
555   /// always increasing, and within equal start offsets, the end offsets are
556   /// decreasing. Thus the spanning range comes first in a cluster with the
557   /// same start position.
operator <(const Slice & RHS) const558   bool operator<(const Slice &RHS) const {
559     if (beginOffset() < RHS.beginOffset())
560       return true;
561     if (beginOffset() > RHS.beginOffset())
562       return false;
563     if (isSplittable() != RHS.isSplittable())
564       return !isSplittable();
565     if (endOffset() > RHS.endOffset())
566       return true;
567     return false;
568   }
569 
570   /// Support comparison with a single offset to allow binary searches.
operator <(const Slice & LHS,uint64_t RHSOffset)571   friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
572                                               uint64_t RHSOffset) {
573     return LHS.beginOffset() < RHSOffset;
574   }
operator <(uint64_t LHSOffset,const Slice & RHS)575   friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
576                                               const Slice &RHS) {
577     return LHSOffset < RHS.beginOffset();
578   }
579 
operator ==(const Slice & RHS) const580   bool operator==(const Slice &RHS) const {
581     return isSplittable() == RHS.isSplittable() &&
582            beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
583   }
operator !=(const Slice & RHS) const584   bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
585 };
586 
587 /// Representation of the alloca slices.
588 ///
589 /// This class represents the slices of an alloca which are formed by its
590 /// various uses. If a pointer escapes, we can't fully build a representation
591 /// for the slices used and we reflect that in this structure. The uses are
592 /// stored, sorted by increasing beginning offset and with unsplittable slices
593 /// starting at a particular offset before splittable slices.
594 class AllocaSlices {
595 public:
596   /// Construct the slices of a particular alloca.
597   AllocaSlices(const DataLayout &DL, AllocaInst &AI);
598 
599   /// Test whether a pointer to the allocation escapes our analysis.
600   ///
601   /// If this is true, the slices are never fully built and should be
602   /// ignored.
isEscaped() const603   bool isEscaped() const { return PointerEscapingInstr; }
isEscapedReadOnly() const604   bool isEscapedReadOnly() const { return PointerEscapingInstrReadOnly; }
605 
606   /// Support for iterating over the slices.
607   /// @{
608   using iterator = SmallVectorImpl<Slice>::iterator;
609   using range = iterator_range<iterator>;
610 
begin()611   iterator begin() { return Slices.begin(); }
end()612   iterator end() { return Slices.end(); }
613 
614   using const_iterator = SmallVectorImpl<Slice>::const_iterator;
615   using const_range = iterator_range<const_iterator>;
616 
begin() const617   const_iterator begin() const { return Slices.begin(); }
end() const618   const_iterator end() const { return Slices.end(); }
619   /// @}
620 
621   /// Erase a range of slices.
erase(iterator Start,iterator Stop)622   void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
623 
624   /// Insert new slices for this alloca.
625   ///
626   /// This moves the slices into the alloca's slices collection, and re-sorts
627   /// everything so that the usual ordering properties of the alloca's slices
628   /// hold.
insert(ArrayRef<Slice> NewSlices)629   void insert(ArrayRef<Slice> NewSlices) {
630     int OldSize = Slices.size();
631     Slices.append(NewSlices.begin(), NewSlices.end());
632     auto SliceI = Slices.begin() + OldSize;
633     std::stable_sort(SliceI, Slices.end());
634     std::inplace_merge(Slices.begin(), SliceI, Slices.end());
635   }
636 
637   // Forward declare the iterator and range accessor for walking the
638   // partitions.
639   class partition_iterator;
640   iterator_range<partition_iterator> partitions();
641 
642   /// Access the dead users for this alloca.
getDeadUsers() const643   ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
644 
645   /// Access Uses that should be dropped if the alloca is promotable.
getDeadUsesIfPromotable() const646   ArrayRef<Use *> getDeadUsesIfPromotable() const {
647     return DeadUseIfPromotable;
648   }
649 
650   /// Access the dead operands referring to this alloca.
651   ///
652   /// These are operands which have cannot actually be used to refer to the
653   /// alloca as they are outside its range and the user doesn't correct for
654   /// that. These mostly consist of PHI node inputs and the like which we just
655   /// need to replace with undef.
getDeadOperands() const656   ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
657 
658 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
659   void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
660   void printSlice(raw_ostream &OS, const_iterator I,
661                   StringRef Indent = "  ") const;
662   void printUse(raw_ostream &OS, const_iterator I,
663                 StringRef Indent = "  ") const;
664   void print(raw_ostream &OS) const;
665   void dump(const_iterator I) const;
666   void dump() const;
667 #endif
668 
669 private:
670   template <typename DerivedT, typename RetT = void> class BuilderBase;
671   class SliceBuilder;
672 
673   friend class AllocaSlices::SliceBuilder;
674 
675 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
676   /// Handle to alloca instruction to simplify method interfaces.
677   AllocaInst &AI;
678 #endif
679 
680   /// The instruction responsible for this alloca not having a known set
681   /// of slices.
682   ///
683   /// When an instruction (potentially) escapes the pointer to the alloca, we
684   /// store a pointer to that here and abort trying to form slices of the
685   /// alloca. This will be null if the alloca slices are analyzed successfully.
686   Instruction *PointerEscapingInstr;
687   Instruction *PointerEscapingInstrReadOnly;
688 
689   /// The slices of the alloca.
690   ///
691   /// We store a vector of the slices formed by uses of the alloca here. This
692   /// vector is sorted by increasing begin offset, and then the unsplittable
693   /// slices before the splittable ones. See the Slice inner class for more
694   /// details.
695   SmallVector<Slice, 8> Slices;
696 
697   /// Instructions which will become dead if we rewrite the alloca.
698   ///
699   /// Note that these are not separated by slice. This is because we expect an
700   /// alloca to be completely rewritten or not rewritten at all. If rewritten,
701   /// all these instructions can simply be removed and replaced with poison as
702   /// they come from outside of the allocated space.
703   SmallVector<Instruction *, 8> DeadUsers;
704 
705   /// Uses which will become dead if can promote the alloca.
706   SmallVector<Use *, 8> DeadUseIfPromotable;
707 
708   /// Operands which will become dead if we rewrite the alloca.
709   ///
710   /// These are operands that in their particular use can be replaced with
711   /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
712   /// to PHI nodes and the like. They aren't entirely dead (there might be
713   /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
714   /// want to swap this particular input for poison to simplify the use lists of
715   /// the alloca.
716   SmallVector<Use *, 8> DeadOperands;
717 };
718 
719 /// A partition of the slices.
720 ///
721 /// An ephemeral representation for a range of slices which can be viewed as
722 /// a partition of the alloca. This range represents a span of the alloca's
723 /// memory which cannot be split, and provides access to all of the slices
724 /// overlapping some part of the partition.
725 ///
726 /// Objects of this type are produced by traversing the alloca's slices, but
727 /// are only ephemeral and not persistent.
728 class Partition {
729 private:
730   friend class AllocaSlices;
731   friend class AllocaSlices::partition_iterator;
732 
733   using iterator = AllocaSlices::iterator;
734 
735   /// The beginning and ending offsets of the alloca for this
736   /// partition.
737   uint64_t BeginOffset = 0, EndOffset = 0;
738 
739   /// The start and end iterators of this partition.
740   iterator SI, SJ;
741 
742   /// A collection of split slice tails overlapping the partition.
743   SmallVector<Slice *, 4> SplitTails;
744 
745   /// Raw constructor builds an empty partition starting and ending at
746   /// the given iterator.
Partition(iterator SI)747   Partition(iterator SI) : SI(SI), SJ(SI) {}
748 
749 public:
750   /// The start offset of this partition.
751   ///
752   /// All of the contained slices start at or after this offset.
beginOffset() const753   uint64_t beginOffset() const { return BeginOffset; }
754 
755   /// The end offset of this partition.
756   ///
757   /// All of the contained slices end at or before this offset.
endOffset() const758   uint64_t endOffset() const { return EndOffset; }
759 
760   /// The size of the partition.
761   ///
762   /// Note that this can never be zero.
size() const763   uint64_t size() const {
764     assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
765     return EndOffset - BeginOffset;
766   }
767 
768   /// Test whether this partition contains no slices, and merely spans
769   /// a region occupied by split slices.
empty() const770   bool empty() const { return SI == SJ; }
771 
772   /// \name Iterate slices that start within the partition.
773   /// These may be splittable or unsplittable. They have a begin offset >= the
774   /// partition begin offset.
775   /// @{
776   // FIXME: We should probably define a "concat_iterator" helper and use that
777   // to stitch together pointee_iterators over the split tails and the
778   // contiguous iterators of the partition. That would give a much nicer
779   // interface here. We could then additionally expose filtered iterators for
780   // split, unsplit, and unsplittable splices based on the usage patterns.
begin() const781   iterator begin() const { return SI; }
end() const782   iterator end() const { return SJ; }
783   /// @}
784 
785   /// Get the sequence of split slice tails.
786   ///
787   /// These tails are of slices which start before this partition but are
788   /// split and overlap into the partition. We accumulate these while forming
789   /// partitions.
splitSliceTails() const790   ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
791 };
792 
793 } // end anonymous namespace
794 
795 /// An iterator over partitions of the alloca's slices.
796 ///
797 /// This iterator implements the core algorithm for partitioning the alloca's
798 /// slices. It is a forward iterator as we don't support backtracking for
799 /// efficiency reasons, and re-use a single storage area to maintain the
800 /// current set of split slices.
801 ///
802 /// It is templated on the slice iterator type to use so that it can operate
803 /// with either const or non-const slice iterators.
804 class AllocaSlices::partition_iterator
805     : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
806                                   Partition> {
807   friend class AllocaSlices;
808 
809   /// Most of the state for walking the partitions is held in a class
810   /// with a nice interface for examining them.
811   Partition P;
812 
813   /// We need to keep the end of the slices to know when to stop.
814   AllocaSlices::iterator SE;
815 
816   /// We also need to keep track of the maximum split end offset seen.
817   /// FIXME: Do we really?
818   uint64_t MaxSplitSliceEndOffset = 0;
819 
820   /// Sets the partition to be empty at given iterator, and sets the
821   /// end iterator.
partition_iterator(AllocaSlices::iterator SI,AllocaSlices::iterator SE)822   partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
823       : P(SI), SE(SE) {
824     // If not already at the end, advance our state to form the initial
825     // partition.
826     if (SI != SE)
827       advance();
828   }
829 
830   /// Advance the iterator to the next partition.
831   ///
832   /// Requires that the iterator not be at the end of the slices.
advance()833   void advance() {
834     assert((P.SI != SE || !P.SplitTails.empty()) &&
835            "Cannot advance past the end of the slices!");
836 
837     // Clear out any split uses which have ended.
838     if (!P.SplitTails.empty()) {
839       if (P.EndOffset >= MaxSplitSliceEndOffset) {
840         // If we've finished all splits, this is easy.
841         P.SplitTails.clear();
842         MaxSplitSliceEndOffset = 0;
843       } else {
844         // Remove the uses which have ended in the prior partition. This
845         // cannot change the max split slice end because we just checked that
846         // the prior partition ended prior to that max.
847         llvm::erase_if(P.SplitTails,
848                        [&](Slice *S) { return S->endOffset() <= P.EndOffset; });
849         assert(llvm::any_of(P.SplitTails,
850                             [&](Slice *S) {
851                               return S->endOffset() == MaxSplitSliceEndOffset;
852                             }) &&
853                "Could not find the current max split slice offset!");
854         assert(llvm::all_of(P.SplitTails,
855                             [&](Slice *S) {
856                               return S->endOffset() <= MaxSplitSliceEndOffset;
857                             }) &&
858                "Max split slice end offset is not actually the max!");
859       }
860     }
861 
862     // If P.SI is already at the end, then we've cleared the split tail and
863     // now have an end iterator.
864     if (P.SI == SE) {
865       assert(P.SplitTails.empty() && "Failed to clear the split slices!");
866       return;
867     }
868 
869     // If we had a non-empty partition previously, set up the state for
870     // subsequent partitions.
871     if (P.SI != P.SJ) {
872       // Accumulate all the splittable slices which started in the old
873       // partition into the split list.
874       for (Slice &S : P)
875         if (S.isSplittable() && S.endOffset() > P.EndOffset) {
876           P.SplitTails.push_back(&S);
877           MaxSplitSliceEndOffset =
878               std::max(S.endOffset(), MaxSplitSliceEndOffset);
879         }
880 
881       // Start from the end of the previous partition.
882       P.SI = P.SJ;
883 
884       // If P.SI is now at the end, we at most have a tail of split slices.
885       if (P.SI == SE) {
886         P.BeginOffset = P.EndOffset;
887         P.EndOffset = MaxSplitSliceEndOffset;
888         return;
889       }
890 
891       // If the we have split slices and the next slice is after a gap and is
892       // not splittable immediately form an empty partition for the split
893       // slices up until the next slice begins.
894       if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
895           !P.SI->isSplittable()) {
896         P.BeginOffset = P.EndOffset;
897         P.EndOffset = P.SI->beginOffset();
898         return;
899       }
900     }
901 
902     // OK, we need to consume new slices. Set the end offset based on the
903     // current slice, and step SJ past it. The beginning offset of the
904     // partition is the beginning offset of the next slice unless we have
905     // pre-existing split slices that are continuing, in which case we begin
906     // at the prior end offset.
907     P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
908     P.EndOffset = P.SI->endOffset();
909     ++P.SJ;
910 
911     // There are two strategies to form a partition based on whether the
912     // partition starts with an unsplittable slice or a splittable slice.
913     if (!P.SI->isSplittable()) {
914       // When we're forming an unsplittable region, it must always start at
915       // the first slice and will extend through its end.
916       assert(P.BeginOffset == P.SI->beginOffset());
917 
918       // Form a partition including all of the overlapping slices with this
919       // unsplittable slice.
920       while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
921         if (!P.SJ->isSplittable())
922           P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
923         ++P.SJ;
924       }
925 
926       // We have a partition across a set of overlapping unsplittable
927       // partitions.
928       return;
929     }
930 
931     // If we're starting with a splittable slice, then we need to form
932     // a synthetic partition spanning it and any other overlapping splittable
933     // splices.
934     assert(P.SI->isSplittable() && "Forming a splittable partition!");
935 
936     // Collect all of the overlapping splittable slices.
937     while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
938            P.SJ->isSplittable()) {
939       P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
940       ++P.SJ;
941     }
942 
943     // Back upiP.EndOffset if we ended the span early when encountering an
944     // unsplittable slice. This synthesizes the early end offset of
945     // a partition spanning only splittable slices.
946     if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
947       assert(!P.SJ->isSplittable());
948       P.EndOffset = P.SJ->beginOffset();
949     }
950   }
951 
952 public:
operator ==(const partition_iterator & RHS) const953   bool operator==(const partition_iterator &RHS) const {
954     assert(SE == RHS.SE &&
955            "End iterators don't match between compared partition iterators!");
956 
957     // The observed positions of partitions is marked by the P.SI iterator and
958     // the emptiness of the split slices. The latter is only relevant when
959     // P.SI == SE, as the end iterator will additionally have an empty split
960     // slices list, but the prior may have the same P.SI and a tail of split
961     // slices.
962     if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
963       assert(P.SJ == RHS.P.SJ &&
964              "Same set of slices formed two different sized partitions!");
965       assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
966              "Same slice position with differently sized non-empty split "
967              "slice tails!");
968       return true;
969     }
970     return false;
971   }
972 
operator ++()973   partition_iterator &operator++() {
974     advance();
975     return *this;
976   }
977 
operator *()978   Partition &operator*() { return P; }
979 };
980 
981 /// A forward range over the partitions of the alloca's slices.
982 ///
983 /// This accesses an iterator range over the partitions of the alloca's
984 /// slices. It computes these partitions on the fly based on the overlapping
985 /// offsets of the slices and the ability to split them. It will visit "empty"
986 /// partitions to cover regions of the alloca only accessed via split
987 /// slices.
partitions()988 iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
989   return make_range(partition_iterator(begin(), end()),
990                     partition_iterator(end(), end()));
991 }
992 
foldSelectInst(SelectInst & SI)993 static Value *foldSelectInst(SelectInst &SI) {
994   // If the condition being selected on is a constant or the same value is
995   // being selected between, fold the select. Yes this does (rarely) happen
996   // early on.
997   if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
998     return SI.getOperand(1 + CI->isZero());
999   if (SI.getOperand(1) == SI.getOperand(2))
1000     return SI.getOperand(1);
1001 
1002   return nullptr;
1003 }
1004 
1005 /// A helper that folds a PHI node or a select.
foldPHINodeOrSelectInst(Instruction & I)1006 static Value *foldPHINodeOrSelectInst(Instruction &I) {
1007   if (PHINode *PN = dyn_cast<PHINode>(&I)) {
1008     // If PN merges together the same value, return that value.
1009     return PN->hasConstantValue();
1010   }
1011   return foldSelectInst(cast<SelectInst>(I));
1012 }
1013 
1014 /// Builder for the alloca slices.
1015 ///
1016 /// This class builds a set of alloca slices by recursively visiting the uses
1017 /// of an alloca and making a slice for each load and store at each offset.
1018 class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
1019   friend class PtrUseVisitor<SliceBuilder>;
1020   friend class InstVisitor<SliceBuilder>;
1021 
1022   using Base = PtrUseVisitor<SliceBuilder>;
1023 
1024   const uint64_t AllocSize;
1025   AllocaSlices &AS;
1026 
1027   SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
1028   SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
1029 
1030   /// Set to de-duplicate dead instructions found in the use walk.
1031   SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
1032 
1033 public:
SliceBuilder(const DataLayout & DL,AllocaInst & AI,AllocaSlices & AS)1034   SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
1035       : PtrUseVisitor<SliceBuilder>(DL),
1036         AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue()),
1037         AS(AS) {}
1038 
1039 private:
markAsDead(Instruction & I)1040   void markAsDead(Instruction &I) {
1041     if (VisitedDeadInsts.insert(&I).second)
1042       AS.DeadUsers.push_back(&I);
1043   }
1044 
insertUse(Instruction & I,const APInt & Offset,uint64_t Size,bool IsSplittable=false)1045   void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
1046                  bool IsSplittable = false) {
1047     // Completely skip uses which have a zero size or start either before or
1048     // past the end of the allocation.
1049     if (Size == 0 || Offset.uge(AllocSize)) {
1050       LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
1051                         << Offset
1052                         << " which has zero size or starts outside of the "
1053                         << AllocSize << " byte alloca:\n"
1054                         << "    alloca: " << AS.AI << "\n"
1055                         << "       use: " << I << "\n");
1056       return markAsDead(I);
1057     }
1058 
1059     uint64_t BeginOffset = Offset.getZExtValue();
1060     uint64_t EndOffset = BeginOffset + Size;
1061 
1062     // Clamp the end offset to the end of the allocation. Note that this is
1063     // formulated to handle even the case where "BeginOffset + Size" overflows.
1064     // This may appear superficially to be something we could ignore entirely,
1065     // but that is not so! There may be widened loads or PHI-node uses where
1066     // some instructions are dead but not others. We can't completely ignore
1067     // them, and so have to record at least the information here.
1068     assert(AllocSize >= BeginOffset); // Established above.
1069     if (Size > AllocSize - BeginOffset) {
1070       LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
1071                         << Offset << " to remain within the " << AllocSize
1072                         << " byte alloca:\n"
1073                         << "    alloca: " << AS.AI << "\n"
1074                         << "       use: " << I << "\n");
1075       EndOffset = AllocSize;
1076     }
1077 
1078     AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
1079   }
1080 
visitBitCastInst(BitCastInst & BC)1081   void visitBitCastInst(BitCastInst &BC) {
1082     if (BC.use_empty())
1083       return markAsDead(BC);
1084 
1085     return Base::visitBitCastInst(BC);
1086   }
1087 
visitAddrSpaceCastInst(AddrSpaceCastInst & ASC)1088   void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1089     if (ASC.use_empty())
1090       return markAsDead(ASC);
1091 
1092     return Base::visitAddrSpaceCastInst(ASC);
1093   }
1094 
visitGetElementPtrInst(GetElementPtrInst & GEPI)1095   void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1096     if (GEPI.use_empty())
1097       return markAsDead(GEPI);
1098 
1099     return Base::visitGetElementPtrInst(GEPI);
1100   }
1101 
handleLoadOrStore(Type * Ty,Instruction & I,const APInt & Offset,uint64_t Size,bool IsVolatile)1102   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
1103                          uint64_t Size, bool IsVolatile) {
1104     // We allow splitting of non-volatile loads and stores where the type is an
1105     // integer type. These may be used to implement 'memcpy' or other "transfer
1106     // of bits" patterns.
1107     bool IsSplittable =
1108         Ty->isIntegerTy() && !IsVolatile && DL.typeSizeEqualsStoreSize(Ty);
1109 
1110     insertUse(I, Offset, Size, IsSplittable);
1111   }
1112 
visitLoadInst(LoadInst & LI)1113   void visitLoadInst(LoadInst &LI) {
1114     assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
1115            "All simple FCA loads should have been pre-split");
1116 
1117     // If there is a load with an unknown offset, we can still perform store
1118     // to load forwarding for other known-offset loads.
1119     if (!IsOffsetKnown)
1120       return PI.setEscapedReadOnly(&LI);
1121 
1122     TypeSize Size = DL.getTypeStoreSize(LI.getType());
1123     if (Size.isScalable()) {
1124       unsigned VScale = LI.getFunction()->getVScaleValue();
1125       if (!VScale)
1126         return PI.setAborted(&LI);
1127 
1128       Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale);
1129     }
1130 
1131     return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(),
1132                              LI.isVolatile());
1133   }
1134 
visitStoreInst(StoreInst & SI)1135   void visitStoreInst(StoreInst &SI) {
1136     Value *ValOp = SI.getValueOperand();
1137     if (ValOp == *U)
1138       return PI.setEscapedAndAborted(&SI);
1139     if (!IsOffsetKnown)
1140       return PI.setAborted(&SI);
1141 
1142     TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType());
1143     if (StoreSize.isScalable()) {
1144       unsigned VScale = SI.getFunction()->getVScaleValue();
1145       if (!VScale)
1146         return PI.setAborted(&SI);
1147 
1148       StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale);
1149     }
1150 
1151     uint64_t Size = StoreSize.getFixedValue();
1152 
1153     // If this memory access can be shown to *statically* extend outside the
1154     // bounds of the allocation, it's behavior is undefined, so simply
1155     // ignore it. Note that this is more strict than the generic clamping
1156     // behavior of insertUse. We also try to handle cases which might run the
1157     // risk of overflow.
1158     // FIXME: We should instead consider the pointer to have escaped if this
1159     // function is being instrumented for addressing bugs or race conditions.
1160     if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
1161       LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
1162                         << Offset << " which extends past the end of the "
1163                         << AllocSize << " byte alloca:\n"
1164                         << "    alloca: " << AS.AI << "\n"
1165                         << "       use: " << SI << "\n");
1166       return markAsDead(SI);
1167     }
1168 
1169     assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
1170            "All simple FCA stores should have been pre-split");
1171     handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
1172   }
1173 
visitMemSetInst(MemSetInst & II)1174   void visitMemSetInst(MemSetInst &II) {
1175     assert(II.getRawDest() == *U && "Pointer use is not the destination?");
1176     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1177     if ((Length && Length->getValue() == 0) ||
1178         (IsOffsetKnown && Offset.uge(AllocSize)))
1179       // Zero-length mem transfer intrinsics can be ignored entirely.
1180       return markAsDead(II);
1181 
1182     if (!IsOffsetKnown)
1183       return PI.setAborted(&II);
1184 
1185     insertUse(II, Offset,
1186               Length ? Length->getLimitedValue()
1187                      : AllocSize - Offset.getLimitedValue(),
1188               (bool)Length);
1189   }
1190 
visitMemTransferInst(MemTransferInst & II)1191   void visitMemTransferInst(MemTransferInst &II) {
1192     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
1193     if (Length && Length->getValue() == 0)
1194       // Zero-length mem transfer intrinsics can be ignored entirely.
1195       return markAsDead(II);
1196 
1197     // Because we can visit these intrinsics twice, also check to see if the
1198     // first time marked this instruction as dead. If so, skip it.
1199     if (VisitedDeadInsts.count(&II))
1200       return;
1201 
1202     if (!IsOffsetKnown)
1203       return PI.setAborted(&II);
1204 
1205     // This side of the transfer is completely out-of-bounds, and so we can
1206     // nuke the entire transfer. However, we also need to nuke the other side
1207     // if already added to our partitions.
1208     // FIXME: Yet another place we really should bypass this when
1209     // instrumenting for ASan.
1210     if (Offset.uge(AllocSize)) {
1211       SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
1212           MemTransferSliceMap.find(&II);
1213       if (MTPI != MemTransferSliceMap.end())
1214         AS.Slices[MTPI->second].kill();
1215       return markAsDead(II);
1216     }
1217 
1218     uint64_t RawOffset = Offset.getLimitedValue();
1219     uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
1220 
1221     // Check for the special case where the same exact value is used for both
1222     // source and dest.
1223     if (*U == II.getRawDest() && *U == II.getRawSource()) {
1224       // For non-volatile transfers this is a no-op.
1225       if (!II.isVolatile())
1226         return markAsDead(II);
1227 
1228       return insertUse(II, Offset, Size, /*IsSplittable=*/false);
1229     }
1230 
1231     // If we have seen both source and destination for a mem transfer, then
1232     // they both point to the same alloca.
1233     bool Inserted;
1234     SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
1235     std::tie(MTPI, Inserted) =
1236         MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
1237     unsigned PrevIdx = MTPI->second;
1238     if (!Inserted) {
1239       Slice &PrevP = AS.Slices[PrevIdx];
1240 
1241       // Check if the begin offsets match and this is a non-volatile transfer.
1242       // In that case, we can completely elide the transfer.
1243       if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
1244         PrevP.kill();
1245         return markAsDead(II);
1246       }
1247 
1248       // Otherwise we have an offset transfer within the same alloca. We can't
1249       // split those.
1250       PrevP.makeUnsplittable();
1251     }
1252 
1253     // Insert the use now that we've fixed up the splittable nature.
1254     insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
1255 
1256     // Check that we ended up with a valid index in the map.
1257     assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
1258            "Map index doesn't point back to a slice with this user.");
1259   }
1260 
1261   // Disable SRoA for any intrinsics except for lifetime invariants.
1262   // FIXME: What about debug intrinsics? This matches old behavior, but
1263   // doesn't make sense.
visitIntrinsicInst(IntrinsicInst & II)1264   void visitIntrinsicInst(IntrinsicInst &II) {
1265     if (II.isDroppable()) {
1266       AS.DeadUseIfPromotable.push_back(U);
1267       return;
1268     }
1269 
1270     if (!IsOffsetKnown)
1271       return PI.setAborted(&II);
1272 
1273     if (II.isLifetimeStartOrEnd()) {
1274       ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
1275       uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
1276                                Length->getLimitedValue());
1277       insertUse(II, Offset, Size, true);
1278       return;
1279     }
1280 
1281     Base::visitIntrinsicInst(II);
1282   }
1283 
hasUnsafePHIOrSelectUse(Instruction * Root,uint64_t & Size)1284   Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
1285     // We consider any PHI or select that results in a direct load or store of
1286     // the same offset to be a viable use for slicing purposes. These uses
1287     // are considered unsplittable and the size is the maximum loaded or stored
1288     // size.
1289     SmallPtrSet<Instruction *, 4> Visited;
1290     SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
1291     Visited.insert(Root);
1292     Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
1293     const DataLayout &DL = Root->getDataLayout();
1294     // If there are no loads or stores, the access is dead. We mark that as
1295     // a size zero access.
1296     Size = 0;
1297     do {
1298       Instruction *I, *UsedI;
1299       std::tie(UsedI, I) = Uses.pop_back_val();
1300 
1301       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
1302         TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
1303         if (LoadSize.isScalable()) {
1304           PI.setAborted(LI);
1305           return nullptr;
1306         }
1307         Size = std::max(Size, LoadSize.getFixedValue());
1308         continue;
1309       }
1310       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
1311         Value *Op = SI->getOperand(0);
1312         if (Op == UsedI)
1313           return SI;
1314         TypeSize StoreSize = DL.getTypeStoreSize(Op->getType());
1315         if (StoreSize.isScalable()) {
1316           PI.setAborted(SI);
1317           return nullptr;
1318         }
1319         Size = std::max(Size, StoreSize.getFixedValue());
1320         continue;
1321       }
1322 
1323       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
1324         if (!GEP->hasAllZeroIndices())
1325           return GEP;
1326       } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
1327                  !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
1328         return I;
1329       }
1330 
1331       for (User *U : I->users())
1332         if (Visited.insert(cast<Instruction>(U)).second)
1333           Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
1334     } while (!Uses.empty());
1335 
1336     return nullptr;
1337   }
1338 
visitPHINodeOrSelectInst(Instruction & I)1339   void visitPHINodeOrSelectInst(Instruction &I) {
1340     assert(isa<PHINode>(I) || isa<SelectInst>(I));
1341     if (I.use_empty())
1342       return markAsDead(I);
1343 
1344     // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
1345     // instructions in this BB, which may be required during rewriting. Bail out
1346     // on these cases.
1347     if (isa<PHINode>(I) &&
1348         I.getParent()->getFirstInsertionPt() == I.getParent()->end())
1349       return PI.setAborted(&I);
1350 
1351     // TODO: We could use simplifyInstruction here to fold PHINodes and
1352     // SelectInsts. However, doing so requires to change the current
1353     // dead-operand-tracking mechanism. For instance, suppose neither loading
1354     // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
1355     // trap either.  However, if we simply replace %U with undef using the
1356     // current dead-operand-tracking mechanism, "load (select undef, undef,
1357     // %other)" may trap because the select may return the first operand
1358     // "undef".
1359     if (Value *Result = foldPHINodeOrSelectInst(I)) {
1360       if (Result == *U)
1361         // If the result of the constant fold will be the pointer, recurse
1362         // through the PHI/select as if we had RAUW'ed it.
1363         enqueueUsers(I);
1364       else
1365         // Otherwise the operand to the PHI/select is dead, and we can replace
1366         // it with poison.
1367         AS.DeadOperands.push_back(U);
1368 
1369       return;
1370     }
1371 
1372     if (!IsOffsetKnown)
1373       return PI.setAborted(&I);
1374 
1375     // See if we already have computed info on this node.
1376     uint64_t &Size = PHIOrSelectSizes[&I];
1377     if (!Size) {
1378       // This is a new PHI/Select, check for an unsafe use of it.
1379       if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
1380         return PI.setAborted(UnsafeI);
1381     }
1382 
1383     // For PHI and select operands outside the alloca, we can't nuke the entire
1384     // phi or select -- the other side might still be relevant, so we special
1385     // case them here and use a separate structure to track the operands
1386     // themselves which should be replaced with poison.
1387     // FIXME: This should instead be escaped in the event we're instrumenting
1388     // for address sanitization.
1389     if (Offset.uge(AllocSize)) {
1390       AS.DeadOperands.push_back(U);
1391       return;
1392     }
1393 
1394     insertUse(I, Offset, Size);
1395   }
1396 
visitPHINode(PHINode & PN)1397   void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
1398 
visitSelectInst(SelectInst & SI)1399   void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
1400 
1401   /// Disable SROA entirely if there are unhandled users of the alloca.
visitInstruction(Instruction & I)1402   void visitInstruction(Instruction &I) { PI.setAborted(&I); }
1403 
visitCallBase(CallBase & CB)1404   void visitCallBase(CallBase &CB) {
1405     // If the call operand is read-only and only does a read-only or address
1406     // capture, then we mark it as EscapedReadOnly.
1407     if (CB.isDataOperand(U) &&
1408         !capturesFullProvenance(CB.getCaptureInfo(U->getOperandNo())) &&
1409         CB.onlyReadsMemory(U->getOperandNo())) {
1410       PI.setEscapedReadOnly(&CB);
1411       return;
1412     }
1413 
1414     Base::visitCallBase(CB);
1415   }
1416 };
1417 
AllocaSlices(const DataLayout & DL,AllocaInst & AI)1418 AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
1419     :
1420 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1421       AI(AI),
1422 #endif
1423       PointerEscapingInstr(nullptr), PointerEscapingInstrReadOnly(nullptr) {
1424   SliceBuilder PB(DL, AI, *this);
1425   SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
1426   if (PtrI.isEscaped() || PtrI.isAborted()) {
1427     // FIXME: We should sink the escape vs. abort info into the caller nicely,
1428     // possibly by just storing the PtrInfo in the AllocaSlices.
1429     PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
1430                                                   : PtrI.getAbortingInst();
1431     assert(PointerEscapingInstr && "Did not track a bad instruction");
1432     return;
1433   }
1434   PointerEscapingInstrReadOnly = PtrI.getEscapedReadOnlyInst();
1435 
1436   llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
1437 
1438   // Sort the uses. This arranges for the offsets to be in ascending order,
1439   // and the sizes to be in descending order.
1440   llvm::stable_sort(Slices);
1441 }
1442 
1443 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1444 
print(raw_ostream & OS,const_iterator I,StringRef Indent) const1445 void AllocaSlices::print(raw_ostream &OS, const_iterator I,
1446                          StringRef Indent) const {
1447   printSlice(OS, I, Indent);
1448   OS << "\n";
1449   printUse(OS, I, Indent);
1450 }
1451 
printSlice(raw_ostream & OS,const_iterator I,StringRef Indent) const1452 void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
1453                               StringRef Indent) const {
1454   OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
1455      << " slice #" << (I - begin())
1456      << (I->isSplittable() ? " (splittable)" : "");
1457 }
1458 
printUse(raw_ostream & OS,const_iterator I,StringRef Indent) const1459 void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
1460                             StringRef Indent) const {
1461   OS << Indent << "  used by: " << *I->getUse()->getUser() << "\n";
1462 }
1463 
print(raw_ostream & OS) const1464 void AllocaSlices::print(raw_ostream &OS) const {
1465   if (PointerEscapingInstr) {
1466     OS << "Can't analyze slices for alloca: " << AI << "\n"
1467        << "  A pointer to this alloca escaped by:\n"
1468        << "  " << *PointerEscapingInstr << "\n";
1469     return;
1470   }
1471 
1472   if (PointerEscapingInstrReadOnly)
1473     OS << "Escapes into ReadOnly: " << *PointerEscapingInstrReadOnly << "\n";
1474 
1475   OS << "Slices of alloca: " << AI << "\n";
1476   for (const_iterator I = begin(), E = end(); I != E; ++I)
1477     print(OS, I);
1478 }
1479 
dump(const_iterator I) const1480 LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
1481   print(dbgs(), I);
1482 }
dump() const1483 LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
1484 
1485 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1486 
1487 /// Walk the range of a partitioning looking for a common type to cover this
1488 /// sequence of slices.
1489 static std::pair<Type *, IntegerType *>
findCommonType(AllocaSlices::const_iterator B,AllocaSlices::const_iterator E,uint64_t EndOffset)1490 findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
1491                uint64_t EndOffset) {
1492   Type *Ty = nullptr;
1493   bool TyIsCommon = true;
1494   IntegerType *ITy = nullptr;
1495 
1496   // Note that we need to look at *every* alloca slice's Use to ensure we
1497   // always get consistent results regardless of the order of slices.
1498   for (AllocaSlices::const_iterator I = B; I != E; ++I) {
1499     Use *U = I->getUse();
1500     if (isa<IntrinsicInst>(*U->getUser()))
1501       continue;
1502     if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
1503       continue;
1504 
1505     Type *UserTy = nullptr;
1506     if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
1507       UserTy = LI->getType();
1508     } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
1509       UserTy = SI->getValueOperand()->getType();
1510     }
1511 
1512     if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
1513       // If the type is larger than the partition, skip it. We only encounter
1514       // this for split integer operations where we want to use the type of the
1515       // entity causing the split. Also skip if the type is not a byte width
1516       // multiple.
1517       if (UserITy->getBitWidth() % 8 != 0 ||
1518           UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
1519         continue;
1520 
1521       // Track the largest bitwidth integer type used in this way in case there
1522       // is no common type.
1523       if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
1524         ITy = UserITy;
1525     }
1526 
1527     // To avoid depending on the order of slices, Ty and TyIsCommon must not
1528     // depend on types skipped above.
1529     if (!UserTy || (Ty && Ty != UserTy))
1530       TyIsCommon = false; // Give up on anything but an iN type.
1531     else
1532       Ty = UserTy;
1533   }
1534 
1535   return {TyIsCommon ? Ty : nullptr, ITy};
1536 }
1537 
1538 /// PHI instructions that use an alloca and are subsequently loaded can be
1539 /// rewritten to load both input pointers in the pred blocks and then PHI the
1540 /// results, allowing the load of the alloca to be promoted.
1541 /// From this:
1542 ///   %P2 = phi [i32* %Alloca, i32* %Other]
1543 ///   %V = load i32* %P2
1544 /// to:
1545 ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
1546 ///   ...
1547 ///   %V2 = load i32* %Other
1548 ///   ...
1549 ///   %V = phi [i32 %V1, i32 %V2]
1550 ///
1551 /// We can do this to a select if its only uses are loads and if the operands
1552 /// to the select can be loaded unconditionally.
1553 ///
1554 /// FIXME: This should be hoisted into a generic utility, likely in
1555 /// Transforms/Util/Local.h
isSafePHIToSpeculate(PHINode & PN)1556 static bool isSafePHIToSpeculate(PHINode &PN) {
1557   const DataLayout &DL = PN.getDataLayout();
1558 
1559   // For now, we can only do this promotion if the load is in the same block
1560   // as the PHI, and if there are no stores between the phi and load.
1561   // TODO: Allow recursive phi users.
1562   // TODO: Allow stores.
1563   BasicBlock *BB = PN.getParent();
1564   Align MaxAlign;
1565   uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
1566   Type *LoadType = nullptr;
1567   for (User *U : PN.users()) {
1568     LoadInst *LI = dyn_cast<LoadInst>(U);
1569     if (!LI || !LI->isSimple())
1570       return false;
1571 
1572     // For now we only allow loads in the same block as the PHI.  This is
1573     // a common case that happens when instcombine merges two loads through
1574     // a PHI.
1575     if (LI->getParent() != BB)
1576       return false;
1577 
1578     if (LoadType) {
1579       if (LoadType != LI->getType())
1580         return false;
1581     } else {
1582       LoadType = LI->getType();
1583     }
1584 
1585     // Ensure that there are no instructions between the PHI and the load that
1586     // could store.
1587     for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
1588       if (BBI->mayWriteToMemory())
1589         return false;
1590 
1591     MaxAlign = std::max(MaxAlign, LI->getAlign());
1592   }
1593 
1594   if (!LoadType)
1595     return false;
1596 
1597   APInt LoadSize =
1598       APInt(APWidth, DL.getTypeStoreSize(LoadType).getFixedValue());
1599 
1600   // We can only transform this if it is safe to push the loads into the
1601   // predecessor blocks. The only thing to watch out for is that we can't put
1602   // a possibly trapping load in the predecessor if it is a critical edge.
1603   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1604     Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
1605     Value *InVal = PN.getIncomingValue(Idx);
1606 
1607     // If the value is produced by the terminator of the predecessor (an
1608     // invoke) or it has side-effects, there is no valid place to put a load
1609     // in the predecessor.
1610     if (TI == InVal || TI->mayHaveSideEffects())
1611       return false;
1612 
1613     // If the predecessor has a single successor, then the edge isn't
1614     // critical.
1615     if (TI->getNumSuccessors() == 1)
1616       continue;
1617 
1618     // If this pointer is always safe to load, or if we can prove that there
1619     // is already a load in the block, then we can move the load to the pred
1620     // block.
1621     if (isSafeToLoadUnconditionally(InVal, MaxAlign, LoadSize, DL, TI))
1622       continue;
1623 
1624     return false;
1625   }
1626 
1627   return true;
1628 }
1629 
speculatePHINodeLoads(IRBuilderTy & IRB,PHINode & PN)1630 static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
1631   LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
1632 
1633   LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
1634   Type *LoadTy = SomeLoad->getType();
1635   IRB.SetInsertPoint(&PN);
1636   PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
1637                                  PN.getName() + ".sroa.speculated");
1638 
1639   // Get the AA tags and alignment to use from one of the loads. It does not
1640   // matter which one we get and if any differ.
1641   AAMDNodes AATags = SomeLoad->getAAMetadata();
1642   Align Alignment = SomeLoad->getAlign();
1643 
1644   // Rewrite all loads of the PN to use the new PHI.
1645   while (!PN.use_empty()) {
1646     LoadInst *LI = cast<LoadInst>(PN.user_back());
1647     LI->replaceAllUsesWith(NewPN);
1648     LI->eraseFromParent();
1649   }
1650 
1651   // Inject loads into all of the pred blocks.
1652   DenseMap<BasicBlock *, Value *> InjectedLoads;
1653   for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
1654     BasicBlock *Pred = PN.getIncomingBlock(Idx);
1655     Value *InVal = PN.getIncomingValue(Idx);
1656 
1657     // A PHI node is allowed to have multiple (duplicated) entries for the same
1658     // basic block, as long as the value is the same. So if we already injected
1659     // a load in the predecessor, then we should reuse the same load for all
1660     // duplicated entries.
1661     if (Value *V = InjectedLoads.lookup(Pred)) {
1662       NewPN->addIncoming(V, Pred);
1663       continue;
1664     }
1665 
1666     Instruction *TI = Pred->getTerminator();
1667     IRB.SetInsertPoint(TI);
1668 
1669     LoadInst *Load = IRB.CreateAlignedLoad(
1670         LoadTy, InVal, Alignment,
1671         (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
1672     ++NumLoadsSpeculated;
1673     if (AATags)
1674       Load->setAAMetadata(AATags);
1675     NewPN->addIncoming(Load, Pred);
1676     InjectedLoads[Pred] = Load;
1677   }
1678 
1679   LLVM_DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
1680   PN.eraseFromParent();
1681 }
1682 
1683 SelectHandSpeculativity &
setAsSpeculatable(bool isTrueVal)1684 SelectHandSpeculativity::setAsSpeculatable(bool isTrueVal) {
1685   if (isTrueVal)
1686     Bitfield::set<SelectHandSpeculativity::TrueVal>(Storage, true);
1687   else
1688     Bitfield::set<SelectHandSpeculativity::FalseVal>(Storage, true);
1689   return *this;
1690 }
1691 
isSpeculatable(bool isTrueVal) const1692 bool SelectHandSpeculativity::isSpeculatable(bool isTrueVal) const {
1693   return isTrueVal ? Bitfield::get<SelectHandSpeculativity::TrueVal>(Storage)
1694                    : Bitfield::get<SelectHandSpeculativity::FalseVal>(Storage);
1695 }
1696 
areAllSpeculatable() const1697 bool SelectHandSpeculativity::areAllSpeculatable() const {
1698   return isSpeculatable(/*isTrueVal=*/true) &&
1699          isSpeculatable(/*isTrueVal=*/false);
1700 }
1701 
areAnySpeculatable() const1702 bool SelectHandSpeculativity::areAnySpeculatable() const {
1703   return isSpeculatable(/*isTrueVal=*/true) ||
1704          isSpeculatable(/*isTrueVal=*/false);
1705 }
areNoneSpeculatable() const1706 bool SelectHandSpeculativity::areNoneSpeculatable() const {
1707   return !areAnySpeculatable();
1708 }
1709 
1710 static SelectHandSpeculativity
isSafeLoadOfSelectToSpeculate(LoadInst & LI,SelectInst & SI,bool PreserveCFG)1711 isSafeLoadOfSelectToSpeculate(LoadInst &LI, SelectInst &SI, bool PreserveCFG) {
1712   assert(LI.isSimple() && "Only for simple loads");
1713   SelectHandSpeculativity Spec;
1714 
1715   const DataLayout &DL = SI.getDataLayout();
1716   for (Value *Value : {SI.getTrueValue(), SI.getFalseValue()})
1717     if (isSafeToLoadUnconditionally(Value, LI.getType(), LI.getAlign(), DL,
1718                                     &LI))
1719       Spec.setAsSpeculatable(/*isTrueVal=*/Value == SI.getTrueValue());
1720     else if (PreserveCFG)
1721       return Spec;
1722 
1723   return Spec;
1724 }
1725 
1726 std::optional<RewriteableMemOps>
isSafeSelectToSpeculate(SelectInst & SI,bool PreserveCFG)1727 SROA::isSafeSelectToSpeculate(SelectInst &SI, bool PreserveCFG) {
1728   RewriteableMemOps Ops;
1729 
1730   for (User *U : SI.users()) {
1731     if (auto *BC = dyn_cast<BitCastInst>(U); BC && BC->hasOneUse())
1732       U = *BC->user_begin();
1733 
1734     if (auto *Store = dyn_cast<StoreInst>(U)) {
1735       // Note that atomic stores can be transformed; atomic semantics do not
1736       // have any meaning for a local alloca. Stores are not speculatable,
1737       // however, so if we can't turn it into a predicated store, we are done.
1738       if (Store->isVolatile() || PreserveCFG)
1739         return {}; // Give up on this `select`.
1740       Ops.emplace_back(Store);
1741       continue;
1742     }
1743 
1744     auto *LI = dyn_cast<LoadInst>(U);
1745 
1746     // Note that atomic loads can be transformed;
1747     // atomic semantics do not have any meaning for a local alloca.
1748     if (!LI || LI->isVolatile())
1749       return {}; // Give up on this `select`.
1750 
1751     PossiblySpeculatableLoad Load(LI);
1752     if (!LI->isSimple()) {
1753       // If the `load` is not simple, we can't speculatively execute it,
1754       // but we could handle this via a CFG modification. But can we?
1755       if (PreserveCFG)
1756         return {}; // Give up on this `select`.
1757       Ops.emplace_back(Load);
1758       continue;
1759     }
1760 
1761     SelectHandSpeculativity Spec =
1762         isSafeLoadOfSelectToSpeculate(*LI, SI, PreserveCFG);
1763     if (PreserveCFG && !Spec.areAllSpeculatable())
1764       return {}; // Give up on this `select`.
1765 
1766     Load.setInt(Spec);
1767     Ops.emplace_back(Load);
1768   }
1769 
1770   return Ops;
1771 }
1772 
speculateSelectInstLoads(SelectInst & SI,LoadInst & LI,IRBuilderTy & IRB)1773 static void speculateSelectInstLoads(SelectInst &SI, LoadInst &LI,
1774                                      IRBuilderTy &IRB) {
1775   LLVM_DEBUG(dbgs() << "    original load: " << SI << "\n");
1776 
1777   Value *TV = SI.getTrueValue();
1778   Value *FV = SI.getFalseValue();
1779   // Replace the given load of the select with a select of two loads.
1780 
1781   assert(LI.isSimple() && "We only speculate simple loads");
1782 
1783   IRB.SetInsertPoint(&LI);
1784 
1785   LoadInst *TL =
1786       IRB.CreateAlignedLoad(LI.getType(), TV, LI.getAlign(),
1787                             LI.getName() + ".sroa.speculate.load.true");
1788   LoadInst *FL =
1789       IRB.CreateAlignedLoad(LI.getType(), FV, LI.getAlign(),
1790                             LI.getName() + ".sroa.speculate.load.false");
1791   NumLoadsSpeculated += 2;
1792 
1793   // Transfer alignment and AA info if present.
1794   TL->setAlignment(LI.getAlign());
1795   FL->setAlignment(LI.getAlign());
1796 
1797   AAMDNodes Tags = LI.getAAMetadata();
1798   if (Tags) {
1799     TL->setAAMetadata(Tags);
1800     FL->setAAMetadata(Tags);
1801   }
1802 
1803   Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
1804                               LI.getName() + ".sroa.speculated");
1805 
1806   LLVM_DEBUG(dbgs() << "          speculated to: " << *V << "\n");
1807   LI.replaceAllUsesWith(V);
1808 }
1809 
1810 template <typename T>
rewriteMemOpOfSelect(SelectInst & SI,T & I,SelectHandSpeculativity Spec,DomTreeUpdater & DTU)1811 static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
1812                                  SelectHandSpeculativity Spec,
1813                                  DomTreeUpdater &DTU) {
1814   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) && "Only for load and store!");
1815   LLVM_DEBUG(dbgs() << "    original mem op: " << I << "\n");
1816   BasicBlock *Head = I.getParent();
1817   Instruction *ThenTerm = nullptr;
1818   Instruction *ElseTerm = nullptr;
1819   if (Spec.areNoneSpeculatable())
1820     SplitBlockAndInsertIfThenElse(SI.getCondition(), &I, &ThenTerm, &ElseTerm,
1821                                   SI.getMetadata(LLVMContext::MD_prof), &DTU);
1822   else {
1823     SplitBlockAndInsertIfThen(SI.getCondition(), &I, /*Unreachable=*/false,
1824                               SI.getMetadata(LLVMContext::MD_prof), &DTU,
1825                               /*LI=*/nullptr, /*ThenBlock=*/nullptr);
1826     if (Spec.isSpeculatable(/*isTrueVal=*/true))
1827       cast<BranchInst>(Head->getTerminator())->swapSuccessors();
1828   }
1829   auto *HeadBI = cast<BranchInst>(Head->getTerminator());
1830   Spec = {}; // Do not use `Spec` beyond this point.
1831   BasicBlock *Tail = I.getParent();
1832   Tail->setName(Head->getName() + ".cont");
1833   PHINode *PN;
1834   if (isa<LoadInst>(I))
1835     PN = PHINode::Create(I.getType(), 2, "", I.getIterator());
1836   for (BasicBlock *SuccBB : successors(Head)) {
1837     bool IsThen = SuccBB == HeadBI->getSuccessor(0);
1838     int SuccIdx = IsThen ? 0 : 1;
1839     auto *NewMemOpBB = SuccBB == Tail ? Head : SuccBB;
1840     auto &CondMemOp = cast<T>(*I.clone());
1841     if (NewMemOpBB != Head) {
1842       NewMemOpBB->setName(Head->getName() + (IsThen ? ".then" : ".else"));
1843       if (isa<LoadInst>(I))
1844         ++NumLoadsPredicated;
1845       else
1846         ++NumStoresPredicated;
1847     } else {
1848       CondMemOp.dropUBImplyingAttrsAndMetadata();
1849       ++NumLoadsSpeculated;
1850     }
1851     CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
1852     Value *Ptr = SI.getOperand(1 + SuccIdx);
1853     CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
1854     if (isa<LoadInst>(I)) {
1855       CondMemOp.setName(I.getName() + (IsThen ? ".then" : ".else") + ".val");
1856       PN->addIncoming(&CondMemOp, NewMemOpBB);
1857     } else
1858       LLVM_DEBUG(dbgs() << "                 to: " << CondMemOp << "\n");
1859   }
1860   if (isa<LoadInst>(I)) {
1861     PN->takeName(&I);
1862     LLVM_DEBUG(dbgs() << "          to: " << *PN << "\n");
1863     I.replaceAllUsesWith(PN);
1864   }
1865 }
1866 
rewriteMemOpOfSelect(SelectInst & SelInst,Instruction & I,SelectHandSpeculativity Spec,DomTreeUpdater & DTU)1867 static void rewriteMemOpOfSelect(SelectInst &SelInst, Instruction &I,
1868                                  SelectHandSpeculativity Spec,
1869                                  DomTreeUpdater &DTU) {
1870   if (auto *LI = dyn_cast<LoadInst>(&I))
1871     rewriteMemOpOfSelect(SelInst, *LI, Spec, DTU);
1872   else if (auto *SI = dyn_cast<StoreInst>(&I))
1873     rewriteMemOpOfSelect(SelInst, *SI, Spec, DTU);
1874   else
1875     llvm_unreachable_internal("Only for load and store.");
1876 }
1877 
rewriteSelectInstMemOps(SelectInst & SI,const RewriteableMemOps & Ops,IRBuilderTy & IRB,DomTreeUpdater * DTU)1878 static bool rewriteSelectInstMemOps(SelectInst &SI,
1879                                     const RewriteableMemOps &Ops,
1880                                     IRBuilderTy &IRB, DomTreeUpdater *DTU) {
1881   bool CFGChanged = false;
1882   LLVM_DEBUG(dbgs() << "    original select: " << SI << "\n");
1883 
1884   for (const RewriteableMemOp &Op : Ops) {
1885     SelectHandSpeculativity Spec;
1886     Instruction *I;
1887     if (auto *const *US = std::get_if<UnspeculatableStore>(&Op)) {
1888       I = *US;
1889     } else {
1890       auto PSL = std::get<PossiblySpeculatableLoad>(Op);
1891       I = PSL.getPointer();
1892       Spec = PSL.getInt();
1893     }
1894     if (Spec.areAllSpeculatable()) {
1895       speculateSelectInstLoads(SI, cast<LoadInst>(*I), IRB);
1896     } else {
1897       assert(DTU && "Should not get here when not allowed to modify the CFG!");
1898       rewriteMemOpOfSelect(SI, *I, Spec, *DTU);
1899       CFGChanged = true;
1900     }
1901     I->eraseFromParent();
1902   }
1903 
1904   for (User *U : make_early_inc_range(SI.users()))
1905     cast<BitCastInst>(U)->eraseFromParent();
1906   SI.eraseFromParent();
1907   return CFGChanged;
1908 }
1909 
1910 /// Compute an adjusted pointer from Ptr by Offset bytes where the
1911 /// resulting pointer has PointerTy.
getAdjustedPtr(IRBuilderTy & IRB,const DataLayout & DL,Value * Ptr,APInt Offset,Type * PointerTy,const Twine & NamePrefix)1912 static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
1913                              APInt Offset, Type *PointerTy,
1914                              const Twine &NamePrefix) {
1915   if (Offset != 0)
1916     Ptr = IRB.CreateInBoundsPtrAdd(Ptr, IRB.getInt(Offset),
1917                                    NamePrefix + "sroa_idx");
1918   return IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr, PointerTy,
1919                                                  NamePrefix + "sroa_cast");
1920 }
1921 
1922 /// Compute the adjusted alignment for a load or store from an offset.
getAdjustedAlignment(Instruction * I,uint64_t Offset)1923 static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
1924   return commonAlignment(getLoadStoreAlignment(I), Offset);
1925 }
1926 
1927 /// Test whether we can convert a value from the old to the new type.
1928 ///
1929 /// This predicate should be used to guard calls to convertValue in order to
1930 /// ensure that we only try to convert viable values. The strategy is that we
1931 /// will peel off single element struct and array wrappings to get to an
1932 /// underlying value, and convert that value.
canConvertValue(const DataLayout & DL,Type * OldTy,Type * NewTy,unsigned VScale=0)1933 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
1934                             unsigned VScale = 0) {
1935   if (OldTy == NewTy)
1936     return true;
1937 
1938   // For integer types, we can't handle any bit-width differences. This would
1939   // break both vector conversions with extension and introduce endianness
1940   // issues when in conjunction with loads and stores.
1941   if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
1942     assert(cast<IntegerType>(OldTy)->getBitWidth() !=
1943                cast<IntegerType>(NewTy)->getBitWidth() &&
1944            "We can't have the same bitwidth for different int types");
1945     return false;
1946   }
1947 
1948   TypeSize NewSize = DL.getTypeSizeInBits(NewTy);
1949   TypeSize OldSize = DL.getTypeSizeInBits(OldTy);
1950 
1951   if ((isa<ScalableVectorType>(NewTy) && isa<FixedVectorType>(OldTy)) ||
1952       (isa<ScalableVectorType>(OldTy) && isa<FixedVectorType>(NewTy))) {
1953     // Conversion is only possible when the size of scalable vectors is known.
1954     if (!VScale)
1955       return false;
1956 
1957     // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within
1958     // a single domain (either fixed or scalable). Any additional conversion
1959     // between fixed and scalable types is handled through integer types.
1960     auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy;
1961     auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy;
1962 
1963     if (isa<ScalableVectorType>(NewTy)) {
1964       if (!VectorType::getWithSizeAndScalar(cast<VectorType>(NewVTy), OldVTy))
1965         return false;
1966 
1967       NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale);
1968     } else {
1969       if (!VectorType::getWithSizeAndScalar(cast<VectorType>(OldVTy), NewVTy))
1970         return false;
1971 
1972       OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale);
1973     }
1974   }
1975 
1976   if (NewSize != OldSize)
1977     return false;
1978   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
1979     return false;
1980 
1981   // We can convert pointers to integers and vice-versa. Same for vectors
1982   // of pointers and integers.
1983   OldTy = OldTy->getScalarType();
1984   NewTy = NewTy->getScalarType();
1985   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
1986     if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
1987       unsigned OldAS = OldTy->getPointerAddressSpace();
1988       unsigned NewAS = NewTy->getPointerAddressSpace();
1989       // Convert pointers if they are pointers from the same address space or
1990       // different integral (not non-integral) address spaces with the same
1991       // pointer size.
1992       return OldAS == NewAS ||
1993              (!DL.isNonIntegralAddressSpace(OldAS) &&
1994               !DL.isNonIntegralAddressSpace(NewAS) &&
1995               DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
1996     }
1997 
1998     // We can convert integers to integral pointers, but not to non-integral
1999     // pointers.
2000     if (OldTy->isIntegerTy())
2001       return !DL.isNonIntegralPointerType(NewTy);
2002 
2003     // We can convert integral pointers to integers, but non-integral pointers
2004     // need to remain pointers.
2005     if (!DL.isNonIntegralPointerType(OldTy))
2006       return NewTy->isIntegerTy();
2007 
2008     return false;
2009   }
2010 
2011   if (OldTy->isTargetExtTy() || NewTy->isTargetExtTy())
2012     return false;
2013 
2014   return true;
2015 }
2016 
2017 /// Generic routine to convert an SSA value to a value of a different
2018 /// type.
2019 ///
2020 /// This will try various different casting techniques, such as bitcasts,
2021 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
2022 /// two types for viability with this routine.
convertValue(const DataLayout & DL,IRBuilderTy & IRB,Value * V,Type * NewTy)2023 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2024                            Type *NewTy) {
2025   Type *OldTy = V->getType();
2026 
2027 #ifndef NDEBUG
2028   BasicBlock *BB = IRB.GetInsertBlock();
2029   assert(BB && BB->getParent() && "VScale unknown!");
2030   unsigned VScale = BB->getParent()->getVScaleValue();
2031   assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
2032          "Value not convertable to type");
2033 #endif
2034 
2035   if (OldTy == NewTy)
2036     return V;
2037 
2038   assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
2039          "Integer types must be the exact same to convert.");
2040 
2041   // A variant of bitcast that supports a mixture of fixed and scalable types
2042   // that are know to have the same size.
2043   auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * {
2044     Type *InTy = In->getType();
2045     if (InTy == Ty)
2046       return In;
2047 
2048     if (isa<FixedVectorType>(InTy) && isa<ScalableVectorType>(Ty)) {
2049       // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
2050       //   <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
2051       auto *VTy = VectorType::getWithSizeAndScalar(cast<VectorType>(Ty), InTy);
2052       return IRB.CreateBitCast(IRB.CreateInsertVector(VTy,
2053                                                       PoisonValue::get(VTy), In,
2054                                                       IRB.getInt64(0)),
2055                                Ty);
2056     }
2057 
2058     if (isa<ScalableVectorType>(InTy) && isa<FixedVectorType>(Ty)) {
2059       // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
2060       //   <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
2061       auto *VTy = VectorType::getWithSizeAndScalar(cast<VectorType>(InTy), Ty);
2062       return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy),
2063                                      IRB.getInt64(0));
2064     }
2065 
2066     return IRB.CreateBitCast(In, Ty);
2067   };
2068 
2069   // See if we need inttoptr for this type pair. May require additional bitcast.
2070   if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2071     // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
2072     // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
2073     // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
2074     // Directly handle i64 to i8*
2075     return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)),
2076                               NewTy);
2077   }
2078 
2079   // See if we need ptrtoint for this type pair. May require additional bitcast.
2080   if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
2081     // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
2082     // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
2083     // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
2084     // Expand i8* to i64 --> i8* to i64 to i64
2085     return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2086                              NewTy);
2087   }
2088 
2089   if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
2090     unsigned OldAS = OldTy->getPointerAddressSpace();
2091     unsigned NewAS = NewTy->getPointerAddressSpace();
2092     // To convert pointers with different address spaces (they are already
2093     // checked convertible, i.e. they have the same pointer size), so far we
2094     // cannot use `bitcast` (which has restrict on the same address space) or
2095     // `addrspacecast` (which is not always no-op casting). Instead, use a pair
2096     // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
2097     // size.
2098     if (OldAS != NewAS) {
2099       assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
2100       return IRB.CreateIntToPtr(
2101           CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
2102                             DL.getIntPtrType(NewTy)),
2103           NewTy);
2104     }
2105   }
2106 
2107   return CreateBitCastLike(V, NewTy);
2108 }
2109 
2110 /// Test whether the given slice use can be promoted to a vector.
2111 ///
2112 /// This function is called to test each entry in a partition which is slated
2113 /// for a single slice.
isVectorPromotionViableForSlice(Partition & P,const Slice & S,VectorType * Ty,uint64_t ElementSize,const DataLayout & DL,unsigned VScale)2114 static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
2115                                             VectorType *Ty,
2116                                             uint64_t ElementSize,
2117                                             const DataLayout &DL,
2118                                             unsigned VScale) {
2119   // First validate the slice offsets.
2120   uint64_t BeginOffset =
2121       std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
2122   uint64_t BeginIndex = BeginOffset / ElementSize;
2123   if (BeginIndex * ElementSize != BeginOffset ||
2124       BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
2125     return false;
2126   uint64_t EndOffset = std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
2127   uint64_t EndIndex = EndOffset / ElementSize;
2128   if (EndIndex * ElementSize != EndOffset ||
2129       EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
2130     return false;
2131 
2132   assert(EndIndex > BeginIndex && "Empty vector!");
2133   uint64_t NumElements = EndIndex - BeginIndex;
2134   Type *SliceTy = (NumElements == 1)
2135                       ? Ty->getElementType()
2136                       : FixedVectorType::get(Ty->getElementType(), NumElements);
2137 
2138   Type *SplitIntTy =
2139       Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
2140 
2141   Use *U = S.getUse();
2142 
2143   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2144     if (MI->isVolatile())
2145       return false;
2146     if (!S.isSplittable())
2147       return false; // Skip any unsplittable intrinsics.
2148   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2149     if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
2150       return false;
2151   } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2152     if (LI->isVolatile())
2153       return false;
2154     Type *LTy = LI->getType();
2155     // Disable vector promotion when there are loads or stores of an FCA.
2156     if (LTy->isStructTy())
2157       return false;
2158     if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2159       assert(LTy->isIntegerTy());
2160       LTy = SplitIntTy;
2161     }
2162     if (!canConvertValue(DL, SliceTy, LTy, VScale))
2163       return false;
2164   } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2165     if (SI->isVolatile())
2166       return false;
2167     Type *STy = SI->getValueOperand()->getType();
2168     // Disable vector promotion when there are loads or stores of an FCA.
2169     if (STy->isStructTy())
2170       return false;
2171     if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
2172       assert(STy->isIntegerTy());
2173       STy = SplitIntTy;
2174     }
2175     if (!canConvertValue(DL, STy, SliceTy, VScale))
2176       return false;
2177   } else {
2178     return false;
2179   }
2180 
2181   return true;
2182 }
2183 
2184 /// Test whether a vector type is viable for promotion.
2185 ///
2186 /// This implements the necessary checking for \c checkVectorTypesForPromotion
2187 /// (and thus isVectorPromotionViable) over all slices of the alloca for the
2188 /// given VectorType.
checkVectorTypeForPromotion(Partition & P,VectorType * VTy,const DataLayout & DL,unsigned VScale)2189 static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
2190                                         const DataLayout &DL, unsigned VScale) {
2191   uint64_t ElementSize =
2192       DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2193 
2194   // While the definition of LLVM vectors is bitpacked, we don't support sizes
2195   // that aren't byte sized.
2196   if (ElementSize % 8)
2197     return false;
2198   assert((DL.getTypeSizeInBits(VTy).getFixedValue() % 8) == 0 &&
2199          "vector size not a multiple of element size?");
2200   ElementSize /= 8;
2201 
2202   for (const Slice &S : P)
2203     if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale))
2204       return false;
2205 
2206   for (const Slice *S : P.splitSliceTails())
2207     if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale))
2208       return false;
2209 
2210   return true;
2211 }
2212 
2213 /// Test whether any vector type in \p CandidateTys is viable for promotion.
2214 ///
2215 /// This implements the necessary checking for \c isVectorPromotionViable over
2216 /// all slices of the alloca for the given VectorType.
2217 static VectorType *
checkVectorTypesForPromotion(Partition & P,const DataLayout & DL,SmallVectorImpl<VectorType * > & CandidateTys,bool HaveCommonEltTy,Type * CommonEltTy,bool HaveVecPtrTy,bool HaveCommonVecPtrTy,VectorType * CommonVecPtrTy,unsigned VScale)2218 checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
2219                              SmallVectorImpl<VectorType *> &CandidateTys,
2220                              bool HaveCommonEltTy, Type *CommonEltTy,
2221                              bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
2222                              VectorType *CommonVecPtrTy, unsigned VScale) {
2223   // If we didn't find a vector type, nothing to do here.
2224   if (CandidateTys.empty())
2225     return nullptr;
2226 
2227   // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
2228   // then we should choose it, not some other alternative.
2229   // But, we can't perform a no-op pointer address space change via bitcast,
2230   // so if we didn't have a common pointer element type, bail.
2231   if (HaveVecPtrTy && !HaveCommonVecPtrTy)
2232     return nullptr;
2233 
2234   // Try to pick the "best" element type out of the choices.
2235   if (!HaveCommonEltTy && HaveVecPtrTy) {
2236     // If there was a pointer element type, there's really only one choice.
2237     CandidateTys.clear();
2238     CandidateTys.push_back(CommonVecPtrTy);
2239   } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
2240     // Integer-ify vector types.
2241     for (VectorType *&VTy : CandidateTys) {
2242       if (!VTy->getElementType()->isIntegerTy())
2243         VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
2244             VTy->getContext(), VTy->getScalarSizeInBits())));
2245     }
2246 
2247     // Rank the remaining candidate vector types. This is easy because we know
2248     // they're all integer vectors. We sort by ascending number of elements.
2249     auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2250       (void)DL;
2251       assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2252                  DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2253              "Cannot have vector types of different sizes!");
2254       assert(RHSTy->getElementType()->isIntegerTy() &&
2255              "All non-integer types eliminated!");
2256       assert(LHSTy->getElementType()->isIntegerTy() &&
2257              "All non-integer types eliminated!");
2258       return cast<FixedVectorType>(RHSTy)->getNumElements() <
2259              cast<FixedVectorType>(LHSTy)->getNumElements();
2260     };
2261     auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
2262       (void)DL;
2263       assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
2264                  DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
2265              "Cannot have vector types of different sizes!");
2266       assert(RHSTy->getElementType()->isIntegerTy() &&
2267              "All non-integer types eliminated!");
2268       assert(LHSTy->getElementType()->isIntegerTy() &&
2269              "All non-integer types eliminated!");
2270       return cast<FixedVectorType>(RHSTy)->getNumElements() ==
2271              cast<FixedVectorType>(LHSTy)->getNumElements();
2272     };
2273     llvm::sort(CandidateTys, RankVectorTypesComp);
2274     CandidateTys.erase(llvm::unique(CandidateTys, RankVectorTypesEq),
2275                        CandidateTys.end());
2276   } else {
2277 // The only way to have the same element type in every vector type is to
2278 // have the same vector type. Check that and remove all but one.
2279 #ifndef NDEBUG
2280     for (VectorType *VTy : CandidateTys) {
2281       assert(VTy->getElementType() == CommonEltTy &&
2282              "Unaccounted for element type!");
2283       assert(VTy == CandidateTys[0] &&
2284              "Different vector types with the same element type!");
2285     }
2286 #endif
2287     CandidateTys.resize(1);
2288   }
2289 
2290   // FIXME: hack. Do we have a named constant for this?
2291   // SDAG SDNode can't have more than 65535 operands.
2292   llvm::erase_if(CandidateTys, [](VectorType *VTy) {
2293     return cast<FixedVectorType>(VTy)->getNumElements() >
2294            std::numeric_limits<unsigned short>::max();
2295   });
2296 
2297   for (VectorType *VTy : CandidateTys)
2298     if (checkVectorTypeForPromotion(P, VTy, DL, VScale))
2299       return VTy;
2300 
2301   return nullptr;
2302 }
2303 
createAndCheckVectorTypesForPromotion(SetVector<Type * > & OtherTys,ArrayRef<VectorType * > CandidateTysCopy,function_ref<void (Type *)> CheckCandidateType,Partition & P,const DataLayout & DL,SmallVectorImpl<VectorType * > & CandidateTys,bool & HaveCommonEltTy,Type * & CommonEltTy,bool & HaveVecPtrTy,bool & HaveCommonVecPtrTy,VectorType * & CommonVecPtrTy,unsigned VScale)2304 static VectorType *createAndCheckVectorTypesForPromotion(
2305     SetVector<Type *> &OtherTys, ArrayRef<VectorType *> CandidateTysCopy,
2306     function_ref<void(Type *)> CheckCandidateType, Partition &P,
2307     const DataLayout &DL, SmallVectorImpl<VectorType *> &CandidateTys,
2308     bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy,
2309     bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) {
2310   [[maybe_unused]] VectorType *OriginalElt =
2311       CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr;
2312   // Consider additional vector types where the element type size is a
2313   // multiple of load/store element size.
2314   for (Type *Ty : OtherTys) {
2315     if (!VectorType::isValidElementType(Ty))
2316       continue;
2317     unsigned TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
2318     // Make a copy of CandidateTys and iterate through it, because we
2319     // might append to CandidateTys in the loop.
2320     for (VectorType *const VTy : CandidateTysCopy) {
2321       // The elements in the copy should remain invariant throughout the loop
2322       assert(CandidateTysCopy[0] == OriginalElt && "Different Element");
2323       unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
2324       unsigned ElementSize =
2325           DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue();
2326       if (TypeSize != VectorSize && TypeSize != ElementSize &&
2327           VectorSize % TypeSize == 0) {
2328         VectorType *NewVTy = VectorType::get(Ty, VectorSize / TypeSize, false);
2329         CheckCandidateType(NewVTy);
2330       }
2331     }
2332   }
2333 
2334   return checkVectorTypesForPromotion(
2335       P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2336       HaveCommonVecPtrTy, CommonVecPtrTy, VScale);
2337 }
2338 
2339 /// Test whether the given alloca partitioning and range of slices can be
2340 /// promoted to a vector.
2341 ///
2342 /// This is a quick test to check whether we can rewrite a particular alloca
2343 /// partition (and its newly formed alloca) into a vector alloca with only
2344 /// whole-vector loads and stores such that it could be promoted to a vector
2345 /// SSA value. We only can ensure this for a limited set of operations, and we
2346 /// don't want to do the rewrites unless we are confident that the result will
2347 /// be promotable, so we have an early test here.
isVectorPromotionViable(Partition & P,const DataLayout & DL,unsigned VScale)2348 static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL,
2349                                            unsigned VScale) {
2350   // Collect the candidate types for vector-based promotion. Also track whether
2351   // we have different element types.
2352   SmallVector<VectorType *, 4> CandidateTys;
2353   SetVector<Type *> LoadStoreTys;
2354   SetVector<Type *> DeferredTys;
2355   Type *CommonEltTy = nullptr;
2356   VectorType *CommonVecPtrTy = nullptr;
2357   bool HaveVecPtrTy = false;
2358   bool HaveCommonEltTy = true;
2359   bool HaveCommonVecPtrTy = true;
2360   auto CheckCandidateType = [&](Type *Ty) {
2361     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2362       // Return if bitcast to vectors is different for total size in bits.
2363       if (!CandidateTys.empty()) {
2364         VectorType *V = CandidateTys[0];
2365         if (DL.getTypeSizeInBits(VTy).getFixedValue() !=
2366             DL.getTypeSizeInBits(V).getFixedValue()) {
2367           CandidateTys.clear();
2368           return;
2369         }
2370       }
2371       CandidateTys.push_back(VTy);
2372       Type *EltTy = VTy->getElementType();
2373 
2374       if (!CommonEltTy)
2375         CommonEltTy = EltTy;
2376       else if (CommonEltTy != EltTy)
2377         HaveCommonEltTy = false;
2378 
2379       if (EltTy->isPointerTy()) {
2380         HaveVecPtrTy = true;
2381         if (!CommonVecPtrTy)
2382           CommonVecPtrTy = VTy;
2383         else if (CommonVecPtrTy != VTy)
2384           HaveCommonVecPtrTy = false;
2385       }
2386     }
2387   };
2388 
2389   // Put load and store types into a set for de-duplication.
2390   for (const Slice &S : P) {
2391     Type *Ty;
2392     if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
2393       Ty = LI->getType();
2394     else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
2395       Ty = SI->getValueOperand()->getType();
2396     else
2397       continue;
2398 
2399     auto CandTy = Ty->getScalarType();
2400     if (CandTy->isPointerTy() && (S.beginOffset() != P.beginOffset() ||
2401                                   S.endOffset() != P.endOffset())) {
2402       DeferredTys.insert(Ty);
2403       continue;
2404     }
2405 
2406     LoadStoreTys.insert(Ty);
2407     // Consider any loads or stores that are the exact size of the slice.
2408     if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
2409       CheckCandidateType(Ty);
2410   }
2411 
2412   SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
2413   if (auto *VTy = createAndCheckVectorTypesForPromotion(
2414           LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL,
2415           CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
2416           HaveCommonVecPtrTy, CommonVecPtrTy, VScale))
2417     return VTy;
2418 
2419   CandidateTys.clear();
2420   return createAndCheckVectorTypesForPromotion(
2421       DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys,
2422       HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy,
2423       CommonVecPtrTy, VScale);
2424 }
2425 
2426 /// Test whether a slice of an alloca is valid for integer widening.
2427 ///
2428 /// This implements the necessary checking for the \c isIntegerWideningViable
2429 /// test below on a single slice of the alloca.
isIntegerWideningViableForSlice(const Slice & S,uint64_t AllocBeginOffset,Type * AllocaTy,const DataLayout & DL,bool & WholeAllocaOp)2430 static bool isIntegerWideningViableForSlice(const Slice &S,
2431                                             uint64_t AllocBeginOffset,
2432                                             Type *AllocaTy,
2433                                             const DataLayout &DL,
2434                                             bool &WholeAllocaOp) {
2435   uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedValue();
2436 
2437   uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
2438   uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
2439 
2440   Use *U = S.getUse();
2441 
2442   // Lifetime intrinsics operate over the whole alloca whose sizes are usually
2443   // larger than other load/store slices (RelEnd > Size). But lifetime are
2444   // always promotable and should not impact other slices' promotability of the
2445   // partition.
2446   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
2447     if (II->isLifetimeStartOrEnd() || II->isDroppable())
2448       return true;
2449   }
2450 
2451   // We can't reasonably handle cases where the load or store extends past
2452   // the end of the alloca's type and into its padding.
2453   if (RelEnd > Size)
2454     return false;
2455 
2456   if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
2457     if (LI->isVolatile())
2458       return false;
2459     // We can't handle loads that extend past the allocated memory.
2460     TypeSize LoadSize = DL.getTypeStoreSize(LI->getType());
2461     if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size)
2462       return false;
2463     // So far, AllocaSliceRewriter does not support widening split slice tails
2464     // in rewriteIntegerLoad.
2465     if (S.beginOffset() < AllocBeginOffset)
2466       return false;
2467     // Note that we don't count vector loads or stores as whole-alloca
2468     // operations which enable integer widening because we would prefer to use
2469     // vector widening instead.
2470     if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
2471       WholeAllocaOp = true;
2472     if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
2473       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2474         return false;
2475     } else if (RelBegin != 0 || RelEnd != Size ||
2476                !canConvertValue(DL, AllocaTy, LI->getType())) {
2477       // Non-integer loads need to be convertible from the alloca type so that
2478       // they are promotable.
2479       return false;
2480     }
2481   } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
2482     Type *ValueTy = SI->getValueOperand()->getType();
2483     if (SI->isVolatile())
2484       return false;
2485     // We can't handle stores that extend past the allocated memory.
2486     TypeSize StoreSize = DL.getTypeStoreSize(ValueTy);
2487     if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size)
2488       return false;
2489     // So far, AllocaSliceRewriter does not support widening split slice tails
2490     // in rewriteIntegerStore.
2491     if (S.beginOffset() < AllocBeginOffset)
2492       return false;
2493     // Note that we don't count vector loads or stores as whole-alloca
2494     // operations which enable integer widening because we would prefer to use
2495     // vector widening instead.
2496     if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
2497       WholeAllocaOp = true;
2498     if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
2499       if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedValue())
2500         return false;
2501     } else if (RelBegin != 0 || RelEnd != Size ||
2502                !canConvertValue(DL, ValueTy, AllocaTy)) {
2503       // Non-integer stores need to be convertible to the alloca type so that
2504       // they are promotable.
2505       return false;
2506     }
2507   } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
2508     if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
2509       return false;
2510     if (!S.isSplittable())
2511       return false; // Skip any unsplittable intrinsics.
2512   } else {
2513     return false;
2514   }
2515 
2516   return true;
2517 }
2518 
2519 /// Test whether the given alloca partition's integer operations can be
2520 /// widened to promotable ones.
2521 ///
2522 /// This is a quick test to check whether we can rewrite the integer loads and
2523 /// stores to a particular alloca into wider loads and stores and be able to
2524 /// promote the resulting alloca.
isIntegerWideningViable(Partition & P,Type * AllocaTy,const DataLayout & DL)2525 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
2526                                     const DataLayout &DL) {
2527   uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedValue();
2528   // Don't create integer types larger than the maximum bitwidth.
2529   if (SizeInBits > IntegerType::MAX_INT_BITS)
2530     return false;
2531 
2532   // Don't try to handle allocas with bit-padding.
2533   if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedValue())
2534     return false;
2535 
2536   // We need to ensure that an integer type with the appropriate bitwidth can
2537   // be converted to the alloca type, whatever that is. We don't want to force
2538   // the alloca itself to have an integer type if there is a more suitable one.
2539   Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
2540   if (!canConvertValue(DL, AllocaTy, IntTy) ||
2541       !canConvertValue(DL, IntTy, AllocaTy))
2542     return false;
2543 
2544   // While examining uses, we ensure that the alloca has a covering load or
2545   // store. We don't want to widen the integer operations only to fail to
2546   // promote due to some other unsplittable entry (which we may make splittable
2547   // later). However, if there are only splittable uses, go ahead and assume
2548   // that we cover the alloca.
2549   // FIXME: We shouldn't consider split slices that happen to start in the
2550   // partition here...
2551   bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
2552 
2553   for (const Slice &S : P)
2554     if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
2555                                          WholeAllocaOp))
2556       return false;
2557 
2558   for (const Slice *S : P.splitSliceTails())
2559     if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
2560                                          WholeAllocaOp))
2561       return false;
2562 
2563   return WholeAllocaOp;
2564 }
2565 
extractInteger(const DataLayout & DL,IRBuilderTy & IRB,Value * V,IntegerType * Ty,uint64_t Offset,const Twine & Name)2566 static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
2567                              IntegerType *Ty, uint64_t Offset,
2568                              const Twine &Name) {
2569   LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
2570   IntegerType *IntTy = cast<IntegerType>(V->getType());
2571   assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2572              DL.getTypeStoreSize(IntTy).getFixedValue() &&
2573          "Element extends past full value");
2574   uint64_t ShAmt = 8 * Offset;
2575   if (DL.isBigEndian())
2576     ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2577                  DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2578   if (ShAmt) {
2579     V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
2580     LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
2581   }
2582   assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2583          "Cannot extract to a larger integer!");
2584   if (Ty != IntTy) {
2585     V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
2586     LLVM_DEBUG(dbgs() << "     trunced: " << *V << "\n");
2587   }
2588   return V;
2589 }
2590 
insertInteger(const DataLayout & DL,IRBuilderTy & IRB,Value * Old,Value * V,uint64_t Offset,const Twine & Name)2591 static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
2592                             Value *V, uint64_t Offset, const Twine &Name) {
2593   IntegerType *IntTy = cast<IntegerType>(Old->getType());
2594   IntegerType *Ty = cast<IntegerType>(V->getType());
2595   assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
2596          "Cannot insert a larger integer!");
2597   LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
2598   if (Ty != IntTy) {
2599     V = IRB.CreateZExt(V, IntTy, Name + ".ext");
2600     LLVM_DEBUG(dbgs() << "    extended: " << *V << "\n");
2601   }
2602   assert(DL.getTypeStoreSize(Ty).getFixedValue() + Offset <=
2603              DL.getTypeStoreSize(IntTy).getFixedValue() &&
2604          "Element store outside of alloca store");
2605   uint64_t ShAmt = 8 * Offset;
2606   if (DL.isBigEndian())
2607     ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedValue() -
2608                  DL.getTypeStoreSize(Ty).getFixedValue() - Offset);
2609   if (ShAmt) {
2610     V = IRB.CreateShl(V, ShAmt, Name + ".shift");
2611     LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
2612   }
2613 
2614   if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
2615     APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
2616     Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
2617     LLVM_DEBUG(dbgs() << "      masked: " << *Old << "\n");
2618     V = IRB.CreateOr(Old, V, Name + ".insert");
2619     LLVM_DEBUG(dbgs() << "    inserted: " << *V << "\n");
2620   }
2621   return V;
2622 }
2623 
extractVector(IRBuilderTy & IRB,Value * V,unsigned BeginIndex,unsigned EndIndex,const Twine & Name)2624 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
2625                             unsigned EndIndex, const Twine &Name) {
2626   auto *VecTy = cast<FixedVectorType>(V->getType());
2627   unsigned NumElements = EndIndex - BeginIndex;
2628   assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
2629 
2630   if (NumElements == VecTy->getNumElements())
2631     return V;
2632 
2633   if (NumElements == 1) {
2634     V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
2635                                  Name + ".extract");
2636     LLVM_DEBUG(dbgs() << "     extract: " << *V << "\n");
2637     return V;
2638   }
2639 
2640   auto Mask = llvm::to_vector<8>(llvm::seq<int>(BeginIndex, EndIndex));
2641   V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
2642   LLVM_DEBUG(dbgs() << "     shuffle: " << *V << "\n");
2643   return V;
2644 }
2645 
insertVector(IRBuilderTy & IRB,Value * Old,Value * V,unsigned BeginIndex,const Twine & Name)2646 static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
2647                            unsigned BeginIndex, const Twine &Name) {
2648   VectorType *VecTy = cast<VectorType>(Old->getType());
2649   assert(VecTy && "Can only insert a vector into a vector");
2650 
2651   VectorType *Ty = dyn_cast<VectorType>(V->getType());
2652   if (!Ty) {
2653     // Single element to insert.
2654     V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
2655                                 Name + ".insert");
2656     LLVM_DEBUG(dbgs() << "     insert: " << *V << "\n");
2657     return V;
2658   }
2659 
2660   assert(cast<FixedVectorType>(Ty)->getNumElements() <=
2661              cast<FixedVectorType>(VecTy)->getNumElements() &&
2662          "Too many elements!");
2663   if (cast<FixedVectorType>(Ty)->getNumElements() ==
2664       cast<FixedVectorType>(VecTy)->getNumElements()) {
2665     assert(V->getType() == VecTy && "Vector type mismatch");
2666     return V;
2667   }
2668   unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
2669 
2670   // When inserting a smaller vector into the larger to store, we first
2671   // use a shuffle vector to widen it with undef elements, and then
2672   // a second shuffle vector to select between the loaded vector and the
2673   // incoming vector.
2674   SmallVector<int, 8> Mask;
2675   Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2676   for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2677     if (i >= BeginIndex && i < EndIndex)
2678       Mask.push_back(i - BeginIndex);
2679     else
2680       Mask.push_back(-1);
2681   V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
2682   LLVM_DEBUG(dbgs() << "    shuffle: " << *V << "\n");
2683 
2684   SmallVector<Constant *, 8> Mask2;
2685   Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
2686   for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
2687     Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
2688 
2689   V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
2690 
2691   LLVM_DEBUG(dbgs() << "    blend: " << *V << "\n");
2692   return V;
2693 }
2694 
2695 namespace {
2696 
2697 /// Visitor to rewrite instructions using p particular slice of an alloca
2698 /// to use a new alloca.
2699 ///
2700 /// Also implements the rewriting to vector-based accesses when the partition
2701 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
2702 /// lives here.
2703 class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
2704   // Befriend the base class so it can delegate to private visit methods.
2705   friend class InstVisitor<AllocaSliceRewriter, bool>;
2706 
2707   using Base = InstVisitor<AllocaSliceRewriter, bool>;
2708 
2709   const DataLayout &DL;
2710   AllocaSlices &AS;
2711   SROA &Pass;
2712   AllocaInst &OldAI, &NewAI;
2713   const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
2714   Type *NewAllocaTy;
2715 
2716   // This is a convenience and flag variable that will be null unless the new
2717   // alloca's integer operations should be widened to this integer type due to
2718   // passing isIntegerWideningViable above. If it is non-null, the desired
2719   // integer type will be stored here for easy access during rewriting.
2720   IntegerType *IntTy;
2721 
2722   // If we are rewriting an alloca partition which can be written as pure
2723   // vector operations, we stash extra information here. When VecTy is
2724   // non-null, we have some strict guarantees about the rewritten alloca:
2725   //   - The new alloca is exactly the size of the vector type here.
2726   //   - The accesses all either map to the entire vector or to a single
2727   //     element.
2728   //   - The set of accessing instructions is only one of those handled above
2729   //     in isVectorPromotionViable. Generally these are the same access kinds
2730   //     which are promotable via mem2reg.
2731   VectorType *VecTy;
2732   Type *ElementTy;
2733   uint64_t ElementSize;
2734 
2735   // The original offset of the slice currently being rewritten relative to
2736   // the original alloca.
2737   uint64_t BeginOffset = 0;
2738   uint64_t EndOffset = 0;
2739 
2740   // The new offsets of the slice currently being rewritten relative to the
2741   // original alloca.
2742   uint64_t NewBeginOffset = 0, NewEndOffset = 0;
2743 
2744   uint64_t SliceSize = 0;
2745   bool IsSplittable = false;
2746   bool IsSplit = false;
2747   Use *OldUse = nullptr;
2748   Instruction *OldPtr = nullptr;
2749 
2750   // Track post-rewrite users which are PHI nodes and Selects.
2751   SmallSetVector<PHINode *, 8> &PHIUsers;
2752   SmallSetVector<SelectInst *, 8> &SelectUsers;
2753 
2754   // Utility IR builder, whose name prefix is setup for each visited use, and
2755   // the insertion point is set to point to the user.
2756   IRBuilderTy IRB;
2757 
2758   // Return the new alloca, addrspacecasted if required to avoid changing the
2759   // addrspace of a volatile access.
getPtrToNewAI(unsigned AddrSpace,bool IsVolatile)2760   Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
2761     if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
2762       return &NewAI;
2763 
2764     Type *AccessTy = IRB.getPtrTy(AddrSpace);
2765     return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
2766   }
2767 
2768 public:
AllocaSliceRewriter(const DataLayout & DL,AllocaSlices & AS,SROA & Pass,AllocaInst & OldAI,AllocaInst & NewAI,uint64_t NewAllocaBeginOffset,uint64_t NewAllocaEndOffset,bool IsIntegerPromotable,VectorType * PromotableVecTy,SmallSetVector<PHINode *,8> & PHIUsers,SmallSetVector<SelectInst *,8> & SelectUsers)2769   AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
2770                       AllocaInst &OldAI, AllocaInst &NewAI,
2771                       uint64_t NewAllocaBeginOffset,
2772                       uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
2773                       VectorType *PromotableVecTy,
2774                       SmallSetVector<PHINode *, 8> &PHIUsers,
2775                       SmallSetVector<SelectInst *, 8> &SelectUsers)
2776       : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
2777         NewAllocaBeginOffset(NewAllocaBeginOffset),
2778         NewAllocaEndOffset(NewAllocaEndOffset),
2779         NewAllocaTy(NewAI.getAllocatedType()),
2780         IntTy(
2781             IsIntegerPromotable
2782                 ? Type::getIntNTy(NewAI.getContext(),
2783                                   DL.getTypeSizeInBits(NewAI.getAllocatedType())
2784                                       .getFixedValue())
2785                 : nullptr),
2786         VecTy(PromotableVecTy),
2787         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
2788         ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8
2789                           : 0),
2790         PHIUsers(PHIUsers), SelectUsers(SelectUsers),
2791         IRB(NewAI.getContext(), ConstantFolder()) {
2792     if (VecTy) {
2793       assert((DL.getTypeSizeInBits(ElementTy).getFixedValue() % 8) == 0 &&
2794              "Only multiple-of-8 sized vector elements are viable");
2795       ++NumVectorized;
2796     }
2797     assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
2798   }
2799 
visit(AllocaSlices::const_iterator I)2800   bool visit(AllocaSlices::const_iterator I) {
2801     bool CanSROA = true;
2802     BeginOffset = I->beginOffset();
2803     EndOffset = I->endOffset();
2804     IsSplittable = I->isSplittable();
2805     IsSplit =
2806         BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
2807     LLVM_DEBUG(dbgs() << "  rewriting " << (IsSplit ? "split " : ""));
2808     LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
2809     LLVM_DEBUG(dbgs() << "\n");
2810 
2811     // Compute the intersecting offset range.
2812     assert(BeginOffset < NewAllocaEndOffset);
2813     assert(EndOffset > NewAllocaBeginOffset);
2814     NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
2815     NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
2816 
2817     SliceSize = NewEndOffset - NewBeginOffset;
2818     LLVM_DEBUG(dbgs() << "   Begin:(" << BeginOffset << ", " << EndOffset
2819                       << ") NewBegin:(" << NewBeginOffset << ", "
2820                       << NewEndOffset << ") NewAllocaBegin:("
2821                       << NewAllocaBeginOffset << ", " << NewAllocaEndOffset
2822                       << ")\n");
2823     assert(IsSplit || NewBeginOffset == BeginOffset);
2824     OldUse = I->getUse();
2825     OldPtr = cast<Instruction>(OldUse->get());
2826 
2827     Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
2828     IRB.SetInsertPoint(OldUserI);
2829     IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
2830     IRB.getInserter().SetNamePrefix(Twine(NewAI.getName()) + "." +
2831                                     Twine(BeginOffset) + ".");
2832 
2833     CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
2834     if (VecTy || IntTy)
2835       assert(CanSROA);
2836     return CanSROA;
2837   }
2838 
2839 private:
2840   // Make sure the other visit overloads are visible.
2841   using Base::visit;
2842 
2843   // Every instruction which can end up as a user must have a rewrite rule.
visitInstruction(Instruction & I)2844   bool visitInstruction(Instruction &I) {
2845     LLVM_DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
2846     llvm_unreachable("No rewrite rule for this instruction!");
2847   }
2848 
getNewAllocaSlicePtr(IRBuilderTy & IRB,Type * PointerTy)2849   Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
2850     // Note that the offset computation can use BeginOffset or NewBeginOffset
2851     // interchangeably for unsplit slices.
2852     assert(IsSplit || BeginOffset == NewBeginOffset);
2853     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2854 
2855 #ifndef NDEBUG
2856     StringRef OldName = OldPtr->getName();
2857     // Skip through the last '.sroa.' component of the name.
2858     size_t LastSROAPrefix = OldName.rfind(".sroa.");
2859     if (LastSROAPrefix != StringRef::npos) {
2860       OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
2861       // Look for an SROA slice index.
2862       size_t IndexEnd = OldName.find_first_not_of("0123456789");
2863       if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
2864         // Strip the index and look for the offset.
2865         OldName = OldName.substr(IndexEnd + 1);
2866         size_t OffsetEnd = OldName.find_first_not_of("0123456789");
2867         if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
2868           // Strip the offset.
2869           OldName = OldName.substr(OffsetEnd + 1);
2870       }
2871     }
2872     // Strip any SROA suffixes as well.
2873     OldName = OldName.substr(0, OldName.find(".sroa_"));
2874 #endif
2875 
2876     return getAdjustedPtr(IRB, DL, &NewAI,
2877                           APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
2878                           PointerTy,
2879 #ifndef NDEBUG
2880                           Twine(OldName) + "."
2881 #else
2882                           Twine()
2883 #endif
2884     );
2885   }
2886 
2887   /// Compute suitable alignment to access this slice of the *new*
2888   /// alloca.
2889   ///
2890   /// You can optionally pass a type to this routine and if that type's ABI
2891   /// alignment is itself suitable, this will return zero.
getSliceAlign()2892   Align getSliceAlign() {
2893     return commonAlignment(NewAI.getAlign(),
2894                            NewBeginOffset - NewAllocaBeginOffset);
2895   }
2896 
getIndex(uint64_t Offset)2897   unsigned getIndex(uint64_t Offset) {
2898     assert(VecTy && "Can only call getIndex when rewriting a vector");
2899     uint64_t RelOffset = Offset - NewAllocaBeginOffset;
2900     assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
2901     uint32_t Index = RelOffset / ElementSize;
2902     assert(Index * ElementSize == RelOffset);
2903     return Index;
2904   }
2905 
deleteIfTriviallyDead(Value * V)2906   void deleteIfTriviallyDead(Value *V) {
2907     Instruction *I = cast<Instruction>(V);
2908     if (isInstructionTriviallyDead(I))
2909       Pass.DeadInsts.push_back(I);
2910   }
2911 
rewriteVectorizedLoadInst(LoadInst & LI)2912   Value *rewriteVectorizedLoadInst(LoadInst &LI) {
2913     unsigned BeginIndex = getIndex(NewBeginOffset);
2914     unsigned EndIndex = getIndex(NewEndOffset);
2915     assert(EndIndex > BeginIndex && "Empty vector!");
2916 
2917     LoadInst *Load = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2918                                            NewAI.getAlign(), "load");
2919 
2920     Load->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
2921                             LLVMContext::MD_access_group});
2922     return extractVector(IRB, Load, BeginIndex, EndIndex, "vec");
2923   }
2924 
rewriteIntegerLoad(LoadInst & LI)2925   Value *rewriteIntegerLoad(LoadInst &LI) {
2926     assert(IntTy && "We cannot insert an integer to the alloca");
2927     assert(!LI.isVolatile());
2928     Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
2929                                      NewAI.getAlign(), "load");
2930     V = convertValue(DL, IRB, V, IntTy);
2931     assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
2932     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
2933     if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
2934       IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
2935       V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
2936     }
2937     // It is possible that the extracted type is not the load type. This
2938     // happens if there is a load past the end of the alloca, and as
2939     // a consequence the slice is narrower but still a candidate for integer
2940     // lowering. To handle this case, we just zero extend the extracted
2941     // integer.
2942     assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
2943            "Can only handle an extract for an overly wide load");
2944     if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
2945       V = IRB.CreateZExt(V, LI.getType());
2946     return V;
2947   }
2948 
visitLoadInst(LoadInst & LI)2949   bool visitLoadInst(LoadInst &LI) {
2950     LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
2951     Value *OldOp = LI.getOperand(0);
2952     assert(OldOp == OldPtr);
2953 
2954     AAMDNodes AATags = LI.getAAMetadata();
2955 
2956     unsigned AS = LI.getPointerAddressSpace();
2957 
2958     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
2959                              : LI.getType();
2960     bool IsPtrAdjusted = false;
2961     Value *V;
2962     if (VecTy) {
2963       V = rewriteVectorizedLoadInst(LI);
2964     } else if (IntTy && LI.getType()->isIntegerTy()) {
2965       V = rewriteIntegerLoad(LI);
2966     } else if (NewBeginOffset == NewAllocaBeginOffset &&
2967                NewEndOffset == NewAllocaEndOffset &&
2968                (canConvertValue(DL, NewAllocaTy, TargetTy) ||
2969                 (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() &&
2970                  DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize &&
2971                  !LI.isVolatile()))) {
2972       Value *NewPtr =
2973           getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
2974       LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
2975                                               NewAI.getAlign(), LI.isVolatile(),
2976                                               LI.getName());
2977       if (LI.isVolatile())
2978         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
2979       if (NewLI->isAtomic())
2980         NewLI->setAlignment(LI.getAlign());
2981 
2982       // Copy any metadata that is valid for the new load. This may require
2983       // conversion to a different kind of metadata, e.g. !nonnull might change
2984       // to !range or vice versa.
2985       copyMetadataForLoad(*NewLI, LI);
2986 
2987       // Do this after copyMetadataForLoad() to preserve the TBAA shift.
2988       if (AATags)
2989         NewLI->setAAMetadata(AATags.adjustForAccess(
2990             NewBeginOffset - BeginOffset, NewLI->getType(), DL));
2991 
2992       // Try to preserve nonnull metadata
2993       V = NewLI;
2994 
2995       // If this is an integer load past the end of the slice (which means the
2996       // bytes outside the slice are undef or this load is dead) just forcibly
2997       // fix the integer size with correct handling of endianness.
2998       if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
2999         if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
3000           if (AITy->getBitWidth() < TITy->getBitWidth()) {
3001             V = IRB.CreateZExt(V, TITy, "load.ext");
3002             if (DL.isBigEndian())
3003               V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
3004                                 "endian_shift");
3005           }
3006     } else {
3007       Type *LTy = IRB.getPtrTy(AS);
3008       LoadInst *NewLI =
3009           IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
3010                                 getSliceAlign(), LI.isVolatile(), LI.getName());
3011 
3012       if (AATags)
3013         NewLI->setAAMetadata(AATags.adjustForAccess(
3014             NewBeginOffset - BeginOffset, NewLI->getType(), DL));
3015 
3016       if (LI.isVolatile())
3017         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
3018       NewLI->copyMetadata(LI, {LLVMContext::MD_mem_parallel_loop_access,
3019                                LLVMContext::MD_access_group});
3020 
3021       V = NewLI;
3022       IsPtrAdjusted = true;
3023     }
3024     V = convertValue(DL, IRB, V, TargetTy);
3025 
3026     if (IsSplit) {
3027       assert(!LI.isVolatile());
3028       assert(LI.getType()->isIntegerTy() &&
3029              "Only integer type loads and stores are split");
3030       assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedValue() &&
3031              "Split load isn't smaller than original load");
3032       assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
3033              "Non-byte-multiple bit width");
3034       // Move the insertion point just past the load so that we can refer to it.
3035       BasicBlock::iterator LIIt = std::next(LI.getIterator());
3036       // Ensure the insertion point comes before any debug-info immediately
3037       // after the load, so that variable values referring to the load are
3038       // dominated by it.
3039       LIIt.setHeadBit(true);
3040       IRB.SetInsertPoint(LI.getParent(), LIIt);
3041       // Create a placeholder value with the same type as LI to use as the
3042       // basis for the new value. This allows us to replace the uses of LI with
3043       // the computed value, and then replace the placeholder with LI, leaving
3044       // LI only used for this computation.
3045       Value *Placeholder =
3046           new LoadInst(LI.getType(), PoisonValue::get(IRB.getPtrTy(AS)), "",
3047                        false, Align(1));
3048       V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
3049                         "insert");
3050       LI.replaceAllUsesWith(V);
3051       Placeholder->replaceAllUsesWith(&LI);
3052       Placeholder->deleteValue();
3053     } else {
3054       LI.replaceAllUsesWith(V);
3055     }
3056 
3057     Pass.DeadInsts.push_back(&LI);
3058     deleteIfTriviallyDead(OldOp);
3059     LLVM_DEBUG(dbgs() << "          to: " << *V << "\n");
3060     return !LI.isVolatile() && !IsPtrAdjusted;
3061   }
3062 
rewriteVectorizedStoreInst(Value * V,StoreInst & SI,Value * OldOp,AAMDNodes AATags)3063   bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
3064                                   AAMDNodes AATags) {
3065     // Capture V for the purpose of debug-info accounting once it's converted
3066     // to a vector store.
3067     Value *OrigV = V;
3068     if (V->getType() != VecTy) {
3069       unsigned BeginIndex = getIndex(NewBeginOffset);
3070       unsigned EndIndex = getIndex(NewEndOffset);
3071       assert(EndIndex > BeginIndex && "Empty vector!");
3072       unsigned NumElements = EndIndex - BeginIndex;
3073       assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3074              "Too many elements!");
3075       Type *SliceTy = (NumElements == 1)
3076                           ? ElementTy
3077                           : FixedVectorType::get(ElementTy, NumElements);
3078       if (V->getType() != SliceTy)
3079         V = convertValue(DL, IRB, V, SliceTy);
3080 
3081       // Mix in the existing elements.
3082       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3083                                          NewAI.getAlign(), "load");
3084       V = insertVector(IRB, Old, V, BeginIndex, "vec");
3085     }
3086     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3087     Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3088                              LLVMContext::MD_access_group});
3089     if (AATags)
3090       Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3091                                                   V->getType(), DL));
3092     Pass.DeadInsts.push_back(&SI);
3093 
3094     // NOTE: Careful to use OrigV rather than V.
3095     migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3096                      Store, Store->getPointerOperand(), OrigV, DL);
3097     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
3098     return true;
3099   }
3100 
rewriteIntegerStore(Value * V,StoreInst & SI,AAMDNodes AATags)3101   bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
3102     assert(IntTy && "We cannot extract an integer from the alloca");
3103     assert(!SI.isVolatile());
3104     if (DL.getTypeSizeInBits(V->getType()).getFixedValue() !=
3105         IntTy->getBitWidth()) {
3106       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3107                                          NewAI.getAlign(), "oldload");
3108       Old = convertValue(DL, IRB, Old, IntTy);
3109       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
3110       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
3111       V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
3112     }
3113     V = convertValue(DL, IRB, V, NewAllocaTy);
3114     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
3115     Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3116                              LLVMContext::MD_access_group});
3117     if (AATags)
3118       Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3119                                                   V->getType(), DL));
3120 
3121     migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3122                      Store, Store->getPointerOperand(),
3123                      Store->getValueOperand(), DL);
3124 
3125     Pass.DeadInsts.push_back(&SI);
3126     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
3127     return true;
3128   }
3129 
visitStoreInst(StoreInst & SI)3130   bool visitStoreInst(StoreInst &SI) {
3131     LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
3132     Value *OldOp = SI.getOperand(1);
3133     assert(OldOp == OldPtr);
3134 
3135     AAMDNodes AATags = SI.getAAMetadata();
3136     Value *V = SI.getValueOperand();
3137 
3138     // Strip all inbounds GEPs and pointer casts to try to dig out any root
3139     // alloca that should be re-examined after promoting this alloca.
3140     if (V->getType()->isPointerTy())
3141       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
3142         Pass.PostPromotionWorklist.insert(AI);
3143 
3144     TypeSize StoreSize = DL.getTypeStoreSize(V->getType());
3145     if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) {
3146       assert(!SI.isVolatile());
3147       assert(V->getType()->isIntegerTy() &&
3148              "Only integer type loads and stores are split");
3149       assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
3150              "Non-byte-multiple bit width");
3151       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
3152       V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
3153                          "extract");
3154     }
3155 
3156     if (VecTy)
3157       return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
3158     if (IntTy && V->getType()->isIntegerTy())
3159       return rewriteIntegerStore(V, SI, AATags);
3160 
3161     StoreInst *NewSI;
3162     if (NewBeginOffset == NewAllocaBeginOffset &&
3163         NewEndOffset == NewAllocaEndOffset &&
3164         canConvertValue(DL, V->getType(), NewAllocaTy)) {
3165       V = convertValue(DL, IRB, V, NewAllocaTy);
3166       Value *NewPtr =
3167           getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
3168 
3169       NewSI =
3170           IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
3171     } else {
3172       unsigned AS = SI.getPointerAddressSpace();
3173       Value *NewPtr = getNewAllocaSlicePtr(IRB, IRB.getPtrTy(AS));
3174       NewSI =
3175           IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
3176     }
3177     NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
3178                              LLVMContext::MD_access_group});
3179     if (AATags)
3180       NewSI->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3181                                                   V->getType(), DL));
3182     if (SI.isVolatile())
3183       NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
3184     if (NewSI->isAtomic())
3185       NewSI->setAlignment(SI.getAlign());
3186 
3187     migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &SI,
3188                      NewSI, NewSI->getPointerOperand(),
3189                      NewSI->getValueOperand(), DL);
3190 
3191     Pass.DeadInsts.push_back(&SI);
3192     deleteIfTriviallyDead(OldOp);
3193 
3194     LLVM_DEBUG(dbgs() << "          to: " << *NewSI << "\n");
3195     return NewSI->getPointerOperand() == &NewAI &&
3196            NewSI->getValueOperand()->getType() == NewAllocaTy &&
3197            !SI.isVolatile();
3198   }
3199 
3200   /// Compute an integer value from splatting an i8 across the given
3201   /// number of bytes.
3202   ///
3203   /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
3204   /// call this routine.
3205   /// FIXME: Heed the advice above.
3206   ///
3207   /// \param V The i8 value to splat.
3208   /// \param Size The number of bytes in the output (assuming i8 is one byte)
getIntegerSplat(Value * V,unsigned Size)3209   Value *getIntegerSplat(Value *V, unsigned Size) {
3210     assert(Size > 0 && "Expected a positive number of bytes.");
3211     IntegerType *VTy = cast<IntegerType>(V->getType());
3212     assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
3213     if (Size == 1)
3214       return V;
3215 
3216     Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
3217     V = IRB.CreateMul(
3218         IRB.CreateZExt(V, SplatIntTy, "zext"),
3219         IRB.CreateUDiv(Constant::getAllOnesValue(SplatIntTy),
3220                        IRB.CreateZExt(Constant::getAllOnesValue(V->getType()),
3221                                       SplatIntTy)),
3222         "isplat");
3223     return V;
3224   }
3225 
3226   /// Compute a vector splat for a given element value.
getVectorSplat(Value * V,unsigned NumElements)3227   Value *getVectorSplat(Value *V, unsigned NumElements) {
3228     V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
3229     LLVM_DEBUG(dbgs() << "       splat: " << *V << "\n");
3230     return V;
3231   }
3232 
visitMemSetInst(MemSetInst & II)3233   bool visitMemSetInst(MemSetInst &II) {
3234     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
3235     assert(II.getRawDest() == OldPtr);
3236 
3237     AAMDNodes AATags = II.getAAMetadata();
3238 
3239     // If the memset has a variable size, it cannot be split, just adjust the
3240     // pointer to the new alloca.
3241     if (!isa<ConstantInt>(II.getLength())) {
3242       assert(!IsSplit);
3243       assert(NewBeginOffset == BeginOffset);
3244       II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
3245       II.setDestAlignment(getSliceAlign());
3246       // In theory we should call migrateDebugInfo here. However, we do not
3247       // emit dbg.assign intrinsics for mem intrinsics storing through non-
3248       // constant geps, or storing a variable number of bytes.
3249       assert(at::getAssignmentMarkers(&II).empty() &&
3250              at::getDVRAssignmentMarkers(&II).empty() &&
3251              "AT: Unexpected link to non-const GEP");
3252       deleteIfTriviallyDead(OldPtr);
3253       return false;
3254     }
3255 
3256     // Record this instruction for deletion.
3257     Pass.DeadInsts.push_back(&II);
3258 
3259     Type *AllocaTy = NewAI.getAllocatedType();
3260     Type *ScalarTy = AllocaTy->getScalarType();
3261 
3262     const bool CanContinue = [&]() {
3263       if (VecTy || IntTy)
3264         return true;
3265       if (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset)
3266         return false;
3267       // Length must be in range for FixedVectorType.
3268       auto *C = cast<ConstantInt>(II.getLength());
3269       const uint64_t Len = C->getLimitedValue();
3270       if (Len > std::numeric_limits<unsigned>::max())
3271         return false;
3272       auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
3273       auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
3274       return canConvertValue(DL, SrcTy, AllocaTy) &&
3275              DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedValue());
3276     }();
3277 
3278     // If this doesn't map cleanly onto the alloca type, and that type isn't
3279     // a single value type, just emit a memset.
3280     if (!CanContinue) {
3281       Type *SizeTy = II.getLength()->getType();
3282       unsigned Sz = NewEndOffset - NewBeginOffset;
3283       Constant *Size = ConstantInt::get(SizeTy, Sz);
3284       MemIntrinsic *New = cast<MemIntrinsic>(IRB.CreateMemSet(
3285           getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
3286           MaybeAlign(getSliceAlign()), II.isVolatile()));
3287       if (AATags)
3288         New->setAAMetadata(
3289             AATags.adjustForAccess(NewBeginOffset - BeginOffset, Sz));
3290 
3291       migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3292                        New, New->getRawDest(), nullptr, DL);
3293 
3294       LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
3295       return false;
3296     }
3297 
3298     // If we can represent this as a simple value, we have to build the actual
3299     // value to store, which requires expanding the byte present in memset to
3300     // a sensible representation for the alloca type. This is essentially
3301     // splatting the byte to a sufficiently wide integer, splatting it across
3302     // any desired vector width, and bitcasting to the final type.
3303     Value *V;
3304 
3305     if (VecTy) {
3306       // If this is a memset of a vectorized alloca, insert it.
3307       assert(ElementTy == ScalarTy);
3308 
3309       unsigned BeginIndex = getIndex(NewBeginOffset);
3310       unsigned EndIndex = getIndex(NewEndOffset);
3311       assert(EndIndex > BeginIndex && "Empty vector!");
3312       unsigned NumElements = EndIndex - BeginIndex;
3313       assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
3314              "Too many elements!");
3315 
3316       Value *Splat = getIntegerSplat(
3317           II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
3318       Splat = convertValue(DL, IRB, Splat, ElementTy);
3319       if (NumElements > 1)
3320         Splat = getVectorSplat(Splat, NumElements);
3321 
3322       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3323                                          NewAI.getAlign(), "oldload");
3324       V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
3325     } else if (IntTy) {
3326       // If this is a memset on an alloca where we can widen stores, insert the
3327       // set integer.
3328       assert(!II.isVolatile());
3329 
3330       uint64_t Size = NewEndOffset - NewBeginOffset;
3331       V = getIntegerSplat(II.getValue(), Size);
3332 
3333       if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
3334                     EndOffset != NewAllocaBeginOffset)) {
3335         Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3336                                            NewAI.getAlign(), "oldload");
3337         Old = convertValue(DL, IRB, Old, IntTy);
3338         uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3339         V = insertInteger(DL, IRB, Old, V, Offset, "insert");
3340       } else {
3341         assert(V->getType() == IntTy &&
3342                "Wrong type for an alloca wide integer!");
3343       }
3344       V = convertValue(DL, IRB, V, AllocaTy);
3345     } else {
3346       // Established these invariants above.
3347       assert(NewBeginOffset == NewAllocaBeginOffset);
3348       assert(NewEndOffset == NewAllocaEndOffset);
3349 
3350       V = getIntegerSplat(II.getValue(),
3351                           DL.getTypeSizeInBits(ScalarTy).getFixedValue() / 8);
3352       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
3353         V = getVectorSplat(
3354             V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
3355 
3356       V = convertValue(DL, IRB, V, AllocaTy);
3357     }
3358 
3359     Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3360     StoreInst *New =
3361         IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
3362     New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3363                            LLVMContext::MD_access_group});
3364     if (AATags)
3365       New->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3366                                                 V->getType(), DL));
3367 
3368     migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3369                      New, New->getPointerOperand(), V, DL);
3370 
3371     LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
3372     return !II.isVolatile();
3373   }
3374 
visitMemTransferInst(MemTransferInst & II)3375   bool visitMemTransferInst(MemTransferInst &II) {
3376     // Rewriting of memory transfer instructions can be a bit tricky. We break
3377     // them into two categories: split intrinsics and unsplit intrinsics.
3378 
3379     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
3380 
3381     AAMDNodes AATags = II.getAAMetadata();
3382 
3383     bool IsDest = &II.getRawDestUse() == OldUse;
3384     assert((IsDest && II.getRawDest() == OldPtr) ||
3385            (!IsDest && II.getRawSource() == OldPtr));
3386 
3387     Align SliceAlign = getSliceAlign();
3388     // For unsplit intrinsics, we simply modify the source and destination
3389     // pointers in place. This isn't just an optimization, it is a matter of
3390     // correctness. With unsplit intrinsics we may be dealing with transfers
3391     // within a single alloca before SROA ran, or with transfers that have
3392     // a variable length. We may also be dealing with memmove instead of
3393     // memcpy, and so simply updating the pointers is the necessary for us to
3394     // update both source and dest of a single call.
3395     if (!IsSplittable) {
3396       Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3397       if (IsDest) {
3398         // Update the address component of linked dbg.assigns.
3399         auto UpdateAssignAddress = [&](auto *DbgAssign) {
3400           if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
3401               DbgAssign->getAddress() == II.getDest())
3402             DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
3403         };
3404         for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress);
3405         for_each(at::getDVRAssignmentMarkers(&II), UpdateAssignAddress);
3406         II.setDest(AdjustedPtr);
3407         II.setDestAlignment(SliceAlign);
3408       } else {
3409         II.setSource(AdjustedPtr);
3410         II.setSourceAlignment(SliceAlign);
3411       }
3412 
3413       LLVM_DEBUG(dbgs() << "          to: " << II << "\n");
3414       deleteIfTriviallyDead(OldPtr);
3415       return false;
3416     }
3417     // For split transfer intrinsics we have an incredibly useful assurance:
3418     // the source and destination do not reside within the same alloca, and at
3419     // least one of them does not escape. This means that we can replace
3420     // memmove with memcpy, and we don't need to worry about all manner of
3421     // downsides to splitting and transforming the operations.
3422 
3423     // If this doesn't map cleanly onto the alloca type, and that type isn't
3424     // a single value type, just emit a memcpy.
3425     bool EmitMemCpy =
3426         !VecTy && !IntTy &&
3427         (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
3428          SliceSize !=
3429              DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
3430          !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
3431          !NewAI.getAllocatedType()->isSingleValueType());
3432 
3433     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
3434     // size hasn't been shrunk based on analysis of the viable range, this is
3435     // a no-op.
3436     if (EmitMemCpy && &OldAI == &NewAI) {
3437       // Ensure the start lines up.
3438       assert(NewBeginOffset == BeginOffset);
3439 
3440       // Rewrite the size as needed.
3441       if (NewEndOffset != EndOffset)
3442         II.setLength(ConstantInt::get(II.getLength()->getType(),
3443                                       NewEndOffset - NewBeginOffset));
3444       return false;
3445     }
3446     // Record this instruction for deletion.
3447     Pass.DeadInsts.push_back(&II);
3448 
3449     // Strip all inbounds GEPs and pointer casts to try to dig out any root
3450     // alloca that should be re-examined after rewriting this instruction.
3451     Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
3452     if (AllocaInst *AI =
3453             dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
3454       assert(AI != &OldAI && AI != &NewAI &&
3455              "Splittable transfers cannot reach the same alloca on both ends.");
3456       Pass.Worklist.insert(AI);
3457     }
3458 
3459     Type *OtherPtrTy = OtherPtr->getType();
3460     unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
3461 
3462     // Compute the relative offset for the other pointer within the transfer.
3463     unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
3464     APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
3465     Align OtherAlign =
3466         (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
3467     OtherAlign =
3468         commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
3469 
3470     if (EmitMemCpy) {
3471       // Compute the other pointer, folding as much as possible to produce
3472       // a single, simple GEP in most cases.
3473       OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3474                                 OtherPtr->getName() + ".");
3475 
3476       Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3477       Type *SizeTy = II.getLength()->getType();
3478       Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
3479 
3480       Value *DestPtr, *SrcPtr;
3481       MaybeAlign DestAlign, SrcAlign;
3482       // Note: IsDest is true iff we're copying into the new alloca slice
3483       if (IsDest) {
3484         DestPtr = OurPtr;
3485         DestAlign = SliceAlign;
3486         SrcPtr = OtherPtr;
3487         SrcAlign = OtherAlign;
3488       } else {
3489         DestPtr = OtherPtr;
3490         DestAlign = OtherAlign;
3491         SrcPtr = OurPtr;
3492         SrcAlign = SliceAlign;
3493       }
3494       CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
3495                                        Size, II.isVolatile());
3496       if (AATags)
3497         New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
3498 
3499       APInt Offset(DL.getIndexTypeSizeInBits(DestPtr->getType()), 0);
3500       if (IsDest) {
3501         migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8,
3502                          &II, New, DestPtr, nullptr, DL);
3503       } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3504                      DestPtr->stripAndAccumulateConstantOffsets(
3505                          DL, Offset, /*AllowNonInbounds*/ true))) {
3506         migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8,
3507                          SliceSize * 8, &II, New, DestPtr, nullptr, DL);
3508       }
3509       LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
3510       return false;
3511     }
3512 
3513     bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
3514                          NewEndOffset == NewAllocaEndOffset;
3515     uint64_t Size = NewEndOffset - NewBeginOffset;
3516     unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
3517     unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
3518     unsigned NumElements = EndIndex - BeginIndex;
3519     IntegerType *SubIntTy =
3520         IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
3521 
3522     // Reset the other pointer type to match the register type we're going to
3523     // use, but using the address space of the original other pointer.
3524     Type *OtherTy;
3525     if (VecTy && !IsWholeAlloca) {
3526       if (NumElements == 1)
3527         OtherTy = VecTy->getElementType();
3528       else
3529         OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
3530     } else if (IntTy && !IsWholeAlloca) {
3531       OtherTy = SubIntTy;
3532     } else {
3533       OtherTy = NewAllocaTy;
3534     }
3535 
3536     Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
3537                                    OtherPtr->getName() + ".");
3538     MaybeAlign SrcAlign = OtherAlign;
3539     MaybeAlign DstAlign = SliceAlign;
3540     if (!IsDest)
3541       std::swap(SrcAlign, DstAlign);
3542 
3543     Value *SrcPtr;
3544     Value *DstPtr;
3545 
3546     if (IsDest) {
3547       DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
3548       SrcPtr = AdjPtr;
3549     } else {
3550       DstPtr = AdjPtr;
3551       SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
3552     }
3553 
3554     Value *Src;
3555     if (VecTy && !IsWholeAlloca && !IsDest) {
3556       Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3557                                   NewAI.getAlign(), "load");
3558       Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
3559     } else if (IntTy && !IsWholeAlloca && !IsDest) {
3560       Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3561                                   NewAI.getAlign(), "load");
3562       Src = convertValue(DL, IRB, Src, IntTy);
3563       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3564       Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
3565     } else {
3566       LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
3567                                              II.isVolatile(), "copyload");
3568       Load->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3569                               LLVMContext::MD_access_group});
3570       if (AATags)
3571         Load->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3572                                                    Load->getType(), DL));
3573       Src = Load;
3574     }
3575 
3576     if (VecTy && !IsWholeAlloca && IsDest) {
3577       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3578                                          NewAI.getAlign(), "oldload");
3579       Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
3580     } else if (IntTy && !IsWholeAlloca && IsDest) {
3581       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
3582                                          NewAI.getAlign(), "oldload");
3583       Old = convertValue(DL, IRB, Old, IntTy);
3584       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
3585       Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
3586       Src = convertValue(DL, IRB, Src, NewAllocaTy);
3587     }
3588 
3589     StoreInst *Store = cast<StoreInst>(
3590         IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
3591     Store->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
3592                              LLVMContext::MD_access_group});
3593     if (AATags)
3594       Store->setAAMetadata(AATags.adjustForAccess(NewBeginOffset - BeginOffset,
3595                                                   Src->getType(), DL));
3596 
3597     APInt Offset(DL.getIndexTypeSizeInBits(DstPtr->getType()), 0);
3598     if (IsDest) {
3599 
3600       migrateDebugInfo(&OldAI, IsSplit, NewBeginOffset * 8, SliceSize * 8, &II,
3601                        Store, DstPtr, Src, DL);
3602     } else if (AllocaInst *Base = dyn_cast<AllocaInst>(
3603                    DstPtr->stripAndAccumulateConstantOffsets(
3604                        DL, Offset, /*AllowNonInbounds*/ true))) {
3605       migrateDebugInfo(Base, IsSplit, Offset.getZExtValue() * 8, SliceSize * 8,
3606                        &II, Store, DstPtr, Src, DL);
3607     }
3608 
3609     LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
3610     return !II.isVolatile();
3611   }
3612 
visitIntrinsicInst(IntrinsicInst & II)3613   bool visitIntrinsicInst(IntrinsicInst &II) {
3614     assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
3615            "Unexpected intrinsic!");
3616     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
3617 
3618     // Record this instruction for deletion.
3619     Pass.DeadInsts.push_back(&II);
3620 
3621     if (II.isDroppable()) {
3622       assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
3623       // TODO For now we forget assumed information, this can be improved.
3624       OldPtr->dropDroppableUsesIn(II);
3625       return true;
3626     }
3627 
3628     assert(II.getArgOperand(1) == OldPtr);
3629     // Lifetime intrinsics are only promotable if they cover the whole alloca.
3630     // Therefore, we drop lifetime intrinsics which don't cover the whole
3631     // alloca.
3632     // (In theory, intrinsics which partially cover an alloca could be
3633     // promoted, but PromoteMemToReg doesn't handle that case.)
3634     // FIXME: Check whether the alloca is promotable before dropping the
3635     // lifetime intrinsics?
3636     if (NewBeginOffset != NewAllocaBeginOffset ||
3637         NewEndOffset != NewAllocaEndOffset)
3638       return true;
3639 
3640     ConstantInt *Size =
3641         ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
3642                          NewEndOffset - NewBeginOffset);
3643     // Lifetime intrinsics always expect an i8* so directly get such a pointer
3644     // for the new alloca slice.
3645     Type *PointerTy = IRB.getPtrTy(OldPtr->getType()->getPointerAddressSpace());
3646     Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
3647     Value *New;
3648     if (II.getIntrinsicID() == Intrinsic::lifetime_start)
3649       New = IRB.CreateLifetimeStart(Ptr, Size);
3650     else
3651       New = IRB.CreateLifetimeEnd(Ptr, Size);
3652 
3653     (void)New;
3654     LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
3655 
3656     return true;
3657   }
3658 
fixLoadStoreAlign(Instruction & Root)3659   void fixLoadStoreAlign(Instruction &Root) {
3660     // This algorithm implements the same visitor loop as
3661     // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
3662     // or store found.
3663     SmallPtrSet<Instruction *, 4> Visited;
3664     SmallVector<Instruction *, 4> Uses;
3665     Visited.insert(&Root);
3666     Uses.push_back(&Root);
3667     do {
3668       Instruction *I = Uses.pop_back_val();
3669 
3670       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
3671         LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
3672         continue;
3673       }
3674       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
3675         SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
3676         continue;
3677       }
3678 
3679       assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
3680              isa<PHINode>(I) || isa<SelectInst>(I) ||
3681              isa<GetElementPtrInst>(I));
3682       for (User *U : I->users())
3683         if (Visited.insert(cast<Instruction>(U)).second)
3684           Uses.push_back(cast<Instruction>(U));
3685     } while (!Uses.empty());
3686   }
3687 
visitPHINode(PHINode & PN)3688   bool visitPHINode(PHINode &PN) {
3689     LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
3690     assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
3691     assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
3692 
3693     // We would like to compute a new pointer in only one place, but have it be
3694     // as local as possible to the PHI. To do that, we re-use the location of
3695     // the old pointer, which necessarily must be in the right position to
3696     // dominate the PHI.
3697     IRBuilderBase::InsertPointGuard Guard(IRB);
3698     if (isa<PHINode>(OldPtr))
3699       IRB.SetInsertPoint(OldPtr->getParent(),
3700                          OldPtr->getParent()->getFirstInsertionPt());
3701     else
3702       IRB.SetInsertPoint(OldPtr);
3703     IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
3704 
3705     Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3706     // Replace the operands which were using the old pointer.
3707     std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
3708 
3709     LLVM_DEBUG(dbgs() << "          to: " << PN << "\n");
3710     deleteIfTriviallyDead(OldPtr);
3711 
3712     // Fix the alignment of any loads or stores using this PHI node.
3713     fixLoadStoreAlign(PN);
3714 
3715     // PHIs can't be promoted on their own, but often can be speculated. We
3716     // check the speculation outside of the rewriter so that we see the
3717     // fully-rewritten alloca.
3718     PHIUsers.insert(&PN);
3719     return true;
3720   }
3721 
visitSelectInst(SelectInst & SI)3722   bool visitSelectInst(SelectInst &SI) {
3723     LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
3724     assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
3725            "Pointer isn't an operand!");
3726     assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
3727     assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
3728 
3729     Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
3730     // Replace the operands which were using the old pointer.
3731     if (SI.getOperand(1) == OldPtr)
3732       SI.setOperand(1, NewPtr);
3733     if (SI.getOperand(2) == OldPtr)
3734       SI.setOperand(2, NewPtr);
3735 
3736     LLVM_DEBUG(dbgs() << "          to: " << SI << "\n");
3737     deleteIfTriviallyDead(OldPtr);
3738 
3739     // Fix the alignment of any loads or stores using this select.
3740     fixLoadStoreAlign(SI);
3741 
3742     // Selects can't be promoted on their own, but often can be speculated. We
3743     // check the speculation outside of the rewriter so that we see the
3744     // fully-rewritten alloca.
3745     SelectUsers.insert(&SI);
3746     return true;
3747   }
3748 };
3749 
3750 /// Visitor to rewrite aggregate loads and stores as scalar.
3751 ///
3752 /// This pass aggressively rewrites all aggregate loads and stores on
3753 /// a particular pointer (or any pointer derived from it which we can identify)
3754 /// with scalar loads and stores.
3755 class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
3756   // Befriend the base class so it can delegate to private visit methods.
3757   friend class InstVisitor<AggLoadStoreRewriter, bool>;
3758 
3759   /// Queue of pointer uses to analyze and potentially rewrite.
3760   SmallVector<Use *, 8> Queue;
3761 
3762   /// Set to prevent us from cycling with phi nodes and loops.
3763   SmallPtrSet<User *, 8> Visited;
3764 
3765   /// The current pointer use being rewritten. This is used to dig up the used
3766   /// value (as opposed to the user).
3767   Use *U = nullptr;
3768 
3769   /// Used to calculate offsets, and hence alignment, of subobjects.
3770   const DataLayout &DL;
3771 
3772   IRBuilderTy &IRB;
3773 
3774 public:
AggLoadStoreRewriter(const DataLayout & DL,IRBuilderTy & IRB)3775   AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
3776       : DL(DL), IRB(IRB) {}
3777 
3778   /// Rewrite loads and stores through a pointer and all pointers derived from
3779   /// it.
rewrite(Instruction & I)3780   bool rewrite(Instruction &I) {
3781     LLVM_DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
3782     enqueueUsers(I);
3783     bool Changed = false;
3784     while (!Queue.empty()) {
3785       U = Queue.pop_back_val();
3786       Changed |= visit(cast<Instruction>(U->getUser()));
3787     }
3788     return Changed;
3789   }
3790 
3791 private:
3792   /// Enqueue all the users of the given instruction for further processing.
3793   /// This uses a set to de-duplicate users.
enqueueUsers(Instruction & I)3794   void enqueueUsers(Instruction &I) {
3795     for (Use &U : I.uses())
3796       if (Visited.insert(U.getUser()).second)
3797         Queue.push_back(&U);
3798   }
3799 
3800   // Conservative default is to not rewrite anything.
visitInstruction(Instruction & I)3801   bool visitInstruction(Instruction &I) { return false; }
3802 
3803   /// Generic recursive split emission class.
3804   template <typename Derived> class OpSplitter {
3805   protected:
3806     /// The builder used to form new instructions.
3807     IRBuilderTy &IRB;
3808 
3809     /// The indices which to be used with insert- or extractvalue to select the
3810     /// appropriate value within the aggregate.
3811     SmallVector<unsigned, 4> Indices;
3812 
3813     /// The indices to a GEP instruction which will move Ptr to the correct slot
3814     /// within the aggregate.
3815     SmallVector<Value *, 4> GEPIndices;
3816 
3817     /// The base pointer of the original op, used as a base for GEPing the
3818     /// split operations.
3819     Value *Ptr;
3820 
3821     /// The base pointee type being GEPed into.
3822     Type *BaseTy;
3823 
3824     /// Known alignment of the base pointer.
3825     Align BaseAlign;
3826 
3827     /// To calculate offset of each component so we can correctly deduce
3828     /// alignments.
3829     const DataLayout &DL;
3830 
3831     /// Initialize the splitter with an insertion point, Ptr and start with a
3832     /// single zero GEP index.
OpSplitter(Instruction * InsertionPoint,Value * Ptr,Type * BaseTy,Align BaseAlign,const DataLayout & DL,IRBuilderTy & IRB)3833     OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3834                Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
3835         : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
3836           BaseAlign(BaseAlign), DL(DL) {
3837       IRB.SetInsertPoint(InsertionPoint);
3838     }
3839 
3840   public:
3841     /// Generic recursive split emission routine.
3842     ///
3843     /// This method recursively splits an aggregate op (load or store) into
3844     /// scalar or vector ops. It splits recursively until it hits a single value
3845     /// and emits that single value operation via the template argument.
3846     ///
3847     /// The logic of this routine relies on GEPs and insertvalue and
3848     /// extractvalue all operating with the same fundamental index list, merely
3849     /// formatted differently (GEPs need actual values).
3850     ///
3851     /// \param Ty  The type being split recursively into smaller ops.
3852     /// \param Agg The aggregate value being built up or stored, depending on
3853     /// whether this is splitting a load or a store respectively.
emitSplitOps(Type * Ty,Value * & Agg,const Twine & Name)3854     void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
3855       if (Ty->isSingleValueType()) {
3856         unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
3857         return static_cast<Derived *>(this)->emitFunc(
3858             Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
3859       }
3860 
3861       if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
3862         unsigned OldSize = Indices.size();
3863         (void)OldSize;
3864         for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
3865              ++Idx) {
3866           assert(Indices.size() == OldSize && "Did not return to the old size");
3867           Indices.push_back(Idx);
3868           GEPIndices.push_back(IRB.getInt32(Idx));
3869           emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
3870           GEPIndices.pop_back();
3871           Indices.pop_back();
3872         }
3873         return;
3874       }
3875 
3876       if (StructType *STy = dyn_cast<StructType>(Ty)) {
3877         unsigned OldSize = Indices.size();
3878         (void)OldSize;
3879         for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
3880              ++Idx) {
3881           assert(Indices.size() == OldSize && "Did not return to the old size");
3882           Indices.push_back(Idx);
3883           GEPIndices.push_back(IRB.getInt32(Idx));
3884           emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
3885           GEPIndices.pop_back();
3886           Indices.pop_back();
3887         }
3888         return;
3889       }
3890 
3891       llvm_unreachable("Only arrays and structs are aggregate loadable types");
3892     }
3893   };
3894 
3895   struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
3896     AAMDNodes AATags;
3897     // A vector to hold the split components that we want to emit
3898     // separate fake uses for.
3899     SmallVector<Value *, 4> Components;
3900     // A vector to hold all the fake uses of the struct that we are splitting.
3901     // Usually there should only be one, but we are handling the general case.
3902     SmallVector<Instruction *, 1> FakeUses;
3903 
LoadOpSplitter__anondf5662880e11::AggLoadStoreRewriter::LoadOpSplitter3904     LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3905                    AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
3906                    IRBuilderTy &IRB)
3907         : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
3908                                      IRB),
3909           AATags(AATags) {}
3910 
3911     /// Emit a leaf load of a single value. This is called at the leaves of the
3912     /// recursive emission to actually load values.
emitFunc__anondf5662880e11::AggLoadStoreRewriter::LoadOpSplitter3913     void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3914       assert(Ty->isSingleValueType());
3915       // Load the single value and insert it using the indices.
3916       Value *GEP =
3917           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3918       LoadInst *Load =
3919           IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
3920 
3921       APInt Offset(
3922           DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
3923       if (AATags &&
3924           GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
3925         Load->setAAMetadata(
3926             AATags.adjustForAccess(Offset.getZExtValue(), Load->getType(), DL));
3927       // Record the load so we can generate a fake use for this aggregate
3928       // component.
3929       Components.push_back(Load);
3930 
3931       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
3932       LLVM_DEBUG(dbgs() << "          to: " << *Load << "\n");
3933     }
3934 
3935     // Stash the fake uses that use the value generated by this instruction.
recordFakeUses__anondf5662880e11::AggLoadStoreRewriter::LoadOpSplitter3936     void recordFakeUses(LoadInst &LI) {
3937       for (Use &U : LI.uses())
3938         if (auto *II = dyn_cast<IntrinsicInst>(U.getUser()))
3939           if (II->getIntrinsicID() == Intrinsic::fake_use)
3940             FakeUses.push_back(II);
3941     }
3942 
3943     // Replace all fake uses of the aggregate with a series of fake uses, one
3944     // for each split component.
emitFakeUses__anondf5662880e11::AggLoadStoreRewriter::LoadOpSplitter3945     void emitFakeUses() {
3946       for (Instruction *I : FakeUses) {
3947         IRB.SetInsertPoint(I);
3948         for (auto *V : Components)
3949           IRB.CreateIntrinsic(Intrinsic::fake_use, {V});
3950         I->eraseFromParent();
3951       }
3952     }
3953   };
3954 
visitLoadInst(LoadInst & LI)3955   bool visitLoadInst(LoadInst &LI) {
3956     assert(LI.getPointerOperand() == *U);
3957     if (!LI.isSimple() || LI.getType()->isSingleValueType())
3958       return false;
3959 
3960     // We have an aggregate being loaded, split it apart.
3961     LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
3962     LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
3963                             getAdjustedAlignment(&LI, 0), DL, IRB);
3964     Splitter.recordFakeUses(LI);
3965     Value *V = PoisonValue::get(LI.getType());
3966     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
3967     Splitter.emitFakeUses();
3968     Visited.erase(&LI);
3969     LI.replaceAllUsesWith(V);
3970     LI.eraseFromParent();
3971     return true;
3972   }
3973 
3974   struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
StoreOpSplitter__anondf5662880e11::AggLoadStoreRewriter::StoreOpSplitter3975     StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
3976                     AAMDNodes AATags, StoreInst *AggStore, Align BaseAlign,
3977                     const DataLayout &DL, IRBuilderTy &IRB)
3978         : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
3979                                       DL, IRB),
3980           AATags(AATags), AggStore(AggStore) {}
3981     AAMDNodes AATags;
3982     StoreInst *AggStore;
3983     /// Emit a leaf store of a single value. This is called at the leaves of the
3984     /// recursive emission to actually produce stores.
emitFunc__anondf5662880e11::AggLoadStoreRewriter::StoreOpSplitter3985     void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
3986       assert(Ty->isSingleValueType());
3987       // Extract the single value and store it using the indices.
3988       //
3989       // The gep and extractvalue values are factored out of the CreateStore
3990       // call to make the output independent of the argument evaluation order.
3991       Value *ExtractValue =
3992           IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
3993       Value *InBoundsGEP =
3994           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
3995       StoreInst *Store =
3996           IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
3997 
3998       APInt Offset(
3999           DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
4000       GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset);
4001       if (AATags) {
4002         Store->setAAMetadata(AATags.adjustForAccess(
4003             Offset.getZExtValue(), ExtractValue->getType(), DL));
4004       }
4005 
4006       // migrateDebugInfo requires the base Alloca. Walk to it from this gep.
4007       // If we cannot (because there's an intervening non-const or unbounded
4008       // gep) then we wouldn't expect to see dbg.assign intrinsics linked to
4009       // this instruction.
4010       Value *Base = AggStore->getPointerOperand()->stripInBoundsOffsets();
4011       if (auto *OldAI = dyn_cast<AllocaInst>(Base)) {
4012         uint64_t SizeInBits =
4013             DL.getTypeSizeInBits(Store->getValueOperand()->getType());
4014         migrateDebugInfo(OldAI, /*IsSplit*/ true, Offset.getZExtValue() * 8,
4015                          SizeInBits, AggStore, Store,
4016                          Store->getPointerOperand(), Store->getValueOperand(),
4017                          DL);
4018       } else {
4019         assert(at::getAssignmentMarkers(Store).empty() &&
4020                at::getDVRAssignmentMarkers(Store).empty() &&
4021                "AT: unexpected debug.assign linked to store through "
4022                "unbounded GEP");
4023       }
4024       LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
4025     }
4026   };
4027 
visitStoreInst(StoreInst & SI)4028   bool visitStoreInst(StoreInst &SI) {
4029     if (!SI.isSimple() || SI.getPointerOperand() != *U)
4030       return false;
4031     Value *V = SI.getValueOperand();
4032     if (V->getType()->isSingleValueType())
4033       return false;
4034 
4035     // We have an aggregate being stored, split it apart.
4036     LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
4037     StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(), &SI,
4038                              getAdjustedAlignment(&SI, 0), DL, IRB);
4039     Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
4040     Visited.erase(&SI);
4041     // The stores replacing SI each have markers describing fragments of the
4042     // assignment so delete the assignment markers linked to SI.
4043     at::deleteAssignmentMarkers(&SI);
4044     SI.eraseFromParent();
4045     return true;
4046   }
4047 
visitBitCastInst(BitCastInst & BC)4048   bool visitBitCastInst(BitCastInst &BC) {
4049     enqueueUsers(BC);
4050     return false;
4051   }
4052 
visitAddrSpaceCastInst(AddrSpaceCastInst & ASC)4053   bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
4054     enqueueUsers(ASC);
4055     return false;
4056   }
4057 
4058   // Unfold gep (select cond, ptr1, ptr2), idx
4059   //   => select cond, gep(ptr1, idx), gep(ptr2, idx)
4060   // and  gep ptr, (select cond, idx1, idx2)
4061   //   => select cond, gep(ptr, idx1), gep(ptr, idx2)
4062   // We also allow for i1 zext indices, which are equivalent to selects.
unfoldGEPSelect(GetElementPtrInst & GEPI)4063   bool unfoldGEPSelect(GetElementPtrInst &GEPI) {
4064     // Check whether the GEP has exactly one select operand and all indices
4065     // will become constant after the transform.
4066     Instruction *Sel = dyn_cast<SelectInst>(GEPI.getPointerOperand());
4067     for (Value *Op : GEPI.indices()) {
4068       if (auto *SI = dyn_cast<SelectInst>(Op)) {
4069         if (Sel)
4070           return false;
4071 
4072         Sel = SI;
4073         if (!isa<ConstantInt>(SI->getTrueValue()) ||
4074             !isa<ConstantInt>(SI->getFalseValue()))
4075           return false;
4076         continue;
4077       }
4078       if (auto *ZI = dyn_cast<ZExtInst>(Op)) {
4079         if (Sel)
4080           return false;
4081         Sel = ZI;
4082         if (!ZI->getSrcTy()->isIntegerTy(1))
4083           return false;
4084         continue;
4085       }
4086 
4087       if (!isa<ConstantInt>(Op))
4088         return false;
4089     }
4090 
4091     if (!Sel)
4092       return false;
4093 
4094     LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):\n";
4095                dbgs() << "    original: " << *Sel << "\n";
4096                dbgs() << "              " << GEPI << "\n";);
4097 
4098     auto GetNewOps = [&](Value *SelOp) {
4099       SmallVector<Value *> NewOps;
4100       for (Value *Op : GEPI.operands())
4101         if (Op == Sel)
4102           NewOps.push_back(SelOp);
4103         else
4104           NewOps.push_back(Op);
4105       return NewOps;
4106     };
4107 
4108     Value *Cond, *True, *False;
4109     if (auto *SI = dyn_cast<SelectInst>(Sel)) {
4110       Cond = SI->getCondition();
4111       True = SI->getTrueValue();
4112       False = SI->getFalseValue();
4113     } else {
4114       Cond = Sel->getOperand(0);
4115       True = ConstantInt::get(Sel->getType(), 1);
4116       False = ConstantInt::get(Sel->getType(), 0);
4117     }
4118     SmallVector<Value *> TrueOps = GetNewOps(True);
4119     SmallVector<Value *> FalseOps = GetNewOps(False);
4120 
4121     IRB.SetInsertPoint(&GEPI);
4122     GEPNoWrapFlags NW = GEPI.getNoWrapFlags();
4123 
4124     Type *Ty = GEPI.getSourceElementType();
4125     Value *NTrue = IRB.CreateGEP(Ty, TrueOps[0], ArrayRef(TrueOps).drop_front(),
4126                                  True->getName() + ".sroa.gep", NW);
4127 
4128     Value *NFalse =
4129         IRB.CreateGEP(Ty, FalseOps[0], ArrayRef(FalseOps).drop_front(),
4130                       False->getName() + ".sroa.gep", NW);
4131 
4132     Value *NSel =
4133         IRB.CreateSelect(Cond, NTrue, NFalse, Sel->getName() + ".sroa.sel");
4134     Visited.erase(&GEPI);
4135     GEPI.replaceAllUsesWith(NSel);
4136     GEPI.eraseFromParent();
4137     Instruction *NSelI = cast<Instruction>(NSel);
4138     Visited.insert(NSelI);
4139     enqueueUsers(*NSelI);
4140 
4141     LLVM_DEBUG(dbgs() << "          to: " << *NTrue << "\n";
4142                dbgs() << "              " << *NFalse << "\n";
4143                dbgs() << "              " << *NSel << "\n";);
4144 
4145     return true;
4146   }
4147 
4148   // Unfold gep (phi ptr1, ptr2), idx
4149   //   => phi ((gep ptr1, idx), (gep ptr2, idx))
4150   // and  gep ptr, (phi idx1, idx2)
4151   //   => phi ((gep ptr, idx1), (gep ptr, idx2))
unfoldGEPPhi(GetElementPtrInst & GEPI)4152   bool unfoldGEPPhi(GetElementPtrInst &GEPI) {
4153     // To prevent infinitely expanding recursive phis, bail if the GEP pointer
4154     // operand (looking through the phi if it is the phi we want to unfold) is
4155     // an instruction besides a static alloca.
4156     PHINode *Phi = dyn_cast<PHINode>(GEPI.getPointerOperand());
4157     auto IsInvalidPointerOperand = [](Value *V) {
4158       if (!isa<Instruction>(V))
4159         return false;
4160       if (auto *AI = dyn_cast<AllocaInst>(V))
4161         return !AI->isStaticAlloca();
4162       return true;
4163     };
4164     if (Phi) {
4165       if (any_of(Phi->operands(), IsInvalidPointerOperand))
4166         return false;
4167     } else {
4168       if (IsInvalidPointerOperand(GEPI.getPointerOperand()))
4169         return false;
4170     }
4171     // Check whether the GEP has exactly one phi operand (including the pointer
4172     // operand) and all indices will become constant after the transform.
4173     for (Value *Op : GEPI.indices()) {
4174       if (auto *SI = dyn_cast<PHINode>(Op)) {
4175         if (Phi)
4176           return false;
4177 
4178         Phi = SI;
4179         if (!all_of(Phi->incoming_values(),
4180                     [](Value *V) { return isa<ConstantInt>(V); }))
4181           return false;
4182         continue;
4183       }
4184 
4185       if (!isa<ConstantInt>(Op))
4186         return false;
4187     }
4188 
4189     if (!Phi)
4190       return false;
4191 
4192     LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):\n";
4193                dbgs() << "    original: " << *Phi << "\n";
4194                dbgs() << "              " << GEPI << "\n";);
4195 
4196     auto GetNewOps = [&](Value *PhiOp) {
4197       SmallVector<Value *> NewOps;
4198       for (Value *Op : GEPI.operands())
4199         if (Op == Phi)
4200           NewOps.push_back(PhiOp);
4201         else
4202           NewOps.push_back(Op);
4203       return NewOps;
4204     };
4205 
4206     IRB.SetInsertPoint(Phi);
4207     PHINode *NewPhi = IRB.CreatePHI(GEPI.getType(), Phi->getNumIncomingValues(),
4208                                     Phi->getName() + ".sroa.phi");
4209 
4210     Type *SourceTy = GEPI.getSourceElementType();
4211     // We only handle arguments, constants, and static allocas here, so we can
4212     // insert GEPs at the end of the entry block.
4213     IRB.SetInsertPoint(GEPI.getFunction()->getEntryBlock().getTerminator());
4214     for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
4215       Value *Op = Phi->getIncomingValue(I);
4216       BasicBlock *BB = Phi->getIncomingBlock(I);
4217       Value *NewGEP;
4218       if (int NI = NewPhi->getBasicBlockIndex(BB); NI >= 0) {
4219         NewGEP = NewPhi->getIncomingValue(NI);
4220       } else {
4221         SmallVector<Value *> NewOps = GetNewOps(Op);
4222         NewGEP =
4223             IRB.CreateGEP(SourceTy, NewOps[0], ArrayRef(NewOps).drop_front(),
4224                           Phi->getName() + ".sroa.gep", GEPI.getNoWrapFlags());
4225       }
4226       NewPhi->addIncoming(NewGEP, BB);
4227     }
4228 
4229     Visited.erase(&GEPI);
4230     GEPI.replaceAllUsesWith(NewPhi);
4231     GEPI.eraseFromParent();
4232     Visited.insert(NewPhi);
4233     enqueueUsers(*NewPhi);
4234 
4235     LLVM_DEBUG(dbgs() << "          to: ";
4236                for (Value *In
4237                     : NewPhi->incoming_values()) dbgs()
4238                << "\n              " << *In;
4239                dbgs() << "\n              " << *NewPhi << '\n');
4240 
4241     return true;
4242   }
4243 
visitGetElementPtrInst(GetElementPtrInst & GEPI)4244   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
4245     if (unfoldGEPSelect(GEPI))
4246       return true;
4247 
4248     if (unfoldGEPPhi(GEPI))
4249       return true;
4250 
4251     enqueueUsers(GEPI);
4252     return false;
4253   }
4254 
visitPHINode(PHINode & PN)4255   bool visitPHINode(PHINode &PN) {
4256     enqueueUsers(PN);
4257     return false;
4258   }
4259 
visitSelectInst(SelectInst & SI)4260   bool visitSelectInst(SelectInst &SI) {
4261     enqueueUsers(SI);
4262     return false;
4263   }
4264 };
4265 
4266 } // end anonymous namespace
4267 
4268 /// Strip aggregate type wrapping.
4269 ///
4270 /// This removes no-op aggregate types wrapping an underlying type. It will
4271 /// strip as many layers of types as it can without changing either the type
4272 /// size or the allocated size.
stripAggregateTypeWrapping(const DataLayout & DL,Type * Ty)4273 static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
4274   if (Ty->isSingleValueType())
4275     return Ty;
4276 
4277   uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedValue();
4278   uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedValue();
4279 
4280   Type *InnerTy;
4281   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
4282     InnerTy = ArrTy->getElementType();
4283   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
4284     const StructLayout *SL = DL.getStructLayout(STy);
4285     unsigned Index = SL->getElementContainingOffset(0);
4286     InnerTy = STy->getElementType(Index);
4287   } else {
4288     return Ty;
4289   }
4290 
4291   if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedValue() ||
4292       TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedValue())
4293     return Ty;
4294 
4295   return stripAggregateTypeWrapping(DL, InnerTy);
4296 }
4297 
4298 /// Try to find a partition of the aggregate type passed in for a given
4299 /// offset and size.
4300 ///
4301 /// This recurses through the aggregate type and tries to compute a subtype
4302 /// based on the offset and size. When the offset and size span a sub-section
4303 /// of an array, it will even compute a new array type for that sub-section,
4304 /// and the same for structs.
4305 ///
4306 /// Note that this routine is very strict and tries to find a partition of the
4307 /// type which produces the *exact* right offset and size. It is not forgiving
4308 /// when the size or offset cause either end of type-based partition to be off.
4309 /// Also, this is a best-effort routine. It is reasonable to give up and not
4310 /// return a type if necessary.
getTypePartition(const DataLayout & DL,Type * Ty,uint64_t Offset,uint64_t Size)4311 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
4312                               uint64_t Size) {
4313   if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedValue() == Size)
4314     return stripAggregateTypeWrapping(DL, Ty);
4315   if (Offset > DL.getTypeAllocSize(Ty).getFixedValue() ||
4316       (DL.getTypeAllocSize(Ty).getFixedValue() - Offset) < Size)
4317     return nullptr;
4318 
4319   if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
4320     Type *ElementTy;
4321     uint64_t TyNumElements;
4322     if (auto *AT = dyn_cast<ArrayType>(Ty)) {
4323       ElementTy = AT->getElementType();
4324       TyNumElements = AT->getNumElements();
4325     } else {
4326       // FIXME: This isn't right for vectors with non-byte-sized or
4327       // non-power-of-two sized elements.
4328       auto *VT = cast<FixedVectorType>(Ty);
4329       ElementTy = VT->getElementType();
4330       TyNumElements = VT->getNumElements();
4331     }
4332     uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4333     uint64_t NumSkippedElements = Offset / ElementSize;
4334     if (NumSkippedElements >= TyNumElements)
4335       return nullptr;
4336     Offset -= NumSkippedElements * ElementSize;
4337 
4338     // First check if we need to recurse.
4339     if (Offset > 0 || Size < ElementSize) {
4340       // Bail if the partition ends in a different array element.
4341       if ((Offset + Size) > ElementSize)
4342         return nullptr;
4343       // Recurse through the element type trying to peel off offset bytes.
4344       return getTypePartition(DL, ElementTy, Offset, Size);
4345     }
4346     assert(Offset == 0);
4347 
4348     if (Size == ElementSize)
4349       return stripAggregateTypeWrapping(DL, ElementTy);
4350     assert(Size > ElementSize);
4351     uint64_t NumElements = Size / ElementSize;
4352     if (NumElements * ElementSize != Size)
4353       return nullptr;
4354     return ArrayType::get(ElementTy, NumElements);
4355   }
4356 
4357   StructType *STy = dyn_cast<StructType>(Ty);
4358   if (!STy)
4359     return nullptr;
4360 
4361   const StructLayout *SL = DL.getStructLayout(STy);
4362 
4363   if (SL->getSizeInBits().isScalable())
4364     return nullptr;
4365 
4366   if (Offset >= SL->getSizeInBytes())
4367     return nullptr;
4368   uint64_t EndOffset = Offset + Size;
4369   if (EndOffset > SL->getSizeInBytes())
4370     return nullptr;
4371 
4372   unsigned Index = SL->getElementContainingOffset(Offset);
4373   Offset -= SL->getElementOffset(Index);
4374 
4375   Type *ElementTy = STy->getElementType(Index);
4376   uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedValue();
4377   if (Offset >= ElementSize)
4378     return nullptr; // The offset points into alignment padding.
4379 
4380   // See if any partition must be contained by the element.
4381   if (Offset > 0 || Size < ElementSize) {
4382     if ((Offset + Size) > ElementSize)
4383       return nullptr;
4384     return getTypePartition(DL, ElementTy, Offset, Size);
4385   }
4386   assert(Offset == 0);
4387 
4388   if (Size == ElementSize)
4389     return stripAggregateTypeWrapping(DL, ElementTy);
4390 
4391   StructType::element_iterator EI = STy->element_begin() + Index,
4392                                EE = STy->element_end();
4393   if (EndOffset < SL->getSizeInBytes()) {
4394     unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
4395     if (Index == EndIndex)
4396       return nullptr; // Within a single element and its padding.
4397 
4398     // Don't try to form "natural" types if the elements don't line up with the
4399     // expected size.
4400     // FIXME: We could potentially recurse down through the last element in the
4401     // sub-struct to find a natural end point.
4402     if (SL->getElementOffset(EndIndex) != EndOffset)
4403       return nullptr;
4404 
4405     assert(Index < EndIndex);
4406     EE = STy->element_begin() + EndIndex;
4407   }
4408 
4409   // Try to build up a sub-structure.
4410   StructType *SubTy =
4411       StructType::get(STy->getContext(), ArrayRef(EI, EE), STy->isPacked());
4412   const StructLayout *SubSL = DL.getStructLayout(SubTy);
4413   if (Size != SubSL->getSizeInBytes())
4414     return nullptr; // The sub-struct doesn't have quite the size needed.
4415 
4416   return SubTy;
4417 }
4418 
4419 /// Pre-split loads and stores to simplify rewriting.
4420 ///
4421 /// We want to break up the splittable load+store pairs as much as
4422 /// possible. This is important to do as a preprocessing step, as once we
4423 /// start rewriting the accesses to partitions of the alloca we lose the
4424 /// necessary information to correctly split apart paired loads and stores
4425 /// which both point into this alloca. The case to consider is something like
4426 /// the following:
4427 ///
4428 ///   %a = alloca [12 x i8]
4429 ///   %gep1 = getelementptr i8, ptr %a, i32 0
4430 ///   %gep2 = getelementptr i8, ptr %a, i32 4
4431 ///   %gep3 = getelementptr i8, ptr %a, i32 8
4432 ///   store float 0.0, ptr %gep1
4433 ///   store float 1.0, ptr %gep2
4434 ///   %v = load i64, ptr %gep1
4435 ///   store i64 %v, ptr %gep2
4436 ///   %f1 = load float, ptr %gep2
4437 ///   %f2 = load float, ptr %gep3
4438 ///
4439 /// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
4440 /// promote everything so we recover the 2 SSA values that should have been
4441 /// there all along.
4442 ///
4443 /// \returns true if any changes are made.
presplitLoadsAndStores(AllocaInst & AI,AllocaSlices & AS)4444 bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
4445   LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
4446 
4447   // Track the loads and stores which are candidates for pre-splitting here, in
4448   // the order they first appear during the partition scan. These give stable
4449   // iteration order and a basis for tracking which loads and stores we
4450   // actually split.
4451   SmallVector<LoadInst *, 4> Loads;
4452   SmallVector<StoreInst *, 4> Stores;
4453 
4454   // We need to accumulate the splits required of each load or store where we
4455   // can find them via a direct lookup. This is important to cross-check loads
4456   // and stores against each other. We also track the slice so that we can kill
4457   // all the slices that end up split.
4458   struct SplitOffsets {
4459     Slice *S;
4460     std::vector<uint64_t> Splits;
4461   };
4462   SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
4463 
4464   // Track loads out of this alloca which cannot, for any reason, be pre-split.
4465   // This is important as we also cannot pre-split stores of those loads!
4466   // FIXME: This is all pretty gross. It means that we can be more aggressive
4467   // in pre-splitting when the load feeding the store happens to come from
4468   // a separate alloca. Put another way, the effectiveness of SROA would be
4469   // decreased by a frontend which just concatenated all of its local allocas
4470   // into one big flat alloca. But defeating such patterns is exactly the job
4471   // SROA is tasked with! Sadly, to not have this discrepancy we would have
4472   // change store pre-splitting to actually force pre-splitting of the load
4473   // that feeds it *and all stores*. That makes pre-splitting much harder, but
4474   // maybe it would make it more principled?
4475   SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
4476 
4477   LLVM_DEBUG(dbgs() << "  Searching for candidate loads and stores\n");
4478   for (auto &P : AS.partitions()) {
4479     for (Slice &S : P) {
4480       Instruction *I = cast<Instruction>(S.getUse()->getUser());
4481       if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
4482         // If this is a load we have to track that it can't participate in any
4483         // pre-splitting. If this is a store of a load we have to track that
4484         // that load also can't participate in any pre-splitting.
4485         if (auto *LI = dyn_cast<LoadInst>(I))
4486           UnsplittableLoads.insert(LI);
4487         else if (auto *SI = dyn_cast<StoreInst>(I))
4488           if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
4489             UnsplittableLoads.insert(LI);
4490         continue;
4491       }
4492       assert(P.endOffset() > S.beginOffset() &&
4493              "Empty or backwards partition!");
4494 
4495       // Determine if this is a pre-splittable slice.
4496       if (auto *LI = dyn_cast<LoadInst>(I)) {
4497         assert(!LI->isVolatile() && "Cannot split volatile loads!");
4498 
4499         // The load must be used exclusively to store into other pointers for
4500         // us to be able to arbitrarily pre-split it. The stores must also be
4501         // simple to avoid changing semantics.
4502         auto IsLoadSimplyStored = [](LoadInst *LI) {
4503           for (User *LU : LI->users()) {
4504             auto *SI = dyn_cast<StoreInst>(LU);
4505             if (!SI || !SI->isSimple())
4506               return false;
4507           }
4508           return true;
4509         };
4510         if (!IsLoadSimplyStored(LI)) {
4511           UnsplittableLoads.insert(LI);
4512           continue;
4513         }
4514 
4515         Loads.push_back(LI);
4516       } else if (auto *SI = dyn_cast<StoreInst>(I)) {
4517         if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
4518           // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
4519           continue;
4520         auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
4521         if (!StoredLoad || !StoredLoad->isSimple())
4522           continue;
4523         assert(!SI->isVolatile() && "Cannot split volatile stores!");
4524 
4525         Stores.push_back(SI);
4526       } else {
4527         // Other uses cannot be pre-split.
4528         continue;
4529       }
4530 
4531       // Record the initial split.
4532       LLVM_DEBUG(dbgs() << "    Candidate: " << *I << "\n");
4533       auto &Offsets = SplitOffsetsMap[I];
4534       assert(Offsets.Splits.empty() &&
4535              "Should not have splits the first time we see an instruction!");
4536       Offsets.S = &S;
4537       Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
4538     }
4539 
4540     // Now scan the already split slices, and add a split for any of them which
4541     // we're going to pre-split.
4542     for (Slice *S : P.splitSliceTails()) {
4543       auto SplitOffsetsMapI =
4544           SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
4545       if (SplitOffsetsMapI == SplitOffsetsMap.end())
4546         continue;
4547       auto &Offsets = SplitOffsetsMapI->second;
4548 
4549       assert(Offsets.S == S && "Found a mismatched slice!");
4550       assert(!Offsets.Splits.empty() &&
4551              "Cannot have an empty set of splits on the second partition!");
4552       assert(Offsets.Splits.back() ==
4553                  P.beginOffset() - Offsets.S->beginOffset() &&
4554              "Previous split does not end where this one begins!");
4555 
4556       // Record each split. The last partition's end isn't needed as the size
4557       // of the slice dictates that.
4558       if (S->endOffset() > P.endOffset())
4559         Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
4560     }
4561   }
4562 
4563   // We may have split loads where some of their stores are split stores. For
4564   // such loads and stores, we can only pre-split them if their splits exactly
4565   // match relative to their starting offset. We have to verify this prior to
4566   // any rewriting.
4567   llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
4568     // Lookup the load we are storing in our map of split
4569     // offsets.
4570     auto *LI = cast<LoadInst>(SI->getValueOperand());
4571     // If it was completely unsplittable, then we're done,
4572     // and this store can't be pre-split.
4573     if (UnsplittableLoads.count(LI))
4574       return true;
4575 
4576     auto LoadOffsetsI = SplitOffsetsMap.find(LI);
4577     if (LoadOffsetsI == SplitOffsetsMap.end())
4578       return false; // Unrelated loads are definitely safe.
4579     auto &LoadOffsets = LoadOffsetsI->second;
4580 
4581     // Now lookup the store's offsets.
4582     auto &StoreOffsets = SplitOffsetsMap[SI];
4583 
4584     // If the relative offsets of each split in the load and
4585     // store match exactly, then we can split them and we
4586     // don't need to remove them here.
4587     if (LoadOffsets.Splits == StoreOffsets.Splits)
4588       return false;
4589 
4590     LLVM_DEBUG(dbgs() << "    Mismatched splits for load and store:\n"
4591                       << "      " << *LI << "\n"
4592                       << "      " << *SI << "\n");
4593 
4594     // We've found a store and load that we need to split
4595     // with mismatched relative splits. Just give up on them
4596     // and remove both instructions from our list of
4597     // candidates.
4598     UnsplittableLoads.insert(LI);
4599     return true;
4600   });
4601   // Now we have to go *back* through all the stores, because a later store may
4602   // have caused an earlier store's load to become unsplittable and if it is
4603   // unsplittable for the later store, then we can't rely on it being split in
4604   // the earlier store either.
4605   llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
4606     auto *LI = cast<LoadInst>(SI->getValueOperand());
4607     return UnsplittableLoads.count(LI);
4608   });
4609   // Once we've established all the loads that can't be split for some reason,
4610   // filter any that made it into our list out.
4611   llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
4612     return UnsplittableLoads.count(LI);
4613   });
4614 
4615   // If no loads or stores are left, there is no pre-splitting to be done for
4616   // this alloca.
4617   if (Loads.empty() && Stores.empty())
4618     return false;
4619 
4620   // From here on, we can't fail and will be building new accesses, so rig up
4621   // an IR builder.
4622   IRBuilderTy IRB(&AI);
4623 
4624   // Collect the new slices which we will merge into the alloca slices.
4625   SmallVector<Slice, 4> NewSlices;
4626 
4627   // Track any allocas we end up splitting loads and stores for so we iterate
4628   // on them.
4629   SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
4630 
4631   // At this point, we have collected all of the loads and stores we can
4632   // pre-split, and the specific splits needed for them. We actually do the
4633   // splitting in a specific order in order to handle when one of the loads in
4634   // the value operand to one of the stores.
4635   //
4636   // First, we rewrite all of the split loads, and just accumulate each split
4637   // load in a parallel structure. We also build the slices for them and append
4638   // them to the alloca slices.
4639   SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
4640   std::vector<LoadInst *> SplitLoads;
4641   const DataLayout &DL = AI.getDataLayout();
4642   for (LoadInst *LI : Loads) {
4643     SplitLoads.clear();
4644 
4645     auto &Offsets = SplitOffsetsMap[LI];
4646     unsigned SliceSize = Offsets.S->endOffset() - Offsets.S->beginOffset();
4647     assert(LI->getType()->getIntegerBitWidth() % 8 == 0 &&
4648            "Load must have type size equal to store size");
4649     assert(LI->getType()->getIntegerBitWidth() / 8 >= SliceSize &&
4650            "Load must be >= slice size");
4651 
4652     uint64_t BaseOffset = Offsets.S->beginOffset();
4653     assert(BaseOffset + SliceSize > BaseOffset &&
4654            "Cannot represent alloca access size using 64-bit integers!");
4655 
4656     Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
4657     IRB.SetInsertPoint(LI);
4658 
4659     LLVM_DEBUG(dbgs() << "  Splitting load: " << *LI << "\n");
4660 
4661     uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4662     int Idx = 0, Size = Offsets.Splits.size();
4663     for (;;) {
4664       auto *PartTy = Type::getIntNTy(LI->getContext(), PartSize * 8);
4665       auto AS = LI->getPointerAddressSpace();
4666       auto *PartPtrTy = LI->getPointerOperandType();
4667       LoadInst *PLoad = IRB.CreateAlignedLoad(
4668           PartTy,
4669           getAdjustedPtr(IRB, DL, BasePtr,
4670                          APInt(DL.getIndexSizeInBits(AS), PartOffset),
4671                          PartPtrTy, BasePtr->getName() + "."),
4672           getAdjustedAlignment(LI, PartOffset),
4673           /*IsVolatile*/ false, LI->getName());
4674       PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4675                                 LLVMContext::MD_access_group});
4676 
4677       // Append this load onto the list of split loads so we can find it later
4678       // to rewrite the stores.
4679       SplitLoads.push_back(PLoad);
4680 
4681       // Now build a new slice for the alloca.
4682       NewSlices.push_back(
4683           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4684                 &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
4685                 /*IsSplittable*/ false));
4686       LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
4687                         << ", " << NewSlices.back().endOffset()
4688                         << "): " << *PLoad << "\n");
4689 
4690       // See if we've handled all the splits.
4691       if (Idx >= Size)
4692         break;
4693 
4694       // Setup the next partition.
4695       PartOffset = Offsets.Splits[Idx];
4696       ++Idx;
4697       PartSize = (Idx < Size ? Offsets.Splits[Idx] : SliceSize) - PartOffset;
4698     }
4699 
4700     // Now that we have the split loads, do the slow walk over all uses of the
4701     // load and rewrite them as split stores, or save the split loads to use
4702     // below if the store is going to be split there anyways.
4703     bool DeferredStores = false;
4704     for (User *LU : LI->users()) {
4705       StoreInst *SI = cast<StoreInst>(LU);
4706       if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
4707         DeferredStores = true;
4708         LLVM_DEBUG(dbgs() << "    Deferred splitting of store: " << *SI
4709                           << "\n");
4710         continue;
4711       }
4712 
4713       Value *StoreBasePtr = SI->getPointerOperand();
4714       IRB.SetInsertPoint(SI);
4715       AAMDNodes AATags = SI->getAAMetadata();
4716 
4717       LLVM_DEBUG(dbgs() << "    Splitting store of load: " << *SI << "\n");
4718 
4719       for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
4720         LoadInst *PLoad = SplitLoads[Idx];
4721         uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
4722         auto *PartPtrTy = SI->getPointerOperandType();
4723 
4724         auto AS = SI->getPointerAddressSpace();
4725         StoreInst *PStore = IRB.CreateAlignedStore(
4726             PLoad,
4727             getAdjustedPtr(IRB, DL, StoreBasePtr,
4728                            APInt(DL.getIndexSizeInBits(AS), PartOffset),
4729                            PartPtrTy, StoreBasePtr->getName() + "."),
4730             getAdjustedAlignment(SI, PartOffset),
4731             /*IsVolatile*/ false);
4732         PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4733                                    LLVMContext::MD_access_group,
4734                                    LLVMContext::MD_DIAssignID});
4735 
4736         if (AATags)
4737           PStore->setAAMetadata(
4738               AATags.adjustForAccess(PartOffset, PLoad->getType(), DL));
4739         LLVM_DEBUG(dbgs() << "      +" << PartOffset << ":" << *PStore << "\n");
4740       }
4741 
4742       // We want to immediately iterate on any allocas impacted by splitting
4743       // this store, and we have to track any promotable alloca (indicated by
4744       // a direct store) as needing to be resplit because it is no longer
4745       // promotable.
4746       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
4747         ResplitPromotableAllocas.insert(OtherAI);
4748         Worklist.insert(OtherAI);
4749       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4750                      StoreBasePtr->stripInBoundsOffsets())) {
4751         Worklist.insert(OtherAI);
4752       }
4753 
4754       // Mark the original store as dead.
4755       DeadInsts.push_back(SI);
4756     }
4757 
4758     // Save the split loads if there are deferred stores among the users.
4759     if (DeferredStores)
4760       SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
4761 
4762     // Mark the original load as dead and kill the original slice.
4763     DeadInsts.push_back(LI);
4764     Offsets.S->kill();
4765   }
4766 
4767   // Second, we rewrite all of the split stores. At this point, we know that
4768   // all loads from this alloca have been split already. For stores of such
4769   // loads, we can simply look up the pre-existing split loads. For stores of
4770   // other loads, we split those loads first and then write split stores of
4771   // them.
4772   for (StoreInst *SI : Stores) {
4773     auto *LI = cast<LoadInst>(SI->getValueOperand());
4774     IntegerType *Ty = cast<IntegerType>(LI->getType());
4775     assert(Ty->getBitWidth() % 8 == 0);
4776     uint64_t StoreSize = Ty->getBitWidth() / 8;
4777     assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
4778 
4779     auto &Offsets = SplitOffsetsMap[SI];
4780     assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
4781            "Slice size should always match load size exactly!");
4782     uint64_t BaseOffset = Offsets.S->beginOffset();
4783     assert(BaseOffset + StoreSize > BaseOffset &&
4784            "Cannot represent alloca access size using 64-bit integers!");
4785 
4786     Value *LoadBasePtr = LI->getPointerOperand();
4787     Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
4788 
4789     LLVM_DEBUG(dbgs() << "  Splitting store: " << *SI << "\n");
4790 
4791     // Check whether we have an already split load.
4792     auto SplitLoadsMapI = SplitLoadsMap.find(LI);
4793     std::vector<LoadInst *> *SplitLoads = nullptr;
4794     if (SplitLoadsMapI != SplitLoadsMap.end()) {
4795       SplitLoads = &SplitLoadsMapI->second;
4796       assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
4797              "Too few split loads for the number of splits in the store!");
4798     } else {
4799       LLVM_DEBUG(dbgs() << "          of load: " << *LI << "\n");
4800     }
4801 
4802     uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
4803     int Idx = 0, Size = Offsets.Splits.size();
4804     for (;;) {
4805       auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
4806       auto *LoadPartPtrTy = LI->getPointerOperandType();
4807       auto *StorePartPtrTy = SI->getPointerOperandType();
4808 
4809       // Either lookup a split load or create one.
4810       LoadInst *PLoad;
4811       if (SplitLoads) {
4812         PLoad = (*SplitLoads)[Idx];
4813       } else {
4814         IRB.SetInsertPoint(LI);
4815         auto AS = LI->getPointerAddressSpace();
4816         PLoad = IRB.CreateAlignedLoad(
4817             PartTy,
4818             getAdjustedPtr(IRB, DL, LoadBasePtr,
4819                            APInt(DL.getIndexSizeInBits(AS), PartOffset),
4820                            LoadPartPtrTy, LoadBasePtr->getName() + "."),
4821             getAdjustedAlignment(LI, PartOffset),
4822             /*IsVolatile*/ false, LI->getName());
4823         PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
4824                                   LLVMContext::MD_access_group});
4825       }
4826 
4827       // And store this partition.
4828       IRB.SetInsertPoint(SI);
4829       auto AS = SI->getPointerAddressSpace();
4830       StoreInst *PStore = IRB.CreateAlignedStore(
4831           PLoad,
4832           getAdjustedPtr(IRB, DL, StoreBasePtr,
4833                          APInt(DL.getIndexSizeInBits(AS), PartOffset),
4834                          StorePartPtrTy, StoreBasePtr->getName() + "."),
4835           getAdjustedAlignment(SI, PartOffset),
4836           /*IsVolatile*/ false);
4837       PStore->copyMetadata(*SI, {LLVMContext::MD_mem_parallel_loop_access,
4838                                  LLVMContext::MD_access_group});
4839 
4840       // Now build a new slice for the alloca.
4841       NewSlices.push_back(
4842           Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
4843                 &PStore->getOperandUse(PStore->getPointerOperandIndex()),
4844                 /*IsSplittable*/ false));
4845       LLVM_DEBUG(dbgs() << "    new slice [" << NewSlices.back().beginOffset()
4846                         << ", " << NewSlices.back().endOffset()
4847                         << "): " << *PStore << "\n");
4848       if (!SplitLoads) {
4849         LLVM_DEBUG(dbgs() << "      of split load: " << *PLoad << "\n");
4850       }
4851 
4852       // See if we've finished all the splits.
4853       if (Idx >= Size)
4854         break;
4855 
4856       // Setup the next partition.
4857       PartOffset = Offsets.Splits[Idx];
4858       ++Idx;
4859       PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
4860     }
4861 
4862     // We want to immediately iterate on any allocas impacted by splitting
4863     // this load, which is only relevant if it isn't a load of this alloca and
4864     // thus we didn't already split the loads above. We also have to keep track
4865     // of any promotable allocas we split loads on as they can no longer be
4866     // promoted.
4867     if (!SplitLoads) {
4868       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
4869         assert(OtherAI != &AI && "We can't re-split our own alloca!");
4870         ResplitPromotableAllocas.insert(OtherAI);
4871         Worklist.insert(OtherAI);
4872       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
4873                      LoadBasePtr->stripInBoundsOffsets())) {
4874         assert(OtherAI != &AI && "We can't re-split our own alloca!");
4875         Worklist.insert(OtherAI);
4876       }
4877     }
4878 
4879     // Mark the original store as dead now that we've split it up and kill its
4880     // slice. Note that we leave the original load in place unless this store
4881     // was its only use. It may in turn be split up if it is an alloca load
4882     // for some other alloca, but it may be a normal load. This may introduce
4883     // redundant loads, but where those can be merged the rest of the optimizer
4884     // should handle the merging, and this uncovers SSA splits which is more
4885     // important. In practice, the original loads will almost always be fully
4886     // split and removed eventually, and the splits will be merged by any
4887     // trivial CSE, including instcombine.
4888     if (LI->hasOneUse()) {
4889       assert(*LI->user_begin() == SI && "Single use isn't this store!");
4890       DeadInsts.push_back(LI);
4891     }
4892     DeadInsts.push_back(SI);
4893     Offsets.S->kill();
4894   }
4895 
4896   // Remove the killed slices that have ben pre-split.
4897   llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
4898 
4899   // Insert our new slices. This will sort and merge them into the sorted
4900   // sequence.
4901   AS.insert(NewSlices);
4902 
4903   LLVM_DEBUG(dbgs() << "  Pre-split slices:\n");
4904 #ifndef NDEBUG
4905   for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
4906     LLVM_DEBUG(AS.print(dbgs(), I, "    "));
4907 #endif
4908 
4909   // Finally, don't try to promote any allocas that new require re-splitting.
4910   // They have already been added to the worklist above.
4911   PromotableAllocas.set_subtract(ResplitPromotableAllocas);
4912 
4913   return true;
4914 }
4915 
4916 /// Rewrite an alloca partition's users.
4917 ///
4918 /// This routine drives both of the rewriting goals of the SROA pass. It tries
4919 /// to rewrite uses of an alloca partition to be conducive for SSA value
4920 /// promotion. If the partition needs a new, more refined alloca, this will
4921 /// build that new alloca, preserving as much type information as possible, and
4922 /// rewrite the uses of the old alloca to point at the new one and have the
4923 /// appropriate new offsets. It also evaluates how successful the rewrite was
4924 /// at enabling promotion and if it was successful queues the alloca to be
4925 /// promoted.
rewritePartition(AllocaInst & AI,AllocaSlices & AS,Partition & P)4926 AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
4927                                    Partition &P) {
4928   // Try to compute a friendly type for this partition of the alloca. This
4929   // won't always succeed, in which case we fall back to a legal integer type
4930   // or an i8 array of an appropriate size.
4931   Type *SliceTy = nullptr;
4932   VectorType *SliceVecTy = nullptr;
4933   const DataLayout &DL = AI.getDataLayout();
4934   unsigned VScale = AI.getFunction()->getVScaleValue();
4935 
4936   std::pair<Type *, IntegerType *> CommonUseTy =
4937       findCommonType(P.begin(), P.end(), P.endOffset());
4938   // Do all uses operate on the same type?
4939   if (CommonUseTy.first) {
4940     TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first);
4941     if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) {
4942       SliceTy = CommonUseTy.first;
4943       SliceVecTy = dyn_cast<VectorType>(SliceTy);
4944     }
4945   }
4946   // If not, can we find an appropriate subtype in the original allocated type?
4947   if (!SliceTy)
4948     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4949                                                  P.beginOffset(), P.size()))
4950       SliceTy = TypePartitionTy;
4951 
4952   // If still not, can we use the largest bitwidth integer type used?
4953   if (!SliceTy && CommonUseTy.second)
4954     if (DL.getTypeAllocSize(CommonUseTy.second).getFixedValue() >= P.size()) {
4955       SliceTy = CommonUseTy.second;
4956       SliceVecTy = dyn_cast<VectorType>(SliceTy);
4957     }
4958   if ((!SliceTy || (SliceTy->isArrayTy() &&
4959                     SliceTy->getArrayElementType()->isIntegerTy())) &&
4960       DL.isLegalInteger(P.size() * 8)) {
4961     SliceTy = Type::getIntNTy(*C, P.size() * 8);
4962   }
4963 
4964   // If the common use types are not viable for promotion then attempt to find
4965   // another type that is viable.
4966   if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale))
4967     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
4968                                                  P.beginOffset(), P.size())) {
4969       VectorType *TypePartitionVecTy = dyn_cast<VectorType>(TypePartitionTy);
4970       if (TypePartitionVecTy &&
4971           checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale))
4972         SliceTy = TypePartitionTy;
4973     }
4974 
4975   if (!SliceTy)
4976     SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
4977   assert(DL.getTypeAllocSize(SliceTy).getFixedValue() >= P.size());
4978 
4979   bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
4980 
4981   VectorType *VecTy =
4982       IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale);
4983   if (VecTy)
4984     SliceTy = VecTy;
4985 
4986   // Check for the case where we're going to rewrite to a new alloca of the
4987   // exact same type as the original, and with the same access offsets. In that
4988   // case, re-use the existing alloca, but still run through the rewriter to
4989   // perform phi and select speculation.
4990   // P.beginOffset() can be non-zero even with the same type in a case with
4991   // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
4992   AllocaInst *NewAI;
4993   if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
4994     NewAI = &AI;
4995     // FIXME: We should be able to bail at this point with "nothing changed".
4996     // FIXME: We might want to defer PHI speculation until after here.
4997     // FIXME: return nullptr;
4998   } else {
4999     // Make sure the alignment is compatible with P.beginOffset().
5000     const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
5001     // If we will get at least this much alignment from the type alone, leave
5002     // the alloca's alignment unconstrained.
5003     const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
5004     NewAI = new AllocaInst(
5005         SliceTy, AI.getAddressSpace(), nullptr,
5006         IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
5007         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()),
5008         AI.getIterator());
5009     // Copy the old AI debug location over to the new one.
5010     NewAI->setDebugLoc(AI.getDebugLoc());
5011     ++NumNewAllocas;
5012   }
5013 
5014   LLVM_DEBUG(dbgs() << "Rewriting alloca partition " << "[" << P.beginOffset()
5015                     << "," << P.endOffset() << ") to: " << *NewAI << "\n");
5016 
5017   // Track the high watermark on the worklist as it is only relevant for
5018   // promoted allocas. We will reset it to this point if the alloca is not in
5019   // fact scheduled for promotion.
5020   unsigned PPWOldSize = PostPromotionWorklist.size();
5021   unsigned NumUses = 0;
5022   SmallSetVector<PHINode *, 8> PHIUsers;
5023   SmallSetVector<SelectInst *, 8> SelectUsers;
5024 
5025   AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
5026                                P.endOffset(), IsIntegerPromotable, VecTy,
5027                                PHIUsers, SelectUsers);
5028   bool Promotable = true;
5029   for (Slice *S : P.splitSliceTails()) {
5030     Promotable &= Rewriter.visit(S);
5031     ++NumUses;
5032   }
5033   for (Slice &S : P) {
5034     Promotable &= Rewriter.visit(&S);
5035     ++NumUses;
5036   }
5037 
5038   NumAllocaPartitionUses += NumUses;
5039   MaxUsesPerAllocaPartition.updateMax(NumUses);
5040 
5041   // Now that we've processed all the slices in the new partition, check if any
5042   // PHIs or Selects would block promotion.
5043   for (PHINode *PHI : PHIUsers)
5044     if (!isSafePHIToSpeculate(*PHI)) {
5045       Promotable = false;
5046       PHIUsers.clear();
5047       SelectUsers.clear();
5048       break;
5049     }
5050 
5051   SmallVector<std::pair<SelectInst *, RewriteableMemOps>, 2>
5052       NewSelectsToRewrite;
5053   NewSelectsToRewrite.reserve(SelectUsers.size());
5054   for (SelectInst *Sel : SelectUsers) {
5055     std::optional<RewriteableMemOps> Ops =
5056         isSafeSelectToSpeculate(*Sel, PreserveCFG);
5057     if (!Ops) {
5058       Promotable = false;
5059       PHIUsers.clear();
5060       SelectUsers.clear();
5061       NewSelectsToRewrite.clear();
5062       break;
5063     }
5064     NewSelectsToRewrite.emplace_back(std::make_pair(Sel, *Ops));
5065   }
5066 
5067   if (Promotable) {
5068     for (Use *U : AS.getDeadUsesIfPromotable()) {
5069       auto *OldInst = dyn_cast<Instruction>(U->get());
5070       Value::dropDroppableUse(*U);
5071       if (OldInst)
5072         if (isInstructionTriviallyDead(OldInst))
5073           DeadInsts.push_back(OldInst);
5074     }
5075     if (PHIUsers.empty() && SelectUsers.empty()) {
5076       // Promote the alloca.
5077       PromotableAllocas.insert(NewAI);
5078     } else {
5079       // If we have either PHIs or Selects to speculate, add them to those
5080       // worklists and re-queue the new alloca so that we promote in on the
5081       // next iteration.
5082       SpeculatablePHIs.insert_range(PHIUsers);
5083       SelectsToRewrite.reserve(SelectsToRewrite.size() +
5084                                NewSelectsToRewrite.size());
5085       for (auto &&KV : llvm::make_range(
5086                std::make_move_iterator(NewSelectsToRewrite.begin()),
5087                std::make_move_iterator(NewSelectsToRewrite.end())))
5088         SelectsToRewrite.insert(std::move(KV));
5089       Worklist.insert(NewAI);
5090     }
5091   } else {
5092     // Drop any post-promotion work items if promotion didn't happen.
5093     while (PostPromotionWorklist.size() > PPWOldSize)
5094       PostPromotionWorklist.pop_back();
5095 
5096     // We couldn't promote and we didn't create a new partition, nothing
5097     // happened.
5098     if (NewAI == &AI)
5099       return nullptr;
5100 
5101     // If we can't promote the alloca, iterate on it to check for new
5102     // refinements exposed by splitting the current alloca. Don't iterate on an
5103     // alloca which didn't actually change and didn't get promoted.
5104     Worklist.insert(NewAI);
5105   }
5106 
5107   return NewAI;
5108 }
5109 
5110 // There isn't a shared interface to get the "address" parts out of a
5111 // dbg.declare and dbg.assign, so provide some wrappers now for
5112 // both debug intrinsics and records.
getAddress(const DbgVariableIntrinsic * DVI)5113 const Value *getAddress(const DbgVariableIntrinsic *DVI) {
5114   if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
5115     return DAI->getAddress();
5116   return cast<DbgDeclareInst>(DVI)->getAddress();
5117 }
5118 
getAddress(const DbgVariableRecord * DVR)5119 const Value *getAddress(const DbgVariableRecord *DVR) {
5120   return DVR->getAddress();
5121 }
5122 
isKillAddress(const DbgVariableIntrinsic * DVI)5123 bool isKillAddress(const DbgVariableIntrinsic *DVI) {
5124   if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
5125     return DAI->isKillAddress();
5126   return cast<DbgDeclareInst>(DVI)->isKillLocation();
5127 }
5128 
isKillAddress(const DbgVariableRecord * DVR)5129 bool isKillAddress(const DbgVariableRecord *DVR) {
5130   if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5131     return DVR->isKillAddress();
5132   return DVR->isKillLocation();
5133 }
5134 
getAddressExpression(const DbgVariableIntrinsic * DVI)5135 const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) {
5136   if (const auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
5137     return DAI->getAddressExpression();
5138   return cast<DbgDeclareInst>(DVI)->getExpression();
5139 }
5140 
getAddressExpression(const DbgVariableRecord * DVR)5141 const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) {
5142   if (DVR->getType() == DbgVariableRecord::LocationType::Assign)
5143     return DVR->getAddressExpression();
5144   return DVR->getExpression();
5145 }
5146 
5147 /// Create or replace an existing fragment in a DIExpression with \p Frag.
5148 /// If the expression already contains a DW_OP_LLVM_extract_bits_[sz]ext
5149 /// operation, add \p BitExtractOffset to the offset part.
5150 ///
5151 /// Returns the new expression, or nullptr if this fails (see details below).
5152 ///
5153 /// This function is similar to DIExpression::createFragmentExpression except
5154 /// for 3 important distinctions:
5155 ///   1. The new fragment isn't relative to an existing fragment.
5156 ///   2. It assumes the computed location is a memory location. This means we
5157 ///      don't need to perform checks that creating the fragment preserves the
5158 ///      expression semantics.
5159 ///   3. Existing extract_bits are modified independently of fragment changes
5160 ///      using \p BitExtractOffset. A change to the fragment offset or size
5161 ///      may affect a bit extract. But a bit extract offset can change
5162 ///      independently of the fragment dimensions.
5163 ///
5164 /// Returns the new expression, or nullptr if one couldn't be created.
5165 /// Ideally this is only used to signal that a bit-extract has become
5166 /// zero-sized (and thus the new debug record has no size and can be
5167 /// dropped), however, it fails for other reasons too - see the FIXME below.
5168 ///
5169 /// FIXME: To keep the change that introduces this function NFC it bails
5170 /// in some situations unecessarily, e.g. when fragment and bit extract
5171 /// sizes differ.
createOrReplaceFragment(const DIExpression * Expr,DIExpression::FragmentInfo Frag,int64_t BitExtractOffset)5172 static DIExpression *createOrReplaceFragment(const DIExpression *Expr,
5173                                              DIExpression::FragmentInfo Frag,
5174                                              int64_t BitExtractOffset) {
5175   SmallVector<uint64_t, 8> Ops;
5176   bool HasFragment = false;
5177   bool HasBitExtract = false;
5178 
5179   for (auto &Op : Expr->expr_ops()) {
5180     if (Op.getOp() == dwarf::DW_OP_LLVM_fragment) {
5181       HasFragment = true;
5182       continue;
5183     }
5184     if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5185         Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5186       HasBitExtract = true;
5187       int64_t ExtractOffsetInBits = Op.getArg(0);
5188       int64_t ExtractSizeInBits = Op.getArg(1);
5189 
5190       // DIExpression::createFragmentExpression doesn't know how to handle
5191       // a fragment that is smaller than the extract. Copy the behaviour
5192       // (bail) to avoid non-NFC changes.
5193       // FIXME: Don't do this.
5194       if (Frag.SizeInBits < uint64_t(ExtractSizeInBits))
5195         return nullptr;
5196 
5197       assert(BitExtractOffset <= 0);
5198       int64_t AdjustedOffset = ExtractOffsetInBits + BitExtractOffset;
5199 
5200       // DIExpression::createFragmentExpression doesn't know what to do
5201       // if the new extract starts "outside" the existing one. Copy the
5202       // behaviour (bail) to avoid non-NFC changes.
5203       // FIXME: Don't do this.
5204       if (AdjustedOffset < 0)
5205         return nullptr;
5206 
5207       Ops.push_back(Op.getOp());
5208       Ops.push_back(std::max<int64_t>(0, AdjustedOffset));
5209       Ops.push_back(ExtractSizeInBits);
5210       continue;
5211     }
5212     Op.appendToVector(Ops);
5213   }
5214 
5215   // Unsupported by createFragmentExpression, so don't support it here yet to
5216   // preserve NFC-ness.
5217   if (HasFragment && HasBitExtract)
5218     return nullptr;
5219 
5220   if (!HasBitExtract) {
5221     Ops.push_back(dwarf::DW_OP_LLVM_fragment);
5222     Ops.push_back(Frag.OffsetInBits);
5223     Ops.push_back(Frag.SizeInBits);
5224   }
5225   return DIExpression::get(Expr->getContext(), Ops);
5226 }
5227 
5228 /// Insert a new dbg.declare.
5229 /// \p Orig Original to copy debug loc and variable from.
5230 /// \p NewAddr Location's new base address.
5231 /// \p NewAddrExpr New expression to apply to address.
5232 /// \p BeforeInst Insert position.
5233 /// \p NewFragment New fragment (absolute, non-relative).
5234 /// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5235 static void
insertNewDbgInst(DIBuilder & DIB,DbgDeclareInst * Orig,AllocaInst * NewAddr,DIExpression * NewAddrExpr,Instruction * BeforeInst,std::optional<DIExpression::FragmentInfo> NewFragment,int64_t BitExtractAdjustment)5236 insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr,
5237                  DIExpression *NewAddrExpr, Instruction *BeforeInst,
5238                  std::optional<DIExpression::FragmentInfo> NewFragment,
5239                  int64_t BitExtractAdjustment) {
5240   if (NewFragment)
5241     NewAddrExpr = createOrReplaceFragment(NewAddrExpr, *NewFragment,
5242                                           BitExtractAdjustment);
5243   if (!NewAddrExpr)
5244     return;
5245 
5246   DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr,
5247                     Orig->getDebugLoc(), BeforeInst->getIterator());
5248 }
5249 
5250 /// Insert a new dbg.assign.
5251 /// \p Orig Original to copy debug loc, variable, value and value expression
5252 ///    from.
5253 /// \p NewAddr Location's new base address.
5254 /// \p NewAddrExpr New expression to apply to address.
5255 /// \p BeforeInst Insert position.
5256 /// \p NewFragment New fragment (absolute, non-relative).
5257 /// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5258 static void
insertNewDbgInst(DIBuilder & DIB,DbgAssignIntrinsic * Orig,AllocaInst * NewAddr,DIExpression * NewAddrExpr,Instruction * BeforeInst,std::optional<DIExpression::FragmentInfo> NewFragment,int64_t BitExtractAdjustment)5259 insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr,
5260                  DIExpression *NewAddrExpr, Instruction *BeforeInst,
5261                  std::optional<DIExpression::FragmentInfo> NewFragment,
5262                  int64_t BitExtractAdjustment) {
5263   // DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr.
5264   (void)BeforeInst;
5265 
5266   // A dbg.assign puts fragment info in the value expression only. The address
5267   // expression has already been built: NewAddrExpr.
5268   DIExpression *NewFragmentExpr = Orig->getExpression();
5269   if (NewFragment)
5270     NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5271                                               BitExtractAdjustment);
5272   if (!NewFragmentExpr)
5273     return;
5274 
5275   // Apply a DIAssignID to the store if it doesn't already have it.
5276   if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5277     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5278                          DIAssignID::getDistinct(NewAddr->getContext()));
5279   }
5280 
5281   Instruction *NewAssign = cast<Instruction *>(DIB.insertDbgAssign(
5282       NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5283       NewAddrExpr, Orig->getDebugLoc()));
5284   LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n");
5285   (void)NewAssign;
5286 }
5287 
5288 /// Insert a new DbgRecord.
5289 /// \p Orig Original to copy record type, debug loc and variable from, and
5290 ///    additionally value and value expression for dbg_assign records.
5291 /// \p NewAddr Location's new base address.
5292 /// \p NewAddrExpr New expression to apply to address.
5293 /// \p BeforeInst Insert position.
5294 /// \p NewFragment New fragment (absolute, non-relative).
5295 /// \p BitExtractAdjustment Offset to apply to any extract_bits op.
5296 static void
insertNewDbgInst(DIBuilder & DIB,DbgVariableRecord * Orig,AllocaInst * NewAddr,DIExpression * NewAddrExpr,Instruction * BeforeInst,std::optional<DIExpression::FragmentInfo> NewFragment,int64_t BitExtractAdjustment)5297 insertNewDbgInst(DIBuilder &DIB, DbgVariableRecord *Orig, AllocaInst *NewAddr,
5298                  DIExpression *NewAddrExpr, Instruction *BeforeInst,
5299                  std::optional<DIExpression::FragmentInfo> NewFragment,
5300                  int64_t BitExtractAdjustment) {
5301   (void)DIB;
5302 
5303   // A dbg_assign puts fragment info in the value expression only. The address
5304   // expression has already been built: NewAddrExpr. A dbg_declare puts the
5305   // new fragment info into NewAddrExpr (as it only has one expression).
5306   DIExpression *NewFragmentExpr =
5307       Orig->isDbgAssign() ? Orig->getExpression() : NewAddrExpr;
5308   if (NewFragment)
5309     NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment,
5310                                               BitExtractAdjustment);
5311   if (!NewFragmentExpr)
5312     return;
5313 
5314   if (Orig->isDbgDeclare()) {
5315     DbgVariableRecord *DVR = DbgVariableRecord::createDVRDeclare(
5316         NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5317     BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5318                                                    BeforeInst->getIterator());
5319     return;
5320   }
5321 
5322   if (Orig->isDbgValue()) {
5323     DbgVariableRecord *DVR = DbgVariableRecord::createDbgVariableRecord(
5324         NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
5325     // Drop debug information if the expression doesn't start with a
5326     // DW_OP_deref. This is because without a DW_OP_deref, the #dbg_value
5327     // describes the address of alloca rather than the value inside the alloca.
5328     if (!NewFragmentExpr->startsWithDeref())
5329       DVR->setKillAddress();
5330     BeforeInst->getParent()->insertDbgRecordBefore(DVR,
5331                                                    BeforeInst->getIterator());
5332     return;
5333   }
5334 
5335   // Apply a DIAssignID to the store if it doesn't already have it.
5336   if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
5337     NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
5338                          DIAssignID::getDistinct(NewAddr->getContext()));
5339   }
5340 
5341   DbgVariableRecord *NewAssign = DbgVariableRecord::createLinkedDVRAssign(
5342       NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
5343       NewAddrExpr, Orig->getDebugLoc());
5344   LLVM_DEBUG(dbgs() << "Created new DVRAssign: " << *NewAssign << "\n");
5345   (void)NewAssign;
5346 }
5347 
5348 /// Walks the slices of an alloca and form partitions based on them,
5349 /// rewriting each of their uses.
splitAlloca(AllocaInst & AI,AllocaSlices & AS)5350 bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
5351   if (AS.begin() == AS.end())
5352     return false;
5353 
5354   unsigned NumPartitions = 0;
5355   bool Changed = false;
5356   const DataLayout &DL = AI.getModule()->getDataLayout();
5357 
5358   // First try to pre-split loads and stores.
5359   Changed |= presplitLoadsAndStores(AI, AS);
5360 
5361   // Now that we have identified any pre-splitting opportunities,
5362   // mark loads and stores unsplittable except for the following case.
5363   // We leave a slice splittable if all other slices are disjoint or fully
5364   // included in the slice, such as whole-alloca loads and stores.
5365   // If we fail to split these during pre-splitting, we want to force them
5366   // to be rewritten into a partition.
5367   bool IsSorted = true;
5368 
5369   uint64_t AllocaSize =
5370       DL.getTypeAllocSize(AI.getAllocatedType()).getFixedValue();
5371   const uint64_t MaxBitVectorSize = 1024;
5372   if (AllocaSize <= MaxBitVectorSize) {
5373     // If a byte boundary is included in any load or store, a slice starting or
5374     // ending at the boundary is not splittable.
5375     SmallBitVector SplittableOffset(AllocaSize + 1, true);
5376     for (Slice &S : AS)
5377       for (unsigned O = S.beginOffset() + 1;
5378            O < S.endOffset() && O < AllocaSize; O++)
5379         SplittableOffset.reset(O);
5380 
5381     for (Slice &S : AS) {
5382       if (!S.isSplittable())
5383         continue;
5384 
5385       if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
5386           (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
5387         continue;
5388 
5389       if (isa<LoadInst>(S.getUse()->getUser()) ||
5390           isa<StoreInst>(S.getUse()->getUser())) {
5391         S.makeUnsplittable();
5392         IsSorted = false;
5393       }
5394     }
5395   } else {
5396     // We only allow whole-alloca splittable loads and stores
5397     // for a large alloca to avoid creating too large BitVector.
5398     for (Slice &S : AS) {
5399       if (!S.isSplittable())
5400         continue;
5401 
5402       if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
5403         continue;
5404 
5405       if (isa<LoadInst>(S.getUse()->getUser()) ||
5406           isa<StoreInst>(S.getUse()->getUser())) {
5407         S.makeUnsplittable();
5408         IsSorted = false;
5409       }
5410     }
5411   }
5412 
5413   if (!IsSorted)
5414     llvm::stable_sort(AS);
5415 
5416   /// Describes the allocas introduced by rewritePartition in order to migrate
5417   /// the debug info.
5418   struct Fragment {
5419     AllocaInst *Alloca;
5420     uint64_t Offset;
5421     uint64_t Size;
5422     Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
5423         : Alloca(AI), Offset(O), Size(S) {}
5424   };
5425   SmallVector<Fragment, 4> Fragments;
5426 
5427   // Rewrite each partition.
5428   for (auto &P : AS.partitions()) {
5429     if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
5430       Changed = true;
5431       if (NewAI != &AI) {
5432         uint64_t SizeOfByte = 8;
5433         uint64_t AllocaSize =
5434             DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedValue();
5435         // Don't include any padding.
5436         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
5437         Fragments.push_back(
5438             Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
5439       }
5440     }
5441     ++NumPartitions;
5442   }
5443 
5444   NumAllocaPartitions += NumPartitions;
5445   MaxPartitionsPerAlloca.updateMax(NumPartitions);
5446 
5447   // Migrate debug information from the old alloca to the new alloca(s)
5448   // and the individual partitions.
5449   auto MigrateOne = [&](auto *DbgVariable) {
5450     // Can't overlap with undef memory.
5451     if (isKillAddress(DbgVariable))
5452       return;
5453 
5454     const Value *DbgPtr = getAddress(DbgVariable);
5455     DIExpression::FragmentInfo VarFrag =
5456         DbgVariable->getFragmentOrEntireVariable();
5457     // Get the address expression constant offset if one exists and the ops
5458     // that come after it.
5459     int64_t CurrentExprOffsetInBytes = 0;
5460     SmallVector<uint64_t> PostOffsetOps;
5461     if (!getAddressExpression(DbgVariable)
5462              ->extractLeadingOffset(CurrentExprOffsetInBytes, PostOffsetOps))
5463       return; // Couldn't interpret this DIExpression - drop the var.
5464 
5465     // Offset defined by a DW_OP_LLVM_extract_bits_[sz]ext.
5466     int64_t ExtractOffsetInBits = 0;
5467     for (auto Op : getAddressExpression(DbgVariable)->expr_ops()) {
5468       if (Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_zext ||
5469           Op.getOp() == dwarf::DW_OP_LLVM_extract_bits_sext) {
5470         ExtractOffsetInBits = Op.getArg(0);
5471         break;
5472       }
5473     }
5474 
5475     DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
5476     for (auto Fragment : Fragments) {
5477       int64_t OffsetFromLocationInBits;
5478       std::optional<DIExpression::FragmentInfo> NewDbgFragment;
5479       // Find the variable fragment that the new alloca slice covers.
5480       // Drop debug info for this variable fragment if we can't compute an
5481       // intersect between it and the alloca slice.
5482       if (!DIExpression::calculateFragmentIntersect(
5483               DL, &AI, Fragment.Offset, Fragment.Size, DbgPtr,
5484               CurrentExprOffsetInBytes * 8, ExtractOffsetInBits, VarFrag,
5485               NewDbgFragment, OffsetFromLocationInBits))
5486         continue; // Do not migrate this fragment to this slice.
5487 
5488       // Zero sized fragment indicates there's no intersect between the variable
5489       // fragment and the alloca slice. Skip this slice for this variable
5490       // fragment.
5491       if (NewDbgFragment && !NewDbgFragment->SizeInBits)
5492         continue; // Do not migrate this fragment to this slice.
5493 
5494       // No fragment indicates DbgVariable's variable or fragment exactly
5495       // overlaps the slice; copy its fragment (or nullopt if there isn't one).
5496       if (!NewDbgFragment)
5497         NewDbgFragment = DbgVariable->getFragment();
5498 
5499       // Reduce the new expression offset by the bit-extract offset since
5500       // we'll be keeping that.
5501       int64_t OffestFromNewAllocaInBits =
5502           OffsetFromLocationInBits - ExtractOffsetInBits;
5503       // We need to adjust an existing bit extract if the offset expression
5504       // can't eat the slack (i.e., if the new offset would be negative).
5505       int64_t BitExtractOffset =
5506           std::min<int64_t>(0, OffestFromNewAllocaInBits);
5507       // The magnitude of a negative value indicates the number of bits into
5508       // the existing variable fragment that the memory region begins. The new
5509       // variable fragment already excludes those bits - the new DbgPtr offset
5510       // only needs to be applied if it's positive.
5511       OffestFromNewAllocaInBits =
5512           std::max(int64_t(0), OffestFromNewAllocaInBits);
5513 
5514       // Rebuild the expression:
5515       //    {Offset(OffestFromNewAllocaInBits), PostOffsetOps, NewDbgFragment}
5516       // Add NewDbgFragment later, because dbg.assigns don't want it in the
5517       // address expression but the value expression instead.
5518       DIExpression *NewExpr = DIExpression::get(AI.getContext(), PostOffsetOps);
5519       if (OffestFromNewAllocaInBits > 0) {
5520         int64_t OffsetInBytes = (OffestFromNewAllocaInBits + 7) / 8;
5521         NewExpr = DIExpression::prepend(NewExpr, /*flags=*/0, OffsetInBytes);
5522       }
5523 
5524       // Remove any existing intrinsics on the new alloca describing
5525       // the variable fragment.
5526       auto RemoveOne = [DbgVariable](auto *OldDII) {
5527         auto SameVariableFragment = [](const auto *LHS, const auto *RHS) {
5528           return LHS->getVariable() == RHS->getVariable() &&
5529                  LHS->getDebugLoc()->getInlinedAt() ==
5530                      RHS->getDebugLoc()->getInlinedAt();
5531         };
5532         if (SameVariableFragment(OldDII, DbgVariable))
5533           OldDII->eraseFromParent();
5534       };
5535       for_each(findDbgDeclares(Fragment.Alloca), RemoveOne);
5536       for_each(findDVRDeclares(Fragment.Alloca), RemoveOne);
5537       for_each(findDVRValues(Fragment.Alloca), RemoveOne);
5538       insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI,
5539                        NewDbgFragment, BitExtractOffset);
5540     }
5541   };
5542 
5543   // Migrate debug information from the old alloca to the new alloca(s)
5544   // and the individual partitions.
5545   for_each(findDbgDeclares(&AI), MigrateOne);
5546   for_each(findDVRDeclares(&AI), MigrateOne);
5547   for_each(findDVRValues(&AI), MigrateOne);
5548   for_each(at::getAssignmentMarkers(&AI), MigrateOne);
5549   for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne);
5550 
5551   return Changed;
5552 }
5553 
5554 /// Clobber a use with poison, deleting the used value if it becomes dead.
clobberUse(Use & U)5555 void SROA::clobberUse(Use &U) {
5556   Value *OldV = U;
5557   // Replace the use with an poison value.
5558   U = PoisonValue::get(OldV->getType());
5559 
5560   // Check for this making an instruction dead. We have to garbage collect
5561   // all the dead instructions to ensure the uses of any alloca end up being
5562   // minimal.
5563   if (Instruction *OldI = dyn_cast<Instruction>(OldV))
5564     if (isInstructionTriviallyDead(OldI)) {
5565       DeadInsts.push_back(OldI);
5566     }
5567 }
5568 
5569 /// A basic LoadAndStorePromoter that does not remove store nodes.
5570 class BasicLoadAndStorePromoter : public LoadAndStorePromoter {
5571 public:
BasicLoadAndStorePromoter(ArrayRef<const Instruction * > Insts,SSAUpdater & S,Type * ZeroType)5572   BasicLoadAndStorePromoter(ArrayRef<const Instruction *> Insts, SSAUpdater &S,
5573                             Type *ZeroType)
5574       : LoadAndStorePromoter(Insts, S), ZeroType(ZeroType) {}
shouldDelete(Instruction * I) const5575   bool shouldDelete(Instruction *I) const override {
5576     return !isa<StoreInst>(I) && !isa<AllocaInst>(I);
5577   }
5578 
getValueToUseForAlloca(Instruction * I) const5579   Value *getValueToUseForAlloca(Instruction *I) const override {
5580     return UndefValue::get(ZeroType);
5581   }
5582 
5583 private:
5584   Type *ZeroType;
5585 };
5586 
propagateStoredValuesToLoads(AllocaInst & AI,AllocaSlices & AS)5587 bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) {
5588   // Look through each "partition", looking for slices with the same start/end
5589   // that do not overlap with any before them. The slices are sorted by
5590   // increasing beginOffset. We don't use AS.partitions(), as it will use a more
5591   // sophisticated algorithm that takes splittable slices into account.
5592   LLVM_DEBUG(dbgs() << "Attempting to propagate values on " << AI << "\n");
5593   bool AllSameAndValid = true;
5594   Type *PartitionType = nullptr;
5595   SmallVector<Instruction *> Insts;
5596   uint64_t BeginOffset = 0;
5597   uint64_t EndOffset = 0;
5598 
5599   auto Flush = [&]() {
5600     if (AllSameAndValid && !Insts.empty()) {
5601       LLVM_DEBUG(dbgs() << "Propagate values on slice [" << BeginOffset << ", "
5602                         << EndOffset << ")\n");
5603       SmallVector<PHINode *, 4> NewPHIs;
5604       SSAUpdater SSA(&NewPHIs);
5605       Insts.push_back(&AI);
5606       BasicLoadAndStorePromoter Promoter(Insts, SSA, PartitionType);
5607       Promoter.run(Insts);
5608     }
5609     AllSameAndValid = true;
5610     PartitionType = nullptr;
5611     Insts.clear();
5612   };
5613 
5614   for (Slice &S : AS) {
5615     auto *User = cast<Instruction>(S.getUse()->getUser());
5616     if (isAssumeLikeIntrinsic(User)) {
5617       LLVM_DEBUG({
5618         dbgs() << "Ignoring slice: ";
5619         AS.print(dbgs(), &S);
5620       });
5621       continue;
5622     }
5623     if (S.beginOffset() >= EndOffset) {
5624       Flush();
5625       BeginOffset = S.beginOffset();
5626       EndOffset = S.endOffset();
5627     } else if (S.beginOffset() != BeginOffset || S.endOffset() != EndOffset) {
5628       if (AllSameAndValid) {
5629         LLVM_DEBUG({
5630           dbgs() << "Slice does not match range [" << BeginOffset << ", "
5631                  << EndOffset << ")";
5632           AS.print(dbgs(), &S);
5633         });
5634         AllSameAndValid = false;
5635       }
5636       EndOffset = std::max(EndOffset, S.endOffset());
5637       continue;
5638     }
5639 
5640     if (auto *LI = dyn_cast<LoadInst>(User)) {
5641       Type *UserTy = LI->getType();
5642       // LoadAndStorePromoter requires all the types to be the same.
5643       if (!LI->isSimple() || (PartitionType && UserTy != PartitionType))
5644         AllSameAndValid = false;
5645       PartitionType = UserTy;
5646       Insts.push_back(User);
5647     } else if (auto *SI = dyn_cast<StoreInst>(User)) {
5648       Type *UserTy = SI->getValueOperand()->getType();
5649       if (!SI->isSimple() || (PartitionType && UserTy != PartitionType))
5650         AllSameAndValid = false;
5651       PartitionType = UserTy;
5652       Insts.push_back(User);
5653     } else {
5654       AllSameAndValid = false;
5655     }
5656   }
5657 
5658   Flush();
5659   return true;
5660 }
5661 
5662 /// Analyze an alloca for SROA.
5663 ///
5664 /// This analyzes the alloca to ensure we can reason about it, builds
5665 /// the slices of the alloca, and then hands it off to be split and
5666 /// rewritten as needed.
5667 std::pair<bool /*Changed*/, bool /*CFGChanged*/>
runOnAlloca(AllocaInst & AI)5668 SROA::runOnAlloca(AllocaInst &AI) {
5669   bool Changed = false;
5670   bool CFGChanged = false;
5671 
5672   LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
5673   ++NumAllocasAnalyzed;
5674 
5675   // Special case dead allocas, as they're trivial.
5676   if (AI.use_empty()) {
5677     AI.eraseFromParent();
5678     Changed = true;
5679     return {Changed, CFGChanged};
5680   }
5681   const DataLayout &DL = AI.getDataLayout();
5682 
5683   // Skip alloca forms that this analysis can't handle.
5684   auto *AT = AI.getAllocatedType();
5685   TypeSize Size = DL.getTypeAllocSize(AT);
5686   if (AI.isArrayAllocation() || !AT->isSized() || Size.isScalable() ||
5687       Size.getFixedValue() == 0)
5688     return {Changed, CFGChanged};
5689 
5690   // First, split any FCA loads and stores touching this alloca to promote
5691   // better splitting and promotion opportunities.
5692   IRBuilderTy IRB(&AI);
5693   AggLoadStoreRewriter AggRewriter(DL, IRB);
5694   Changed |= AggRewriter.rewrite(AI);
5695 
5696   // Build the slices using a recursive instruction-visiting builder.
5697   AllocaSlices AS(DL, AI);
5698   LLVM_DEBUG(AS.print(dbgs()));
5699   if (AS.isEscaped())
5700     return {Changed, CFGChanged};
5701 
5702   if (AS.isEscapedReadOnly()) {
5703     Changed |= propagateStoredValuesToLoads(AI, AS);
5704     return {Changed, CFGChanged};
5705   }
5706 
5707   // Delete all the dead users of this alloca before splitting and rewriting it.
5708   for (Instruction *DeadUser : AS.getDeadUsers()) {
5709     // Free up everything used by this instruction.
5710     for (Use &DeadOp : DeadUser->operands())
5711       clobberUse(DeadOp);
5712 
5713     // Now replace the uses of this instruction.
5714     DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
5715 
5716     // And mark it for deletion.
5717     DeadInsts.push_back(DeadUser);
5718     Changed = true;
5719   }
5720   for (Use *DeadOp : AS.getDeadOperands()) {
5721     clobberUse(*DeadOp);
5722     Changed = true;
5723   }
5724 
5725   // No slices to split. Leave the dead alloca for a later pass to clean up.
5726   if (AS.begin() == AS.end())
5727     return {Changed, CFGChanged};
5728 
5729   Changed |= splitAlloca(AI, AS);
5730 
5731   LLVM_DEBUG(dbgs() << "  Speculating PHIs\n");
5732   while (!SpeculatablePHIs.empty())
5733     speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
5734 
5735   LLVM_DEBUG(dbgs() << "  Rewriting Selects\n");
5736   auto RemainingSelectsToRewrite = SelectsToRewrite.takeVector();
5737   while (!RemainingSelectsToRewrite.empty()) {
5738     const auto [K, V] = RemainingSelectsToRewrite.pop_back_val();
5739     CFGChanged |=
5740         rewriteSelectInstMemOps(*K, V, IRB, PreserveCFG ? nullptr : DTU);
5741   }
5742 
5743   return {Changed, CFGChanged};
5744 }
5745 
5746 /// Delete the dead instructions accumulated in this run.
5747 ///
5748 /// Recursively deletes the dead instructions we've accumulated. This is done
5749 /// at the very end to maximize locality of the recursive delete and to
5750 /// minimize the problems of invalidated instruction pointers as such pointers
5751 /// are used heavily in the intermediate stages of the algorithm.
5752 ///
5753 /// We also record the alloca instructions deleted here so that they aren't
5754 /// subsequently handed to mem2reg to promote.
deleteDeadInstructions(SmallPtrSetImpl<AllocaInst * > & DeletedAllocas)5755 bool SROA::deleteDeadInstructions(
5756     SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
5757   bool Changed = false;
5758   while (!DeadInsts.empty()) {
5759     Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
5760     if (!I)
5761       continue;
5762     LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
5763 
5764     // If the instruction is an alloca, find the possible dbg.declare connected
5765     // to it, and remove it too. We must do this before calling RAUW or we will
5766     // not be able to find it.
5767     if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5768       DeletedAllocas.insert(AI);
5769       for (DbgDeclareInst *OldDII : findDbgDeclares(AI))
5770         OldDII->eraseFromParent();
5771       for (DbgVariableRecord *OldDII : findDVRDeclares(AI))
5772         OldDII->eraseFromParent();
5773     }
5774 
5775     at::deleteAssignmentMarkers(I);
5776     I->replaceAllUsesWith(UndefValue::get(I->getType()));
5777 
5778     for (Use &Operand : I->operands())
5779       if (Instruction *U = dyn_cast<Instruction>(Operand)) {
5780         // Zero out the operand and see if it becomes trivially dead.
5781         Operand = nullptr;
5782         if (isInstructionTriviallyDead(U))
5783           DeadInsts.push_back(U);
5784       }
5785 
5786     ++NumDeleted;
5787     I->eraseFromParent();
5788     Changed = true;
5789   }
5790   return Changed;
5791 }
5792 /// Promote the allocas, using the best available technique.
5793 ///
5794 /// This attempts to promote whatever allocas have been identified as viable in
5795 /// the PromotableAllocas list. If that list is empty, there is nothing to do.
5796 /// This function returns whether any promotion occurred.
promoteAllocas()5797 bool SROA::promoteAllocas() {
5798   if (PromotableAllocas.empty())
5799     return false;
5800 
5801   if (SROASkipMem2Reg) {
5802     LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n");
5803   } else {
5804     LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
5805     NumPromoted += PromotableAllocas.size();
5806     PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC);
5807   }
5808 
5809   PromotableAllocas.clear();
5810   return true;
5811 }
5812 
runSROA(Function & F)5813 std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
5814   LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
5815 
5816   const DataLayout &DL = F.getDataLayout();
5817   BasicBlock &EntryBB = F.getEntryBlock();
5818   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
5819        I != E; ++I) {
5820     if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
5821       if (DL.getTypeAllocSize(AI->getAllocatedType()).isScalable() &&
5822           isAllocaPromotable(AI))
5823         PromotableAllocas.insert(AI);
5824       else
5825         Worklist.insert(AI);
5826     }
5827   }
5828 
5829   bool Changed = false;
5830   bool CFGChanged = false;
5831   // A set of deleted alloca instruction pointers which should be removed from
5832   // the list of promotable allocas.
5833   SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
5834 
5835   do {
5836     while (!Worklist.empty()) {
5837       auto [IterationChanged, IterationCFGChanged] =
5838           runOnAlloca(*Worklist.pop_back_val());
5839       Changed |= IterationChanged;
5840       CFGChanged |= IterationCFGChanged;
5841 
5842       Changed |= deleteDeadInstructions(DeletedAllocas);
5843 
5844       // Remove the deleted allocas from various lists so that we don't try to
5845       // continue processing them.
5846       if (!DeletedAllocas.empty()) {
5847         Worklist.set_subtract(DeletedAllocas);
5848         PostPromotionWorklist.set_subtract(DeletedAllocas);
5849         PromotableAllocas.set_subtract(DeletedAllocas);
5850         DeletedAllocas.clear();
5851       }
5852     }
5853 
5854     Changed |= promoteAllocas();
5855 
5856     Worklist = PostPromotionWorklist;
5857     PostPromotionWorklist.clear();
5858   } while (!Worklist.empty());
5859 
5860   assert((!CFGChanged || Changed) && "Can not only modify the CFG.");
5861   assert((!CFGChanged || !PreserveCFG) &&
5862          "Should not have modified the CFG when told to preserve it.");
5863 
5864   if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
5865     for (auto &BB : F) {
5866       RemoveRedundantDbgInstrs(&BB);
5867     }
5868   }
5869 
5870   return {Changed, CFGChanged};
5871 }
5872 
run(Function & F,FunctionAnalysisManager & AM)5873 PreservedAnalyses SROAPass::run(Function &F, FunctionAnalysisManager &AM) {
5874   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
5875   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
5876   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5877   auto [Changed, CFGChanged] =
5878       SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5879   if (!Changed)
5880     return PreservedAnalyses::all();
5881   PreservedAnalyses PA;
5882   if (!CFGChanged)
5883     PA.preserveSet<CFGAnalyses>();
5884   PA.preserve<DominatorTreeAnalysis>();
5885   return PA;
5886 }
5887 
printPipeline(raw_ostream & OS,function_ref<StringRef (StringRef)> MapClassName2PassName)5888 void SROAPass::printPipeline(
5889     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
5890   static_cast<PassInfoMixin<SROAPass> *>(this)->printPipeline(
5891       OS, MapClassName2PassName);
5892   OS << (PreserveCFG == SROAOptions::PreserveCFG ? "<preserve-cfg>"
5893                                                  : "<modify-cfg>");
5894 }
5895 
SROAPass(SROAOptions PreserveCFG)5896 SROAPass::SROAPass(SROAOptions PreserveCFG) : PreserveCFG(PreserveCFG) {}
5897 
5898 namespace {
5899 
5900 /// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
5901 class SROALegacyPass : public FunctionPass {
5902   SROAOptions PreserveCFG;
5903 
5904 public:
5905   static char ID;
5906 
SROALegacyPass(SROAOptions PreserveCFG=SROAOptions::PreserveCFG)5907   SROALegacyPass(SROAOptions PreserveCFG = SROAOptions::PreserveCFG)
5908       : FunctionPass(ID), PreserveCFG(PreserveCFG) {
5909     initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
5910   }
5911 
runOnFunction(Function & F)5912   bool runOnFunction(Function &F) override {
5913     if (skipFunction(F))
5914       return false;
5915 
5916     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
5917     AssumptionCache &AC =
5918         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
5919     DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
5920     auto [Changed, _] =
5921         SROA(&F.getContext(), &DTU, &AC, PreserveCFG).runSROA(F);
5922     return Changed;
5923   }
5924 
getAnalysisUsage(AnalysisUsage & AU) const5925   void getAnalysisUsage(AnalysisUsage &AU) const override {
5926     AU.addRequired<AssumptionCacheTracker>();
5927     AU.addRequired<DominatorTreeWrapperPass>();
5928     AU.addPreserved<GlobalsAAWrapperPass>();
5929     AU.addPreserved<DominatorTreeWrapperPass>();
5930   }
5931 
getPassName() const5932   StringRef getPassName() const override { return "SROA"; }
5933 };
5934 
5935 } // end anonymous namespace
5936 
5937 char SROALegacyPass::ID = 0;
5938 
createSROAPass(bool PreserveCFG)5939 FunctionPass *llvm::createSROAPass(bool PreserveCFG) {
5940   return new SROALegacyPass(PreserveCFG ? SROAOptions::PreserveCFG
5941                                         : SROAOptions::ModifyCFG);
5942 }
5943 
5944 INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
5945                       "Scalar Replacement Of Aggregates", false, false)
5946 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
5947 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
5948 INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
5949                     false, false)
5950