xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp (revision a4e5e0106ac7145f56eb39a691e302cabb4635be)
1 //===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the visit functions for load, store and alloca.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "InstCombineInternal.h"
14 #include "llvm/ADT/MapVector.h"
15 #include "llvm/ADT/SmallString.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/Analysis/AliasAnalysis.h"
18 #include "llvm/Analysis/Loads.h"
19 #include "llvm/IR/DataLayout.h"
20 #include "llvm/IR/DebugInfoMetadata.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/IR/LLVMContext.h"
23 #include "llvm/IR/PatternMatch.h"
24 #include "llvm/Transforms/InstCombine/InstCombiner.h"
25 #include "llvm/Transforms/Utils/Local.h"
26 using namespace llvm;
27 using namespace PatternMatch;
28 
29 #define DEBUG_TYPE "instcombine"
30 
31 STATISTIC(NumDeadStore, "Number of dead stores eliminated");
32 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
33 
34 static cl::opt<unsigned> MaxCopiedFromConstantUsers(
35     "instcombine-max-copied-from-constant-users", cl::init(300),
36     cl::desc("Maximum users to visit in copy from constant transform"),
37     cl::Hidden);
38 
39 /// isOnlyCopiedFromConstantMemory - Recursively walk the uses of a (derived)
40 /// pointer to an alloca.  Ignore any reads of the pointer, return false if we
41 /// see any stores or other unknown uses.  If we see pointer arithmetic, keep
42 /// track of whether it moves the pointer (with IsOffset) but otherwise traverse
43 /// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
44 /// the alloca, and if the source pointer is a pointer to a constant memory
45 /// location, we can optimize this.
46 static bool
47 isOnlyCopiedFromConstantMemory(AAResults *AA, AllocaInst *V,
48                                MemTransferInst *&TheCopy,
49                                SmallVectorImpl<Instruction *> &ToDelete) {
50   // We track lifetime intrinsics as we encounter them.  If we decide to go
51   // ahead and replace the value with the memory location, this lets the caller
52   // quickly eliminate the markers.
53 
54   using ValueAndIsOffset = PointerIntPair<Value *, 1, bool>;
55   SmallVector<ValueAndIsOffset, 32> Worklist;
56   SmallPtrSet<ValueAndIsOffset, 32> Visited;
57   Worklist.emplace_back(V, false);
58   while (!Worklist.empty()) {
59     ValueAndIsOffset Elem = Worklist.pop_back_val();
60     if (!Visited.insert(Elem).second)
61       continue;
62     if (Visited.size() > MaxCopiedFromConstantUsers)
63       return false;
64 
65     const auto [Value, IsOffset] = Elem;
66     for (auto &U : Value->uses()) {
67       auto *I = cast<Instruction>(U.getUser());
68 
69       if (auto *LI = dyn_cast<LoadInst>(I)) {
70         // Ignore non-volatile loads, they are always ok.
71         if (!LI->isSimple()) return false;
72         continue;
73       }
74 
75       if (isa<PHINode, SelectInst>(I)) {
76         // We set IsOffset=true, to forbid the memcpy from occurring after the
77         // phi: If one of the phi operands is not based on the alloca, we
78         // would incorrectly omit a write.
79         Worklist.emplace_back(I, true);
80         continue;
81       }
82       if (isa<BitCastInst, AddrSpaceCastInst>(I)) {
83         // If uses of the bitcast are ok, we are ok.
84         Worklist.emplace_back(I, IsOffset);
85         continue;
86       }
87       if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
88         // If the GEP has all zero indices, it doesn't offset the pointer. If it
89         // doesn't, it does.
90         Worklist.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
91         continue;
92       }
93 
94       if (auto *Call = dyn_cast<CallBase>(I)) {
95         // If this is the function being called then we treat it like a load and
96         // ignore it.
97         if (Call->isCallee(&U))
98           continue;
99 
100         unsigned DataOpNo = Call->getDataOperandNo(&U);
101         bool IsArgOperand = Call->isArgOperand(&U);
102 
103         // Inalloca arguments are clobbered by the call.
104         if (IsArgOperand && Call->isInAllocaArgument(DataOpNo))
105           return false;
106 
107         // If this call site doesn't modify the memory, then we know it is just
108         // a load (but one that potentially returns the value itself), so we can
109         // ignore it if we know that the value isn't captured.
110         bool NoCapture = Call->doesNotCapture(DataOpNo);
111         if ((Call->onlyReadsMemory() && (Call->use_empty() || NoCapture)) ||
112             (Call->onlyReadsMemory(DataOpNo) && NoCapture))
113           continue;
114 
115         // If this is being passed as a byval argument, the caller is making a
116         // copy, so it is only a read of the alloca.
117         if (IsArgOperand && Call->isByValArgument(DataOpNo))
118           continue;
119       }
120 
121       // Lifetime intrinsics can be handled by the caller.
122       if (I->isLifetimeStartOrEnd()) {
123         assert(I->use_empty() && "Lifetime markers have no result to use!");
124         ToDelete.push_back(I);
125         continue;
126       }
127 
128       // If this is isn't our memcpy/memmove, reject it as something we can't
129       // handle.
130       MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
131       if (!MI)
132         return false;
133 
134       // If the transfer is volatile, reject it.
135       if (MI->isVolatile())
136         return false;
137 
138       // If the transfer is using the alloca as a source of the transfer, then
139       // ignore it since it is a load (unless the transfer is volatile).
140       if (U.getOperandNo() == 1)
141         continue;
142 
143       // If we already have seen a copy, reject the second one.
144       if (TheCopy) return false;
145 
146       // If the pointer has been offset from the start of the alloca, we can't
147       // safely handle this.
148       if (IsOffset) return false;
149 
150       // If the memintrinsic isn't using the alloca as the dest, reject it.
151       if (U.getOperandNo() != 0) return false;
152 
153       // If the source of the memcpy/move is not constant, reject it.
154       if (isModSet(AA->getModRefInfoMask(MI->getSource())))
155         return false;
156 
157       // Otherwise, the transform is safe.  Remember the copy instruction.
158       TheCopy = MI;
159     }
160   }
161   return true;
162 }
163 
164 /// isOnlyCopiedFromConstantMemory - Return true if the specified alloca is only
165 /// modified by a copy from a constant memory location. If we can prove this, we
166 /// can replace any uses of the alloca with uses of the memory location
167 /// directly.
168 static MemTransferInst *
169 isOnlyCopiedFromConstantMemory(AAResults *AA,
170                                AllocaInst *AI,
171                                SmallVectorImpl<Instruction *> &ToDelete) {
172   MemTransferInst *TheCopy = nullptr;
173   if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
174     return TheCopy;
175   return nullptr;
176 }
177 
178 /// Returns true if V is dereferenceable for size of alloca.
179 static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
180                                            const DataLayout &DL) {
181   if (AI->isArrayAllocation())
182     return false;
183   uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
184   if (!AllocaSize)
185     return false;
186   return isDereferenceableAndAlignedPointer(V, AI->getAlign(),
187                                             APInt(64, AllocaSize), DL);
188 }
189 
190 static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
191                                             AllocaInst &AI, DominatorTree &DT) {
192   // Check for array size of 1 (scalar allocation).
193   if (!AI.isArrayAllocation()) {
194     // i32 1 is the canonical array size for scalar allocations.
195     if (AI.getArraySize()->getType()->isIntegerTy(32))
196       return nullptr;
197 
198     // Canonicalize it.
199     return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1));
200   }
201 
202   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
203   if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
204     if (C->getValue().getActiveBits() <= 64) {
205       Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
206       AllocaInst *New = IC.Builder.CreateAlloca(NewTy, AI.getAddressSpace(),
207                                                 nullptr, AI.getName());
208       New->setAlignment(AI.getAlign());
209 
210       replaceAllDbgUsesWith(AI, *New, *New, DT);
211 
212       // Scan to the end of the allocation instructions, to skip over a block of
213       // allocas if possible...also skip interleaved debug info
214       //
215       BasicBlock::iterator It(New);
216       while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
217         ++It;
218 
219       // Now that I is pointing to the first non-allocation-inst in the block,
220       // insert our getelementptr instruction...
221       //
222       Type *IdxTy = IC.getDataLayout().getIndexType(AI.getType());
223       Value *NullIdx = Constant::getNullValue(IdxTy);
224       Value *Idx[2] = {NullIdx, NullIdx};
225       Instruction *GEP = GetElementPtrInst::CreateInBounds(
226           NewTy, New, Idx, New->getName() + ".sub");
227       IC.InsertNewInstBefore(GEP, *It);
228 
229       // Now make everything use the getelementptr instead of the original
230       // allocation.
231       return IC.replaceInstUsesWith(AI, GEP);
232     }
233   }
234 
235   if (isa<UndefValue>(AI.getArraySize()))
236     return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
237 
238   // Ensure that the alloca array size argument has type equal to the offset
239   // size of the alloca() pointer, which, in the tyical case, is intptr_t,
240   // so that any casting is exposed early.
241   Type *PtrIdxTy = IC.getDataLayout().getIndexType(AI.getType());
242   if (AI.getArraySize()->getType() != PtrIdxTy) {
243     Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), PtrIdxTy, false);
244     return IC.replaceOperand(AI, 0, V);
245   }
246 
247   return nullptr;
248 }
249 
250 namespace {
251 // If I and V are pointers in different address space, it is not allowed to
252 // use replaceAllUsesWith since I and V have different types. A
253 // non-target-specific transformation should not use addrspacecast on V since
254 // the two address space may be disjoint depending on target.
255 //
256 // This class chases down uses of the old pointer until reaching the load
257 // instructions, then replaces the old pointer in the load instructions with
258 // the new pointer. If during the chasing it sees bitcast or GEP, it will
259 // create new bitcast or GEP with the new pointer and use them in the load
260 // instruction.
261 class PointerReplacer {
262 public:
263   PointerReplacer(InstCombinerImpl &IC, Instruction &Root, unsigned SrcAS)
264       : IC(IC), Root(Root), FromAS(SrcAS) {}
265 
266   bool collectUsers();
267   void replacePointer(Value *V);
268 
269 private:
270   bool collectUsersRecursive(Instruction &I);
271   void replace(Instruction *I);
272   Value *getReplacement(Value *I);
273   bool isAvailable(Instruction *I) const {
274     return I == &Root || Worklist.contains(I);
275   }
276 
277   bool isEqualOrValidAddrSpaceCast(const Instruction *I,
278                                    unsigned FromAS) const {
279     const auto *ASC = dyn_cast<AddrSpaceCastInst>(I);
280     if (!ASC)
281       return false;
282     unsigned ToAS = ASC->getDestAddressSpace();
283     return (FromAS == ToAS) || IC.isValidAddrSpaceCast(FromAS, ToAS);
284   }
285 
286   SmallPtrSet<Instruction *, 32> ValuesToRevisit;
287   SmallSetVector<Instruction *, 4> Worklist;
288   MapVector<Value *, Value *> WorkMap;
289   InstCombinerImpl &IC;
290   Instruction &Root;
291   unsigned FromAS;
292 };
293 } // end anonymous namespace
294 
295 bool PointerReplacer::collectUsers() {
296   if (!collectUsersRecursive(Root))
297     return false;
298 
299   // Ensure that all outstanding (indirect) users of I
300   // are inserted into the Worklist. Return false
301   // otherwise.
302   for (auto *Inst : ValuesToRevisit)
303     if (!Worklist.contains(Inst))
304       return false;
305   return true;
306 }
307 
308 bool PointerReplacer::collectUsersRecursive(Instruction &I) {
309   for (auto *U : I.users()) {
310     auto *Inst = cast<Instruction>(&*U);
311     if (auto *Load = dyn_cast<LoadInst>(Inst)) {
312       if (Load->isVolatile())
313         return false;
314       Worklist.insert(Load);
315     } else if (auto *PHI = dyn_cast<PHINode>(Inst)) {
316       // All incoming values must be instructions for replacability
317       if (any_of(PHI->incoming_values(),
318                  [](Value *V) { return !isa<Instruction>(V); }))
319         return false;
320 
321       // If at least one incoming value of the PHI is not in Worklist,
322       // store the PHI for revisiting and skip this iteration of the
323       // loop.
324       if (any_of(PHI->incoming_values(), [this](Value *V) {
325             return !isAvailable(cast<Instruction>(V));
326           })) {
327         ValuesToRevisit.insert(Inst);
328         continue;
329       }
330 
331       Worklist.insert(PHI);
332       if (!collectUsersRecursive(*PHI))
333         return false;
334     } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
335       if (!isa<Instruction>(SI->getTrueValue()) ||
336           !isa<Instruction>(SI->getFalseValue()))
337         return false;
338 
339       if (!isAvailable(cast<Instruction>(SI->getTrueValue())) ||
340           !isAvailable(cast<Instruction>(SI->getFalseValue()))) {
341         ValuesToRevisit.insert(Inst);
342         continue;
343       }
344       Worklist.insert(SI);
345       if (!collectUsersRecursive(*SI))
346         return false;
347     } else if (isa<GetElementPtrInst, BitCastInst>(Inst)) {
348       Worklist.insert(Inst);
349       if (!collectUsersRecursive(*Inst))
350         return false;
351     } else if (auto *MI = dyn_cast<MemTransferInst>(Inst)) {
352       if (MI->isVolatile())
353         return false;
354       Worklist.insert(Inst);
355     } else if (isEqualOrValidAddrSpaceCast(Inst, FromAS)) {
356       Worklist.insert(Inst);
357     } else if (Inst->isLifetimeStartOrEnd()) {
358       continue;
359     } else {
360       LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
361       return false;
362     }
363   }
364 
365   return true;
366 }
367 
368 Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
369 
370 void PointerReplacer::replace(Instruction *I) {
371   if (getReplacement(I))
372     return;
373 
374   if (auto *LT = dyn_cast<LoadInst>(I)) {
375     auto *V = getReplacement(LT->getPointerOperand());
376     assert(V && "Operand not replaced");
377     auto *NewI = new LoadInst(LT->getType(), V, "", LT->isVolatile(),
378                               LT->getAlign(), LT->getOrdering(),
379                               LT->getSyncScopeID());
380     NewI->takeName(LT);
381     copyMetadataForLoad(*NewI, *LT);
382 
383     IC.InsertNewInstWith(NewI, *LT);
384     IC.replaceInstUsesWith(*LT, NewI);
385     WorkMap[LT] = NewI;
386   } else if (auto *PHI = dyn_cast<PHINode>(I)) {
387     Type *NewTy = getReplacement(PHI->getIncomingValue(0))->getType();
388     auto *NewPHI = PHINode::Create(NewTy, PHI->getNumIncomingValues(),
389                                    PHI->getName(), PHI);
390     for (unsigned int I = 0; I < PHI->getNumIncomingValues(); ++I)
391       NewPHI->addIncoming(getReplacement(PHI->getIncomingValue(I)),
392                           PHI->getIncomingBlock(I));
393     WorkMap[PHI] = NewPHI;
394   } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
395     auto *V = getReplacement(GEP->getPointerOperand());
396     assert(V && "Operand not replaced");
397     SmallVector<Value *, 8> Indices;
398     Indices.append(GEP->idx_begin(), GEP->idx_end());
399     auto *NewI =
400         GetElementPtrInst::Create(GEP->getSourceElementType(), V, Indices);
401     IC.InsertNewInstWith(NewI, *GEP);
402     NewI->takeName(GEP);
403     WorkMap[GEP] = NewI;
404   } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
405     auto *V = getReplacement(BC->getOperand(0));
406     assert(V && "Operand not replaced");
407     auto *NewT = PointerType::get(BC->getType()->getContext(),
408                                   V->getType()->getPointerAddressSpace());
409     auto *NewI = new BitCastInst(V, NewT);
410     IC.InsertNewInstWith(NewI, *BC);
411     NewI->takeName(BC);
412     WorkMap[BC] = NewI;
413   } else if (auto *SI = dyn_cast<SelectInst>(I)) {
414     auto *NewSI = SelectInst::Create(
415         SI->getCondition(), getReplacement(SI->getTrueValue()),
416         getReplacement(SI->getFalseValue()), SI->getName(), nullptr, SI);
417     IC.InsertNewInstWith(NewSI, *SI);
418     NewSI->takeName(SI);
419     WorkMap[SI] = NewSI;
420   } else if (auto *MemCpy = dyn_cast<MemTransferInst>(I)) {
421     auto *SrcV = getReplacement(MemCpy->getRawSource());
422     // The pointer may appear in the destination of a copy, but we don't want to
423     // replace it.
424     if (!SrcV) {
425       assert(getReplacement(MemCpy->getRawDest()) &&
426              "destination not in replace list");
427       return;
428     }
429 
430     IC.Builder.SetInsertPoint(MemCpy);
431     auto *NewI = IC.Builder.CreateMemTransferInst(
432         MemCpy->getIntrinsicID(), MemCpy->getRawDest(), MemCpy->getDestAlign(),
433         SrcV, MemCpy->getSourceAlign(), MemCpy->getLength(),
434         MemCpy->isVolatile());
435     AAMDNodes AAMD = MemCpy->getAAMetadata();
436     if (AAMD)
437       NewI->setAAMetadata(AAMD);
438 
439     IC.eraseInstFromFunction(*MemCpy);
440     WorkMap[MemCpy] = NewI;
441   } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(I)) {
442     auto *V = getReplacement(ASC->getPointerOperand());
443     assert(V && "Operand not replaced");
444     assert(isEqualOrValidAddrSpaceCast(
445                ASC, V->getType()->getPointerAddressSpace()) &&
446            "Invalid address space cast!");
447     auto *NewV = V;
448     if (V->getType()->getPointerAddressSpace() !=
449         ASC->getType()->getPointerAddressSpace()) {
450       auto *NewI = new AddrSpaceCastInst(V, ASC->getType(), "");
451       NewI->takeName(ASC);
452       IC.InsertNewInstWith(NewI, *ASC);
453       NewV = NewI;
454     }
455     IC.replaceInstUsesWith(*ASC, NewV);
456     IC.eraseInstFromFunction(*ASC);
457   } else {
458     llvm_unreachable("should never reach here");
459   }
460 }
461 
462 void PointerReplacer::replacePointer(Value *V) {
463 #ifndef NDEBUG
464   auto *PT = cast<PointerType>(Root.getType());
465   auto *NT = cast<PointerType>(V->getType());
466   assert(PT != NT && "Invalid usage");
467 #endif
468   WorkMap[&Root] = V;
469 
470   for (Instruction *Workitem : Worklist)
471     replace(Workitem);
472 }
473 
474 Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
475   if (auto *I = simplifyAllocaArraySize(*this, AI, DT))
476     return I;
477 
478   if (AI.getAllocatedType()->isSized()) {
479     // Move all alloca's of zero byte objects to the entry block and merge them
480     // together.  Note that we only do this for alloca's, because malloc should
481     // allocate and return a unique pointer, even for a zero byte allocation.
482     if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinValue() == 0) {
483       // For a zero sized alloca there is no point in doing an array allocation.
484       // This is helpful if the array size is a complicated expression not used
485       // elsewhere.
486       if (AI.isArrayAllocation())
487         return replaceOperand(AI, 0,
488             ConstantInt::get(AI.getArraySize()->getType(), 1));
489 
490       // Get the first instruction in the entry block.
491       BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
492       Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
493       if (FirstInst != &AI) {
494         // If the entry block doesn't start with a zero-size alloca then move
495         // this one to the start of the entry block.  There is no problem with
496         // dominance as the array size was forced to a constant earlier already.
497         AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
498         if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
499             DL.getTypeAllocSize(EntryAI->getAllocatedType())
500                     .getKnownMinValue() != 0) {
501           AI.moveBefore(FirstInst);
502           return &AI;
503         }
504 
505         // Replace this zero-sized alloca with the one at the start of the entry
506         // block after ensuring that the address will be aligned enough for both
507         // types.
508         const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
509         EntryAI->setAlignment(MaxAlign);
510         if (AI.getType() != EntryAI->getType())
511           return new BitCastInst(EntryAI, AI.getType());
512         return replaceInstUsesWith(AI, EntryAI);
513       }
514     }
515   }
516 
517   // Check to see if this allocation is only modified by a memcpy/memmove from
518   // a memory location whose alignment is equal to or exceeds that of the
519   // allocation. If this is the case, we can change all users to use the
520   // constant memory location instead.  This is commonly produced by the CFE by
521   // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
522   // is only subsequently read.
523   SmallVector<Instruction *, 4> ToDelete;
524   if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
525     Value *TheSrc = Copy->getSource();
526     Align AllocaAlign = AI.getAlign();
527     Align SourceAlign = getOrEnforceKnownAlignment(
528       TheSrc, AllocaAlign, DL, &AI, &AC, &DT);
529     if (AllocaAlign <= SourceAlign &&
530         isDereferenceableForAllocaSize(TheSrc, &AI, DL) &&
531         !isa<Instruction>(TheSrc)) {
532       // FIXME: Can we sink instructions without violating dominance when TheSrc
533       // is an instruction instead of a constant or argument?
534       LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
535       LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
536       unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace();
537       auto *DestTy = PointerType::get(AI.getAllocatedType(), SrcAddrSpace);
538       if (AI.getAddressSpace() == SrcAddrSpace) {
539         for (Instruction *Delete : ToDelete)
540           eraseInstFromFunction(*Delete);
541 
542         Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
543         Instruction *NewI = replaceInstUsesWith(AI, Cast);
544         eraseInstFromFunction(*Copy);
545         ++NumGlobalCopies;
546         return NewI;
547       }
548 
549       PointerReplacer PtrReplacer(*this, AI, SrcAddrSpace);
550       if (PtrReplacer.collectUsers()) {
551         for (Instruction *Delete : ToDelete)
552           eraseInstFromFunction(*Delete);
553 
554         Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
555         PtrReplacer.replacePointer(Cast);
556         ++NumGlobalCopies;
557       }
558     }
559   }
560 
561   // At last, use the generic allocation site handler to aggressively remove
562   // unused allocas.
563   return visitAllocSite(AI);
564 }
565 
566 // Are we allowed to form a atomic load or store of this type?
567 static bool isSupportedAtomicType(Type *Ty) {
568   return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
569 }
570 
571 /// Helper to combine a load to a new type.
572 ///
573 /// This just does the work of combining a load to a new type. It handles
574 /// metadata, etc., and returns the new instruction. The \c NewTy should be the
575 /// loaded *value* type. This will convert it to a pointer, cast the operand to
576 /// that pointer type, load it, etc.
577 ///
578 /// Note that this will create all of the instructions with whatever insert
579 /// point the \c InstCombinerImpl currently is using.
580 LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
581                                                  const Twine &Suffix) {
582   assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
583          "can't fold an atomic load to requested type");
584 
585   Value *Ptr = LI.getPointerOperand();
586   unsigned AS = LI.getPointerAddressSpace();
587   Type *NewPtrTy = NewTy->getPointerTo(AS);
588   Value *NewPtr = nullptr;
589   if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
590         NewPtr->getType() == NewPtrTy))
591     NewPtr = Builder.CreateBitCast(Ptr, NewPtrTy);
592 
593   LoadInst *NewLoad = Builder.CreateAlignedLoad(
594       NewTy, NewPtr, LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix);
595   NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
596   copyMetadataForLoad(*NewLoad, LI);
597   return NewLoad;
598 }
599 
600 /// Combine a store to a new type.
601 ///
602 /// Returns the newly created store instruction.
603 static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
604                                          Value *V) {
605   assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
606          "can't fold an atomic store of requested type");
607 
608   Value *Ptr = SI.getPointerOperand();
609   unsigned AS = SI.getPointerAddressSpace();
610   SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
611   SI.getAllMetadata(MD);
612 
613   StoreInst *NewStore = IC.Builder.CreateAlignedStore(
614       V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
615       SI.getAlign(), SI.isVolatile());
616   NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
617   for (const auto &MDPair : MD) {
618     unsigned ID = MDPair.first;
619     MDNode *N = MDPair.second;
620     // Note, essentially every kind of metadata should be preserved here! This
621     // routine is supposed to clone a store instruction changing *only its
622     // type*. The only metadata it makes sense to drop is metadata which is
623     // invalidated when the pointer type changes. This should essentially
624     // never be the case in LLVM, but we explicitly switch over only known
625     // metadata to be conservatively correct. If you are adding metadata to
626     // LLVM which pertains to stores, you almost certainly want to add it
627     // here.
628     switch (ID) {
629     case LLVMContext::MD_dbg:
630     case LLVMContext::MD_DIAssignID:
631     case LLVMContext::MD_tbaa:
632     case LLVMContext::MD_prof:
633     case LLVMContext::MD_fpmath:
634     case LLVMContext::MD_tbaa_struct:
635     case LLVMContext::MD_alias_scope:
636     case LLVMContext::MD_noalias:
637     case LLVMContext::MD_nontemporal:
638     case LLVMContext::MD_mem_parallel_loop_access:
639     case LLVMContext::MD_access_group:
640       // All of these directly apply.
641       NewStore->setMetadata(ID, N);
642       break;
643     case LLVMContext::MD_invariant_load:
644     case LLVMContext::MD_nonnull:
645     case LLVMContext::MD_noundef:
646     case LLVMContext::MD_range:
647     case LLVMContext::MD_align:
648     case LLVMContext::MD_dereferenceable:
649     case LLVMContext::MD_dereferenceable_or_null:
650       // These don't apply for stores.
651       break;
652     }
653   }
654 
655   return NewStore;
656 }
657 
658 /// Returns true if instruction represent minmax pattern like:
659 ///   select ((cmp load V1, load V2), V1, V2).
660 static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
661   assert(V->getType()->isPointerTy() && "Expected pointer type.");
662   // Ignore possible ty* to ixx* bitcast.
663   V = InstCombiner::peekThroughBitcast(V);
664   // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
665   // pattern.
666   CmpInst::Predicate Pred;
667   Instruction *L1;
668   Instruction *L2;
669   Value *LHS;
670   Value *RHS;
671   if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
672                          m_Value(LHS), m_Value(RHS))))
673     return false;
674   LoadTy = L1->getType();
675   return (match(L1, m_Load(m_Specific(LHS))) &&
676           match(L2, m_Load(m_Specific(RHS)))) ||
677          (match(L1, m_Load(m_Specific(RHS))) &&
678           match(L2, m_Load(m_Specific(LHS))));
679 }
680 
681 /// Combine loads to match the type of their uses' value after looking
682 /// through intervening bitcasts.
683 ///
684 /// The core idea here is that if the result of a load is used in an operation,
685 /// we should load the type most conducive to that operation. For example, when
686 /// loading an integer and converting that immediately to a pointer, we should
687 /// instead directly load a pointer.
688 ///
689 /// However, this routine must never change the width of a load or the number of
690 /// loads as that would introduce a semantic change. This combine is expected to
691 /// be a semantic no-op which just allows loads to more closely model the types
692 /// of their consuming operations.
693 ///
694 /// Currently, we also refuse to change the precise type used for an atomic load
695 /// or a volatile load. This is debatable, and might be reasonable to change
696 /// later. However, it is risky in case some backend or other part of LLVM is
697 /// relying on the exact type loaded to select appropriate atomic operations.
698 static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
699                                                LoadInst &Load) {
700   // FIXME: We could probably with some care handle both volatile and ordered
701   // atomic loads here but it isn't clear that this is important.
702   if (!Load.isUnordered())
703     return nullptr;
704 
705   if (Load.use_empty())
706     return nullptr;
707 
708   // swifterror values can't be bitcasted.
709   if (Load.getPointerOperand()->isSwiftError())
710     return nullptr;
711 
712   // Fold away bit casts of the loaded value by loading the desired type.
713   // Note that we should not do this for pointer<->integer casts,
714   // because that would result in type punning.
715   if (Load.hasOneUse()) {
716     // Don't transform when the type is x86_amx, it makes the pass that lower
717     // x86_amx type happy.
718     Type *LoadTy = Load.getType();
719     if (auto *BC = dyn_cast<BitCastInst>(Load.user_back())) {
720       assert(!LoadTy->isX86_AMXTy() && "Load from x86_amx* should not happen!");
721       if (BC->getType()->isX86_AMXTy())
722         return nullptr;
723     }
724 
725     if (auto *CastUser = dyn_cast<CastInst>(Load.user_back())) {
726       Type *DestTy = CastUser->getDestTy();
727       if (CastUser->isNoopCast(IC.getDataLayout()) &&
728           LoadTy->isPtrOrPtrVectorTy() == DestTy->isPtrOrPtrVectorTy() &&
729           (!Load.isAtomic() || isSupportedAtomicType(DestTy))) {
730         LoadInst *NewLoad = IC.combineLoadToNewType(Load, DestTy);
731         CastUser->replaceAllUsesWith(NewLoad);
732         IC.eraseInstFromFunction(*CastUser);
733         return &Load;
734       }
735     }
736   }
737 
738   // FIXME: We should also canonicalize loads of vectors when their elements are
739   // cast to other types.
740   return nullptr;
741 }
742 
743 static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
744   // FIXME: We could probably with some care handle both volatile and atomic
745   // stores here but it isn't clear that this is important.
746   if (!LI.isSimple())
747     return nullptr;
748 
749   Type *T = LI.getType();
750   if (!T->isAggregateType())
751     return nullptr;
752 
753   StringRef Name = LI.getName();
754 
755   if (auto *ST = dyn_cast<StructType>(T)) {
756     // If the struct only have one element, we unpack.
757     auto NumElements = ST->getNumElements();
758     if (NumElements == 1) {
759       LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
760                                                   ".unpack");
761       NewLoad->setAAMetadata(LI.getAAMetadata());
762       return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
763         PoisonValue::get(T), NewLoad, 0, Name));
764     }
765 
766     // We don't want to break loads with padding here as we'd loose
767     // the knowledge that padding exists for the rest of the pipeline.
768     const DataLayout &DL = IC.getDataLayout();
769     auto *SL = DL.getStructLayout(ST);
770 
771     // Don't unpack for structure with scalable vector.
772     if (SL->getSizeInBits().isScalable())
773       return nullptr;
774 
775     if (SL->hasPadding())
776       return nullptr;
777 
778     const auto Align = LI.getAlign();
779     auto *Addr = LI.getPointerOperand();
780     auto *IdxType = Type::getInt32Ty(T->getContext());
781     auto *Zero = ConstantInt::get(IdxType, 0);
782 
783     Value *V = PoisonValue::get(T);
784     for (unsigned i = 0; i < NumElements; i++) {
785       Value *Indices[2] = {
786         Zero,
787         ConstantInt::get(IdxType, i),
788       };
789       auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices),
790                                                Name + ".elt");
791       auto *L = IC.Builder.CreateAlignedLoad(
792           ST->getElementType(i), Ptr,
793           commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
794       // Propagate AA metadata. It'll still be valid on the narrowed load.
795       L->setAAMetadata(LI.getAAMetadata());
796       V = IC.Builder.CreateInsertValue(V, L, i);
797     }
798 
799     V->setName(Name);
800     return IC.replaceInstUsesWith(LI, V);
801   }
802 
803   if (auto *AT = dyn_cast<ArrayType>(T)) {
804     auto *ET = AT->getElementType();
805     auto NumElements = AT->getNumElements();
806     if (NumElements == 1) {
807       LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
808       NewLoad->setAAMetadata(LI.getAAMetadata());
809       return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
810         PoisonValue::get(T), NewLoad, 0, Name));
811     }
812 
813     // Bail out if the array is too large. Ideally we would like to optimize
814     // arrays of arbitrary size but this has a terrible impact on compile time.
815     // The threshold here is chosen arbitrarily, maybe needs a little bit of
816     // tuning.
817     if (NumElements > IC.MaxArraySizeForCombine)
818       return nullptr;
819 
820     const DataLayout &DL = IC.getDataLayout();
821     auto EltSize = DL.getTypeAllocSize(ET);
822     const auto Align = LI.getAlign();
823 
824     auto *Addr = LI.getPointerOperand();
825     auto *IdxType = Type::getInt64Ty(T->getContext());
826     auto *Zero = ConstantInt::get(IdxType, 0);
827 
828     Value *V = PoisonValue::get(T);
829     uint64_t Offset = 0;
830     for (uint64_t i = 0; i < NumElements; i++) {
831       Value *Indices[2] = {
832         Zero,
833         ConstantInt::get(IdxType, i),
834       };
835       auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, ArrayRef(Indices),
836                                                Name + ".elt");
837       auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
838                                              commonAlignment(Align, Offset),
839                                              Name + ".unpack");
840       L->setAAMetadata(LI.getAAMetadata());
841       V = IC.Builder.CreateInsertValue(V, L, i);
842       Offset += EltSize;
843     }
844 
845     V->setName(Name);
846     return IC.replaceInstUsesWith(LI, V);
847   }
848 
849   return nullptr;
850 }
851 
852 // If we can determine that all possible objects pointed to by the provided
853 // pointer value are, not only dereferenceable, but also definitively less than
854 // or equal to the provided maximum size, then return true. Otherwise, return
855 // false (constant global values and allocas fall into this category).
856 //
857 // FIXME: This should probably live in ValueTracking (or similar).
858 static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
859                                      const DataLayout &DL) {
860   SmallPtrSet<Value *, 4> Visited;
861   SmallVector<Value *, 4> Worklist(1, V);
862 
863   do {
864     Value *P = Worklist.pop_back_val();
865     P = P->stripPointerCasts();
866 
867     if (!Visited.insert(P).second)
868       continue;
869 
870     if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
871       Worklist.push_back(SI->getTrueValue());
872       Worklist.push_back(SI->getFalseValue());
873       continue;
874     }
875 
876     if (PHINode *PN = dyn_cast<PHINode>(P)) {
877       append_range(Worklist, PN->incoming_values());
878       continue;
879     }
880 
881     if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
882       if (GA->isInterposable())
883         return false;
884       Worklist.push_back(GA->getAliasee());
885       continue;
886     }
887 
888     // If we know how big this object is, and it is less than MaxSize, continue
889     // searching. Otherwise, return false.
890     if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
891       if (!AI->getAllocatedType()->isSized())
892         return false;
893 
894       ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
895       if (!CS)
896         return false;
897 
898       TypeSize TS = DL.getTypeAllocSize(AI->getAllocatedType());
899       if (TS.isScalable())
900         return false;
901       // Make sure that, even if the multiplication below would wrap as an
902       // uint64_t, we still do the right thing.
903       if ((CS->getValue().zext(128) * APInt(128, TS.getFixedValue()))
904               .ugt(MaxSize))
905         return false;
906       continue;
907     }
908 
909     if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
910       if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
911         return false;
912 
913       uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
914       if (InitSize > MaxSize)
915         return false;
916       continue;
917     }
918 
919     return false;
920   } while (!Worklist.empty());
921 
922   return true;
923 }
924 
925 // If we're indexing into an object of a known size, and the outer index is
926 // not a constant, but having any value but zero would lead to undefined
927 // behavior, replace it with zero.
928 //
929 // For example, if we have:
930 // @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
931 // ...
932 // %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
933 // ... = load i32* %arrayidx, align 4
934 // Then we know that we can replace %x in the GEP with i64 0.
935 //
936 // FIXME: We could fold any GEP index to zero that would cause UB if it were
937 // not zero. Currently, we only handle the first such index. Also, we could
938 // also search through non-zero constant indices if we kept track of the
939 // offsets those indices implied.
940 static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
941                                      GetElementPtrInst *GEPI, Instruction *MemI,
942                                      unsigned &Idx) {
943   if (GEPI->getNumOperands() < 2)
944     return false;
945 
946   // Find the first non-zero index of a GEP. If all indices are zero, return
947   // one past the last index.
948   auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
949     unsigned I = 1;
950     for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
951       Value *V = GEPI->getOperand(I);
952       if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
953         if (CI->isZero())
954           continue;
955 
956       break;
957     }
958 
959     return I;
960   };
961 
962   // Skip through initial 'zero' indices, and find the corresponding pointer
963   // type. See if the next index is not a constant.
964   Idx = FirstNZIdx(GEPI);
965   if (Idx == GEPI->getNumOperands())
966     return false;
967   if (isa<Constant>(GEPI->getOperand(Idx)))
968     return false;
969 
970   SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
971   Type *SourceElementType = GEPI->getSourceElementType();
972   // Size information about scalable vectors is not available, so we cannot
973   // deduce whether indexing at n is undefined behaviour or not. Bail out.
974   if (isa<ScalableVectorType>(SourceElementType))
975     return false;
976 
977   Type *AllocTy = GetElementPtrInst::getIndexedType(SourceElementType, Ops);
978   if (!AllocTy || !AllocTy->isSized())
979     return false;
980   const DataLayout &DL = IC.getDataLayout();
981   uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy).getFixedValue();
982 
983   // If there are more indices after the one we might replace with a zero, make
984   // sure they're all non-negative. If any of them are negative, the overall
985   // address being computed might be before the base address determined by the
986   // first non-zero index.
987   auto IsAllNonNegative = [&]() {
988     for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
989       KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
990       if (Known.isNonNegative())
991         continue;
992       return false;
993     }
994 
995     return true;
996   };
997 
998   // FIXME: If the GEP is not inbounds, and there are extra indices after the
999   // one we'll replace, those could cause the address computation to wrap
1000   // (rendering the IsAllNonNegative() check below insufficient). We can do
1001   // better, ignoring zero indices (and other indices we can prove small
1002   // enough not to wrap).
1003   if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
1004     return false;
1005 
1006   // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
1007   // also known to be dereferenceable.
1008   return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
1009          IsAllNonNegative();
1010 }
1011 
1012 // If we're indexing into an object with a variable index for the memory
1013 // access, but the object has only one element, we can assume that the index
1014 // will always be zero. If we replace the GEP, return it.
1015 static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
1016                                           Instruction &MemI) {
1017   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
1018     unsigned Idx;
1019     if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
1020       Instruction *NewGEPI = GEPI->clone();
1021       NewGEPI->setOperand(Idx,
1022         ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
1023       IC.InsertNewInstBefore(NewGEPI, *GEPI);
1024       return NewGEPI;
1025     }
1026   }
1027 
1028   return nullptr;
1029 }
1030 
1031 static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
1032   if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
1033     return false;
1034 
1035   auto *Ptr = SI.getPointerOperand();
1036   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
1037     Ptr = GEPI->getOperand(0);
1038   return (isa<ConstantPointerNull>(Ptr) &&
1039           !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
1040 }
1041 
1042 static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
1043   if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
1044     const Value *GEPI0 = GEPI->getOperand(0);
1045     if (isa<ConstantPointerNull>(GEPI0) &&
1046         !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
1047       return true;
1048   }
1049   if (isa<UndefValue>(Op) ||
1050       (isa<ConstantPointerNull>(Op) &&
1051        !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
1052     return true;
1053   return false;
1054 }
1055 
1056 Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
1057   Value *Op = LI.getOperand(0);
1058   if (Value *Res = simplifyLoadInst(&LI, Op, SQ.getWithInstruction(&LI)))
1059     return replaceInstUsesWith(LI, Res);
1060 
1061   // Try to canonicalize the loaded type.
1062   if (Instruction *Res = combineLoadToOperationType(*this, LI))
1063     return Res;
1064 
1065   // Attempt to improve the alignment.
1066   Align KnownAlign = getOrEnforceKnownAlignment(
1067       Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
1068   if (KnownAlign > LI.getAlign())
1069     LI.setAlignment(KnownAlign);
1070 
1071   // Replace GEP indices if possible.
1072   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI))
1073     return replaceOperand(LI, 0, NewGEPI);
1074 
1075   if (Instruction *Res = unpackLoadToAggregate(*this, LI))
1076     return Res;
1077 
1078   // Do really simple store-to-load forwarding and load CSE, to catch cases
1079   // where there are several consecutive memory accesses to the same location,
1080   // separated by a few arithmetic operations.
1081   bool IsLoadCSE = false;
1082   if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE)) {
1083     if (IsLoadCSE)
1084       combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
1085 
1086     return replaceInstUsesWith(
1087         LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
1088                                            LI.getName() + ".cast"));
1089   }
1090 
1091   // None of the following transforms are legal for volatile/ordered atomic
1092   // loads.  Most of them do apply for unordered atomics.
1093   if (!LI.isUnordered()) return nullptr;
1094 
1095   // load(gep null, ...) -> unreachable
1096   // load null/undef -> unreachable
1097   // TODO: Consider a target hook for valid address spaces for this xforms.
1098   if (canSimplifyNullLoadOrGEP(LI, Op)) {
1099     CreateNonTerminatorUnreachable(&LI);
1100     return replaceInstUsesWith(LI, PoisonValue::get(LI.getType()));
1101   }
1102 
1103   if (Op->hasOneUse()) {
1104     // Change select and PHI nodes to select values instead of addresses: this
1105     // helps alias analysis out a lot, allows many others simplifications, and
1106     // exposes redundancy in the code.
1107     //
1108     // Note that we cannot do the transformation unless we know that the
1109     // introduced loads cannot trap!  Something like this is valid as long as
1110     // the condition is always false: load (select bool %C, int* null, int* %G),
1111     // but it would not be valid if we transformed it to load from null
1112     // unconditionally.
1113     //
1114     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
1115       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
1116       Align Alignment = LI.getAlign();
1117       if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
1118                                       Alignment, DL, SI) &&
1119           isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
1120                                       Alignment, DL, SI)) {
1121         LoadInst *V1 =
1122             Builder.CreateLoad(LI.getType(), SI->getOperand(1),
1123                                SI->getOperand(1)->getName() + ".val");
1124         LoadInst *V2 =
1125             Builder.CreateLoad(LI.getType(), SI->getOperand(2),
1126                                SI->getOperand(2)->getName() + ".val");
1127         assert(LI.isUnordered() && "implied by above");
1128         V1->setAlignment(Alignment);
1129         V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1130         V2->setAlignment(Alignment);
1131         V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
1132         return SelectInst::Create(SI->getCondition(), V1, V2);
1133       }
1134 
1135       // load (select (cond, null, P)) -> load P
1136       if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
1137           !NullPointerIsDefined(SI->getFunction(),
1138                                 LI.getPointerAddressSpace()))
1139         return replaceOperand(LI, 0, SI->getOperand(2));
1140 
1141       // load (select (cond, P, null)) -> load P
1142       if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
1143           !NullPointerIsDefined(SI->getFunction(),
1144                                 LI.getPointerAddressSpace()))
1145         return replaceOperand(LI, 0, SI->getOperand(1));
1146     }
1147   }
1148   return nullptr;
1149 }
1150 
1151 /// Look for extractelement/insertvalue sequence that acts like a bitcast.
1152 ///
1153 /// \returns underlying value that was "cast", or nullptr otherwise.
1154 ///
1155 /// For example, if we have:
1156 ///
1157 ///     %E0 = extractelement <2 x double> %U, i32 0
1158 ///     %V0 = insertvalue [2 x double] undef, double %E0, 0
1159 ///     %E1 = extractelement <2 x double> %U, i32 1
1160 ///     %V1 = insertvalue [2 x double] %V0, double %E1, 1
1161 ///
1162 /// and the layout of a <2 x double> is isomorphic to a [2 x double],
1163 /// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
1164 /// Note that %U may contain non-undef values where %V1 has undef.
1165 static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
1166   Value *U = nullptr;
1167   while (auto *IV = dyn_cast<InsertValueInst>(V)) {
1168     auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
1169     if (!E)
1170       return nullptr;
1171     auto *W = E->getVectorOperand();
1172     if (!U)
1173       U = W;
1174     else if (U != W)
1175       return nullptr;
1176     auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
1177     if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
1178       return nullptr;
1179     V = IV->getAggregateOperand();
1180   }
1181   if (!match(V, m_Undef()) || !U)
1182     return nullptr;
1183 
1184   auto *UT = cast<VectorType>(U->getType());
1185   auto *VT = V->getType();
1186   // Check that types UT and VT are bitwise isomorphic.
1187   const auto &DL = IC.getDataLayout();
1188   if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
1189     return nullptr;
1190   }
1191   if (auto *AT = dyn_cast<ArrayType>(VT)) {
1192     if (AT->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
1193       return nullptr;
1194   } else {
1195     auto *ST = cast<StructType>(VT);
1196     if (ST->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
1197       return nullptr;
1198     for (const auto *EltT : ST->elements()) {
1199       if (EltT != UT->getElementType())
1200         return nullptr;
1201     }
1202   }
1203   return U;
1204 }
1205 
1206 /// Combine stores to match the type of value being stored.
1207 ///
1208 /// The core idea here is that the memory does not have any intrinsic type and
1209 /// where we can we should match the type of a store to the type of value being
1210 /// stored.
1211 ///
1212 /// However, this routine must never change the width of a store or the number of
1213 /// stores as that would introduce a semantic change. This combine is expected to
1214 /// be a semantic no-op which just allows stores to more closely model the types
1215 /// of their incoming values.
1216 ///
1217 /// Currently, we also refuse to change the precise type used for an atomic or
1218 /// volatile store. This is debatable, and might be reasonable to change later.
1219 /// However, it is risky in case some backend or other part of LLVM is relying
1220 /// on the exact type stored to select appropriate atomic operations.
1221 ///
1222 /// \returns true if the store was successfully combined away. This indicates
1223 /// the caller must erase the store instruction. We have to let the caller erase
1224 /// the store instruction as otherwise there is no way to signal whether it was
1225 /// combined or not: IC.EraseInstFromFunction returns a null pointer.
1226 static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
1227   // FIXME: We could probably with some care handle both volatile and ordered
1228   // atomic stores here but it isn't clear that this is important.
1229   if (!SI.isUnordered())
1230     return false;
1231 
1232   // swifterror values can't be bitcasted.
1233   if (SI.getPointerOperand()->isSwiftError())
1234     return false;
1235 
1236   Value *V = SI.getValueOperand();
1237 
1238   // Fold away bit casts of the stored value by storing the original type.
1239   if (auto *BC = dyn_cast<BitCastInst>(V)) {
1240     assert(!BC->getType()->isX86_AMXTy() &&
1241            "store to x86_amx* should not happen!");
1242     V = BC->getOperand(0);
1243     // Don't transform when the type is x86_amx, it makes the pass that lower
1244     // x86_amx type happy.
1245     if (V->getType()->isX86_AMXTy())
1246       return false;
1247     if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
1248       combineStoreToNewValue(IC, SI, V);
1249       return true;
1250     }
1251   }
1252 
1253   if (Value *U = likeBitCastFromVector(IC, V))
1254     if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
1255       combineStoreToNewValue(IC, SI, U);
1256       return true;
1257     }
1258 
1259   // FIXME: We should also canonicalize stores of vectors when their elements
1260   // are cast to other types.
1261   return false;
1262 }
1263 
1264 static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
1265   // FIXME: We could probably with some care handle both volatile and atomic
1266   // stores here but it isn't clear that this is important.
1267   if (!SI.isSimple())
1268     return false;
1269 
1270   Value *V = SI.getValueOperand();
1271   Type *T = V->getType();
1272 
1273   if (!T->isAggregateType())
1274     return false;
1275 
1276   if (auto *ST = dyn_cast<StructType>(T)) {
1277     // If the struct only have one element, we unpack.
1278     unsigned Count = ST->getNumElements();
1279     if (Count == 1) {
1280       V = IC.Builder.CreateExtractValue(V, 0);
1281       combineStoreToNewValue(IC, SI, V);
1282       return true;
1283     }
1284 
1285     // We don't want to break loads with padding here as we'd loose
1286     // the knowledge that padding exists for the rest of the pipeline.
1287     const DataLayout &DL = IC.getDataLayout();
1288     auto *SL = DL.getStructLayout(ST);
1289 
1290     // Don't unpack for structure with scalable vector.
1291     if (SL->getSizeInBits().isScalable())
1292       return false;
1293 
1294     if (SL->hasPadding())
1295       return false;
1296 
1297     const auto Align = SI.getAlign();
1298 
1299     SmallString<16> EltName = V->getName();
1300     EltName += ".elt";
1301     auto *Addr = SI.getPointerOperand();
1302     SmallString<16> AddrName = Addr->getName();
1303     AddrName += ".repack";
1304 
1305     auto *IdxType = Type::getInt32Ty(ST->getContext());
1306     auto *Zero = ConstantInt::get(IdxType, 0);
1307     for (unsigned i = 0; i < Count; i++) {
1308       Value *Indices[2] = {
1309         Zero,
1310         ConstantInt::get(IdxType, i),
1311       };
1312       auto *Ptr =
1313           IC.Builder.CreateInBoundsGEP(ST, Addr, ArrayRef(Indices), AddrName);
1314       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
1315       auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
1316       llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1317       NS->setAAMetadata(SI.getAAMetadata());
1318     }
1319 
1320     return true;
1321   }
1322 
1323   if (auto *AT = dyn_cast<ArrayType>(T)) {
1324     // If the array only have one element, we unpack.
1325     auto NumElements = AT->getNumElements();
1326     if (NumElements == 1) {
1327       V = IC.Builder.CreateExtractValue(V, 0);
1328       combineStoreToNewValue(IC, SI, V);
1329       return true;
1330     }
1331 
1332     // Bail out if the array is too large. Ideally we would like to optimize
1333     // arrays of arbitrary size but this has a terrible impact on compile time.
1334     // The threshold here is chosen arbitrarily, maybe needs a little bit of
1335     // tuning.
1336     if (NumElements > IC.MaxArraySizeForCombine)
1337       return false;
1338 
1339     const DataLayout &DL = IC.getDataLayout();
1340     auto EltSize = DL.getTypeAllocSize(AT->getElementType());
1341     const auto Align = SI.getAlign();
1342 
1343     SmallString<16> EltName = V->getName();
1344     EltName += ".elt";
1345     auto *Addr = SI.getPointerOperand();
1346     SmallString<16> AddrName = Addr->getName();
1347     AddrName += ".repack";
1348 
1349     auto *IdxType = Type::getInt64Ty(T->getContext());
1350     auto *Zero = ConstantInt::get(IdxType, 0);
1351 
1352     uint64_t Offset = 0;
1353     for (uint64_t i = 0; i < NumElements; i++) {
1354       Value *Indices[2] = {
1355         Zero,
1356         ConstantInt::get(IdxType, i),
1357       };
1358       auto *Ptr =
1359           IC.Builder.CreateInBoundsGEP(AT, Addr, ArrayRef(Indices), AddrName);
1360       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
1361       auto EltAlign = commonAlignment(Align, Offset);
1362       Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
1363       NS->setAAMetadata(SI.getAAMetadata());
1364       Offset += EltSize;
1365     }
1366 
1367     return true;
1368   }
1369 
1370   return false;
1371 }
1372 
1373 /// equivalentAddressValues - Test if A and B will obviously have the same
1374 /// value. This includes recognizing that %t0 and %t1 will have the same
1375 /// value in code like this:
1376 ///   %t0 = getelementptr \@a, 0, 3
1377 ///   store i32 0, i32* %t0
1378 ///   %t1 = getelementptr \@a, 0, 3
1379 ///   %t2 = load i32* %t1
1380 ///
1381 static bool equivalentAddressValues(Value *A, Value *B) {
1382   // Test if the values are trivially equivalent.
1383   if (A == B) return true;
1384 
1385   // Test if the values come form identical arithmetic instructions.
1386   // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
1387   // its only used to compare two uses within the same basic block, which
1388   // means that they'll always either have the same value or one of them
1389   // will have an undefined value.
1390   if (isa<BinaryOperator>(A) ||
1391       isa<CastInst>(A) ||
1392       isa<PHINode>(A) ||
1393       isa<GetElementPtrInst>(A))
1394     if (Instruction *BI = dyn_cast<Instruction>(B))
1395       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
1396         return true;
1397 
1398   // Otherwise they may not be equivalent.
1399   return false;
1400 }
1401 
1402 /// Converts store (bitcast (load (bitcast (select ...)))) to
1403 /// store (load (select ...)), where select is minmax:
1404 /// select ((cmp load V1, load V2), V1, V2).
1405 static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC,
1406                                                 StoreInst &SI) {
1407   // bitcast?
1408   if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
1409     return false;
1410   // load? integer?
1411   Value *LoadAddr;
1412   if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr)))))
1413     return false;
1414   auto *LI = cast<LoadInst>(SI.getValueOperand());
1415   if (!LI->getType()->isIntegerTy())
1416     return false;
1417   Type *CmpLoadTy;
1418   if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy))
1419     return false;
1420 
1421   // Make sure the type would actually change.
1422   // This condition can be hit with chains of bitcasts.
1423   if (LI->getType() == CmpLoadTy)
1424     return false;
1425 
1426   // Make sure we're not changing the size of the load/store.
1427   const auto &DL = IC.getDataLayout();
1428   if (DL.getTypeStoreSizeInBits(LI->getType()) !=
1429       DL.getTypeStoreSizeInBits(CmpLoadTy))
1430     return false;
1431 
1432   if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
1433         auto *SI = dyn_cast<StoreInst>(U);
1434         return SI && SI->getPointerOperand() != LI &&
1435                InstCombiner::peekThroughBitcast(SI->getPointerOperand()) !=
1436                    LoadAddr &&
1437                !SI->getPointerOperand()->isSwiftError();
1438       }))
1439     return false;
1440 
1441   IC.Builder.SetInsertPoint(LI);
1442   LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy);
1443   // Replace all the stores with stores of the newly loaded value.
1444   for (auto *UI : LI->users()) {
1445     auto *USI = cast<StoreInst>(UI);
1446     IC.Builder.SetInsertPoint(USI);
1447     combineStoreToNewValue(IC, *USI, NewLI);
1448   }
1449   IC.replaceInstUsesWith(*LI, PoisonValue::get(LI->getType()));
1450   IC.eraseInstFromFunction(*LI);
1451   return true;
1452 }
1453 
1454 Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
1455   Value *Val = SI.getOperand(0);
1456   Value *Ptr = SI.getOperand(1);
1457 
1458   // Try to canonicalize the stored type.
1459   if (combineStoreToValueType(*this, SI))
1460     return eraseInstFromFunction(SI);
1461 
1462   // Attempt to improve the alignment.
1463   const Align KnownAlign = getOrEnforceKnownAlignment(
1464       Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
1465   if (KnownAlign > SI.getAlign())
1466     SI.setAlignment(KnownAlign);
1467 
1468   // Try to canonicalize the stored type.
1469   if (unpackStoreToAggregate(*this, SI))
1470     return eraseInstFromFunction(SI);
1471 
1472   if (removeBitcastsFromLoadStoreOnMinMax(*this, SI))
1473     return eraseInstFromFunction(SI);
1474 
1475   // Replace GEP indices if possible.
1476   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI))
1477     return replaceOperand(SI, 1, NewGEPI);
1478 
1479   // Don't hack volatile/ordered stores.
1480   // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
1481   if (!SI.isUnordered()) return nullptr;
1482 
1483   // If the RHS is an alloca with a single use, zapify the store, making the
1484   // alloca dead.
1485   if (Ptr->hasOneUse()) {
1486     if (isa<AllocaInst>(Ptr))
1487       return eraseInstFromFunction(SI);
1488     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1489       if (isa<AllocaInst>(GEP->getOperand(0))) {
1490         if (GEP->getOperand(0)->hasOneUse())
1491           return eraseInstFromFunction(SI);
1492       }
1493     }
1494   }
1495 
1496   // If we have a store to a location which is known constant, we can conclude
1497   // that the store must be storing the constant value (else the memory
1498   // wouldn't be constant), and this must be a noop.
1499   if (!isModSet(AA->getModRefInfoMask(Ptr)))
1500     return eraseInstFromFunction(SI);
1501 
1502   // Do really simple DSE, to catch cases where there are several consecutive
1503   // stores to the same location, separated by a few arithmetic operations. This
1504   // situation often occurs with bitfield accesses.
1505   BasicBlock::iterator BBI(SI);
1506   for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
1507        --ScanInsts) {
1508     --BBI;
1509     // Don't count debug info directives, lest they affect codegen,
1510     // and we skip pointer-to-pointer bitcasts, which are NOPs.
1511     if (BBI->isDebugOrPseudoInst() ||
1512         (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1513       ScanInsts++;
1514       continue;
1515     }
1516 
1517     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
1518       // Prev store isn't volatile, and stores to the same location?
1519       if (PrevSI->isUnordered() &&
1520           equivalentAddressValues(PrevSI->getOperand(1), SI.getOperand(1)) &&
1521           PrevSI->getValueOperand()->getType() ==
1522               SI.getValueOperand()->getType()) {
1523         ++NumDeadStore;
1524         // Manually add back the original store to the worklist now, so it will
1525         // be processed after the operands of the removed store, as this may
1526         // expose additional DSE opportunities.
1527         Worklist.push(&SI);
1528         eraseInstFromFunction(*PrevSI);
1529         return nullptr;
1530       }
1531       break;
1532     }
1533 
1534     // If this is a load, we have to stop.  However, if the loaded value is from
1535     // the pointer we're loading and is producing the pointer we're storing,
1536     // then *this* store is dead (X = load P; store X -> P).
1537     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
1538       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
1539         assert(SI.isUnordered() && "can't eliminate ordering operation");
1540         return eraseInstFromFunction(SI);
1541       }
1542 
1543       // Otherwise, this is a load from some other location.  Stores before it
1544       // may not be dead.
1545       break;
1546     }
1547 
1548     // Don't skip over loads, throws or things that can modify memory.
1549     if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
1550       break;
1551   }
1552 
1553   // store X, null    -> turns into 'unreachable' in SimplifyCFG
1554   // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
1555   if (canSimplifyNullStoreOrGEP(SI)) {
1556     if (!isa<PoisonValue>(Val))
1557       return replaceOperand(SI, 0, PoisonValue::get(Val->getType()));
1558     return nullptr;  // Do not modify these!
1559   }
1560 
1561   // This is a non-terminator unreachable marker. Don't remove it.
1562   if (isa<UndefValue>(Ptr)) {
1563     // Remove all instructions after the marker and guaranteed-to-transfer
1564     // instructions before the marker.
1565     if (handleUnreachableFrom(SI.getNextNode()) ||
1566         removeInstructionsBeforeUnreachable(SI))
1567       return &SI;
1568     return nullptr;
1569   }
1570 
1571   // store undef, Ptr -> noop
1572   // FIXME: This is technically incorrect because it might overwrite a poison
1573   // value. Change to PoisonValue once #52930 is resolved.
1574   if (isa<UndefValue>(Val))
1575     return eraseInstFromFunction(SI);
1576 
1577   return nullptr;
1578 }
1579 
1580 /// Try to transform:
1581 ///   if () { *P = v1; } else { *P = v2 }
1582 /// or:
1583 ///   *P = v1; if () { *P = v2; }
1584 /// into a phi node with a store in the successor.
1585 bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
1586   if (!SI.isUnordered())
1587     return false; // This code has not been audited for volatile/ordered case.
1588 
1589   // Check if the successor block has exactly 2 incoming edges.
1590   BasicBlock *StoreBB = SI.getParent();
1591   BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
1592   if (!DestBB->hasNPredecessors(2))
1593     return false;
1594 
1595   // Capture the other block (the block that doesn't contain our store).
1596   pred_iterator PredIter = pred_begin(DestBB);
1597   if (*PredIter == StoreBB)
1598     ++PredIter;
1599   BasicBlock *OtherBB = *PredIter;
1600 
1601   // Bail out if all of the relevant blocks aren't distinct. This can happen,
1602   // for example, if SI is in an infinite loop.
1603   if (StoreBB == DestBB || OtherBB == DestBB)
1604     return false;
1605 
1606   // Verify that the other block ends in a branch and is not otherwise empty.
1607   BasicBlock::iterator BBI(OtherBB->getTerminator());
1608   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
1609   if (!OtherBr || BBI == OtherBB->begin())
1610     return false;
1611 
1612   auto OtherStoreIsMergeable = [&](StoreInst *OtherStore) -> bool {
1613     if (!OtherStore ||
1614         OtherStore->getPointerOperand() != SI.getPointerOperand())
1615       return false;
1616 
1617     auto *SIVTy = SI.getValueOperand()->getType();
1618     auto *OSVTy = OtherStore->getValueOperand()->getType();
1619     return CastInst::isBitOrNoopPointerCastable(OSVTy, SIVTy, DL) &&
1620            SI.hasSameSpecialState(OtherStore);
1621   };
1622 
1623   // If the other block ends in an unconditional branch, check for the 'if then
1624   // else' case. There is an instruction before the branch.
1625   StoreInst *OtherStore = nullptr;
1626   if (OtherBr->isUnconditional()) {
1627     --BBI;
1628     // Skip over debugging info and pseudo probes.
1629     while (BBI->isDebugOrPseudoInst() ||
1630            (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
1631       if (BBI==OtherBB->begin())
1632         return false;
1633       --BBI;
1634     }
1635     // If this isn't a store, isn't a store to the same location, or is not the
1636     // right kind of store, bail out.
1637     OtherStore = dyn_cast<StoreInst>(BBI);
1638     if (!OtherStoreIsMergeable(OtherStore))
1639       return false;
1640   } else {
1641     // Otherwise, the other block ended with a conditional branch. If one of the
1642     // destinations is StoreBB, then we have the if/then case.
1643     if (OtherBr->getSuccessor(0) != StoreBB &&
1644         OtherBr->getSuccessor(1) != StoreBB)
1645       return false;
1646 
1647     // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
1648     // if/then triangle. See if there is a store to the same ptr as SI that
1649     // lives in OtherBB.
1650     for (;; --BBI) {
1651       // Check to see if we find the matching store.
1652       OtherStore = dyn_cast<StoreInst>(BBI);
1653       if (OtherStoreIsMergeable(OtherStore))
1654         break;
1655 
1656       // If we find something that may be using or overwriting the stored
1657       // value, or if we run out of instructions, we can't do the transform.
1658       if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
1659           BBI->mayWriteToMemory() || BBI == OtherBB->begin())
1660         return false;
1661     }
1662 
1663     // In order to eliminate the store in OtherBr, we have to make sure nothing
1664     // reads or overwrites the stored value in StoreBB.
1665     for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
1666       // FIXME: This should really be AA driven.
1667       if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
1668         return false;
1669     }
1670   }
1671 
1672   // Insert a PHI node now if we need it.
1673   Value *MergedVal = OtherStore->getValueOperand();
1674   // The debug locations of the original instructions might differ. Merge them.
1675   DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
1676                                                      OtherStore->getDebugLoc());
1677   if (MergedVal != SI.getValueOperand()) {
1678     PHINode *PN =
1679         PHINode::Create(SI.getValueOperand()->getType(), 2, "storemerge");
1680     PN->addIncoming(SI.getValueOperand(), SI.getParent());
1681     Builder.SetInsertPoint(OtherStore);
1682     PN->addIncoming(Builder.CreateBitOrPointerCast(MergedVal, PN->getType()),
1683                     OtherBB);
1684     MergedVal = InsertNewInstBefore(PN, DestBB->front());
1685     PN->setDebugLoc(MergedLoc);
1686   }
1687 
1688   // Advance to a place where it is safe to insert the new store and insert it.
1689   BBI = DestBB->getFirstInsertionPt();
1690   StoreInst *NewSI =
1691       new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(),
1692                     SI.getOrdering(), SI.getSyncScopeID());
1693   InsertNewInstBefore(NewSI, *BBI);
1694   NewSI->setDebugLoc(MergedLoc);
1695   NewSI->mergeDIAssignID({&SI, OtherStore});
1696 
1697   // If the two stores had AA tags, merge them.
1698   AAMDNodes AATags = SI.getAAMetadata();
1699   if (AATags)
1700     NewSI->setAAMetadata(AATags.merge(OtherStore->getAAMetadata()));
1701 
1702   // Nuke the old stores.
1703   eraseInstFromFunction(SI);
1704   eraseInstFromFunction(*OtherStore);
1705   return true;
1706 }
1707