xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/Coroutines/CoroFrame.cpp (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- CoroFrame.cpp - Builds and manipulates coroutine frame -------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // This file contains classes used to discover if for a particular value
9 // there from sue to definition that crosses a suspend block.
10 //
11 // Using the information discovered we form a Coroutine Frame structure to
12 // contain those values. All uses of those values are replaced with appropriate
13 // GEP + load from the coroutine frame. At the point of the definition we spill
14 // the value into the coroutine frame.
15 //===----------------------------------------------------------------------===//
16 
17 #include "CoroInternal.h"
18 #include "llvm/ADT/BitVector.h"
19 #include "llvm/ADT/PostOrderIterator.h"
20 #include "llvm/ADT/ScopeExit.h"
21 #include "llvm/ADT/SmallString.h"
22 #include "llvm/Analysis/CFG.h"
23 #include "llvm/Analysis/PtrUseVisitor.h"
24 #include "llvm/Analysis/StackLifetime.h"
25 #include "llvm/Config/llvm-config.h"
26 #include "llvm/IR/CFG.h"
27 #include "llvm/IR/DIBuilder.h"
28 #include "llvm/IR/DebugInfo.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/InstIterator.h"
32 #include "llvm/IR/IntrinsicInst.h"
33 #include "llvm/Support/Debug.h"
34 #include "llvm/Support/MathExtras.h"
35 #include "llvm/Support/OptimizedStructLayout.h"
36 #include "llvm/Support/circular_raw_ostream.h"
37 #include "llvm/Support/raw_ostream.h"
38 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
39 #include "llvm/Transforms/Utils/Local.h"
40 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
41 #include <algorithm>
42 #include <deque>
43 #include <optional>
44 
45 using namespace llvm;
46 
47 extern cl::opt<bool> UseNewDbgInfoFormat;
48 
49 // The "coro-suspend-crossing" flag is very noisy. There is another debug type,
50 // "coro-frame", which results in leaner debug spew.
51 #define DEBUG_TYPE "coro-suspend-crossing"
52 
53 enum { SmallVectorThreshold = 32 };
54 
55 // Provides two way mapping between the blocks and numbers.
56 namespace {
57 class BlockToIndexMapping {
58   SmallVector<BasicBlock *, SmallVectorThreshold> V;
59 
60 public:
size() const61   size_t size() const { return V.size(); }
62 
BlockToIndexMapping(Function & F)63   BlockToIndexMapping(Function &F) {
64     for (BasicBlock &BB : F)
65       V.push_back(&BB);
66     llvm::sort(V);
67   }
68 
blockToIndex(BasicBlock const * BB) const69   size_t blockToIndex(BasicBlock const *BB) const {
70     auto *I = llvm::lower_bound(V, BB);
71     assert(I != V.end() && *I == BB && "BasicBlockNumberng: Unknown block");
72     return I - V.begin();
73   }
74 
indexToBlock(unsigned Index) const75   BasicBlock *indexToBlock(unsigned Index) const { return V[Index]; }
76 };
77 } // end anonymous namespace
78 
79 // The SuspendCrossingInfo maintains data that allows to answer a question
80 // whether given two BasicBlocks A and B there is a path from A to B that
81 // passes through a suspend point.
82 //
83 // For every basic block 'i' it maintains a BlockData that consists of:
84 //   Consumes:  a bit vector which contains a set of indices of blocks that can
85 //              reach block 'i'. A block can trivially reach itself.
86 //   Kills: a bit vector which contains a set of indices of blocks that can
87 //          reach block 'i' but there is a path crossing a suspend point
88 //          not repeating 'i' (path to 'i' without cycles containing 'i').
89 //   Suspend: a boolean indicating whether block 'i' contains a suspend point.
90 //   End: a boolean indicating whether block 'i' contains a coro.end intrinsic.
91 //   KillLoop: There is a path from 'i' to 'i' not otherwise repeating 'i' that
92 //             crosses a suspend point.
93 //
94 namespace {
95 class SuspendCrossingInfo {
96   BlockToIndexMapping Mapping;
97 
98   struct BlockData {
99     BitVector Consumes;
100     BitVector Kills;
101     bool Suspend = false;
102     bool End = false;
103     bool KillLoop = false;
104     bool Changed = false;
105   };
106   SmallVector<BlockData, SmallVectorThreshold> Block;
107 
predecessors(BlockData const & BD) const108   iterator_range<pred_iterator> predecessors(BlockData const &BD) const {
109     BasicBlock *BB = Mapping.indexToBlock(&BD - &Block[0]);
110     return llvm::predecessors(BB);
111   }
112 
getBlockData(BasicBlock * BB)113   BlockData &getBlockData(BasicBlock *BB) {
114     return Block[Mapping.blockToIndex(BB)];
115   }
116 
117   /// Compute the BlockData for the current function in one iteration.
118   /// Initialize - Whether this is the first iteration, we can optimize
119   /// the initial case a little bit by manual loop switch.
120   /// Returns whether the BlockData changes in this iteration.
121   template <bool Initialize = false>
122   bool computeBlockData(const ReversePostOrderTraversal<Function *> &RPOT);
123 
124 public:
125 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
126   void dump() const;
127   void dump(StringRef Label, BitVector const &BV) const;
128 #endif
129 
130   SuspendCrossingInfo(Function &F, coro::Shape &Shape);
131 
132   /// Returns true if there is a path from \p From to \p To crossing a suspend
133   /// point without crossing \p From a 2nd time.
hasPathCrossingSuspendPoint(BasicBlock * From,BasicBlock * To) const134   bool hasPathCrossingSuspendPoint(BasicBlock *From, BasicBlock *To) const {
135     size_t const FromIndex = Mapping.blockToIndex(From);
136     size_t const ToIndex = Mapping.blockToIndex(To);
137     bool const Result = Block[ToIndex].Kills[FromIndex];
138     LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName()
139                       << " answer is " << Result << "\n");
140     return Result;
141   }
142 
143   /// Returns true if there is a path from \p From to \p To crossing a suspend
144   /// point without crossing \p From a 2nd time. If \p From is the same as \p To
145   /// this will also check if there is a looping path crossing a suspend point.
hasPathOrLoopCrossingSuspendPoint(BasicBlock * From,BasicBlock * To) const146   bool hasPathOrLoopCrossingSuspendPoint(BasicBlock *From,
147                                          BasicBlock *To) const {
148     size_t const FromIndex = Mapping.blockToIndex(From);
149     size_t const ToIndex = Mapping.blockToIndex(To);
150     bool Result = Block[ToIndex].Kills[FromIndex] ||
151                   (From == To && Block[ToIndex].KillLoop);
152     LLVM_DEBUG(dbgs() << From->getName() << " => " << To->getName()
153                       << " answer is " << Result << " (path or loop)\n");
154     return Result;
155   }
156 
isDefinitionAcrossSuspend(BasicBlock * DefBB,User * U) const157   bool isDefinitionAcrossSuspend(BasicBlock *DefBB, User *U) const {
158     auto *I = cast<Instruction>(U);
159 
160     // We rewrote PHINodes, so that only the ones with exactly one incoming
161     // value need to be analyzed.
162     if (auto *PN = dyn_cast<PHINode>(I))
163       if (PN->getNumIncomingValues() > 1)
164         return false;
165 
166     BasicBlock *UseBB = I->getParent();
167 
168     // As a special case, treat uses by an llvm.coro.suspend.retcon or an
169     // llvm.coro.suspend.async as if they were uses in the suspend's single
170     // predecessor: the uses conceptually occur before the suspend.
171     if (isa<CoroSuspendRetconInst>(I) || isa<CoroSuspendAsyncInst>(I)) {
172       UseBB = UseBB->getSinglePredecessor();
173       assert(UseBB && "should have split coro.suspend into its own block");
174     }
175 
176     return hasPathCrossingSuspendPoint(DefBB, UseBB);
177   }
178 
isDefinitionAcrossSuspend(Argument & A,User * U) const179   bool isDefinitionAcrossSuspend(Argument &A, User *U) const {
180     return isDefinitionAcrossSuspend(&A.getParent()->getEntryBlock(), U);
181   }
182 
isDefinitionAcrossSuspend(Instruction & I,User * U) const183   bool isDefinitionAcrossSuspend(Instruction &I, User *U) const {
184     auto *DefBB = I.getParent();
185 
186     // As a special case, treat values produced by an llvm.coro.suspend.*
187     // as if they were defined in the single successor: the uses
188     // conceptually occur after the suspend.
189     if (isa<AnyCoroSuspendInst>(I)) {
190       DefBB = DefBB->getSingleSuccessor();
191       assert(DefBB && "should have split coro.suspend into its own block");
192     }
193 
194     return isDefinitionAcrossSuspend(DefBB, U);
195   }
196 
isDefinitionAcrossSuspend(Value & V,User * U) const197   bool isDefinitionAcrossSuspend(Value &V, User *U) const {
198     if (auto *Arg = dyn_cast<Argument>(&V))
199       return isDefinitionAcrossSuspend(*Arg, U);
200     if (auto *Inst = dyn_cast<Instruction>(&V))
201       return isDefinitionAcrossSuspend(*Inst, U);
202 
203     llvm_unreachable(
204         "Coroutine could only collect Argument and Instruction now.");
205   }
206 };
207 } // end anonymous namespace
208 
209 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump(StringRef Label,BitVector const & BV) const210 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(StringRef Label,
211                                                 BitVector const &BV) const {
212   dbgs() << Label << ":";
213   for (size_t I = 0, N = BV.size(); I < N; ++I)
214     if (BV[I])
215       dbgs() << " " << Mapping.indexToBlock(I)->getName();
216   dbgs() << "\n";
217 }
218 
dump() const219 LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const {
220   for (size_t I = 0, N = Block.size(); I < N; ++I) {
221     BasicBlock *const B = Mapping.indexToBlock(I);
222     dbgs() << B->getName() << ":\n";
223     dump("   Consumes", Block[I].Consumes);
224     dump("      Kills", Block[I].Kills);
225   }
226   dbgs() << "\n";
227 }
228 #endif
229 
230 template <bool Initialize>
computeBlockData(const ReversePostOrderTraversal<Function * > & RPOT)231 bool SuspendCrossingInfo::computeBlockData(
232     const ReversePostOrderTraversal<Function *> &RPOT) {
233   bool Changed = false;
234 
235   for (const BasicBlock *BB : RPOT) {
236     auto BBNo = Mapping.blockToIndex(BB);
237     auto &B = Block[BBNo];
238 
239     // We don't need to count the predecessors when initialization.
240     if constexpr (!Initialize)
241       // If all the predecessors of the current Block don't change,
242       // the BlockData for the current block must not change too.
243       if (all_of(predecessors(B), [this](BasicBlock *BB) {
244             return !Block[Mapping.blockToIndex(BB)].Changed;
245           })) {
246         B.Changed = false;
247         continue;
248       }
249 
250     // Saved Consumes and Kills bitsets so that it is easy to see
251     // if anything changed after propagation.
252     auto SavedConsumes = B.Consumes;
253     auto SavedKills = B.Kills;
254 
255     for (BasicBlock *PI : predecessors(B)) {
256       auto PrevNo = Mapping.blockToIndex(PI);
257       auto &P = Block[PrevNo];
258 
259       // Propagate Kills and Consumes from predecessors into B.
260       B.Consumes |= P.Consumes;
261       B.Kills |= P.Kills;
262 
263       // If block P is a suspend block, it should propagate kills into block
264       // B for every block P consumes.
265       if (P.Suspend)
266         B.Kills |= P.Consumes;
267     }
268 
269     if (B.Suspend) {
270       // If block B is a suspend block, it should kill all of the blocks it
271       // consumes.
272       B.Kills |= B.Consumes;
273     } else if (B.End) {
274       // If block B is an end block, it should not propagate kills as the
275       // blocks following coro.end() are reached during initial invocation
276       // of the coroutine while all the data are still available on the
277       // stack or in the registers.
278       B.Kills.reset();
279     } else {
280       // This is reached when B block it not Suspend nor coro.end and it
281       // need to make sure that it is not in the kill set.
282       B.KillLoop |= B.Kills[BBNo];
283       B.Kills.reset(BBNo);
284     }
285 
286     if constexpr (!Initialize) {
287       B.Changed = (B.Kills != SavedKills) || (B.Consumes != SavedConsumes);
288       Changed |= B.Changed;
289     }
290   }
291 
292   return Changed;
293 }
294 
SuspendCrossingInfo(Function & F,coro::Shape & Shape)295 SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape)
296     : Mapping(F) {
297   const size_t N = Mapping.size();
298   Block.resize(N);
299 
300   // Initialize every block so that it consumes itself
301   for (size_t I = 0; I < N; ++I) {
302     auto &B = Block[I];
303     B.Consumes.resize(N);
304     B.Kills.resize(N);
305     B.Consumes.set(I);
306     B.Changed = true;
307   }
308 
309   // Mark all CoroEnd Blocks. We do not propagate Kills beyond coro.ends as
310   // the code beyond coro.end is reachable during initial invocation of the
311   // coroutine.
312   for (auto *CE : Shape.CoroEnds)
313     getBlockData(CE->getParent()).End = true;
314 
315   // Mark all suspend blocks and indicate that they kill everything they
316   // consume. Note, that crossing coro.save also requires a spill, as any code
317   // between coro.save and coro.suspend may resume the coroutine and all of the
318   // state needs to be saved by that time.
319   auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) {
320     BasicBlock *SuspendBlock = BarrierInst->getParent();
321     auto &B = getBlockData(SuspendBlock);
322     B.Suspend = true;
323     B.Kills |= B.Consumes;
324   };
325   for (auto *CSI : Shape.CoroSuspends) {
326     markSuspendBlock(CSI);
327     if (auto *Save = CSI->getCoroSave())
328       markSuspendBlock(Save);
329   }
330 
331   // It is considered to be faster to use RPO traversal for forward-edges
332   // dataflow analysis.
333   ReversePostOrderTraversal<Function *> RPOT(&F);
334   computeBlockData</*Initialize=*/true>(RPOT);
335   while (computeBlockData</*Initialize*/ false>(RPOT))
336     ;
337 
338   LLVM_DEBUG(dump());
339 }
340 
341 namespace {
342 
343 // RematGraph is used to construct a DAG for rematerializable instructions
344 // When the constructor is invoked with a candidate instruction (which is
345 // materializable) it builds a DAG of materializable instructions from that
346 // point.
347 // Typically, for each instruction identified as re-materializable across a
348 // suspend point, a RematGraph will be created.
349 struct RematGraph {
350   // Each RematNode in the graph contains the edges to instructions providing
351   // operands in the current node.
352   struct RematNode {
353     Instruction *Node;
354     SmallVector<RematNode *> Operands;
355     RematNode() = default;
RematNode__anon408e6fe60611::RematGraph::RematNode356     RematNode(Instruction *V) : Node(V) {}
357   };
358 
359   RematNode *EntryNode;
360   using RematNodeMap =
361       SmallMapVector<Instruction *, std::unique_ptr<RematNode>, 8>;
362   RematNodeMap Remats;
363   const std::function<bool(Instruction &)> &MaterializableCallback;
364   SuspendCrossingInfo &Checker;
365 
RematGraph__anon408e6fe60611::RematGraph366   RematGraph(const std::function<bool(Instruction &)> &MaterializableCallback,
367              Instruction *I, SuspendCrossingInfo &Checker)
368       : MaterializableCallback(MaterializableCallback), Checker(Checker) {
369     std::unique_ptr<RematNode> FirstNode = std::make_unique<RematNode>(I);
370     EntryNode = FirstNode.get();
371     std::deque<std::unique_ptr<RematNode>> WorkList;
372     addNode(std::move(FirstNode), WorkList, cast<User>(I));
373     while (WorkList.size()) {
374       std::unique_ptr<RematNode> N = std::move(WorkList.front());
375       WorkList.pop_front();
376       addNode(std::move(N), WorkList, cast<User>(I));
377     }
378   }
379 
addNode__anon408e6fe60611::RematGraph380   void addNode(std::unique_ptr<RematNode> NUPtr,
381                std::deque<std::unique_ptr<RematNode>> &WorkList,
382                User *FirstUse) {
383     RematNode *N = NUPtr.get();
384     if (Remats.count(N->Node))
385       return;
386 
387     // We haven't see this node yet - add to the list
388     Remats[N->Node] = std::move(NUPtr);
389     for (auto &Def : N->Node->operands()) {
390       Instruction *D = dyn_cast<Instruction>(Def.get());
391       if (!D || !MaterializableCallback(*D) ||
392           !Checker.isDefinitionAcrossSuspend(*D, FirstUse))
393         continue;
394 
395       if (Remats.count(D)) {
396         // Already have this in the graph
397         N->Operands.push_back(Remats[D].get());
398         continue;
399       }
400 
401       bool NoMatch = true;
402       for (auto &I : WorkList) {
403         if (I->Node == D) {
404           NoMatch = false;
405           N->Operands.push_back(I.get());
406           break;
407         }
408       }
409       if (NoMatch) {
410         // Create a new node
411         std::unique_ptr<RematNode> ChildNode = std::make_unique<RematNode>(D);
412         N->Operands.push_back(ChildNode.get());
413         WorkList.push_back(std::move(ChildNode));
414       }
415     }
416   }
417 
418 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump__anon408e6fe60611::RematGraph419   void dump() const {
420     dbgs() << "Entry (";
421     if (EntryNode->Node->getParent()->hasName())
422       dbgs() << EntryNode->Node->getParent()->getName();
423     else
424       EntryNode->Node->getParent()->printAsOperand(dbgs(), false);
425     dbgs() << ") : " << *EntryNode->Node << "\n";
426     for (auto &E : Remats) {
427       dbgs() << *(E.first) << "\n";
428       for (RematNode *U : E.second->Operands)
429         dbgs() << "  " << *U->Node << "\n";
430     }
431   }
432 #endif
433 };
434 } // end anonymous namespace
435 
436 namespace llvm {
437 
438 template <> struct GraphTraits<RematGraph *> {
439   using NodeRef = RematGraph::RematNode *;
440   using ChildIteratorType = RematGraph::RematNode **;
441 
getEntryNodellvm::GraphTraits442   static NodeRef getEntryNode(RematGraph *G) { return G->EntryNode; }
child_beginllvm::GraphTraits443   static ChildIteratorType child_begin(NodeRef N) {
444     return N->Operands.begin();
445   }
child_endllvm::GraphTraits446   static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
447 };
448 
449 } // end namespace llvm
450 
451 #undef DEBUG_TYPE // "coro-suspend-crossing"
452 #define DEBUG_TYPE "coro-frame"
453 
454 namespace {
455 class FrameTypeBuilder;
456 // Mapping from the to-be-spilled value to all the users that need reload.
457 using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>;
458 struct AllocaInfo {
459   AllocaInst *Alloca;
460   DenseMap<Instruction *, std::optional<APInt>> Aliases;
461   bool MayWriteBeforeCoroBegin;
AllocaInfo__anon408e6fe60711::AllocaInfo462   AllocaInfo(AllocaInst *Alloca,
463              DenseMap<Instruction *, std::optional<APInt>> Aliases,
464              bool MayWriteBeforeCoroBegin)
465       : Alloca(Alloca), Aliases(std::move(Aliases)),
466         MayWriteBeforeCoroBegin(MayWriteBeforeCoroBegin) {}
467 };
468 struct FrameDataInfo {
469   // All the values (that are not allocas) that needs to be spilled to the
470   // frame.
471   SpillInfo Spills;
472   // Allocas contains all values defined as allocas that need to live in the
473   // frame.
474   SmallVector<AllocaInfo, 8> Allocas;
475 
getAllDefs__anon408e6fe60711::FrameDataInfo476   SmallVector<Value *, 8> getAllDefs() const {
477     SmallVector<Value *, 8> Defs;
478     for (const auto &P : Spills)
479       Defs.push_back(P.first);
480     for (const auto &A : Allocas)
481       Defs.push_back(A.Alloca);
482     return Defs;
483   }
484 
getFieldIndex__anon408e6fe60711::FrameDataInfo485   uint32_t getFieldIndex(Value *V) const {
486     auto Itr = FieldIndexMap.find(V);
487     assert(Itr != FieldIndexMap.end() &&
488            "Value does not have a frame field index");
489     return Itr->second;
490   }
491 
setFieldIndex__anon408e6fe60711::FrameDataInfo492   void setFieldIndex(Value *V, uint32_t Index) {
493     assert((LayoutIndexUpdateStarted || FieldIndexMap.count(V) == 0) &&
494            "Cannot set the index for the same field twice.");
495     FieldIndexMap[V] = Index;
496   }
497 
getAlign__anon408e6fe60711::FrameDataInfo498   Align getAlign(Value *V) const {
499     auto Iter = FieldAlignMap.find(V);
500     assert(Iter != FieldAlignMap.end());
501     return Iter->second;
502   }
503 
setAlign__anon408e6fe60711::FrameDataInfo504   void setAlign(Value *V, Align AL) {
505     assert(FieldAlignMap.count(V) == 0);
506     FieldAlignMap.insert({V, AL});
507   }
508 
getDynamicAlign__anon408e6fe60711::FrameDataInfo509   uint64_t getDynamicAlign(Value *V) const {
510     auto Iter = FieldDynamicAlignMap.find(V);
511     assert(Iter != FieldDynamicAlignMap.end());
512     return Iter->second;
513   }
514 
setDynamicAlign__anon408e6fe60711::FrameDataInfo515   void setDynamicAlign(Value *V, uint64_t Align) {
516     assert(FieldDynamicAlignMap.count(V) == 0);
517     FieldDynamicAlignMap.insert({V, Align});
518   }
519 
getOffset__anon408e6fe60711::FrameDataInfo520   uint64_t getOffset(Value *V) const {
521     auto Iter = FieldOffsetMap.find(V);
522     assert(Iter != FieldOffsetMap.end());
523     return Iter->second;
524   }
525 
setOffset__anon408e6fe60711::FrameDataInfo526   void setOffset(Value *V, uint64_t Offset) {
527     assert(FieldOffsetMap.count(V) == 0);
528     FieldOffsetMap.insert({V, Offset});
529   }
530 
531   // Remap the index of every field in the frame, using the final layout index.
532   void updateLayoutIndex(FrameTypeBuilder &B);
533 
534 private:
535   // LayoutIndexUpdateStarted is used to avoid updating the index of any field
536   // twice by mistake.
537   bool LayoutIndexUpdateStarted = false;
538   // Map from values to their slot indexes on the frame. They will be first set
539   // with their original insertion field index. After the frame is built, their
540   // indexes will be updated into the final layout index.
541   DenseMap<Value *, uint32_t> FieldIndexMap;
542   // Map from values to their alignment on the frame. They would be set after
543   // the frame is built.
544   DenseMap<Value *, Align> FieldAlignMap;
545   DenseMap<Value *, uint64_t> FieldDynamicAlignMap;
546   // Map from values to their offset on the frame. They would be set after
547   // the frame is built.
548   DenseMap<Value *, uint64_t> FieldOffsetMap;
549 };
550 } // namespace
551 
552 #ifndef NDEBUG
dumpSpills(StringRef Title,const SpillInfo & Spills)553 static void dumpSpills(StringRef Title, const SpillInfo &Spills) {
554   dbgs() << "------------- " << Title << "--------------\n";
555   for (const auto &E : Spills) {
556     E.first->dump();
557     dbgs() << "   user: ";
558     for (auto *I : E.second)
559       I->dump();
560   }
561 }
dumpRemats(StringRef Title,const SmallMapVector<Instruction *,std::unique_ptr<RematGraph>,8> & RM)562 static void dumpRemats(
563     StringRef Title,
564     const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> &RM) {
565   dbgs() << "------------- " << Title << "--------------\n";
566   for (const auto &E : RM) {
567     E.second->dump();
568     dbgs() << "--\n";
569   }
570 }
571 
dumpAllocas(const SmallVectorImpl<AllocaInfo> & Allocas)572 static void dumpAllocas(const SmallVectorImpl<AllocaInfo> &Allocas) {
573   dbgs() << "------------- Allocas --------------\n";
574   for (const auto &A : Allocas) {
575     A.Alloca->dump();
576   }
577 }
578 #endif
579 
580 namespace {
581 using FieldIDType = size_t;
582 // We cannot rely solely on natural alignment of a type when building a
583 // coroutine frame and if the alignment specified on the Alloca instruction
584 // differs from the natural alignment of the alloca type we will need to insert
585 // padding.
586 class FrameTypeBuilder {
587 private:
588   struct Field {
589     uint64_t Size;
590     uint64_t Offset;
591     Type *Ty;
592     FieldIDType LayoutFieldIndex;
593     Align Alignment;
594     Align TyAlignment;
595     uint64_t DynamicAlignBuffer;
596   };
597 
598   const DataLayout &DL;
599   LLVMContext &Context;
600   uint64_t StructSize = 0;
601   Align StructAlign;
602   bool IsFinished = false;
603 
604   std::optional<Align> MaxFrameAlignment;
605 
606   SmallVector<Field, 8> Fields;
607   DenseMap<Value*, unsigned> FieldIndexByKey;
608 
609 public:
FrameTypeBuilder(LLVMContext & Context,const DataLayout & DL,std::optional<Align> MaxFrameAlignment)610   FrameTypeBuilder(LLVMContext &Context, const DataLayout &DL,
611                    std::optional<Align> MaxFrameAlignment)
612       : DL(DL), Context(Context), MaxFrameAlignment(MaxFrameAlignment) {}
613 
614   /// Add a field to this structure for the storage of an `alloca`
615   /// instruction.
addFieldForAlloca(AllocaInst * AI,bool IsHeader=false)616   [[nodiscard]] FieldIDType addFieldForAlloca(AllocaInst *AI,
617                                               bool IsHeader = false) {
618     Type *Ty = AI->getAllocatedType();
619 
620     // Make an array type if this is a static array allocation.
621     if (AI->isArrayAllocation()) {
622       if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
623         Ty = ArrayType::get(Ty, CI->getValue().getZExtValue());
624       else
625         report_fatal_error("Coroutines cannot handle non static allocas yet");
626     }
627 
628     return addField(Ty, AI->getAlign(), IsHeader);
629   }
630 
631   /// We want to put the allocas whose lifetime-ranges are not overlapped
632   /// into one slot of coroutine frame.
633   /// Consider the example at:https://bugs.llvm.org/show_bug.cgi?id=45566
634   ///
635   ///     cppcoro::task<void> alternative_paths(bool cond) {
636   ///         if (cond) {
637   ///             big_structure a;
638   ///             process(a);
639   ///             co_await something();
640   ///         } else {
641   ///             big_structure b;
642   ///             process2(b);
643   ///             co_await something();
644   ///         }
645   ///     }
646   ///
647   /// We want to put variable a and variable b in the same slot to
648   /// reduce the size of coroutine frame.
649   ///
650   /// This function use StackLifetime algorithm to partition the AllocaInsts in
651   /// Spills to non-overlapped sets in order to put Alloca in the same
652   /// non-overlapped set into the same slot in the Coroutine Frame. Then add
653   /// field for the allocas in the same non-overlapped set by using the largest
654   /// type as the field type.
655   ///
656   /// Side Effects: Because We sort the allocas, the order of allocas in the
657   /// frame may be different with the order in the source code.
658   void addFieldForAllocas(const Function &F, FrameDataInfo &FrameData,
659                           coro::Shape &Shape);
660 
661   /// Add a field to this structure.
addField(Type * Ty,MaybeAlign MaybeFieldAlignment,bool IsHeader=false,bool IsSpillOfValue=false)662   [[nodiscard]] FieldIDType addField(Type *Ty, MaybeAlign MaybeFieldAlignment,
663                                      bool IsHeader = false,
664                                      bool IsSpillOfValue = false) {
665     assert(!IsFinished && "adding fields to a finished builder");
666     assert(Ty && "must provide a type for a field");
667 
668     // The field size is always the alloc size of the type.
669     uint64_t FieldSize = DL.getTypeAllocSize(Ty);
670 
671     // For an alloca with size=0, we don't need to add a field and they
672     // can just point to any index in the frame. Use index 0.
673     if (FieldSize == 0) {
674       return 0;
675     }
676 
677     // The field alignment might not be the type alignment, but we need
678     // to remember the type alignment anyway to build the type.
679     // If we are spilling values we don't need to worry about ABI alignment
680     // concerns.
681     Align ABIAlign = DL.getABITypeAlign(Ty);
682     Align TyAlignment = ABIAlign;
683     if (IsSpillOfValue && MaxFrameAlignment && *MaxFrameAlignment < ABIAlign)
684       TyAlignment = *MaxFrameAlignment;
685     Align FieldAlignment = MaybeFieldAlignment.value_or(TyAlignment);
686 
687     // The field alignment could be bigger than the max frame case, in that case
688     // we request additional storage to be able to dynamically align the
689     // pointer.
690     uint64_t DynamicAlignBuffer = 0;
691     if (MaxFrameAlignment && (FieldAlignment > *MaxFrameAlignment)) {
692       DynamicAlignBuffer =
693           offsetToAlignment(MaxFrameAlignment->value(), FieldAlignment);
694       FieldAlignment = *MaxFrameAlignment;
695       FieldSize = FieldSize + DynamicAlignBuffer;
696     }
697 
698     // Lay out header fields immediately.
699     uint64_t Offset;
700     if (IsHeader) {
701       Offset = alignTo(StructSize, FieldAlignment);
702       StructSize = Offset + FieldSize;
703 
704       // Everything else has a flexible offset.
705     } else {
706       Offset = OptimizedStructLayoutField::FlexibleOffset;
707     }
708 
709     Fields.push_back({FieldSize, Offset, Ty, 0, FieldAlignment, TyAlignment,
710                       DynamicAlignBuffer});
711     return Fields.size() - 1;
712   }
713 
714   /// Finish the layout and set the body on the given type.
715   void finish(StructType *Ty);
716 
getStructSize() const717   uint64_t getStructSize() const {
718     assert(IsFinished && "not yet finished!");
719     return StructSize;
720   }
721 
getStructAlign() const722   Align getStructAlign() const {
723     assert(IsFinished && "not yet finished!");
724     return StructAlign;
725   }
726 
getLayoutFieldIndex(FieldIDType Id) const727   FieldIDType getLayoutFieldIndex(FieldIDType Id) const {
728     assert(IsFinished && "not yet finished!");
729     return Fields[Id].LayoutFieldIndex;
730   }
731 
getLayoutField(FieldIDType Id) const732   Field getLayoutField(FieldIDType Id) const {
733     assert(IsFinished && "not yet finished!");
734     return Fields[Id];
735   }
736 };
737 } // namespace
738 
updateLayoutIndex(FrameTypeBuilder & B)739 void FrameDataInfo::updateLayoutIndex(FrameTypeBuilder &B) {
740   auto Updater = [&](Value *I) {
741     auto Field = B.getLayoutField(getFieldIndex(I));
742     setFieldIndex(I, Field.LayoutFieldIndex);
743     setAlign(I, Field.Alignment);
744     uint64_t dynamicAlign =
745         Field.DynamicAlignBuffer
746             ? Field.DynamicAlignBuffer + Field.Alignment.value()
747             : 0;
748     setDynamicAlign(I, dynamicAlign);
749     setOffset(I, Field.Offset);
750   };
751   LayoutIndexUpdateStarted = true;
752   for (auto &S : Spills)
753     Updater(S.first);
754   for (const auto &A : Allocas)
755     Updater(A.Alloca);
756   LayoutIndexUpdateStarted = false;
757 }
758 
addFieldForAllocas(const Function & F,FrameDataInfo & FrameData,coro::Shape & Shape)759 void FrameTypeBuilder::addFieldForAllocas(const Function &F,
760                                           FrameDataInfo &FrameData,
761                                           coro::Shape &Shape) {
762   using AllocaSetType = SmallVector<AllocaInst *, 4>;
763   SmallVector<AllocaSetType, 4> NonOverlapedAllocas;
764 
765   // We need to add field for allocas at the end of this function.
766   auto AddFieldForAllocasAtExit = make_scope_exit([&]() {
767     for (auto AllocaList : NonOverlapedAllocas) {
768       auto *LargestAI = *AllocaList.begin();
769       FieldIDType Id = addFieldForAlloca(LargestAI);
770       for (auto *Alloca : AllocaList)
771         FrameData.setFieldIndex(Alloca, Id);
772     }
773   });
774 
775   if (!Shape.OptimizeFrame) {
776     for (const auto &A : FrameData.Allocas) {
777       AllocaInst *Alloca = A.Alloca;
778       NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
779     }
780     return;
781   }
782 
783   // Because there are paths from the lifetime.start to coro.end
784   // for each alloca, the liferanges for every alloca is overlaped
785   // in the blocks who contain coro.end and the successor blocks.
786   // So we choose to skip there blocks when we calculate the liferange
787   // for each alloca. It should be reasonable since there shouldn't be uses
788   // in these blocks and the coroutine frame shouldn't be used outside the
789   // coroutine body.
790   //
791   // Note that the user of coro.suspend may not be SwitchInst. However, this
792   // case seems too complex to handle. And it is harmless to skip these
793   // patterns since it just prevend putting the allocas to live in the same
794   // slot.
795   DenseMap<SwitchInst *, BasicBlock *> DefaultSuspendDest;
796   for (auto *CoroSuspendInst : Shape.CoroSuspends) {
797     for (auto *U : CoroSuspendInst->users()) {
798       if (auto *ConstSWI = dyn_cast<SwitchInst>(U)) {
799         auto *SWI = const_cast<SwitchInst *>(ConstSWI);
800         DefaultSuspendDest[SWI] = SWI->getDefaultDest();
801         SWI->setDefaultDest(SWI->getSuccessor(1));
802       }
803     }
804   }
805 
806   auto ExtractAllocas = [&]() {
807     AllocaSetType Allocas;
808     Allocas.reserve(FrameData.Allocas.size());
809     for (const auto &A : FrameData.Allocas)
810       Allocas.push_back(A.Alloca);
811     return Allocas;
812   };
813   StackLifetime StackLifetimeAnalyzer(F, ExtractAllocas(),
814                                       StackLifetime::LivenessType::May);
815   StackLifetimeAnalyzer.run();
816   auto IsAllocaInferenre = [&](const AllocaInst *AI1, const AllocaInst *AI2) {
817     return StackLifetimeAnalyzer.getLiveRange(AI1).overlaps(
818         StackLifetimeAnalyzer.getLiveRange(AI2));
819   };
820   auto GetAllocaSize = [&](const AllocaInfo &A) {
821     std::optional<TypeSize> RetSize = A.Alloca->getAllocationSize(DL);
822     assert(RetSize && "Variable Length Arrays (VLA) are not supported.\n");
823     assert(!RetSize->isScalable() && "Scalable vectors are not yet supported");
824     return RetSize->getFixedValue();
825   };
826   // Put larger allocas in the front. So the larger allocas have higher
827   // priority to merge, which can save more space potentially. Also each
828   // AllocaSet would be ordered. So we can get the largest Alloca in one
829   // AllocaSet easily.
830   sort(FrameData.Allocas, [&](const auto &Iter1, const auto &Iter2) {
831     return GetAllocaSize(Iter1) > GetAllocaSize(Iter2);
832   });
833   for (const auto &A : FrameData.Allocas) {
834     AllocaInst *Alloca = A.Alloca;
835     bool Merged = false;
836     // Try to find if the Alloca is not inferenced with any existing
837     // NonOverlappedAllocaSet. If it is true, insert the alloca to that
838     // NonOverlappedAllocaSet.
839     for (auto &AllocaSet : NonOverlapedAllocas) {
840       assert(!AllocaSet.empty() && "Processing Alloca Set is not empty.\n");
841       bool NoInference = none_of(AllocaSet, [&](auto Iter) {
842         return IsAllocaInferenre(Alloca, Iter);
843       });
844       // If the alignment of A is multiple of the alignment of B, the address
845       // of A should satisfy the requirement for aligning for B.
846       //
847       // There may be other more fine-grained strategies to handle the alignment
848       // infomation during the merging process. But it seems hard to handle
849       // these strategies and benefit little.
850       bool Alignable = [&]() -> bool {
851         auto *LargestAlloca = *AllocaSet.begin();
852         return LargestAlloca->getAlign().value() % Alloca->getAlign().value() ==
853                0;
854       }();
855       bool CouldMerge = NoInference && Alignable;
856       if (!CouldMerge)
857         continue;
858       AllocaSet.push_back(Alloca);
859       Merged = true;
860       break;
861     }
862     if (!Merged) {
863       NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
864     }
865   }
866   // Recover the default target destination for each Switch statement
867   // reserved.
868   for (auto SwitchAndDefaultDest : DefaultSuspendDest) {
869     SwitchInst *SWI = SwitchAndDefaultDest.first;
870     BasicBlock *DestBB = SwitchAndDefaultDest.second;
871     SWI->setDefaultDest(DestBB);
872   }
873   // This Debug Info could tell us which allocas are merged into one slot.
874   LLVM_DEBUG(for (auto &AllocaSet
875                   : NonOverlapedAllocas) {
876     if (AllocaSet.size() > 1) {
877       dbgs() << "In Function:" << F.getName() << "\n";
878       dbgs() << "Find Union Set "
879              << "\n";
880       dbgs() << "\tAllocas are \n";
881       for (auto Alloca : AllocaSet)
882         dbgs() << "\t\t" << *Alloca << "\n";
883     }
884   });
885 }
886 
finish(StructType * Ty)887 void FrameTypeBuilder::finish(StructType *Ty) {
888   assert(!IsFinished && "already finished!");
889 
890   // Prepare the optimal-layout field array.
891   // The Id in the layout field is a pointer to our Field for it.
892   SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
893   LayoutFields.reserve(Fields.size());
894   for (auto &Field : Fields) {
895     LayoutFields.emplace_back(&Field, Field.Size, Field.Alignment,
896                               Field.Offset);
897   }
898 
899   // Perform layout.
900   auto SizeAndAlign = performOptimizedStructLayout(LayoutFields);
901   StructSize = SizeAndAlign.first;
902   StructAlign = SizeAndAlign.second;
903 
904   auto getField = [](const OptimizedStructLayoutField &LayoutField) -> Field & {
905     return *static_cast<Field *>(const_cast<void*>(LayoutField.Id));
906   };
907 
908   // We need to produce a packed struct type if there's a field whose
909   // assigned offset isn't a multiple of its natural type alignment.
910   bool Packed = [&] {
911     for (auto &LayoutField : LayoutFields) {
912       auto &F = getField(LayoutField);
913       if (!isAligned(F.TyAlignment, LayoutField.Offset))
914         return true;
915     }
916     return false;
917   }();
918 
919   // Build the struct body.
920   SmallVector<Type*, 16> FieldTypes;
921   FieldTypes.reserve(LayoutFields.size() * 3 / 2);
922   uint64_t LastOffset = 0;
923   for (auto &LayoutField : LayoutFields) {
924     auto &F = getField(LayoutField);
925 
926     auto Offset = LayoutField.Offset;
927 
928     // Add a padding field if there's a padding gap and we're either
929     // building a packed struct or the padding gap is more than we'd
930     // get from aligning to the field type's natural alignment.
931     assert(Offset >= LastOffset);
932     if (Offset != LastOffset) {
933       if (Packed || alignTo(LastOffset, F.TyAlignment) != Offset)
934         FieldTypes.push_back(ArrayType::get(Type::getInt8Ty(Context),
935                                             Offset - LastOffset));
936     }
937 
938     F.Offset = Offset;
939     F.LayoutFieldIndex = FieldTypes.size();
940 
941     FieldTypes.push_back(F.Ty);
942     if (F.DynamicAlignBuffer) {
943       FieldTypes.push_back(
944           ArrayType::get(Type::getInt8Ty(Context), F.DynamicAlignBuffer));
945     }
946     LastOffset = Offset + F.Size;
947   }
948 
949   Ty->setBody(FieldTypes, Packed);
950 
951 #ifndef NDEBUG
952   // Check that the IR layout matches the offsets we expect.
953   auto Layout = DL.getStructLayout(Ty);
954   for (auto &F : Fields) {
955     assert(Ty->getElementType(F.LayoutFieldIndex) == F.Ty);
956     assert(Layout->getElementOffset(F.LayoutFieldIndex) == F.Offset);
957   }
958 #endif
959 
960   IsFinished = true;
961 }
962 
cacheDIVar(FrameDataInfo & FrameData,DenseMap<Value *,DILocalVariable * > & DIVarCache)963 static void cacheDIVar(FrameDataInfo &FrameData,
964                        DenseMap<Value *, DILocalVariable *> &DIVarCache) {
965   for (auto *V : FrameData.getAllDefs()) {
966     if (DIVarCache.contains(V))
967       continue;
968 
969     auto CacheIt = [&DIVarCache, V](const auto &Container) {
970       auto *I = llvm::find_if(Container, [](auto *DDI) {
971         return DDI->getExpression()->getNumElements() == 0;
972       });
973       if (I != Container.end())
974         DIVarCache.insert({V, (*I)->getVariable()});
975     };
976     CacheIt(findDbgDeclares(V));
977     CacheIt(findDVRDeclares(V));
978   }
979 }
980 
981 /// Create name for Type. It uses MDString to store new created string to
982 /// avoid memory leak.
solveTypeName(Type * Ty)983 static StringRef solveTypeName(Type *Ty) {
984   if (Ty->isIntegerTy()) {
985     // The longest name in common may be '__int_128', which has 9 bits.
986     SmallString<16> Buffer;
987     raw_svector_ostream OS(Buffer);
988     OS << "__int_" << cast<IntegerType>(Ty)->getBitWidth();
989     auto *MDName = MDString::get(Ty->getContext(), OS.str());
990     return MDName->getString();
991   }
992 
993   if (Ty->isFloatingPointTy()) {
994     if (Ty->isFloatTy())
995       return "__float_";
996     if (Ty->isDoubleTy())
997       return "__double_";
998     return "__floating_type_";
999   }
1000 
1001   if (Ty->isPointerTy())
1002     return "PointerType";
1003 
1004   if (Ty->isStructTy()) {
1005     if (!cast<StructType>(Ty)->hasName())
1006       return "__LiteralStructType_";
1007 
1008     auto Name = Ty->getStructName();
1009 
1010     SmallString<16> Buffer(Name);
1011     for (auto &Iter : Buffer)
1012       if (Iter == '.' || Iter == ':')
1013         Iter = '_';
1014     auto *MDName = MDString::get(Ty->getContext(), Buffer.str());
1015     return MDName->getString();
1016   }
1017 
1018   return "UnknownType";
1019 }
1020 
solveDIType(DIBuilder & Builder,Type * Ty,const DataLayout & Layout,DIScope * Scope,unsigned LineNum,DenseMap<Type *,DIType * > & DITypeCache)1021 static DIType *solveDIType(DIBuilder &Builder, Type *Ty,
1022                            const DataLayout &Layout, DIScope *Scope,
1023                            unsigned LineNum,
1024                            DenseMap<Type *, DIType *> &DITypeCache) {
1025   if (DIType *DT = DITypeCache.lookup(Ty))
1026     return DT;
1027 
1028   StringRef Name = solveTypeName(Ty);
1029 
1030   DIType *RetType = nullptr;
1031 
1032   if (Ty->isIntegerTy()) {
1033     auto BitWidth = cast<IntegerType>(Ty)->getBitWidth();
1034     RetType = Builder.createBasicType(Name, BitWidth, dwarf::DW_ATE_signed,
1035                                       llvm::DINode::FlagArtificial);
1036   } else if (Ty->isFloatingPointTy()) {
1037     RetType = Builder.createBasicType(Name, Layout.getTypeSizeInBits(Ty),
1038                                       dwarf::DW_ATE_float,
1039                                       llvm::DINode::FlagArtificial);
1040   } else if (Ty->isPointerTy()) {
1041     // Construct PointerType points to null (aka void *) instead of exploring
1042     // pointee type to avoid infinite search problem. For example, we would be
1043     // in trouble if we traverse recursively:
1044     //
1045     //  struct Node {
1046     //      Node* ptr;
1047     //  };
1048     RetType =
1049         Builder.createPointerType(nullptr, Layout.getTypeSizeInBits(Ty),
1050                                   Layout.getABITypeAlign(Ty).value() * CHAR_BIT,
1051                                   /*DWARFAddressSpace=*/std::nullopt, Name);
1052   } else if (Ty->isStructTy()) {
1053     auto *DIStruct = Builder.createStructType(
1054         Scope, Name, Scope->getFile(), LineNum, Layout.getTypeSizeInBits(Ty),
1055         Layout.getPrefTypeAlign(Ty).value() * CHAR_BIT,
1056         llvm::DINode::FlagArtificial, nullptr, llvm::DINodeArray());
1057 
1058     auto *StructTy = cast<StructType>(Ty);
1059     SmallVector<Metadata *, 16> Elements;
1060     for (unsigned I = 0; I < StructTy->getNumElements(); I++) {
1061       DIType *DITy = solveDIType(Builder, StructTy->getElementType(I), Layout,
1062                                  Scope, LineNum, DITypeCache);
1063       assert(DITy);
1064       Elements.push_back(Builder.createMemberType(
1065           Scope, DITy->getName(), Scope->getFile(), LineNum,
1066           DITy->getSizeInBits(), DITy->getAlignInBits(),
1067           Layout.getStructLayout(StructTy)->getElementOffsetInBits(I),
1068           llvm::DINode::FlagArtificial, DITy));
1069     }
1070 
1071     Builder.replaceArrays(DIStruct, Builder.getOrCreateArray(Elements));
1072 
1073     RetType = DIStruct;
1074   } else {
1075     LLVM_DEBUG(dbgs() << "Unresolved Type: " << *Ty << "\n");
1076     TypeSize Size = Layout.getTypeSizeInBits(Ty);
1077     auto *CharSizeType = Builder.createBasicType(
1078         Name, 8, dwarf::DW_ATE_unsigned_char, llvm::DINode::FlagArtificial);
1079 
1080     if (Size <= 8)
1081       RetType = CharSizeType;
1082     else {
1083       if (Size % 8 != 0)
1084         Size = TypeSize::getFixed(Size + 8 - (Size % 8));
1085 
1086       RetType = Builder.createArrayType(
1087           Size, Layout.getPrefTypeAlign(Ty).value(), CharSizeType,
1088           Builder.getOrCreateArray(Builder.getOrCreateSubrange(0, Size / 8)));
1089     }
1090   }
1091 
1092   DITypeCache.insert({Ty, RetType});
1093   return RetType;
1094 }
1095 
1096 /// Build artificial debug info for C++ coroutine frames to allow users to
1097 /// inspect the contents of the frame directly
1098 ///
1099 /// Create Debug information for coroutine frame with debug name "__coro_frame".
1100 /// The debug information for the fields of coroutine frame is constructed from
1101 /// the following way:
1102 /// 1. For all the value in the Frame, we search the use of dbg.declare to find
1103 ///    the corresponding debug variables for the value. If we can find the
1104 ///    debug variable, we can get full and accurate debug information.
1105 /// 2. If we can't get debug information in step 1 and 2, we could only try to
1106 ///    build the DIType by Type. We did this in solveDIType. We only handle
1107 ///    integer, float, double, integer type and struct type for now.
buildFrameDebugInfo(Function & F,coro::Shape & Shape,FrameDataInfo & FrameData)1108 static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
1109                                 FrameDataInfo &FrameData) {
1110   DISubprogram *DIS = F.getSubprogram();
1111   // If there is no DISubprogram for F, it implies the Function are not compiled
1112   // with debug info. So we also don't need to generate debug info for the frame
1113   // neither.
1114   if (!DIS || !DIS->getUnit() ||
1115       !dwarf::isCPlusPlus(
1116           (dwarf::SourceLanguage)DIS->getUnit()->getSourceLanguage()))
1117     return;
1118 
1119   assert(Shape.ABI == coro::ABI::Switch &&
1120          "We could only build debug infomation for C++ coroutine now.\n");
1121 
1122   DIBuilder DBuilder(*F.getParent(), /*AllowUnresolved*/ false);
1123 
1124   AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
1125   assert(PromiseAlloca &&
1126          "Coroutine with switch ABI should own Promise alloca");
1127 
1128   TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(PromiseAlloca);
1129   TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(PromiseAlloca);
1130 
1131   DILocalVariable *PromiseDIVariable = nullptr;
1132   DILocation *DILoc = nullptr;
1133   if (!DIs.empty()) {
1134     DbgDeclareInst *PromiseDDI = DIs.front();
1135     PromiseDIVariable = PromiseDDI->getVariable();
1136     DILoc = PromiseDDI->getDebugLoc().get();
1137   } else if (!DVRs.empty()) {
1138     DbgVariableRecord *PromiseDVR = DVRs.front();
1139     PromiseDIVariable = PromiseDVR->getVariable();
1140     DILoc = PromiseDVR->getDebugLoc().get();
1141   } else {
1142     return;
1143   }
1144 
1145   DILocalScope *PromiseDIScope = PromiseDIVariable->getScope();
1146   DIFile *DFile = PromiseDIScope->getFile();
1147   unsigned LineNum = PromiseDIVariable->getLine();
1148 
1149   DICompositeType *FrameDITy = DBuilder.createStructType(
1150       DIS->getUnit(), Twine(F.getName() + ".coro_frame_ty").str(),
1151       DFile, LineNum, Shape.FrameSize * 8,
1152       Shape.FrameAlign.value() * 8, llvm::DINode::FlagArtificial, nullptr,
1153       llvm::DINodeArray());
1154   StructType *FrameTy = Shape.FrameTy;
1155   SmallVector<Metadata *, 16> Elements;
1156   DataLayout Layout = F.getDataLayout();
1157 
1158   DenseMap<Value *, DILocalVariable *> DIVarCache;
1159   cacheDIVar(FrameData, DIVarCache);
1160 
1161   unsigned ResumeIndex = coro::Shape::SwitchFieldIndex::Resume;
1162   unsigned DestroyIndex = coro::Shape::SwitchFieldIndex::Destroy;
1163   unsigned IndexIndex = Shape.SwitchLowering.IndexField;
1164 
1165   DenseMap<unsigned, StringRef> NameCache;
1166   NameCache.insert({ResumeIndex, "__resume_fn"});
1167   NameCache.insert({DestroyIndex, "__destroy_fn"});
1168   NameCache.insert({IndexIndex, "__coro_index"});
1169 
1170   Type *ResumeFnTy = FrameTy->getElementType(ResumeIndex),
1171        *DestroyFnTy = FrameTy->getElementType(DestroyIndex),
1172        *IndexTy = FrameTy->getElementType(IndexIndex);
1173 
1174   DenseMap<unsigned, DIType *> TyCache;
1175   TyCache.insert(
1176       {ResumeIndex, DBuilder.createPointerType(
1177                         nullptr, Layout.getTypeSizeInBits(ResumeFnTy))});
1178   TyCache.insert(
1179       {DestroyIndex, DBuilder.createPointerType(
1180                          nullptr, Layout.getTypeSizeInBits(DestroyFnTy))});
1181 
1182   /// FIXME: If we fill the field `SizeInBits` with the actual size of
1183   /// __coro_index in bits, then __coro_index wouldn't show in the debugger.
1184   TyCache.insert({IndexIndex, DBuilder.createBasicType(
1185                                   "__coro_index",
1186                                   (Layout.getTypeSizeInBits(IndexTy) < 8)
1187                                       ? 8
1188                                       : Layout.getTypeSizeInBits(IndexTy),
1189                                   dwarf::DW_ATE_unsigned_char)});
1190 
1191   for (auto *V : FrameData.getAllDefs()) {
1192     if (!DIVarCache.contains(V))
1193       continue;
1194 
1195     auto Index = FrameData.getFieldIndex(V);
1196 
1197     NameCache.insert({Index, DIVarCache[V]->getName()});
1198     TyCache.insert({Index, DIVarCache[V]->getType()});
1199   }
1200 
1201   // Cache from index to (Align, Offset Pair)
1202   DenseMap<unsigned, std::pair<unsigned, unsigned>> OffsetCache;
1203   // The Align and Offset of Resume function and Destroy function are fixed.
1204   OffsetCache.insert({ResumeIndex, {8, 0}});
1205   OffsetCache.insert({DestroyIndex, {8, 8}});
1206   OffsetCache.insert(
1207       {IndexIndex,
1208        {Shape.SwitchLowering.IndexAlign, Shape.SwitchLowering.IndexOffset}});
1209 
1210   for (auto *V : FrameData.getAllDefs()) {
1211     auto Index = FrameData.getFieldIndex(V);
1212 
1213     OffsetCache.insert(
1214         {Index, {FrameData.getAlign(V).value(), FrameData.getOffset(V)}});
1215   }
1216 
1217   DenseMap<Type *, DIType *> DITypeCache;
1218   // This counter is used to avoid same type names. e.g., there would be
1219   // many i32 and i64 types in one coroutine. And we would use i32_0 and
1220   // i32_1 to avoid the same type. Since it makes no sense the name of the
1221   // fields confilicts with each other.
1222   unsigned UnknownTypeNum = 0;
1223   for (unsigned Index = 0; Index < FrameTy->getNumElements(); Index++) {
1224     if (!OffsetCache.contains(Index))
1225       continue;
1226 
1227     std::string Name;
1228     uint64_t SizeInBits;
1229     uint32_t AlignInBits;
1230     uint64_t OffsetInBits;
1231     DIType *DITy = nullptr;
1232 
1233     Type *Ty = FrameTy->getElementType(Index);
1234     assert(Ty->isSized() && "We can't handle type which is not sized.\n");
1235     SizeInBits = Layout.getTypeSizeInBits(Ty).getFixedValue();
1236     AlignInBits = OffsetCache[Index].first * 8;
1237     OffsetInBits = OffsetCache[Index].second * 8;
1238 
1239     if (NameCache.contains(Index)) {
1240       Name = NameCache[Index].str();
1241       DITy = TyCache[Index];
1242     } else {
1243       DITy = solveDIType(DBuilder, Ty, Layout, FrameDITy, LineNum, DITypeCache);
1244       assert(DITy && "SolveDIType shouldn't return nullptr.\n");
1245       Name = DITy->getName().str();
1246       Name += "_" + std::to_string(UnknownTypeNum);
1247       UnknownTypeNum++;
1248     }
1249 
1250     Elements.push_back(DBuilder.createMemberType(
1251         FrameDITy, Name, DFile, LineNum, SizeInBits, AlignInBits, OffsetInBits,
1252         llvm::DINode::FlagArtificial, DITy));
1253   }
1254 
1255   DBuilder.replaceArrays(FrameDITy, DBuilder.getOrCreateArray(Elements));
1256 
1257   auto *FrameDIVar = DBuilder.createAutoVariable(PromiseDIScope, "__coro_frame",
1258                                                  DFile, LineNum, FrameDITy,
1259                                                  true, DINode::FlagArtificial);
1260   assert(FrameDIVar->isValidLocationForIntrinsic(DILoc));
1261 
1262   // Subprogram would have ContainedNodes field which records the debug
1263   // variables it contained. So we need to add __coro_frame to the
1264   // ContainedNodes of it.
1265   //
1266   // If we don't add __coro_frame to the RetainedNodes, user may get
1267   // `no symbol __coro_frame in context` rather than `__coro_frame`
1268   // is optimized out, which is more precise.
1269   if (auto *SubProgram = dyn_cast<DISubprogram>(PromiseDIScope)) {
1270     auto RetainedNodes = SubProgram->getRetainedNodes();
1271     SmallVector<Metadata *, 32> RetainedNodesVec(RetainedNodes.begin(),
1272                                                  RetainedNodes.end());
1273     RetainedNodesVec.push_back(FrameDIVar);
1274     SubProgram->replaceOperandWith(
1275         7, (MDTuple::get(F.getContext(), RetainedNodesVec)));
1276   }
1277 
1278   if (UseNewDbgInfoFormat) {
1279     DbgVariableRecord *NewDVR =
1280         new DbgVariableRecord(ValueAsMetadata::get(Shape.FramePtr), FrameDIVar,
1281                               DBuilder.createExpression(), DILoc,
1282                               DbgVariableRecord::LocationType::Declare);
1283     BasicBlock::iterator It = Shape.getInsertPtAfterFramePtr();
1284     It->getParent()->insertDbgRecordBefore(NewDVR, It);
1285   } else {
1286     DBuilder.insertDeclare(Shape.FramePtr, FrameDIVar,
1287                            DBuilder.createExpression(), DILoc,
1288                            &*Shape.getInsertPtAfterFramePtr());
1289   }
1290 }
1291 
1292 // Build a struct that will keep state for an active coroutine.
1293 //   struct f.frame {
1294 //     ResumeFnTy ResumeFnAddr;
1295 //     ResumeFnTy DestroyFnAddr;
1296 //     ... promise (if present) ...
1297 //     int ResumeIndex;
1298 //     ... spills ...
1299 //   };
buildFrameType(Function & F,coro::Shape & Shape,FrameDataInfo & FrameData)1300 static StructType *buildFrameType(Function &F, coro::Shape &Shape,
1301                                   FrameDataInfo &FrameData) {
1302   LLVMContext &C = F.getContext();
1303   const DataLayout &DL = F.getDataLayout();
1304   StructType *FrameTy = [&] {
1305     SmallString<32> Name(F.getName());
1306     Name.append(".Frame");
1307     return StructType::create(C, Name);
1308   }();
1309 
1310   // We will use this value to cap the alignment of spilled values.
1311   std::optional<Align> MaxFrameAlignment;
1312   if (Shape.ABI == coro::ABI::Async)
1313     MaxFrameAlignment = Shape.AsyncLowering.getContextAlignment();
1314   FrameTypeBuilder B(C, DL, MaxFrameAlignment);
1315 
1316   AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
1317   std::optional<FieldIDType> SwitchIndexFieldId;
1318 
1319   if (Shape.ABI == coro::ABI::Switch) {
1320     auto *FnPtrTy = PointerType::getUnqual(C);
1321 
1322     // Add header fields for the resume and destroy functions.
1323     // We can rely on these being perfectly packed.
1324     (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true);
1325     (void)B.addField(FnPtrTy, std::nullopt, /*header*/ true);
1326 
1327     // PromiseAlloca field needs to be explicitly added here because it's
1328     // a header field with a fixed offset based on its alignment. Hence it
1329     // needs special handling and cannot be added to FrameData.Allocas.
1330     if (PromiseAlloca)
1331       FrameData.setFieldIndex(
1332           PromiseAlloca, B.addFieldForAlloca(PromiseAlloca, /*header*/ true));
1333 
1334     // Add a field to store the suspend index.  This doesn't need to
1335     // be in the header.
1336     unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
1337     Type *IndexType = Type::getIntNTy(C, IndexBits);
1338 
1339     SwitchIndexFieldId = B.addField(IndexType, std::nullopt);
1340   } else {
1341     assert(PromiseAlloca == nullptr && "lowering doesn't support promises");
1342   }
1343 
1344   // Because multiple allocas may own the same field slot,
1345   // we add allocas to field here.
1346   B.addFieldForAllocas(F, FrameData, Shape);
1347   // Add PromiseAlloca to Allocas list so that
1348   // 1. updateLayoutIndex could update its index after
1349   // `performOptimizedStructLayout`
1350   // 2. it is processed in insertSpills.
1351   if (Shape.ABI == coro::ABI::Switch && PromiseAlloca)
1352     // We assume that the promise alloca won't be modified before
1353     // CoroBegin and no alias will be create before CoroBegin.
1354     FrameData.Allocas.emplace_back(
1355         PromiseAlloca, DenseMap<Instruction *, std::optional<APInt>>{}, false);
1356   // Create an entry for every spilled value.
1357   for (auto &S : FrameData.Spills) {
1358     Type *FieldType = S.first->getType();
1359     // For byval arguments, we need to store the pointed value in the frame,
1360     // instead of the pointer itself.
1361     if (const Argument *A = dyn_cast<Argument>(S.first))
1362       if (A->hasByValAttr())
1363         FieldType = A->getParamByValType();
1364     FieldIDType Id = B.addField(FieldType, std::nullopt, false /*header*/,
1365                                 true /*IsSpillOfValue*/);
1366     FrameData.setFieldIndex(S.first, Id);
1367   }
1368 
1369   B.finish(FrameTy);
1370   FrameData.updateLayoutIndex(B);
1371   Shape.FrameAlign = B.getStructAlign();
1372   Shape.FrameSize = B.getStructSize();
1373 
1374   switch (Shape.ABI) {
1375   case coro::ABI::Switch: {
1376     // In the switch ABI, remember the switch-index field.
1377     auto IndexField = B.getLayoutField(*SwitchIndexFieldId);
1378     Shape.SwitchLowering.IndexField = IndexField.LayoutFieldIndex;
1379     Shape.SwitchLowering.IndexAlign = IndexField.Alignment.value();
1380     Shape.SwitchLowering.IndexOffset = IndexField.Offset;
1381 
1382     // Also round the frame size up to a multiple of its alignment, as is
1383     // generally expected in C/C++.
1384     Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign);
1385     break;
1386   }
1387 
1388   // In the retcon ABI, remember whether the frame is inline in the storage.
1389   case coro::ABI::Retcon:
1390   case coro::ABI::RetconOnce: {
1391     auto Id = Shape.getRetconCoroId();
1392     Shape.RetconLowering.IsFrameInlineInStorage
1393       = (B.getStructSize() <= Id->getStorageSize() &&
1394          B.getStructAlign() <= Id->getStorageAlignment());
1395     break;
1396   }
1397   case coro::ABI::Async: {
1398     Shape.AsyncLowering.FrameOffset =
1399         alignTo(Shape.AsyncLowering.ContextHeaderSize, Shape.FrameAlign);
1400     // Also make the final context size a multiple of the context alignment to
1401     // make allocation easier for allocators.
1402     Shape.AsyncLowering.ContextSize =
1403         alignTo(Shape.AsyncLowering.FrameOffset + Shape.FrameSize,
1404                 Shape.AsyncLowering.getContextAlignment());
1405     if (Shape.AsyncLowering.getContextAlignment() < Shape.FrameAlign) {
1406       report_fatal_error(
1407           "The alignment requirment of frame variables cannot be higher than "
1408           "the alignment of the async function context");
1409     }
1410     break;
1411   }
1412   }
1413 
1414   return FrameTy;
1415 }
1416 
1417 // We use a pointer use visitor to track how an alloca is being used.
1418 // The goal is to be able to answer the following three questions:
1419 // 1. Should this alloca be allocated on the frame instead.
1420 // 2. Could the content of the alloca be modified prior to CoroBegn, which would
1421 // require copying the data from alloca to the frame after CoroBegin.
1422 // 3. Is there any alias created for this alloca prior to CoroBegin, but used
1423 // after CoroBegin. In that case, we will need to recreate the alias after
1424 // CoroBegin based off the frame. To answer question 1, we track two things:
1425 //   a. List of all BasicBlocks that use this alloca or any of the aliases of
1426 //   the alloca. In the end, we check if there exists any two basic blocks that
1427 //   cross suspension points. If so, this alloca must be put on the frame. b.
1428 //   Whether the alloca or any alias of the alloca is escaped at some point,
1429 //   either by storing the address somewhere, or the address is used in a
1430 //   function call that might capture. If it's ever escaped, this alloca must be
1431 //   put on the frame conservatively.
1432 // To answer quetion 2, we track through the variable MayWriteBeforeCoroBegin.
1433 // Whenever a potential write happens, either through a store instruction, a
1434 // function call or any of the memory intrinsics, we check whether this
1435 // instruction is prior to CoroBegin. To answer question 3, we track the offsets
1436 // of all aliases created for the alloca prior to CoroBegin but used after
1437 // CoroBegin. std::optional is used to be able to represent the case when the
1438 // offset is unknown (e.g. when you have a PHINode that takes in different
1439 // offset values). We cannot handle unknown offsets and will assert. This is the
1440 // potential issue left out. An ideal solution would likely require a
1441 // significant redesign.
1442 namespace {
1443 struct AllocaUseVisitor : PtrUseVisitor<AllocaUseVisitor> {
1444   using Base = PtrUseVisitor<AllocaUseVisitor>;
AllocaUseVisitor__anon408e6fe61611::AllocaUseVisitor1445   AllocaUseVisitor(const DataLayout &DL, const DominatorTree &DT,
1446                    const coro::Shape &CoroShape,
1447                    const SuspendCrossingInfo &Checker,
1448                    bool ShouldUseLifetimeStartInfo)
1449       : PtrUseVisitor(DL), DT(DT), CoroShape(CoroShape), Checker(Checker),
1450         ShouldUseLifetimeStartInfo(ShouldUseLifetimeStartInfo) {
1451     for (AnyCoroSuspendInst *SuspendInst : CoroShape.CoroSuspends)
1452       CoroSuspendBBs.insert(SuspendInst->getParent());
1453   }
1454 
visit__anon408e6fe61611::AllocaUseVisitor1455   void visit(Instruction &I) {
1456     Users.insert(&I);
1457     Base::visit(I);
1458     // If the pointer is escaped prior to CoroBegin, we have to assume it would
1459     // be written into before CoroBegin as well.
1460     if (PI.isEscaped() &&
1461         !DT.dominates(CoroShape.CoroBegin, PI.getEscapingInst())) {
1462       MayWriteBeforeCoroBegin = true;
1463     }
1464   }
1465   // We need to provide this overload as PtrUseVisitor uses a pointer based
1466   // visiting function.
visit__anon408e6fe61611::AllocaUseVisitor1467   void visit(Instruction *I) { return visit(*I); }
1468 
visitPHINode__anon408e6fe61611::AllocaUseVisitor1469   void visitPHINode(PHINode &I) {
1470     enqueueUsers(I);
1471     handleAlias(I);
1472   }
1473 
visitSelectInst__anon408e6fe61611::AllocaUseVisitor1474   void visitSelectInst(SelectInst &I) {
1475     enqueueUsers(I);
1476     handleAlias(I);
1477   }
1478 
visitStoreInst__anon408e6fe61611::AllocaUseVisitor1479   void visitStoreInst(StoreInst &SI) {
1480     // Regardless whether the alias of the alloca is the value operand or the
1481     // pointer operand, we need to assume the alloca is been written.
1482     handleMayWrite(SI);
1483 
1484     if (SI.getValueOperand() != U->get())
1485       return;
1486 
1487     // We are storing the pointer into a memory location, potentially escaping.
1488     // As an optimization, we try to detect simple cases where it doesn't
1489     // actually escape, for example:
1490     //   %ptr = alloca ..
1491     //   %addr = alloca ..
1492     //   store %ptr, %addr
1493     //   %x = load %addr
1494     //   ..
1495     // If %addr is only used by loading from it, we could simply treat %x as
1496     // another alias of %ptr, and not considering %ptr being escaped.
1497     auto IsSimpleStoreThenLoad = [&]() {
1498       auto *AI = dyn_cast<AllocaInst>(SI.getPointerOperand());
1499       // If the memory location we are storing to is not an alloca, it
1500       // could be an alias of some other memory locations, which is difficult
1501       // to analyze.
1502       if (!AI)
1503         return false;
1504       // StoreAliases contains aliases of the memory location stored into.
1505       SmallVector<Instruction *, 4> StoreAliases = {AI};
1506       while (!StoreAliases.empty()) {
1507         Instruction *I = StoreAliases.pop_back_val();
1508         for (User *U : I->users()) {
1509           // If we are loading from the memory location, we are creating an
1510           // alias of the original pointer.
1511           if (auto *LI = dyn_cast<LoadInst>(U)) {
1512             enqueueUsers(*LI);
1513             handleAlias(*LI);
1514             continue;
1515           }
1516           // If we are overriding the memory location, the pointer certainly
1517           // won't escape.
1518           if (auto *S = dyn_cast<StoreInst>(U))
1519             if (S->getPointerOperand() == I)
1520               continue;
1521           if (auto *II = dyn_cast<IntrinsicInst>(U))
1522             if (II->isLifetimeStartOrEnd())
1523               continue;
1524           // BitCastInst creats aliases of the memory location being stored
1525           // into.
1526           if (auto *BI = dyn_cast<BitCastInst>(U)) {
1527             StoreAliases.push_back(BI);
1528             continue;
1529           }
1530           return false;
1531         }
1532       }
1533 
1534       return true;
1535     };
1536 
1537     if (!IsSimpleStoreThenLoad())
1538       PI.setEscaped(&SI);
1539   }
1540 
1541   // All mem intrinsics modify the data.
visitMemIntrinsic__anon408e6fe61611::AllocaUseVisitor1542   void visitMemIntrinsic(MemIntrinsic &MI) { handleMayWrite(MI); }
1543 
visitBitCastInst__anon408e6fe61611::AllocaUseVisitor1544   void visitBitCastInst(BitCastInst &BC) {
1545     Base::visitBitCastInst(BC);
1546     handleAlias(BC);
1547   }
1548 
visitAddrSpaceCastInst__anon408e6fe61611::AllocaUseVisitor1549   void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
1550     Base::visitAddrSpaceCastInst(ASC);
1551     handleAlias(ASC);
1552   }
1553 
visitGetElementPtrInst__anon408e6fe61611::AllocaUseVisitor1554   void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
1555     // The base visitor will adjust Offset accordingly.
1556     Base::visitGetElementPtrInst(GEPI);
1557     handleAlias(GEPI);
1558   }
1559 
visitIntrinsicInst__anon408e6fe61611::AllocaUseVisitor1560   void visitIntrinsicInst(IntrinsicInst &II) {
1561     // When we found the lifetime markers refers to a
1562     // subrange of the original alloca, ignore the lifetime
1563     // markers to avoid misleading the analysis.
1564     if (!IsOffsetKnown || !Offset.isZero())
1565       return Base::visitIntrinsicInst(II);
1566     switch (II.getIntrinsicID()) {
1567     default:
1568       return Base::visitIntrinsicInst(II);
1569     case Intrinsic::lifetime_start:
1570       LifetimeStarts.insert(&II);
1571       LifetimeStartBBs.push_back(II.getParent());
1572       break;
1573     case Intrinsic::lifetime_end:
1574       LifetimeEndBBs.insert(II.getParent());
1575       break;
1576     }
1577   }
1578 
visitCallBase__anon408e6fe61611::AllocaUseVisitor1579   void visitCallBase(CallBase &CB) {
1580     for (unsigned Op = 0, OpCount = CB.arg_size(); Op < OpCount; ++Op)
1581       if (U->get() == CB.getArgOperand(Op) && !CB.doesNotCapture(Op))
1582         PI.setEscaped(&CB);
1583     handleMayWrite(CB);
1584   }
1585 
getShouldLiveOnFrame__anon408e6fe61611::AllocaUseVisitor1586   bool getShouldLiveOnFrame() const {
1587     if (!ShouldLiveOnFrame)
1588       ShouldLiveOnFrame = computeShouldLiveOnFrame();
1589     return *ShouldLiveOnFrame;
1590   }
1591 
getMayWriteBeforeCoroBegin__anon408e6fe61611::AllocaUseVisitor1592   bool getMayWriteBeforeCoroBegin() const { return MayWriteBeforeCoroBegin; }
1593 
getAliasesCopy__anon408e6fe61611::AllocaUseVisitor1594   DenseMap<Instruction *, std::optional<APInt>> getAliasesCopy() const {
1595     assert(getShouldLiveOnFrame() && "This method should only be called if the "
1596                                      "alloca needs to live on the frame.");
1597     for (const auto &P : AliasOffetMap)
1598       if (!P.second)
1599         report_fatal_error("Unable to handle an alias with unknown offset "
1600                            "created before CoroBegin.");
1601     return AliasOffetMap;
1602   }
1603 
1604 private:
1605   const DominatorTree &DT;
1606   const coro::Shape &CoroShape;
1607   const SuspendCrossingInfo &Checker;
1608   // All alias to the original AllocaInst, created before CoroBegin and used
1609   // after CoroBegin. Each entry contains the instruction and the offset in the
1610   // original Alloca. They need to be recreated after CoroBegin off the frame.
1611   DenseMap<Instruction *, std::optional<APInt>> AliasOffetMap{};
1612   SmallPtrSet<Instruction *, 4> Users{};
1613   SmallPtrSet<IntrinsicInst *, 2> LifetimeStarts{};
1614   SmallVector<BasicBlock *> LifetimeStartBBs{};
1615   SmallPtrSet<BasicBlock *, 2> LifetimeEndBBs{};
1616   SmallPtrSet<const BasicBlock *, 2> CoroSuspendBBs{};
1617   bool MayWriteBeforeCoroBegin{false};
1618   bool ShouldUseLifetimeStartInfo{true};
1619 
1620   mutable std::optional<bool> ShouldLiveOnFrame{};
1621 
computeShouldLiveOnFrame__anon408e6fe61611::AllocaUseVisitor1622   bool computeShouldLiveOnFrame() const {
1623     // If lifetime information is available, we check it first since it's
1624     // more precise. We look at every pair of lifetime.start intrinsic and
1625     // every basic block that uses the pointer to see if they cross suspension
1626     // points. The uses cover both direct uses as well as indirect uses.
1627     if (ShouldUseLifetimeStartInfo && !LifetimeStarts.empty()) {
1628       // If there is no explicit lifetime.end, then assume the address can
1629       // cross suspension points.
1630       if (LifetimeEndBBs.empty())
1631         return true;
1632 
1633       // If there is a path from a lifetime.start to a suspend without a
1634       // corresponding lifetime.end, then the alloca's lifetime persists
1635       // beyond that suspension point and the alloca must go on the frame.
1636       llvm::SmallVector<BasicBlock *> Worklist(LifetimeStartBBs);
1637       if (isManyPotentiallyReachableFromMany(Worklist, CoroSuspendBBs,
1638                                              &LifetimeEndBBs, &DT))
1639         return true;
1640 
1641       // Addresses are guaranteed to be identical after every lifetime.start so
1642       // we cannot use the local stack if the address escaped and there is a
1643       // suspend point between lifetime markers. This should also cover the
1644       // case of a single lifetime.start intrinsic in a loop with suspend point.
1645       if (PI.isEscaped()) {
1646         for (auto *A : LifetimeStarts) {
1647           for (auto *B : LifetimeStarts) {
1648             if (Checker.hasPathOrLoopCrossingSuspendPoint(A->getParent(),
1649                                                           B->getParent()))
1650               return true;
1651           }
1652         }
1653       }
1654       return false;
1655     }
1656     // FIXME: Ideally the isEscaped check should come at the beginning.
1657     // However there are a few loose ends that need to be fixed first before
1658     // we can do that. We need to make sure we are not over-conservative, so
1659     // that the data accessed in-between await_suspend and symmetric transfer
1660     // is always put on the stack, and also data accessed after coro.end is
1661     // always put on the stack (esp the return object). To fix that, we need
1662     // to:
1663     //  1) Potentially treat sret as nocapture in calls
1664     //  2) Special handle the return object and put it on the stack
1665     //  3) Utilize lifetime.end intrinsic
1666     if (PI.isEscaped())
1667       return true;
1668 
1669     for (auto *U1 : Users)
1670       for (auto *U2 : Users)
1671         if (Checker.isDefinitionAcrossSuspend(*U1, U2))
1672           return true;
1673 
1674     return false;
1675   }
1676 
handleMayWrite__anon408e6fe61611::AllocaUseVisitor1677   void handleMayWrite(const Instruction &I) {
1678     if (!DT.dominates(CoroShape.CoroBegin, &I))
1679       MayWriteBeforeCoroBegin = true;
1680   }
1681 
usedAfterCoroBegin__anon408e6fe61611::AllocaUseVisitor1682   bool usedAfterCoroBegin(Instruction &I) {
1683     for (auto &U : I.uses())
1684       if (DT.dominates(CoroShape.CoroBegin, U))
1685         return true;
1686     return false;
1687   }
1688 
handleAlias__anon408e6fe61611::AllocaUseVisitor1689   void handleAlias(Instruction &I) {
1690     // We track all aliases created prior to CoroBegin but used after.
1691     // These aliases may need to be recreated after CoroBegin if the alloca
1692     // need to live on the frame.
1693     if (DT.dominates(CoroShape.CoroBegin, &I) || !usedAfterCoroBegin(I))
1694       return;
1695 
1696     if (!IsOffsetKnown) {
1697       AliasOffetMap[&I].reset();
1698     } else {
1699       auto Itr = AliasOffetMap.find(&I);
1700       if (Itr == AliasOffetMap.end()) {
1701         AliasOffetMap[&I] = Offset;
1702       } else if (Itr->second && *Itr->second != Offset) {
1703         // If we have seen two different possible values for this alias, we set
1704         // it to empty.
1705         AliasOffetMap[&I].reset();
1706       }
1707     }
1708   }
1709 };
1710 } // namespace
1711 
1712 // We need to make room to insert a spill after initial PHIs, but before
1713 // catchswitch instruction. Placing it before violates the requirement that
1714 // catchswitch, like all other EHPads must be the first nonPHI in a block.
1715 //
1716 // Split away catchswitch into a separate block and insert in its place:
1717 //
1718 //   cleanuppad <InsertPt> cleanupret.
1719 //
1720 // cleanupret instruction will act as an insert point for the spill.
splitBeforeCatchSwitch(CatchSwitchInst * CatchSwitch)1721 static Instruction *splitBeforeCatchSwitch(CatchSwitchInst *CatchSwitch) {
1722   BasicBlock *CurrentBlock = CatchSwitch->getParent();
1723   BasicBlock *NewBlock = CurrentBlock->splitBasicBlock(CatchSwitch);
1724   CurrentBlock->getTerminator()->eraseFromParent();
1725 
1726   auto *CleanupPad =
1727       CleanupPadInst::Create(CatchSwitch->getParentPad(), {}, "", CurrentBlock);
1728   auto *CleanupRet =
1729       CleanupReturnInst::Create(CleanupPad, NewBlock, CurrentBlock);
1730   return CleanupRet;
1731 }
1732 
1733 // Replace all alloca and SSA values that are accessed across suspend points
1734 // with GetElementPointer from coroutine frame + loads and stores. Create an
1735 // AllocaSpillBB that will become the new entry block for the resume parts of
1736 // the coroutine:
1737 //
1738 //    %hdl = coro.begin(...)
1739 //    whatever
1740 //
1741 // becomes:
1742 //
1743 //    %hdl = coro.begin(...)
1744 //    br label %AllocaSpillBB
1745 //
1746 //  AllocaSpillBB:
1747 //    ; geps corresponding to allocas that were moved to coroutine frame
1748 //    br label PostSpill
1749 //
1750 //  PostSpill:
1751 //    whatever
1752 //
1753 //
insertSpills(const FrameDataInfo & FrameData,coro::Shape & Shape)1754 static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
1755   auto *CB = Shape.CoroBegin;
1756   LLVMContext &C = CB->getContext();
1757   Function *F = CB->getFunction();
1758   IRBuilder<> Builder(C);
1759   StructType *FrameTy = Shape.FrameTy;
1760   Value *FramePtr = Shape.FramePtr;
1761   DominatorTree DT(*F);
1762   SmallDenseMap<Argument *, AllocaInst *, 4> ArgToAllocaMap;
1763 
1764   // Create a GEP with the given index into the coroutine frame for the original
1765   // value Orig. Appends an extra 0 index for array-allocas, preserving the
1766   // original type.
1767   auto GetFramePointer = [&](Value *Orig) -> Value * {
1768     FieldIDType Index = FrameData.getFieldIndex(Orig);
1769     SmallVector<Value *, 3> Indices = {
1770         ConstantInt::get(Type::getInt32Ty(C), 0),
1771         ConstantInt::get(Type::getInt32Ty(C), Index),
1772     };
1773 
1774     if (auto *AI = dyn_cast<AllocaInst>(Orig)) {
1775       if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
1776         auto Count = CI->getValue().getZExtValue();
1777         if (Count > 1) {
1778           Indices.push_back(ConstantInt::get(Type::getInt32Ty(C), 0));
1779         }
1780       } else {
1781         report_fatal_error("Coroutines cannot handle non static allocas yet");
1782       }
1783     }
1784 
1785     auto GEP = cast<GetElementPtrInst>(
1786         Builder.CreateInBoundsGEP(FrameTy, FramePtr, Indices));
1787     if (auto *AI = dyn_cast<AllocaInst>(Orig)) {
1788       if (FrameData.getDynamicAlign(Orig) != 0) {
1789         assert(FrameData.getDynamicAlign(Orig) == AI->getAlign().value());
1790         auto *M = AI->getModule();
1791         auto *IntPtrTy = M->getDataLayout().getIntPtrType(AI->getType());
1792         auto *PtrValue = Builder.CreatePtrToInt(GEP, IntPtrTy);
1793         auto *AlignMask =
1794             ConstantInt::get(IntPtrTy, AI->getAlign().value() - 1);
1795         PtrValue = Builder.CreateAdd(PtrValue, AlignMask);
1796         PtrValue = Builder.CreateAnd(PtrValue, Builder.CreateNot(AlignMask));
1797         return Builder.CreateIntToPtr(PtrValue, AI->getType());
1798       }
1799       // If the type of GEP is not equal to the type of AllocaInst, it implies
1800       // that the AllocaInst may be reused in the Frame slot of other
1801       // AllocaInst. So We cast GEP to the AllocaInst here to re-use
1802       // the Frame storage.
1803       //
1804       // Note: If we change the strategy dealing with alignment, we need to refine
1805       // this casting.
1806       if (GEP->getType() != Orig->getType())
1807         return Builder.CreateAddrSpaceCast(GEP, Orig->getType(),
1808                                            Orig->getName() + Twine(".cast"));
1809     }
1810     return GEP;
1811   };
1812 
1813   for (auto const &E : FrameData.Spills) {
1814     Value *Def = E.first;
1815     auto SpillAlignment = Align(FrameData.getAlign(Def));
1816     // Create a store instruction storing the value into the
1817     // coroutine frame.
1818     BasicBlock::iterator InsertPt;
1819     Type *ByValTy = nullptr;
1820     if (auto *Arg = dyn_cast<Argument>(Def)) {
1821       // For arguments, we will place the store instruction right after
1822       // the coroutine frame pointer instruction, i.e. coro.begin.
1823       InsertPt = Shape.getInsertPtAfterFramePtr();
1824 
1825       // If we're spilling an Argument, make sure we clear 'nocapture'
1826       // from the coroutine function.
1827       Arg->getParent()->removeParamAttr(Arg->getArgNo(), Attribute::NoCapture);
1828 
1829       if (Arg->hasByValAttr())
1830         ByValTy = Arg->getParamByValType();
1831     } else if (auto *CSI = dyn_cast<AnyCoroSuspendInst>(Def)) {
1832       // Don't spill immediately after a suspend; splitting assumes
1833       // that the suspend will be followed by a branch.
1834       InsertPt = CSI->getParent()->getSingleSuccessor()->getFirstNonPHIIt();
1835     } else {
1836       auto *I = cast<Instruction>(Def);
1837       if (!DT.dominates(CB, I)) {
1838         // If it is not dominated by CoroBegin, then spill should be
1839         // inserted immediately after CoroFrame is computed.
1840         InsertPt = Shape.getInsertPtAfterFramePtr();
1841       } else if (auto *II = dyn_cast<InvokeInst>(I)) {
1842         // If we are spilling the result of the invoke instruction, split
1843         // the normal edge and insert the spill in the new block.
1844         auto *NewBB = SplitEdge(II->getParent(), II->getNormalDest());
1845         InsertPt = NewBB->getTerminator()->getIterator();
1846       } else if (isa<PHINode>(I)) {
1847         // Skip the PHINodes and EH pads instructions.
1848         BasicBlock *DefBlock = I->getParent();
1849         if (auto *CSI = dyn_cast<CatchSwitchInst>(DefBlock->getTerminator()))
1850           InsertPt = splitBeforeCatchSwitch(CSI)->getIterator();
1851         else
1852           InsertPt = DefBlock->getFirstInsertionPt();
1853       } else {
1854         assert(!I->isTerminator() && "unexpected terminator");
1855         // For all other values, the spill is placed immediately after
1856         // the definition.
1857         InsertPt = I->getNextNode()->getIterator();
1858       }
1859     }
1860 
1861     auto Index = FrameData.getFieldIndex(Def);
1862     Builder.SetInsertPoint(InsertPt->getParent(), InsertPt);
1863     auto *G = Builder.CreateConstInBoundsGEP2_32(
1864         FrameTy, FramePtr, 0, Index, Def->getName() + Twine(".spill.addr"));
1865     if (ByValTy) {
1866       // For byval arguments, we need to store the pointed value in the frame,
1867       // instead of the pointer itself.
1868       auto *Value = Builder.CreateLoad(ByValTy, Def);
1869       Builder.CreateAlignedStore(Value, G, SpillAlignment);
1870     } else {
1871       Builder.CreateAlignedStore(Def, G, SpillAlignment);
1872     }
1873 
1874     BasicBlock *CurrentBlock = nullptr;
1875     Value *CurrentReload = nullptr;
1876     for (auto *U : E.second) {
1877       // If we have not seen the use block, create a load instruction to reload
1878       // the spilled value from the coroutine frame. Populates the Value pointer
1879       // reference provided with the frame GEP.
1880       if (CurrentBlock != U->getParent()) {
1881         CurrentBlock = U->getParent();
1882         Builder.SetInsertPoint(CurrentBlock,
1883                                CurrentBlock->getFirstInsertionPt());
1884 
1885         auto *GEP = GetFramePointer(E.first);
1886         GEP->setName(E.first->getName() + Twine(".reload.addr"));
1887         if (ByValTy)
1888           CurrentReload = GEP;
1889         else
1890           CurrentReload = Builder.CreateAlignedLoad(
1891               FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP,
1892               SpillAlignment, E.first->getName() + Twine(".reload"));
1893 
1894         TinyPtrVector<DbgDeclareInst *> DIs = findDbgDeclares(Def);
1895         TinyPtrVector<DbgVariableRecord *> DVRs = findDVRDeclares(Def);
1896         // Try best to find dbg.declare. If the spill is a temp, there may not
1897         // be a direct dbg.declare. Walk up the load chain to find one from an
1898         // alias.
1899         if (F->getSubprogram()) {
1900           auto *CurDef = Def;
1901           while (DIs.empty() && DVRs.empty() && isa<LoadInst>(CurDef)) {
1902             auto *LdInst = cast<LoadInst>(CurDef);
1903             // Only consider ptr to ptr same type load.
1904             if (LdInst->getPointerOperandType() != LdInst->getType())
1905               break;
1906             CurDef = LdInst->getPointerOperand();
1907             if (!isa<AllocaInst, LoadInst>(CurDef))
1908               break;
1909             DIs = findDbgDeclares(CurDef);
1910             DVRs = findDVRDeclares(CurDef);
1911           }
1912         }
1913 
1914         auto SalvageOne = [&](auto *DDI) {
1915           bool AllowUnresolved = false;
1916           // This dbg.declare is preserved for all coro-split function
1917           // fragments. It will be unreachable in the main function, and
1918           // processed by coro::salvageDebugInfo() by CoroCloner.
1919           if (UseNewDbgInfoFormat) {
1920             DbgVariableRecord *NewDVR = new DbgVariableRecord(
1921                 ValueAsMetadata::get(CurrentReload), DDI->getVariable(),
1922                 DDI->getExpression(), DDI->getDebugLoc(),
1923                 DbgVariableRecord::LocationType::Declare);
1924             Builder.GetInsertPoint()->getParent()->insertDbgRecordBefore(
1925                 NewDVR, Builder.GetInsertPoint());
1926           } else {
1927             DIBuilder(*CurrentBlock->getParent()->getParent(), AllowUnresolved)
1928                 .insertDeclare(CurrentReload, DDI->getVariable(),
1929                                DDI->getExpression(), DDI->getDebugLoc(),
1930                                &*Builder.GetInsertPoint());
1931           }
1932           // This dbg.declare is for the main function entry point.  It
1933           // will be deleted in all coro-split functions.
1934           coro::salvageDebugInfo(ArgToAllocaMap, *DDI, Shape.OptimizeFrame,
1935                                  false /*UseEntryValue*/);
1936         };
1937         for_each(DIs, SalvageOne);
1938         for_each(DVRs, SalvageOne);
1939       }
1940 
1941       // If we have a single edge PHINode, remove it and replace it with a
1942       // reload from the coroutine frame. (We already took care of multi edge
1943       // PHINodes by rewriting them in the rewritePHIs function).
1944       if (auto *PN = dyn_cast<PHINode>(U)) {
1945         assert(PN->getNumIncomingValues() == 1 &&
1946                "unexpected number of incoming "
1947                "values in the PHINode");
1948         PN->replaceAllUsesWith(CurrentReload);
1949         PN->eraseFromParent();
1950         continue;
1951       }
1952 
1953       // Replace all uses of CurrentValue in the current instruction with
1954       // reload.
1955       U->replaceUsesOfWith(Def, CurrentReload);
1956       // Instructions are added to Def's user list if the attached
1957       // debug records use Def. Update those now.
1958       for (DbgVariableRecord &DVR : filterDbgVars(U->getDbgRecordRange()))
1959         DVR.replaceVariableLocationOp(Def, CurrentReload, true);
1960     }
1961   }
1962 
1963   BasicBlock *FramePtrBB = Shape.getInsertPtAfterFramePtr()->getParent();
1964 
1965   auto SpillBlock = FramePtrBB->splitBasicBlock(
1966       Shape.getInsertPtAfterFramePtr(), "AllocaSpillBB");
1967   SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
1968   Shape.AllocaSpillBlock = SpillBlock;
1969 
1970   // retcon and retcon.once lowering assumes all uses have been sunk.
1971   if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce ||
1972       Shape.ABI == coro::ABI::Async) {
1973     // If we found any allocas, replace all of their remaining uses with Geps.
1974     Builder.SetInsertPoint(SpillBlock, SpillBlock->begin());
1975     for (const auto &P : FrameData.Allocas) {
1976       AllocaInst *Alloca = P.Alloca;
1977       auto *G = GetFramePointer(Alloca);
1978 
1979       // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G))
1980       // here, as we are changing location of the instruction.
1981       G->takeName(Alloca);
1982       Alloca->replaceAllUsesWith(G);
1983       Alloca->eraseFromParent();
1984     }
1985     return;
1986   }
1987 
1988   // If we found any alloca, replace all of their remaining uses with GEP
1989   // instructions. To remain debugbility, we replace the uses of allocas for
1990   // dbg.declares and dbg.values with the reload from the frame.
1991   // Note: We cannot replace the alloca with GEP instructions indiscriminately,
1992   // as some of the uses may not be dominated by CoroBegin.
1993   Builder.SetInsertPoint(Shape.AllocaSpillBlock,
1994                          Shape.AllocaSpillBlock->begin());
1995   SmallVector<Instruction *, 4> UsersToUpdate;
1996   for (const auto &A : FrameData.Allocas) {
1997     AllocaInst *Alloca = A.Alloca;
1998     UsersToUpdate.clear();
1999     for (User *U : Alloca->users()) {
2000       auto *I = cast<Instruction>(U);
2001       if (DT.dominates(CB, I))
2002         UsersToUpdate.push_back(I);
2003     }
2004     if (UsersToUpdate.empty())
2005       continue;
2006     auto *G = GetFramePointer(Alloca);
2007     G->setName(Alloca->getName() + Twine(".reload.addr"));
2008 
2009     SmallVector<DbgVariableIntrinsic *, 4> DIs;
2010     SmallVector<DbgVariableRecord *> DbgVariableRecords;
2011     findDbgUsers(DIs, Alloca, &DbgVariableRecords);
2012     for (auto *DVI : DIs)
2013       DVI->replaceUsesOfWith(Alloca, G);
2014     for (auto *DVR : DbgVariableRecords)
2015       DVR->replaceVariableLocationOp(Alloca, G);
2016 
2017     for (Instruction *I : UsersToUpdate) {
2018       // It is meaningless to retain the lifetime intrinsics refer for the
2019       // member of coroutine frames and the meaningless lifetime intrinsics
2020       // are possible to block further optimizations.
2021       if (I->isLifetimeStartOrEnd()) {
2022         I->eraseFromParent();
2023         continue;
2024       }
2025 
2026       I->replaceUsesOfWith(Alloca, G);
2027     }
2028   }
2029   Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr());
2030   for (const auto &A : FrameData.Allocas) {
2031     AllocaInst *Alloca = A.Alloca;
2032     if (A.MayWriteBeforeCoroBegin) {
2033       // isEscaped really means potentially modified before CoroBegin.
2034       if (Alloca->isArrayAllocation())
2035         report_fatal_error(
2036             "Coroutines cannot handle copying of array allocas yet");
2037 
2038       auto *G = GetFramePointer(Alloca);
2039       auto *Value = Builder.CreateLoad(Alloca->getAllocatedType(), Alloca);
2040       Builder.CreateStore(Value, G);
2041     }
2042     // For each alias to Alloca created before CoroBegin but used after
2043     // CoroBegin, we recreate them after CoroBegin by appplying the offset
2044     // to the pointer in the frame.
2045     for (const auto &Alias : A.Aliases) {
2046       auto *FramePtr = GetFramePointer(Alloca);
2047       auto &Value = *Alias.second;
2048       auto ITy = IntegerType::get(C, Value.getBitWidth());
2049       auto *AliasPtr =
2050           Builder.CreatePtrAdd(FramePtr, ConstantInt::get(ITy, Value));
2051       Alias.first->replaceUsesWithIf(
2052           AliasPtr, [&](Use &U) { return DT.dominates(CB, U); });
2053     }
2054   }
2055 
2056   // PromiseAlloca is not collected in FrameData.Allocas. So we don't handle
2057   // the case that the PromiseAlloca may have writes before CoroBegin in the
2058   // above codes. And it may be problematic in edge cases. See
2059   // https://github.com/llvm/llvm-project/issues/57861 for an example.
2060   if (Shape.ABI == coro::ABI::Switch && Shape.SwitchLowering.PromiseAlloca) {
2061     AllocaInst *PA = Shape.SwitchLowering.PromiseAlloca;
2062     // If there is memory accessing to promise alloca before CoroBegin;
2063     bool HasAccessingPromiseBeforeCB = llvm::any_of(PA->uses(), [&](Use &U) {
2064       auto *Inst = dyn_cast<Instruction>(U.getUser());
2065       if (!Inst || DT.dominates(CB, Inst))
2066         return false;
2067 
2068       if (auto *CI = dyn_cast<CallInst>(Inst)) {
2069         // It is fine if the call wouldn't write to the Promise.
2070         // This is possible for @llvm.coro.id intrinsics, which
2071         // would take the promise as the second argument as a
2072         // marker.
2073         if (CI->onlyReadsMemory() ||
2074             CI->onlyReadsMemory(CI->getArgOperandNo(&U)))
2075           return false;
2076         return true;
2077       }
2078 
2079       return isa<StoreInst>(Inst) ||
2080              // It may take too much time to track the uses.
2081              // Be conservative about the case the use may escape.
2082              isa<GetElementPtrInst>(Inst) ||
2083              // There would always be a bitcast for the promise alloca
2084              // before we enabled Opaque pointers. And now given
2085              // opaque pointers are enabled by default. This should be
2086              // fine.
2087              isa<BitCastInst>(Inst);
2088     });
2089     if (HasAccessingPromiseBeforeCB) {
2090       Builder.SetInsertPoint(&*Shape.getInsertPtAfterFramePtr());
2091       auto *G = GetFramePointer(PA);
2092       auto *Value = Builder.CreateLoad(PA->getAllocatedType(), PA);
2093       Builder.CreateStore(Value, G);
2094     }
2095   }
2096 }
2097 
2098 // Moves the values in the PHIs in SuccBB that correspong to PredBB into a new
2099 // PHI in InsertedBB.
movePHIValuesToInsertedBlock(BasicBlock * SuccBB,BasicBlock * InsertedBB,BasicBlock * PredBB,PHINode * UntilPHI=nullptr)2100 static void movePHIValuesToInsertedBlock(BasicBlock *SuccBB,
2101                                          BasicBlock *InsertedBB,
2102                                          BasicBlock *PredBB,
2103                                          PHINode *UntilPHI = nullptr) {
2104   auto *PN = cast<PHINode>(&SuccBB->front());
2105   do {
2106     int Index = PN->getBasicBlockIndex(InsertedBB);
2107     Value *V = PN->getIncomingValue(Index);
2108     PHINode *InputV = PHINode::Create(
2109         V->getType(), 1, V->getName() + Twine(".") + SuccBB->getName());
2110     InputV->insertBefore(InsertedBB->begin());
2111     InputV->addIncoming(V, PredBB);
2112     PN->setIncomingValue(Index, InputV);
2113     PN = dyn_cast<PHINode>(PN->getNextNode());
2114   } while (PN != UntilPHI);
2115 }
2116 
2117 // Rewrites the PHI Nodes in a cleanuppad.
rewritePHIsForCleanupPad(BasicBlock * CleanupPadBB,CleanupPadInst * CleanupPad)2118 static void rewritePHIsForCleanupPad(BasicBlock *CleanupPadBB,
2119                                      CleanupPadInst *CleanupPad) {
2120   // For every incoming edge to a CleanupPad we will create a new block holding
2121   // all incoming values in single-value PHI nodes. We will then create another
2122   // block to act as a dispather (as all unwind edges for related EH blocks
2123   // must be the same).
2124   //
2125   // cleanuppad:
2126   //    %2 = phi i32[%0, %catchswitch], [%1, %catch.1]
2127   //    %3 = cleanuppad within none []
2128   //
2129   // It will create:
2130   //
2131   // cleanuppad.corodispatch
2132   //    %2 = phi i8[0, %catchswitch], [1, %catch.1]
2133   //    %3 = cleanuppad within none []
2134   //    switch i8 % 2, label %unreachable
2135   //            [i8 0, label %cleanuppad.from.catchswitch
2136   //             i8 1, label %cleanuppad.from.catch.1]
2137   // cleanuppad.from.catchswitch:
2138   //    %4 = phi i32 [%0, %catchswitch]
2139   //    br %label cleanuppad
2140   // cleanuppad.from.catch.1:
2141   //    %6 = phi i32 [%1, %catch.1]
2142   //    br %label cleanuppad
2143   // cleanuppad:
2144   //    %8 = phi i32 [%4, %cleanuppad.from.catchswitch],
2145   //                 [%6, %cleanuppad.from.catch.1]
2146 
2147   // Unreachable BB, in case switching on an invalid value in the dispatcher.
2148   auto *UnreachBB = BasicBlock::Create(
2149       CleanupPadBB->getContext(), "unreachable", CleanupPadBB->getParent());
2150   IRBuilder<> Builder(UnreachBB);
2151   Builder.CreateUnreachable();
2152 
2153   // Create a new cleanuppad which will be the dispatcher.
2154   auto *NewCleanupPadBB =
2155       BasicBlock::Create(CleanupPadBB->getContext(),
2156                          CleanupPadBB->getName() + Twine(".corodispatch"),
2157                          CleanupPadBB->getParent(), CleanupPadBB);
2158   Builder.SetInsertPoint(NewCleanupPadBB);
2159   auto *SwitchType = Builder.getInt8Ty();
2160   auto *SetDispatchValuePN =
2161       Builder.CreatePHI(SwitchType, pred_size(CleanupPadBB));
2162   CleanupPad->removeFromParent();
2163   CleanupPad->insertAfter(SetDispatchValuePN);
2164   auto *SwitchOnDispatch = Builder.CreateSwitch(SetDispatchValuePN, UnreachBB,
2165                                                 pred_size(CleanupPadBB));
2166 
2167   int SwitchIndex = 0;
2168   SmallVector<BasicBlock *, 8> Preds(predecessors(CleanupPadBB));
2169   for (BasicBlock *Pred : Preds) {
2170     // Create a new cleanuppad and move the PHI values to there.
2171     auto *CaseBB = BasicBlock::Create(CleanupPadBB->getContext(),
2172                                       CleanupPadBB->getName() +
2173                                           Twine(".from.") + Pred->getName(),
2174                                       CleanupPadBB->getParent(), CleanupPadBB);
2175     updatePhiNodes(CleanupPadBB, Pred, CaseBB);
2176     CaseBB->setName(CleanupPadBB->getName() + Twine(".from.") +
2177                     Pred->getName());
2178     Builder.SetInsertPoint(CaseBB);
2179     Builder.CreateBr(CleanupPadBB);
2180     movePHIValuesToInsertedBlock(CleanupPadBB, CaseBB, NewCleanupPadBB);
2181 
2182     // Update this Pred to the new unwind point.
2183     setUnwindEdgeTo(Pred->getTerminator(), NewCleanupPadBB);
2184 
2185     // Setup the switch in the dispatcher.
2186     auto *SwitchConstant = ConstantInt::get(SwitchType, SwitchIndex);
2187     SetDispatchValuePN->addIncoming(SwitchConstant, Pred);
2188     SwitchOnDispatch->addCase(SwitchConstant, CaseBB);
2189     SwitchIndex++;
2190   }
2191 }
2192 
cleanupSinglePredPHIs(Function & F)2193 static void cleanupSinglePredPHIs(Function &F) {
2194   SmallVector<PHINode *, 32> Worklist;
2195   for (auto &BB : F) {
2196     for (auto &Phi : BB.phis()) {
2197       if (Phi.getNumIncomingValues() == 1) {
2198         Worklist.push_back(&Phi);
2199       } else
2200         break;
2201     }
2202   }
2203   while (!Worklist.empty()) {
2204     auto *Phi = Worklist.pop_back_val();
2205     auto *OriginalValue = Phi->getIncomingValue(0);
2206     Phi->replaceAllUsesWith(OriginalValue);
2207   }
2208 }
2209 
rewritePHIs(BasicBlock & BB)2210 static void rewritePHIs(BasicBlock &BB) {
2211   // For every incoming edge we will create a block holding all
2212   // incoming values in a single PHI nodes.
2213   //
2214   // loop:
2215   //    %n.val = phi i32[%n, %entry], [%inc, %loop]
2216   //
2217   // It will create:
2218   //
2219   // loop.from.entry:
2220   //    %n.loop.pre = phi i32 [%n, %entry]
2221   //    br %label loop
2222   // loop.from.loop:
2223   //    %inc.loop.pre = phi i32 [%inc, %loop]
2224   //    br %label loop
2225   //
2226   // After this rewrite, further analysis will ignore any phi nodes with more
2227   // than one incoming edge.
2228 
2229   // TODO: Simplify PHINodes in the basic block to remove duplicate
2230   // predecessors.
2231 
2232   // Special case for CleanupPad: all EH blocks must have the same unwind edge
2233   // so we need to create an additional "dispatcher" block.
2234   if (auto *CleanupPad =
2235           dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) {
2236     SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
2237     for (BasicBlock *Pred : Preds) {
2238       if (CatchSwitchInst *CS =
2239               dyn_cast<CatchSwitchInst>(Pred->getTerminator())) {
2240         // CleanupPad with a CatchSwitch predecessor: therefore this is an
2241         // unwind destination that needs to be handle specially.
2242         assert(CS->getUnwindDest() == &BB);
2243         (void)CS;
2244         rewritePHIsForCleanupPad(&BB, CleanupPad);
2245         return;
2246       }
2247     }
2248   }
2249 
2250   LandingPadInst *LandingPad = nullptr;
2251   PHINode *ReplPHI = nullptr;
2252   if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
2253     // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
2254     // We replace the original landing pad with a PHINode that will collect the
2255     // results from all of them.
2256     ReplPHI = PHINode::Create(LandingPad->getType(), 1, "");
2257     ReplPHI->insertBefore(LandingPad->getIterator());
2258     ReplPHI->takeName(LandingPad);
2259     LandingPad->replaceAllUsesWith(ReplPHI);
2260     // We will erase the original landing pad at the end of this function after
2261     // ehAwareSplitEdge cloned it in the transition blocks.
2262   }
2263 
2264   SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
2265   for (BasicBlock *Pred : Preds) {
2266     auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI);
2267     IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName());
2268 
2269     // Stop the moving of values at ReplPHI, as this is either null or the PHI
2270     // that replaced the landing pad.
2271     movePHIValuesToInsertedBlock(&BB, IncomingBB, Pred, ReplPHI);
2272   }
2273 
2274   if (LandingPad) {
2275     // Calls to ehAwareSplitEdge function cloned the original lading pad.
2276     // No longer need it.
2277     LandingPad->eraseFromParent();
2278   }
2279 }
2280 
rewritePHIs(Function & F)2281 static void rewritePHIs(Function &F) {
2282   SmallVector<BasicBlock *, 8> WorkList;
2283 
2284   for (BasicBlock &BB : F)
2285     if (auto *PN = dyn_cast<PHINode>(&BB.front()))
2286       if (PN->getNumIncomingValues() > 1)
2287         WorkList.push_back(&BB);
2288 
2289   for (BasicBlock *BB : WorkList)
2290     rewritePHIs(*BB);
2291 }
2292 
2293 /// Default materializable callback
2294 // Check for instructions that we can recreate on resume as opposed to spill
2295 // the result into a coroutine frame.
defaultMaterializable(Instruction & V)2296 bool coro::defaultMaterializable(Instruction &V) {
2297   return (isa<CastInst>(&V) || isa<GetElementPtrInst>(&V) ||
2298           isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<SelectInst>(&V));
2299 }
2300 
2301 // Check for structural coroutine intrinsics that should not be spilled into
2302 // the coroutine frame.
isCoroutineStructureIntrinsic(Instruction & I)2303 static bool isCoroutineStructureIntrinsic(Instruction &I) {
2304   return isa<CoroIdInst>(&I) || isa<CoroSaveInst>(&I) ||
2305          isa<CoroSuspendInst>(&I);
2306 }
2307 
2308 // For each instruction identified as materializable across the suspend point,
2309 // and its associated DAG of other rematerializable instructions,
2310 // recreate the DAG of instructions after the suspend point.
rewriteMaterializableInstructions(const SmallMapVector<Instruction *,std::unique_ptr<RematGraph>,8> & AllRemats)2311 static void rewriteMaterializableInstructions(
2312     const SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8>
2313         &AllRemats) {
2314   // This has to be done in 2 phases
2315   // Do the remats and record the required defs to be replaced in the
2316   // original use instructions
2317   // Once all the remats are complete, replace the uses in the final
2318   // instructions with the new defs
2319   typedef struct {
2320     Instruction *Use;
2321     Instruction *Def;
2322     Instruction *Remat;
2323   } ProcessNode;
2324 
2325   SmallVector<ProcessNode> FinalInstructionsToProcess;
2326 
2327   for (const auto &E : AllRemats) {
2328     Instruction *Use = E.first;
2329     Instruction *CurrentMaterialization = nullptr;
2330     RematGraph *RG = E.second.get();
2331     ReversePostOrderTraversal<RematGraph *> RPOT(RG);
2332     SmallVector<Instruction *> InstructionsToProcess;
2333 
2334     // If the target use is actually a suspend instruction then we have to
2335     // insert the remats into the end of the predecessor (there should only be
2336     // one). This is so that suspend blocks always have the suspend instruction
2337     // as the first instruction.
2338     auto InsertPoint = &*Use->getParent()->getFirstInsertionPt();
2339     if (isa<AnyCoroSuspendInst>(Use)) {
2340       BasicBlock *SuspendPredecessorBlock =
2341           Use->getParent()->getSinglePredecessor();
2342       assert(SuspendPredecessorBlock && "malformed coro suspend instruction");
2343       InsertPoint = SuspendPredecessorBlock->getTerminator();
2344     }
2345 
2346     // Note: skip the first instruction as this is the actual use that we're
2347     // rematerializing everything for.
2348     auto I = RPOT.begin();
2349     ++I;
2350     for (; I != RPOT.end(); ++I) {
2351       Instruction *D = (*I)->Node;
2352       CurrentMaterialization = D->clone();
2353       CurrentMaterialization->setName(D->getName());
2354       CurrentMaterialization->insertBefore(InsertPoint);
2355       InsertPoint = CurrentMaterialization;
2356 
2357       // Replace all uses of Def in the instructions being added as part of this
2358       // rematerialization group
2359       for (auto &I : InstructionsToProcess)
2360         I->replaceUsesOfWith(D, CurrentMaterialization);
2361 
2362       // Don't replace the final use at this point as this can cause problems
2363       // for other materializations. Instead, for any final use that uses a
2364       // define that's being rematerialized, record the replace values
2365       for (unsigned i = 0, E = Use->getNumOperands(); i != E; ++i)
2366         if (Use->getOperand(i) == D) // Is this operand pointing to oldval?
2367           FinalInstructionsToProcess.push_back(
2368               {Use, D, CurrentMaterialization});
2369 
2370       InstructionsToProcess.push_back(CurrentMaterialization);
2371     }
2372   }
2373 
2374   // Finally, replace the uses with the defines that we've just rematerialized
2375   for (auto &R : FinalInstructionsToProcess) {
2376     if (auto *PN = dyn_cast<PHINode>(R.Use)) {
2377       assert(PN->getNumIncomingValues() == 1 && "unexpected number of incoming "
2378                                                 "values in the PHINode");
2379       PN->replaceAllUsesWith(R.Remat);
2380       PN->eraseFromParent();
2381       continue;
2382     }
2383     R.Use->replaceUsesOfWith(R.Def, R.Remat);
2384   }
2385 }
2386 
2387 // Splits the block at a particular instruction unless it is the first
2388 // instruction in the block with a single predecessor.
splitBlockIfNotFirst(Instruction * I,const Twine & Name)2389 static BasicBlock *splitBlockIfNotFirst(Instruction *I, const Twine &Name) {
2390   auto *BB = I->getParent();
2391   if (&BB->front() == I) {
2392     if (BB->getSinglePredecessor()) {
2393       BB->setName(Name);
2394       return BB;
2395     }
2396   }
2397   return BB->splitBasicBlock(I, Name);
2398 }
2399 
2400 // Split above and below a particular instruction so that it
2401 // will be all alone by itself in a block.
splitAround(Instruction * I,const Twine & Name)2402 static void splitAround(Instruction *I, const Twine &Name) {
2403   splitBlockIfNotFirst(I, Name);
2404   splitBlockIfNotFirst(I->getNextNode(), "After" + Name);
2405 }
2406 
isSuspendBlock(BasicBlock * BB)2407 static bool isSuspendBlock(BasicBlock *BB) {
2408   return isa<AnyCoroSuspendInst>(BB->front());
2409 }
2410 
2411 typedef SmallPtrSet<BasicBlock*, 8> VisitedBlocksSet;
2412 
2413 /// Does control flow starting at the given block ever reach a suspend
2414 /// instruction before reaching a block in VisitedOrFreeBBs?
isSuspendReachableFrom(BasicBlock * From,VisitedBlocksSet & VisitedOrFreeBBs)2415 static bool isSuspendReachableFrom(BasicBlock *From,
2416                                    VisitedBlocksSet &VisitedOrFreeBBs) {
2417   // Eagerly try to add this block to the visited set.  If it's already
2418   // there, stop recursing; this path doesn't reach a suspend before
2419   // either looping or reaching a freeing block.
2420   if (!VisitedOrFreeBBs.insert(From).second)
2421     return false;
2422 
2423   // We assume that we'll already have split suspends into their own blocks.
2424   if (isSuspendBlock(From))
2425     return true;
2426 
2427   // Recurse on the successors.
2428   for (auto *Succ : successors(From)) {
2429     if (isSuspendReachableFrom(Succ, VisitedOrFreeBBs))
2430       return true;
2431   }
2432 
2433   return false;
2434 }
2435 
2436 /// Is the given alloca "local", i.e. bounded in lifetime to not cross a
2437 /// suspend point?
isLocalAlloca(CoroAllocaAllocInst * AI)2438 static bool isLocalAlloca(CoroAllocaAllocInst *AI) {
2439   // Seed the visited set with all the basic blocks containing a free
2440   // so that we won't pass them up.
2441   VisitedBlocksSet VisitedOrFreeBBs;
2442   for (auto *User : AI->users()) {
2443     if (auto FI = dyn_cast<CoroAllocaFreeInst>(User))
2444       VisitedOrFreeBBs.insert(FI->getParent());
2445   }
2446 
2447   return !isSuspendReachableFrom(AI->getParent(), VisitedOrFreeBBs);
2448 }
2449 
2450 /// After we split the coroutine, will the given basic block be along
2451 /// an obvious exit path for the resumption function?
willLeaveFunctionImmediatelyAfter(BasicBlock * BB,unsigned depth=3)2452 static bool willLeaveFunctionImmediatelyAfter(BasicBlock *BB,
2453                                               unsigned depth = 3) {
2454   // If we've bottomed out our depth count, stop searching and assume
2455   // that the path might loop back.
2456   if (depth == 0) return false;
2457 
2458   // If this is a suspend block, we're about to exit the resumption function.
2459   if (isSuspendBlock(BB)) return true;
2460 
2461   // Recurse into the successors.
2462   for (auto *Succ : successors(BB)) {
2463     if (!willLeaveFunctionImmediatelyAfter(Succ, depth - 1))
2464       return false;
2465   }
2466 
2467   // If none of the successors leads back in a loop, we're on an exit/abort.
2468   return true;
2469 }
2470 
localAllocaNeedsStackSave(CoroAllocaAllocInst * AI)2471 static bool localAllocaNeedsStackSave(CoroAllocaAllocInst *AI) {
2472   // Look for a free that isn't sufficiently obviously followed by
2473   // either a suspend or a termination, i.e. something that will leave
2474   // the coro resumption frame.
2475   for (auto *U : AI->users()) {
2476     auto FI = dyn_cast<CoroAllocaFreeInst>(U);
2477     if (!FI) continue;
2478 
2479     if (!willLeaveFunctionImmediatelyAfter(FI->getParent()))
2480       return true;
2481   }
2482 
2483   // If we never found one, we don't need a stack save.
2484   return false;
2485 }
2486 
2487 /// Turn each of the given local allocas into a normal (dynamic) alloca
2488 /// instruction.
lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst * > LocalAllocas,SmallVectorImpl<Instruction * > & DeadInsts)2489 static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas,
2490                               SmallVectorImpl<Instruction*> &DeadInsts) {
2491   for (auto *AI : LocalAllocas) {
2492     IRBuilder<> Builder(AI);
2493 
2494     // Save the stack depth.  Try to avoid doing this if the stackrestore
2495     // is going to immediately precede a return or something.
2496     Value *StackSave = nullptr;
2497     if (localAllocaNeedsStackSave(AI))
2498       StackSave = Builder.CreateStackSave();
2499 
2500     // Allocate memory.
2501     auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize());
2502     Alloca->setAlignment(AI->getAlignment());
2503 
2504     for (auto *U : AI->users()) {
2505       // Replace gets with the allocation.
2506       if (isa<CoroAllocaGetInst>(U)) {
2507         U->replaceAllUsesWith(Alloca);
2508 
2509       // Replace frees with stackrestores.  This is safe because
2510       // alloca.alloc is required to obey a stack discipline, although we
2511       // don't enforce that structurally.
2512       } else {
2513         auto FI = cast<CoroAllocaFreeInst>(U);
2514         if (StackSave) {
2515           Builder.SetInsertPoint(FI);
2516           Builder.CreateStackRestore(StackSave);
2517         }
2518       }
2519       DeadInsts.push_back(cast<Instruction>(U));
2520     }
2521 
2522     DeadInsts.push_back(AI);
2523   }
2524 }
2525 
2526 /// Turn the given coro.alloca.alloc call into a dynamic allocation.
2527 /// This happens during the all-instructions iteration, so it must not
2528 /// delete the call.
lowerNonLocalAlloca(CoroAllocaAllocInst * AI,coro::Shape & Shape,SmallVectorImpl<Instruction * > & DeadInsts)2529 static Instruction *lowerNonLocalAlloca(CoroAllocaAllocInst *AI,
2530                                         coro::Shape &Shape,
2531                                    SmallVectorImpl<Instruction*> &DeadInsts) {
2532   IRBuilder<> Builder(AI);
2533   auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr);
2534 
2535   for (User *U : AI->users()) {
2536     if (isa<CoroAllocaGetInst>(U)) {
2537       U->replaceAllUsesWith(Alloc);
2538     } else {
2539       auto FI = cast<CoroAllocaFreeInst>(U);
2540       Builder.SetInsertPoint(FI);
2541       Shape.emitDealloc(Builder, Alloc, nullptr);
2542     }
2543     DeadInsts.push_back(cast<Instruction>(U));
2544   }
2545 
2546   // Push this on last so that it gets deleted after all the others.
2547   DeadInsts.push_back(AI);
2548 
2549   // Return the new allocation value so that we can check for needed spills.
2550   return cast<Instruction>(Alloc);
2551 }
2552 
2553 /// Get the current swifterror value.
emitGetSwiftErrorValue(IRBuilder<> & Builder,Type * ValueTy,coro::Shape & Shape)2554 static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy,
2555                                      coro::Shape &Shape) {
2556   // Make a fake function pointer as a sort of intrinsic.
2557   auto FnTy = FunctionType::get(ValueTy, {}, false);
2558   auto Fn = ConstantPointerNull::get(Builder.getPtrTy());
2559 
2560   auto Call = Builder.CreateCall(FnTy, Fn, {});
2561   Shape.SwiftErrorOps.push_back(Call);
2562 
2563   return Call;
2564 }
2565 
2566 /// Set the given value as the current swifterror value.
2567 ///
2568 /// Returns a slot that can be used as a swifterror slot.
emitSetSwiftErrorValue(IRBuilder<> & Builder,Value * V,coro::Shape & Shape)2569 static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V,
2570                                      coro::Shape &Shape) {
2571   // Make a fake function pointer as a sort of intrinsic.
2572   auto FnTy = FunctionType::get(Builder.getPtrTy(),
2573                                 {V->getType()}, false);
2574   auto Fn = ConstantPointerNull::get(Builder.getPtrTy());
2575 
2576   auto Call = Builder.CreateCall(FnTy, Fn, { V });
2577   Shape.SwiftErrorOps.push_back(Call);
2578 
2579   return Call;
2580 }
2581 
2582 /// Set the swifterror value from the given alloca before a call,
2583 /// then put in back in the alloca afterwards.
2584 ///
2585 /// Returns an address that will stand in for the swifterror slot
2586 /// until splitting.
emitSetAndGetSwiftErrorValueAround(Instruction * Call,AllocaInst * Alloca,coro::Shape & Shape)2587 static Value *emitSetAndGetSwiftErrorValueAround(Instruction *Call,
2588                                                  AllocaInst *Alloca,
2589                                                  coro::Shape &Shape) {
2590   auto ValueTy = Alloca->getAllocatedType();
2591   IRBuilder<> Builder(Call);
2592 
2593   // Load the current value from the alloca and set it as the
2594   // swifterror value.
2595   auto ValueBeforeCall = Builder.CreateLoad(ValueTy, Alloca);
2596   auto Addr = emitSetSwiftErrorValue(Builder, ValueBeforeCall, Shape);
2597 
2598   // Move to after the call.  Since swifterror only has a guaranteed
2599   // value on normal exits, we can ignore implicit and explicit unwind
2600   // edges.
2601   if (isa<CallInst>(Call)) {
2602     Builder.SetInsertPoint(Call->getNextNode());
2603   } else {
2604     auto Invoke = cast<InvokeInst>(Call);
2605     Builder.SetInsertPoint(Invoke->getNormalDest()->getFirstNonPHIOrDbg());
2606   }
2607 
2608   // Get the current swifterror value and store it to the alloca.
2609   auto ValueAfterCall = emitGetSwiftErrorValue(Builder, ValueTy, Shape);
2610   Builder.CreateStore(ValueAfterCall, Alloca);
2611 
2612   return Addr;
2613 }
2614 
2615 /// Eliminate a formerly-swifterror alloca by inserting the get/set
2616 /// intrinsics and attempting to MemToReg the alloca away.
eliminateSwiftErrorAlloca(Function & F,AllocaInst * Alloca,coro::Shape & Shape)2617 static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca,
2618                                       coro::Shape &Shape) {
2619   for (Use &Use : llvm::make_early_inc_range(Alloca->uses())) {
2620     // swifterror values can only be used in very specific ways.
2621     // We take advantage of that here.
2622     auto User = Use.getUser();
2623     if (isa<LoadInst>(User) || isa<StoreInst>(User))
2624       continue;
2625 
2626     assert(isa<CallInst>(User) || isa<InvokeInst>(User));
2627     auto Call = cast<Instruction>(User);
2628 
2629     auto Addr = emitSetAndGetSwiftErrorValueAround(Call, Alloca, Shape);
2630 
2631     // Use the returned slot address as the call argument.
2632     Use.set(Addr);
2633   }
2634 
2635   // All the uses should be loads and stores now.
2636   assert(isAllocaPromotable(Alloca));
2637 }
2638 
2639 /// "Eliminate" a swifterror argument by reducing it to the alloca case
2640 /// and then loading and storing in the prologue and epilog.
2641 ///
2642 /// The argument keeps the swifterror flag.
eliminateSwiftErrorArgument(Function & F,Argument & Arg,coro::Shape & Shape,SmallVectorImpl<AllocaInst * > & AllocasToPromote)2643 static void eliminateSwiftErrorArgument(Function &F, Argument &Arg,
2644                                         coro::Shape &Shape,
2645                              SmallVectorImpl<AllocaInst*> &AllocasToPromote) {
2646   IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
2647 
2648   auto ArgTy = cast<PointerType>(Arg.getType());
2649   auto ValueTy = PointerType::getUnqual(F.getContext());
2650 
2651   // Reduce to the alloca case:
2652 
2653   // Create an alloca and replace all uses of the arg with it.
2654   auto Alloca = Builder.CreateAlloca(ValueTy, ArgTy->getAddressSpace());
2655   Arg.replaceAllUsesWith(Alloca);
2656 
2657   // Set an initial value in the alloca.  swifterror is always null on entry.
2658   auto InitialValue = Constant::getNullValue(ValueTy);
2659   Builder.CreateStore(InitialValue, Alloca);
2660 
2661   // Find all the suspends in the function and save and restore around them.
2662   for (auto *Suspend : Shape.CoroSuspends) {
2663     (void) emitSetAndGetSwiftErrorValueAround(Suspend, Alloca, Shape);
2664   }
2665 
2666   // Find all the coro.ends in the function and restore the error value.
2667   for (auto *End : Shape.CoroEnds) {
2668     Builder.SetInsertPoint(End);
2669     auto FinalValue = Builder.CreateLoad(ValueTy, Alloca);
2670     (void) emitSetSwiftErrorValue(Builder, FinalValue, Shape);
2671   }
2672 
2673   // Now we can use the alloca logic.
2674   AllocasToPromote.push_back(Alloca);
2675   eliminateSwiftErrorAlloca(F, Alloca, Shape);
2676 }
2677 
2678 /// Eliminate all problematic uses of swifterror arguments and allocas
2679 /// from the function.  We'll fix them up later when splitting the function.
eliminateSwiftError(Function & F,coro::Shape & Shape)2680 static void eliminateSwiftError(Function &F, coro::Shape &Shape) {
2681   SmallVector<AllocaInst*, 4> AllocasToPromote;
2682 
2683   // Look for a swifterror argument.
2684   for (auto &Arg : F.args()) {
2685     if (!Arg.hasSwiftErrorAttr()) continue;
2686 
2687     eliminateSwiftErrorArgument(F, Arg, Shape, AllocasToPromote);
2688     break;
2689   }
2690 
2691   // Look for swifterror allocas.
2692   for (auto &Inst : F.getEntryBlock()) {
2693     auto Alloca = dyn_cast<AllocaInst>(&Inst);
2694     if (!Alloca || !Alloca->isSwiftError()) continue;
2695 
2696     // Clear the swifterror flag.
2697     Alloca->setSwiftError(false);
2698 
2699     AllocasToPromote.push_back(Alloca);
2700     eliminateSwiftErrorAlloca(F, Alloca, Shape);
2701   }
2702 
2703   // If we have any allocas to promote, compute a dominator tree and
2704   // promote them en masse.
2705   if (!AllocasToPromote.empty()) {
2706     DominatorTree DT(F);
2707     PromoteMemToReg(AllocasToPromote, DT);
2708   }
2709 }
2710 
2711 /// retcon and retcon.once conventions assume that all spill uses can be sunk
2712 /// after the coro.begin intrinsic.
sinkSpillUsesAfterCoroBegin(Function & F,const FrameDataInfo & FrameData,CoroBeginInst * CoroBegin)2713 static void sinkSpillUsesAfterCoroBegin(Function &F,
2714                                         const FrameDataInfo &FrameData,
2715                                         CoroBeginInst *CoroBegin) {
2716   DominatorTree Dom(F);
2717 
2718   SmallSetVector<Instruction *, 32> ToMove;
2719   SmallVector<Instruction *, 32> Worklist;
2720 
2721   // Collect all users that precede coro.begin.
2722   for (auto *Def : FrameData.getAllDefs()) {
2723     for (User *U : Def->users()) {
2724       auto Inst = cast<Instruction>(U);
2725       if (Inst->getParent() != CoroBegin->getParent() ||
2726           Dom.dominates(CoroBegin, Inst))
2727         continue;
2728       if (ToMove.insert(Inst))
2729         Worklist.push_back(Inst);
2730     }
2731   }
2732   // Recursively collect users before coro.begin.
2733   while (!Worklist.empty()) {
2734     auto *Def = Worklist.pop_back_val();
2735     for (User *U : Def->users()) {
2736       auto Inst = cast<Instruction>(U);
2737       if (Dom.dominates(CoroBegin, Inst))
2738         continue;
2739       if (ToMove.insert(Inst))
2740         Worklist.push_back(Inst);
2741     }
2742   }
2743 
2744   // Sort by dominance.
2745   SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end());
2746   llvm::sort(InsertionList, [&Dom](Instruction *A, Instruction *B) -> bool {
2747     // If a dominates b it should preceed (<) b.
2748     return Dom.dominates(A, B);
2749   });
2750 
2751   Instruction *InsertPt = CoroBegin->getNextNode();
2752   for (Instruction *Inst : InsertionList)
2753     Inst->moveBefore(InsertPt);
2754 }
2755 
2756 /// For each local variable that all of its user are only used inside one of
2757 /// suspended region, we sink their lifetime.start markers to the place where
2758 /// after the suspend block. Doing so minimizes the lifetime of each variable,
2759 /// hence minimizing the amount of data we end up putting on the frame.
sinkLifetimeStartMarkers(Function & F,coro::Shape & Shape,SuspendCrossingInfo & Checker,const DominatorTree & DT)2760 static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
2761                                      SuspendCrossingInfo &Checker,
2762                                      const DominatorTree &DT) {
2763   if (F.hasOptNone())
2764     return;
2765 
2766   // Collect all possible basic blocks which may dominate all uses of allocas.
2767   SmallPtrSet<BasicBlock *, 4> DomSet;
2768   DomSet.insert(&F.getEntryBlock());
2769   for (auto *CSI : Shape.CoroSuspends) {
2770     BasicBlock *SuspendBlock = CSI->getParent();
2771     assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() &&
2772            "should have split coro.suspend into its own block");
2773     DomSet.insert(SuspendBlock->getSingleSuccessor());
2774   }
2775 
2776   for (Instruction &I : instructions(F)) {
2777     AllocaInst* AI = dyn_cast<AllocaInst>(&I);
2778     if (!AI)
2779       continue;
2780 
2781     for (BasicBlock *DomBB : DomSet) {
2782       bool Valid = true;
2783       SmallVector<Instruction *, 1> Lifetimes;
2784 
2785       auto isLifetimeStart = [](Instruction* I) {
2786         if (auto* II = dyn_cast<IntrinsicInst>(I))
2787           return II->getIntrinsicID() == Intrinsic::lifetime_start;
2788         return false;
2789       };
2790 
2791       auto collectLifetimeStart = [&](Instruction *U, AllocaInst *AI) {
2792         if (isLifetimeStart(U)) {
2793           Lifetimes.push_back(U);
2794           return true;
2795         }
2796         if (!U->hasOneUse() || U->stripPointerCasts() != AI)
2797           return false;
2798         if (isLifetimeStart(U->user_back())) {
2799           Lifetimes.push_back(U->user_back());
2800           return true;
2801         }
2802         return false;
2803       };
2804 
2805       for (User *U : AI->users()) {
2806         Instruction *UI = cast<Instruction>(U);
2807         // For all users except lifetime.start markers, if they are all
2808         // dominated by one of the basic blocks and do not cross
2809         // suspend points as well, then there is no need to spill the
2810         // instruction.
2811         if (!DT.dominates(DomBB, UI->getParent()) ||
2812             Checker.isDefinitionAcrossSuspend(DomBB, UI)) {
2813           // Skip lifetime.start, GEP and bitcast used by lifetime.start
2814           // markers.
2815           if (collectLifetimeStart(UI, AI))
2816             continue;
2817           Valid = false;
2818           break;
2819         }
2820       }
2821       // Sink lifetime.start markers to dominate block when they are
2822       // only used outside the region.
2823       if (Valid && Lifetimes.size() != 0) {
2824         auto *NewLifetime = Lifetimes[0]->clone();
2825         NewLifetime->replaceUsesOfWith(NewLifetime->getOperand(1), AI);
2826         NewLifetime->insertBefore(DomBB->getTerminator());
2827 
2828         // All the outsided lifetime.start markers are no longer necessary.
2829         for (Instruction *S : Lifetimes)
2830           S->eraseFromParent();
2831 
2832         break;
2833       }
2834     }
2835   }
2836 }
2837 
collectFrameAlloca(AllocaInst * AI,coro::Shape & Shape,const SuspendCrossingInfo & Checker,SmallVectorImpl<AllocaInfo> & Allocas,const DominatorTree & DT)2838 static void collectFrameAlloca(AllocaInst *AI, coro::Shape &Shape,
2839                                const SuspendCrossingInfo &Checker,
2840                                SmallVectorImpl<AllocaInfo> &Allocas,
2841                                const DominatorTree &DT) {
2842   if (Shape.CoroSuspends.empty())
2843     return;
2844 
2845   // The PromiseAlloca will be specially handled since it needs to be in a
2846   // fixed position in the frame.
2847   if (AI == Shape.SwitchLowering.PromiseAlloca)
2848     return;
2849 
2850   // The __coro_gro alloca should outlive the promise, make sure we
2851   // keep it outside the frame.
2852   if (AI->hasMetadata(LLVMContext::MD_coro_outside_frame))
2853     return;
2854 
2855   // The code that uses lifetime.start intrinsic does not work for functions
2856   // with loops without exit. Disable it on ABIs we know to generate such
2857   // code.
2858   bool ShouldUseLifetimeStartInfo =
2859       (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
2860        Shape.ABI != coro::ABI::RetconOnce);
2861   AllocaUseVisitor Visitor{AI->getDataLayout(), DT, Shape, Checker,
2862                            ShouldUseLifetimeStartInfo};
2863   Visitor.visitPtr(*AI);
2864   if (!Visitor.getShouldLiveOnFrame())
2865     return;
2866   Allocas.emplace_back(AI, Visitor.getAliasesCopy(),
2867                        Visitor.getMayWriteBeforeCoroBegin());
2868 }
2869 
2870 static std::optional<std::pair<Value &, DIExpression &>>
salvageDebugInfoImpl(SmallDenseMap<Argument *,AllocaInst *,4> & ArgToAllocaMap,bool OptimizeFrame,bool UseEntryValue,Function * F,Value * Storage,DIExpression * Expr,bool SkipOutermostLoad)2871 salvageDebugInfoImpl(SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
2872                      bool OptimizeFrame, bool UseEntryValue, Function *F,
2873                      Value *Storage, DIExpression *Expr,
2874                      bool SkipOutermostLoad) {
2875   IRBuilder<> Builder(F->getContext());
2876   auto InsertPt = F->getEntryBlock().getFirstInsertionPt();
2877   while (isa<IntrinsicInst>(InsertPt))
2878     ++InsertPt;
2879   Builder.SetInsertPoint(&F->getEntryBlock(), InsertPt);
2880 
2881   while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) {
2882     if (auto *LdInst = dyn_cast<LoadInst>(Inst)) {
2883       Storage = LdInst->getPointerOperand();
2884       // FIXME: This is a heuristic that works around the fact that
2885       // LLVM IR debug intrinsics cannot yet distinguish between
2886       // memory and value locations: Because a dbg.declare(alloca) is
2887       // implicitly a memory location no DW_OP_deref operation for the
2888       // last direct load from an alloca is necessary.  This condition
2889       // effectively drops the *last* DW_OP_deref in the expression.
2890       if (!SkipOutermostLoad)
2891         Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
2892     } else if (auto *StInst = dyn_cast<StoreInst>(Inst)) {
2893       Storage = StInst->getValueOperand();
2894     } else {
2895       SmallVector<uint64_t, 16> Ops;
2896       SmallVector<Value *, 0> AdditionalValues;
2897       Value *Op = llvm::salvageDebugInfoImpl(
2898           *Inst, Expr ? Expr->getNumLocationOperands() : 0, Ops,
2899           AdditionalValues);
2900       if (!Op || !AdditionalValues.empty()) {
2901         // If salvaging failed or salvaging produced more than one location
2902         // operand, give up.
2903         break;
2904       }
2905       Storage = Op;
2906       Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, /*StackValue*/ false);
2907     }
2908     SkipOutermostLoad = false;
2909   }
2910   if (!Storage)
2911     return std::nullopt;
2912 
2913   auto *StorageAsArg = dyn_cast<Argument>(Storage);
2914   const bool IsSwiftAsyncArg =
2915       StorageAsArg && StorageAsArg->hasAttribute(Attribute::SwiftAsync);
2916 
2917   // Swift async arguments are described by an entry value of the ABI-defined
2918   // register containing the coroutine context.
2919   // Entry values in variadic expressions are not supported.
2920   if (IsSwiftAsyncArg && UseEntryValue && !Expr->isEntryValue() &&
2921       Expr->isSingleLocationExpression())
2922     Expr = DIExpression::prepend(Expr, DIExpression::EntryValue);
2923 
2924   // If the coroutine frame is an Argument, store it in an alloca to improve
2925   // its availability (e.g. registers may be clobbered).
2926   // Avoid this if optimizations are enabled (they would remove the alloca) or
2927   // if the value is guaranteed to be available through other means (e.g. swift
2928   // ABI guarantees).
2929   if (StorageAsArg && !OptimizeFrame && !IsSwiftAsyncArg) {
2930     auto &Cached = ArgToAllocaMap[StorageAsArg];
2931     if (!Cached) {
2932       Cached = Builder.CreateAlloca(Storage->getType(), 0, nullptr,
2933                                     Storage->getName() + ".debug");
2934       Builder.CreateStore(Storage, Cached);
2935     }
2936     Storage = Cached;
2937     // FIXME: LLVM lacks nuanced semantics to differentiate between
2938     // memory and direct locations at the IR level. The backend will
2939     // turn a dbg.declare(alloca, ..., DIExpression()) into a memory
2940     // location. Thus, if there are deref and offset operations in the
2941     // expression, we need to add a DW_OP_deref at the *start* of the
2942     // expression to first load the contents of the alloca before
2943     // adjusting it with the expression.
2944     Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
2945   }
2946 
2947   return {{*Storage, *Expr}};
2948 }
2949 
salvageDebugInfo(SmallDenseMap<Argument *,AllocaInst *,4> & ArgToAllocaMap,DbgVariableIntrinsic & DVI,bool OptimizeFrame,bool UseEntryValue)2950 void coro::salvageDebugInfo(
2951     SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
2952     DbgVariableIntrinsic &DVI, bool OptimizeFrame, bool UseEntryValue) {
2953 
2954   Function *F = DVI.getFunction();
2955   // Follow the pointer arithmetic all the way to the incoming
2956   // function argument and convert into a DIExpression.
2957   bool SkipOutermostLoad = !isa<DbgValueInst>(DVI);
2958   Value *OriginalStorage = DVI.getVariableLocationOp(0);
2959 
2960   auto SalvagedInfo = ::salvageDebugInfoImpl(
2961       ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage,
2962       DVI.getExpression(), SkipOutermostLoad);
2963   if (!SalvagedInfo)
2964     return;
2965 
2966   Value *Storage = &SalvagedInfo->first;
2967   DIExpression *Expr = &SalvagedInfo->second;
2968 
2969   DVI.replaceVariableLocationOp(OriginalStorage, Storage);
2970   DVI.setExpression(Expr);
2971   // We only hoist dbg.declare today since it doesn't make sense to hoist
2972   // dbg.value since it does not have the same function wide guarantees that
2973   // dbg.declare does.
2974   if (isa<DbgDeclareInst>(DVI)) {
2975     std::optional<BasicBlock::iterator> InsertPt;
2976     if (auto *I = dyn_cast<Instruction>(Storage)) {
2977       InsertPt = I->getInsertionPointAfterDef();
2978       // Update DILocation only if variable was not inlined.
2979       DebugLoc ILoc = I->getDebugLoc();
2980       DebugLoc DVILoc = DVI.getDebugLoc();
2981       if (ILoc && DVILoc &&
2982           DVILoc->getScope()->getSubprogram() ==
2983               ILoc->getScope()->getSubprogram())
2984         DVI.setDebugLoc(I->getDebugLoc());
2985     } else if (isa<Argument>(Storage))
2986       InsertPt = F->getEntryBlock().begin();
2987     if (InsertPt)
2988       DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt);
2989   }
2990 }
2991 
salvageDebugInfo(SmallDenseMap<Argument *,AllocaInst *,4> & ArgToAllocaMap,DbgVariableRecord & DVR,bool OptimizeFrame,bool UseEntryValue)2992 void coro::salvageDebugInfo(
2993     SmallDenseMap<Argument *, AllocaInst *, 4> &ArgToAllocaMap,
2994     DbgVariableRecord &DVR, bool OptimizeFrame, bool UseEntryValue) {
2995 
2996   Function *F = DVR.getFunction();
2997   // Follow the pointer arithmetic all the way to the incoming
2998   // function argument and convert into a DIExpression.
2999   bool SkipOutermostLoad = DVR.isDbgDeclare();
3000   Value *OriginalStorage = DVR.getVariableLocationOp(0);
3001 
3002   auto SalvagedInfo = ::salvageDebugInfoImpl(
3003       ArgToAllocaMap, OptimizeFrame, UseEntryValue, F, OriginalStorage,
3004       DVR.getExpression(), SkipOutermostLoad);
3005   if (!SalvagedInfo)
3006     return;
3007 
3008   Value *Storage = &SalvagedInfo->first;
3009   DIExpression *Expr = &SalvagedInfo->second;
3010 
3011   DVR.replaceVariableLocationOp(OriginalStorage, Storage);
3012   DVR.setExpression(Expr);
3013   // We only hoist dbg.declare today since it doesn't make sense to hoist
3014   // dbg.value since it does not have the same function wide guarantees that
3015   // dbg.declare does.
3016   if (DVR.getType() == DbgVariableRecord::LocationType::Declare) {
3017     std::optional<BasicBlock::iterator> InsertPt;
3018     if (auto *I = dyn_cast<Instruction>(Storage)) {
3019       InsertPt = I->getInsertionPointAfterDef();
3020       // Update DILocation only if variable was not inlined.
3021       DebugLoc ILoc = I->getDebugLoc();
3022       DebugLoc DVRLoc = DVR.getDebugLoc();
3023       if (ILoc && DVRLoc &&
3024           DVRLoc->getScope()->getSubprogram() ==
3025               ILoc->getScope()->getSubprogram())
3026         DVR.setDebugLoc(ILoc);
3027     } else if (isa<Argument>(Storage))
3028       InsertPt = F->getEntryBlock().begin();
3029     if (InsertPt) {
3030       DVR.removeFromParent();
3031       (*InsertPt)->getParent()->insertDbgRecordBefore(&DVR, *InsertPt);
3032     }
3033   }
3034 }
3035 
doRematerializations(Function & F,SuspendCrossingInfo & Checker,const std::function<bool (Instruction &)> & MaterializableCallback)3036 static void doRematerializations(
3037     Function &F, SuspendCrossingInfo &Checker,
3038     const std::function<bool(Instruction &)> &MaterializableCallback) {
3039   if (F.hasOptNone())
3040     return;
3041 
3042   SpillInfo Spills;
3043 
3044   // See if there are materializable instructions across suspend points
3045   // We record these as the starting point to also identify materializable
3046   // defs of uses in these operations
3047   for (Instruction &I : instructions(F)) {
3048     if (!MaterializableCallback(I))
3049       continue;
3050     for (User *U : I.users())
3051       if (Checker.isDefinitionAcrossSuspend(I, U))
3052         Spills[&I].push_back(cast<Instruction>(U));
3053   }
3054 
3055   // Process each of the identified rematerializable instructions
3056   // and add predecessor instructions that can also be rematerialized.
3057   // This is actually a graph of instructions since we could potentially
3058   // have multiple uses of a def in the set of predecessor instructions.
3059   // The approach here is to maintain a graph of instructions for each bottom
3060   // level instruction - where we have a unique set of instructions (nodes)
3061   // and edges between them. We then walk the graph in reverse post-dominator
3062   // order to insert them past the suspend point, but ensure that ordering is
3063   // correct. We also rely on CSE removing duplicate defs for remats of
3064   // different instructions with a def in common (rather than maintaining more
3065   // complex graphs for each suspend point)
3066 
3067   // We can do this by adding new nodes to the list for each suspend
3068   // point. Then using standard GraphTraits to give a reverse post-order
3069   // traversal when we insert the nodes after the suspend
3070   SmallMapVector<Instruction *, std::unique_ptr<RematGraph>, 8> AllRemats;
3071   for (auto &E : Spills) {
3072     for (Instruction *U : E.second) {
3073       // Don't process a user twice (this can happen if the instruction uses
3074       // more than one rematerializable def)
3075       if (AllRemats.count(U))
3076         continue;
3077 
3078       // Constructor creates the whole RematGraph for the given Use
3079       auto RematUPtr =
3080           std::make_unique<RematGraph>(MaterializableCallback, U, Checker);
3081 
3082       LLVM_DEBUG(dbgs() << "***** Next remat group *****\n";
3083                  ReversePostOrderTraversal<RematGraph *> RPOT(RematUPtr.get());
3084                  for (auto I = RPOT.begin(); I != RPOT.end();
3085                       ++I) { (*I)->Node->dump(); } dbgs()
3086                  << "\n";);
3087 
3088       AllRemats[U] = std::move(RematUPtr);
3089     }
3090   }
3091 
3092   // Rewrite materializable instructions to be materialized at the use
3093   // point.
3094   LLVM_DEBUG(dumpRemats("Materializations", AllRemats));
3095   rewriteMaterializableInstructions(AllRemats);
3096 }
3097 
buildCoroutineFrame(Function & F,Shape & Shape,TargetTransformInfo & TTI,const std::function<bool (Instruction &)> & MaterializableCallback)3098 void coro::buildCoroutineFrame(
3099     Function &F, Shape &Shape, TargetTransformInfo &TTI,
3100     const std::function<bool(Instruction &)> &MaterializableCallback) {
3101   // Don't eliminate swifterror in async functions that won't be split.
3102   if (Shape.ABI != coro::ABI::Async || !Shape.CoroSuspends.empty())
3103     eliminateSwiftError(F, Shape);
3104 
3105   if (Shape.ABI == coro::ABI::Switch &&
3106       Shape.SwitchLowering.PromiseAlloca) {
3107     Shape.getSwitchCoroId()->clearPromise();
3108   }
3109 
3110   // Make sure that all coro.save, coro.suspend and the fallthrough coro.end
3111   // intrinsics are in their own blocks to simplify the logic of building up
3112   // SuspendCrossing data.
3113   for (auto *CSI : Shape.CoroSuspends) {
3114     if (auto *Save = CSI->getCoroSave())
3115       splitAround(Save, "CoroSave");
3116     splitAround(CSI, "CoroSuspend");
3117   }
3118 
3119   // Put CoroEnds into their own blocks.
3120   for (AnyCoroEndInst *CE : Shape.CoroEnds) {
3121     splitAround(CE, "CoroEnd");
3122 
3123     // Emit the musttail call function in a new block before the CoroEnd.
3124     // We do this here so that the right suspend crossing info is computed for
3125     // the uses of the musttail call function call. (Arguments to the coro.end
3126     // instructions would be ignored)
3127     if (auto *AsyncEnd = dyn_cast<CoroAsyncEndInst>(CE)) {
3128       auto *MustTailCallFn = AsyncEnd->getMustTailCallFunction();
3129       if (!MustTailCallFn)
3130         continue;
3131       IRBuilder<> Builder(AsyncEnd);
3132       SmallVector<Value *, 8> Args(AsyncEnd->args());
3133       auto Arguments = ArrayRef<Value *>(Args).drop_front(3);
3134       auto *Call = createMustTailCall(AsyncEnd->getDebugLoc(), MustTailCallFn,
3135                                       TTI, Arguments, Builder);
3136       splitAround(Call, "MustTailCall.Before.CoroEnd");
3137     }
3138   }
3139 
3140   // Later code makes structural assumptions about single predecessors phis e.g
3141   // that they are not live across a suspend point.
3142   cleanupSinglePredPHIs(F);
3143 
3144   // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
3145   // never has its definition separated from the PHI by the suspend point.
3146   rewritePHIs(F);
3147 
3148   // Build suspend crossing info.
3149   SuspendCrossingInfo Checker(F, Shape);
3150 
3151   doRematerializations(F, Checker, MaterializableCallback);
3152 
3153   const DominatorTree DT(F);
3154   FrameDataInfo FrameData;
3155   SmallVector<CoroAllocaAllocInst*, 4> LocalAllocas;
3156   SmallVector<Instruction*, 4> DeadInstructions;
3157   if (Shape.ABI != coro::ABI::Async && Shape.ABI != coro::ABI::Retcon &&
3158       Shape.ABI != coro::ABI::RetconOnce)
3159     sinkLifetimeStartMarkers(F, Shape, Checker, DT);
3160 
3161   // Collect the spills for arguments and other not-materializable values.
3162   for (Argument &A : F.args())
3163     for (User *U : A.users())
3164       if (Checker.isDefinitionAcrossSuspend(A, U))
3165         FrameData.Spills[&A].push_back(cast<Instruction>(U));
3166 
3167   for (Instruction &I : instructions(F)) {
3168     // Values returned from coroutine structure intrinsics should not be part
3169     // of the Coroutine Frame.
3170     if (isCoroutineStructureIntrinsic(I) || &I == Shape.CoroBegin)
3171       continue;
3172 
3173     // Handle alloca.alloc specially here.
3174     if (auto AI = dyn_cast<CoroAllocaAllocInst>(&I)) {
3175       // Check whether the alloca's lifetime is bounded by suspend points.
3176       if (isLocalAlloca(AI)) {
3177         LocalAllocas.push_back(AI);
3178         continue;
3179       }
3180 
3181       // If not, do a quick rewrite of the alloca and then add spills of
3182       // the rewritten value.  The rewrite doesn't invalidate anything in
3183       // Spills because the other alloca intrinsics have no other operands
3184       // besides AI, and it doesn't invalidate the iteration because we delay
3185       // erasing AI.
3186       auto Alloc = lowerNonLocalAlloca(AI, Shape, DeadInstructions);
3187 
3188       for (User *U : Alloc->users()) {
3189         if (Checker.isDefinitionAcrossSuspend(*Alloc, U))
3190           FrameData.Spills[Alloc].push_back(cast<Instruction>(U));
3191       }
3192       continue;
3193     }
3194 
3195     // Ignore alloca.get; we process this as part of coro.alloca.alloc.
3196     if (isa<CoroAllocaGetInst>(I))
3197       continue;
3198 
3199     if (auto *AI = dyn_cast<AllocaInst>(&I)) {
3200       collectFrameAlloca(AI, Shape, Checker, FrameData.Allocas, DT);
3201       continue;
3202     }
3203 
3204     for (User *U : I.users())
3205       if (Checker.isDefinitionAcrossSuspend(I, U)) {
3206         // We cannot spill a token.
3207         if (I.getType()->isTokenTy())
3208           report_fatal_error(
3209               "token definition is separated from the use by a suspend point");
3210         FrameData.Spills[&I].push_back(cast<Instruction>(U));
3211       }
3212   }
3213 
3214   LLVM_DEBUG(dumpAllocas(FrameData.Allocas));
3215 
3216   // We don't want the layout of coroutine frame to be affected
3217   // by debug information. So we only choose to salvage DbgValueInst for
3218   // whose value is already in the frame.
3219   // We would handle the dbg.values for allocas specially
3220   for (auto &Iter : FrameData.Spills) {
3221     auto *V = Iter.first;
3222     SmallVector<DbgValueInst *, 16> DVIs;
3223     SmallVector<DbgVariableRecord *, 16> DVRs;
3224     findDbgValues(DVIs, V, &DVRs);
3225     for (DbgValueInst *DVI : DVIs)
3226       if (Checker.isDefinitionAcrossSuspend(*V, DVI))
3227         FrameData.Spills[V].push_back(DVI);
3228     // Add the instructions which carry debug info that is in the frame.
3229     for (DbgVariableRecord *DVR : DVRs)
3230       if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr))
3231         FrameData.Spills[V].push_back(DVR->Marker->MarkedInstr);
3232   }
3233 
3234   LLVM_DEBUG(dumpSpills("Spills", FrameData.Spills));
3235   if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce ||
3236       Shape.ABI == coro::ABI::Async)
3237     sinkSpillUsesAfterCoroBegin(F, FrameData, Shape.CoroBegin);
3238   Shape.FrameTy = buildFrameType(F, Shape, FrameData);
3239   Shape.FramePtr = Shape.CoroBegin;
3240   // For now, this works for C++ programs only.
3241   buildFrameDebugInfo(F, Shape, FrameData);
3242   insertSpills(FrameData, Shape);
3243   lowerLocalAllocas(LocalAllocas, DeadInstructions);
3244 
3245   for (auto *I : DeadInstructions)
3246     I->eraseFromParent();
3247 }
3248