xref: /freebsd/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 //===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the OpenMPIRBuilder class and helpers used as a convenient
10 // way to create LLVM instructions for OpenMP directives.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
15 #define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
16 
17 #include "llvm/Analysis/MemorySSAUpdater.h"
18 #include "llvm/Frontend/OpenMP/OMPConstants.h"
19 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
20 #include "llvm/IR/DebugLoc.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Module.h"
23 #include "llvm/Support/Allocator.h"
24 #include "llvm/TargetParser/Triple.h"
25 #include <forward_list>
26 #include <map>
27 #include <optional>
28 
29 namespace llvm {
30 class CanonicalLoopInfo;
31 struct TargetRegionEntryInfo;
32 class OffloadEntriesInfoManager;
33 class OpenMPIRBuilder;
34 
35 /// Move the instruction after an InsertPoint to the beginning of another
36 /// BasicBlock.
37 ///
38 /// The instructions after \p IP are moved to the beginning of \p New which must
39 /// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
40 /// \p New will be added such that there is no semantic change. Otherwise, the
41 /// \p IP insert block remains degenerate and it is up to the caller to insert a
42 /// terminator.
43 void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
44               bool CreateBranch);
45 
46 /// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
47 /// insert location will stick to after the instruction before the insertion
48 /// point (instead of moving with the instruction the InsertPoint stores
49 /// internally).
50 void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch);
51 
52 /// Split a BasicBlock at an InsertPoint, even if the block is degenerate
53 /// (missing the terminator).
54 ///
55 /// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
56 /// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
57 /// is true, a branch to the new successor will new created such that
58 /// semantically there is no change; otherwise the block of the insertion point
59 /// remains degenerate and it is the caller's responsibility to insert a
60 /// terminator. Returns the new successor block.
61 BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
62                     llvm::Twine Name = {});
63 
64 /// Split a BasicBlock at \p Builder's insertion point, even if the block is
65 /// degenerate (missing the terminator).  Its new insert location will stick to
66 /// after the instruction before the insertion point (instead of moving with the
67 /// instruction the InsertPoint stores internally).
68 BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
69                     llvm::Twine Name = {});
70 
71 /// Split a BasicBlock at \p Builder's insertion point, even if the block is
72 /// degenerate (missing the terminator).  Its new insert location will stick to
73 /// after the instruction before the insertion point (instead of moving with the
74 /// instruction the InsertPoint stores internally).
75 BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name);
76 
77 /// Like splitBB, but reuses the current block's name for the new name.
78 BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
79                               llvm::Twine Suffix = ".split");
80 
81 /// Captures attributes that affect generating LLVM-IR using the
82 /// OpenMPIRBuilder and related classes. Note that not all attributes are
83 /// required for all classes or functions. In some use cases the configuration
84 /// is not necessary at all, because because the only functions that are called
85 /// are ones that are not dependent on the configuration.
86 class OpenMPIRBuilderConfig {
87 public:
88   /// Flag to define whether to generate code for the role of the OpenMP host
89   /// (if set to false) or device (if set to true) in an offloading context. It
90   /// is set when the -fopenmp-is-target-device compiler frontend option is
91   /// specified.
92   std::optional<bool> IsTargetDevice;
93 
94   /// Flag for specifying if the compilation is done for an accelerator. It is
95   /// set according to the architecture of the target triple and currently only
96   /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
97   /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
98   /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
99   /// like target with the ability to work as the OpenMP host is added, or if
100   /// the capabilities of the currently supported GPU architectures are
101   /// expanded.
102   std::optional<bool> IsGPU;
103 
104   /// Flag for specifying if LLVMUsed information should be emitted.
105   std::optional<bool> EmitLLVMUsedMetaInfo;
106 
107   /// Flag for specifying if offloading is mandatory.
108   std::optional<bool> OpenMPOffloadMandatory;
109 
110   /// First separator used between the initial two parts of a name.
111   std::optional<StringRef> FirstSeparator;
112   /// Separator used between all of the rest consecutive parts of s name
113   std::optional<StringRef> Separator;
114 
115   // Grid Value for the GPU target
116   std::optional<omp::GV> GridValue;
117 
118   OpenMPIRBuilderConfig();
119   OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
120                         bool OpenMPOffloadMandatory,
121                         bool HasRequiresReverseOffload,
122                         bool HasRequiresUnifiedAddress,
123                         bool HasRequiresUnifiedSharedMemory,
124                         bool HasRequiresDynamicAllocators);
125 
126   // Getters functions that assert if the required values are not present.
isTargetDevice()127   bool isTargetDevice() const {
128     assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
129     return *IsTargetDevice;
130   }
131 
isGPU()132   bool isGPU() const {
133     assert(IsGPU.has_value() && "IsGPU is not set");
134     return *IsGPU;
135   }
136 
openMPOffloadMandatory()137   bool openMPOffloadMandatory() const {
138     assert(OpenMPOffloadMandatory.has_value() &&
139            "OpenMPOffloadMandatory is not set");
140     return *OpenMPOffloadMandatory;
141   }
142 
getGridValue()143   omp::GV getGridValue() const {
144     assert(GridValue.has_value() && "GridValue is not set");
145     return *GridValue;
146   }
147 
hasRequiresFlags()148   bool hasRequiresFlags() const { return RequiresFlags; }
149   bool hasRequiresReverseOffload() const;
150   bool hasRequiresUnifiedAddress() const;
151   bool hasRequiresUnifiedSharedMemory() const;
152   bool hasRequiresDynamicAllocators() const;
153 
154   /// Returns requires directive clauses as flags compatible with those expected
155   /// by libomptarget.
156   int64_t getRequiresFlags() const;
157 
158   // Returns the FirstSeparator if set, otherwise use the default separator
159   // depending on isGPU
firstSeparator()160   StringRef firstSeparator() const {
161     if (FirstSeparator.has_value())
162       return *FirstSeparator;
163     if (isGPU())
164       return "_";
165     return ".";
166   }
167 
168   // Returns the Separator if set, otherwise use the default separator depending
169   // on isGPU
separator()170   StringRef separator() const {
171     if (Separator.has_value())
172       return *Separator;
173     if (isGPU())
174       return "$";
175     return ".";
176   }
177 
setIsTargetDevice(bool Value)178   void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
setIsGPU(bool Value)179   void setIsGPU(bool Value) { IsGPU = Value; }
180   void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; }
setOpenMPOffloadMandatory(bool Value)181   void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
setFirstSeparator(StringRef FS)182   void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
setSeparator(StringRef S)183   void setSeparator(StringRef S) { Separator = S; }
setGridValue(omp::GV G)184   void setGridValue(omp::GV G) { GridValue = G; }
185 
186   void setHasRequiresReverseOffload(bool Value);
187   void setHasRequiresUnifiedAddress(bool Value);
188   void setHasRequiresUnifiedSharedMemory(bool Value);
189   void setHasRequiresDynamicAllocators(bool Value);
190 
191 private:
192   /// Flags for specifying which requires directive clauses are present.
193   int64_t RequiresFlags;
194 };
195 
196 /// Data structure to contain the information needed to uniquely identify
197 /// a target entry.
198 struct TargetRegionEntryInfo {
199   std::string ParentName;
200   unsigned DeviceID;
201   unsigned FileID;
202   unsigned Line;
203   unsigned Count;
204 
TargetRegionEntryInfoTargetRegionEntryInfo205   TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {}
206   TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
207                         unsigned FileID, unsigned Line, unsigned Count = 0)
ParentNameTargetRegionEntryInfo208       : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
209         Count(Count) {}
210 
211   static void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
212                                          StringRef ParentName,
213                                          unsigned DeviceID, unsigned FileID,
214                                          unsigned Line, unsigned Count);
215 
216   bool operator<(const TargetRegionEntryInfo &RHS) const {
217     return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
218            std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
219                            RHS.Count);
220   }
221 };
222 
223 /// Class that manages information about offload code regions and data
224 class OffloadEntriesInfoManager {
225   /// Number of entries registered so far.
226   OpenMPIRBuilder *OMPBuilder;
227   unsigned OffloadingEntriesNum = 0;
228 
229 public:
230   /// Base class of the entries info.
231   class OffloadEntryInfo {
232   public:
233     /// Kind of a given entry.
234     enum OffloadingEntryInfoKinds : unsigned {
235       /// Entry is a target region.
236       OffloadingEntryInfoTargetRegion = 0,
237       /// Entry is a declare target variable.
238       OffloadingEntryInfoDeviceGlobalVar = 1,
239       /// Invalid entry info.
240       OffloadingEntryInfoInvalid = ~0u
241     };
242 
243   protected:
244     OffloadEntryInfo() = delete;
OffloadEntryInfo(OffloadingEntryInfoKinds Kind)245     explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
OffloadEntryInfo(OffloadingEntryInfoKinds Kind,unsigned Order,uint32_t Flags)246     explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
247                               uint32_t Flags)
248         : Flags(Flags), Order(Order), Kind(Kind) {}
249     ~OffloadEntryInfo() = default;
250 
251   public:
isValid()252     bool isValid() const { return Order != ~0u; }
getOrder()253     unsigned getOrder() const { return Order; }
getKind()254     OffloadingEntryInfoKinds getKind() const { return Kind; }
getFlags()255     uint32_t getFlags() const { return Flags; }
setFlags(uint32_t NewFlags)256     void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
getAddress()257     Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
setAddress(Constant * V)258     void setAddress(Constant *V) {
259       assert(!Addr.pointsToAliveValue() && "Address has been set before!");
260       Addr = V;
261     }
classof(const OffloadEntryInfo * Info)262     static bool classof(const OffloadEntryInfo *Info) { return true; }
263 
264   private:
265     /// Address of the entity that has to be mapped for offloading.
266     WeakTrackingVH Addr;
267 
268     /// Flags associated with the device global.
269     uint32_t Flags = 0u;
270 
271     /// Order this entry was emitted.
272     unsigned Order = ~0u;
273 
274     OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
275   };
276 
277   /// Return true if a there are no entries defined.
278   bool empty() const;
279   /// Return number of entries defined so far.
size()280   unsigned size() const { return OffloadingEntriesNum; }
281 
OffloadEntriesInfoManager(OpenMPIRBuilder * builder)282   OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
283 
284   //
285   // Target region entries related.
286   //
287 
288   /// Kind of the target registry entry.
289   enum OMPTargetRegionEntryKind : uint32_t {
290     /// Mark the entry as target region.
291     OMPTargetRegionEntryTargetRegion = 0x0,
292   };
293 
294   /// Target region entries info.
295   class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
296     /// Address that can be used as the ID of the entry.
297     Constant *ID = nullptr;
298 
299   public:
OffloadEntryInfoTargetRegion()300     OffloadEntryInfoTargetRegion()
301         : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
OffloadEntryInfoTargetRegion(unsigned Order,Constant * Addr,Constant * ID,OMPTargetRegionEntryKind Flags)302     explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
303                                           Constant *ID,
304                                           OMPTargetRegionEntryKind Flags)
305         : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
306           ID(ID) {
307       setAddress(Addr);
308     }
309 
getID()310     Constant *getID() const { return ID; }
setID(Constant * V)311     void setID(Constant *V) {
312       assert(!ID && "ID has been set before!");
313       ID = V;
314     }
classof(const OffloadEntryInfo * Info)315     static bool classof(const OffloadEntryInfo *Info) {
316       return Info->getKind() == OffloadingEntryInfoTargetRegion;
317     }
318   };
319 
320   /// Initialize target region entry.
321   /// This is ONLY needed for DEVICE compilation.
322   void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
323                                        unsigned Order);
324   /// Register target region entry.
325   void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
326                                      Constant *Addr, Constant *ID,
327                                      OMPTargetRegionEntryKind Flags);
328   /// Return true if a target region entry with the provided information
329   /// exists.
330   bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
331                                 bool IgnoreAddressId = false) const;
332 
333   // Return the Name based on \a EntryInfo using the next available Count.
334   void getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
335                                   const TargetRegionEntryInfo &EntryInfo);
336 
337   /// brief Applies action \a Action on all registered entries.
338   typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
339                             const OffloadEntryInfoTargetRegion &)>
340       OffloadTargetRegionEntryInfoActTy;
341   void
342   actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);
343 
344   //
345   // Device global variable entries related.
346   //
347 
348   /// Kind of the global variable entry..
349   enum OMPTargetGlobalVarEntryKind : uint32_t {
350     /// Mark the entry as a to declare target.
351     OMPTargetGlobalVarEntryTo = 0x0,
352     /// Mark the entry as a to declare target link.
353     OMPTargetGlobalVarEntryLink = 0x1,
354     /// Mark the entry as a declare target enter.
355     OMPTargetGlobalVarEntryEnter = 0x2,
356     /// Mark the entry as having no declare target entry kind.
357     OMPTargetGlobalVarEntryNone = 0x3,
358     /// Mark the entry as a declare target indirect global.
359     OMPTargetGlobalVarEntryIndirect = 0x8,
360     /// Mark the entry as a register requires global.
361     OMPTargetGlobalRegisterRequires = 0x10,
362   };
363 
364   /// Kind of device clause for declare target variables
365   /// and functions
366   /// NOTE: Currently not used as a part of a variable entry
367   /// used for Flang and Clang to interface with the variable
368   /// related registration functions
369   enum OMPTargetDeviceClauseKind : uint32_t {
370     /// The target is marked for all devices
371     OMPTargetDeviceClauseAny = 0x0,
372     /// The target is marked for non-host devices
373     OMPTargetDeviceClauseNoHost = 0x1,
374     /// The target is marked for host devices
375     OMPTargetDeviceClauseHost = 0x2,
376     /// The target is marked as having no clause
377     OMPTargetDeviceClauseNone = 0x3
378   };
379 
380   /// Device global variable entries info.
381   class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
382     /// Type of the global variable.
383     int64_t VarSize;
384     GlobalValue::LinkageTypes Linkage;
385     const std::string VarName;
386 
387   public:
OffloadEntryInfoDeviceGlobalVar()388     OffloadEntryInfoDeviceGlobalVar()
389         : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
OffloadEntryInfoDeviceGlobalVar(unsigned Order,OMPTargetGlobalVarEntryKind Flags)390     explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
391                                              OMPTargetGlobalVarEntryKind Flags)
392         : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
OffloadEntryInfoDeviceGlobalVar(unsigned Order,Constant * Addr,int64_t VarSize,OMPTargetGlobalVarEntryKind Flags,GlobalValue::LinkageTypes Linkage,const std::string & VarName)393     explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
394                                              int64_t VarSize,
395                                              OMPTargetGlobalVarEntryKind Flags,
396                                              GlobalValue::LinkageTypes Linkage,
397                                              const std::string &VarName)
398         : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
399           VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
400       setAddress(Addr);
401     }
402 
getVarSize()403     int64_t getVarSize() const { return VarSize; }
getVarName()404     StringRef getVarName() const { return VarName; }
setVarSize(int64_t Size)405     void setVarSize(int64_t Size) { VarSize = Size; }
getLinkage()406     GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
setLinkage(GlobalValue::LinkageTypes LT)407     void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
classof(const OffloadEntryInfo * Info)408     static bool classof(const OffloadEntryInfo *Info) {
409       return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
410     }
411   };
412 
413   /// Initialize device global variable entry.
414   /// This is ONLY used for DEVICE compilation.
415   void initializeDeviceGlobalVarEntryInfo(StringRef Name,
416                                           OMPTargetGlobalVarEntryKind Flags,
417                                           unsigned Order);
418 
419   /// Register device global variable entry.
420   void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr,
421                                         int64_t VarSize,
422                                         OMPTargetGlobalVarEntryKind Flags,
423                                         GlobalValue::LinkageTypes Linkage);
424   /// Checks if the variable with the given name has been registered already.
hasDeviceGlobalVarEntryInfo(StringRef VarName)425   bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
426     return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
427   }
428   /// Applies action \a Action on all registered entries.
429   typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
430       OffloadDeviceGlobalVarEntryInfoActTy;
431   void actOnDeviceGlobalVarEntriesInfo(
432       const OffloadDeviceGlobalVarEntryInfoActTy &Action);
433 
434 private:
435   /// Return the count of entries at a particular source location.
436   unsigned
437   getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
438 
439   /// Update the count of entries at a particular source location.
440   void
441   incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
442 
443   static TargetRegionEntryInfo
getTargetRegionEntryCountKey(const TargetRegionEntryInfo & EntryInfo)444   getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
445     return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
446                                  EntryInfo.FileID, EntryInfo.Line, 0);
447   }
448 
449   // Count of entries at a location.
450   std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
451 
452   // Storage for target region entries kind.
453   typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
454       OffloadEntriesTargetRegionTy;
455   OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
456   /// Storage for device global variable entries kind. The storage is to be
457   /// indexed by mangled name.
458   typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
459       OffloadEntriesDeviceGlobalVarTy;
460   OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
461 };
462 
463 /// An interface to create LLVM-IR for OpenMP directives.
464 ///
465 /// Each OpenMP directive has a corresponding public generator method.
466 class OpenMPIRBuilder {
467 public:
468   /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
469   /// not have an effect on \p M (see initialize)
OpenMPIRBuilder(Module & M)470   OpenMPIRBuilder(Module &M)
471       : M(M), Builder(M.getContext()), OffloadInfoManager(this),
472         T(Triple(M.getTargetTriple())) {}
473   ~OpenMPIRBuilder();
474 
475   /// Initialize the internal state, this will put structures types and
476   /// potentially other helpers into the underlying module. Must be called
477   /// before any other method and only once! This internal state includes types
478   /// used in the OpenMPIRBuilder generated from OMPKinds.def.
479   void initialize();
480 
setConfig(OpenMPIRBuilderConfig C)481   void setConfig(OpenMPIRBuilderConfig C) { Config = C; }
482 
483   /// Finalize the underlying module, e.g., by outlining regions.
484   /// \param Fn                    The function to be finalized. If not used,
485   ///                              all functions are finalized.
486   void finalize(Function *Fn = nullptr);
487 
488   /// Add attributes known for \p FnID to \p Fn.
489   void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
490 
491   /// Type used throughout for insertion points.
492   using InsertPointTy = IRBuilder<>::InsertPoint;
493 
494   /// Get the create a name using the platform specific separators.
495   /// \param Parts parts of the final name that needs separation
496   /// The created name has a first separator between the first and second part
497   /// and a second separator between all other parts.
498   /// E.g. with FirstSeparator "$" and Separator "." and
499   /// parts: "p1", "p2", "p3", "p4"
500   /// The resulting name is "p1$p2.p3.p4"
501   /// The separators are retrieved from the OpenMPIRBuilderConfig.
502   std::string createPlatformSpecificName(ArrayRef<StringRef> Parts) const;
503 
504   /// Callback type for variable finalization (think destructors).
505   ///
506   /// \param CodeGenIP is the insertion point at which the finalization code
507   ///                  should be placed.
508   ///
509   /// A finalize callback knows about all objects that need finalization, e.g.
510   /// destruction, when the scope of the currently generated construct is left
511   /// at the time, and location, the callback is invoked.
512   using FinalizeCallbackTy = std::function<void(InsertPointTy CodeGenIP)>;
513 
514   struct FinalizationInfo {
515     /// The finalization callback provided by the last in-flight invocation of
516     /// createXXXX for the directive of kind DK.
517     FinalizeCallbackTy FiniCB;
518 
519     /// The directive kind of the innermost directive that has an associated
520     /// region which might require finalization when it is left.
521     omp::Directive DK;
522 
523     /// Flag to indicate if the directive is cancellable.
524     bool IsCancellable;
525   };
526 
527   /// Push a finalization callback on the finalization stack.
528   ///
529   /// NOTE: Temporary solution until Clang CG is gone.
pushFinalizationCB(const FinalizationInfo & FI)530   void pushFinalizationCB(const FinalizationInfo &FI) {
531     FinalizationStack.push_back(FI);
532   }
533 
534   /// Pop the last finalization callback from the finalization stack.
535   ///
536   /// NOTE: Temporary solution until Clang CG is gone.
popFinalizationCB()537   void popFinalizationCB() { FinalizationStack.pop_back(); }
538 
539   /// Callback type for body (=inner region) code generation
540   ///
541   /// The callback takes code locations as arguments, each describing a
542   /// location where additional instructions can be inserted.
543   ///
544   /// The CodeGenIP may be in the middle of a basic block or point to the end of
545   /// it. The basic block may have a terminator or be degenerate. The callback
546   /// function may just insert instructions at that position, but also split the
547   /// block (without the Before argument of BasicBlock::splitBasicBlock such
548   /// that the identify of the split predecessor block is preserved) and insert
549   /// additional control flow, including branches that do not lead back to what
550   /// follows the CodeGenIP. Note that since the callback is allowed to split
551   /// the block, callers must assume that InsertPoints to positions in the
552   /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
553   /// such InsertPoints need to be preserved, it can split the block itself
554   /// before calling the callback.
555   ///
556   /// AllocaIP and CodeGenIP must not point to the same position.
557   ///
558   /// \param AllocaIP is the insertion point at which new alloca instructions
559   ///                 should be placed. The BasicBlock it is pointing to must
560   ///                 not be split.
561   /// \param CodeGenIP is the insertion point at which the body code should be
562   ///                  placed.
563   using BodyGenCallbackTy =
564       function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
565 
566   // This is created primarily for sections construct as llvm::function_ref
567   // (BodyGenCallbackTy) is not storable (as described in the comments of
568   // function_ref class - function_ref contains non-ownable reference
569   // to the callable.
570   using StorableBodyGenCallbackTy =
571       std::function<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
572 
573   /// Callback type for loop body code generation.
574   ///
575   /// \param CodeGenIP is the insertion point where the loop's body code must be
576   ///                  placed. This will be a dedicated BasicBlock with a
577   ///                  conditional branch from the loop condition check and
578   ///                  terminated with an unconditional branch to the loop
579   ///                  latch.
580   /// \param IndVar    is the induction variable usable at the insertion point.
581   using LoopBodyGenCallbackTy =
582       function_ref<void(InsertPointTy CodeGenIP, Value *IndVar)>;
583 
584   /// Callback type for variable privatization (think copy & default
585   /// constructor).
586   ///
587   /// \param AllocaIP is the insertion point at which new alloca instructions
588   ///                 should be placed.
589   /// \param CodeGenIP is the insertion point at which the privatization code
590   ///                  should be placed.
591   /// \param Original The value being copied/created, should not be used in the
592   ///                 generated IR.
593   /// \param Inner The equivalent of \p Original that should be used in the
594   ///              generated IR; this is equal to \p Original if the value is
595   ///              a pointer and can thus be passed directly, otherwise it is
596   ///              an equivalent but different value.
597   /// \param ReplVal The replacement value, thus a copy or new created version
598   ///                of \p Inner.
599   ///
600   /// \returns The new insertion point where code generation continues and
601   ///          \p ReplVal the replacement value.
602   using PrivatizeCallbackTy = function_ref<InsertPointTy(
603       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
604       Value &Inner, Value *&ReplVal)>;
605 
606   /// Description of a LLVM-IR insertion point (IP) and a debug/source location
607   /// (filename, line, column, ...).
608   struct LocationDescription {
LocationDescriptionLocationDescription609     LocationDescription(const IRBuilderBase &IRB)
610         : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
LocationDescriptionLocationDescription611     LocationDescription(const InsertPointTy &IP) : IP(IP) {}
LocationDescriptionLocationDescription612     LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
613         : IP(IP), DL(DL) {}
614     InsertPointTy IP;
615     DebugLoc DL;
616   };
617 
618   /// Emitter methods for OpenMP directives.
619   ///
620   ///{
621 
622   /// Generator for '#omp barrier'
623   ///
624   /// \param Loc The location where the barrier directive was encountered.
625   /// \param Kind The kind of directive that caused the barrier.
626   /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
627   /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
628   ///                        should be checked and acted upon.
629   /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
630   ///
631   /// \returns The insertion point after the barrier.
632   InsertPointTy createBarrier(const LocationDescription &Loc,
633                               omp::Directive Kind, bool ForceSimpleCall = false,
634                               bool CheckCancelFlag = true);
635 
636   /// Generator for '#omp cancel'
637   ///
638   /// \param Loc The location where the directive was encountered.
639   /// \param IfCondition The evaluated 'if' clause expression, if any.
640   /// \param CanceledDirective The kind of directive that is cancled.
641   ///
642   /// \returns The insertion point after the barrier.
643   InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition,
644                              omp::Directive CanceledDirective);
645 
646   /// Generator for '#omp parallel'
647   ///
648   /// \param Loc The insert and source location description.
649   /// \param AllocaIP The insertion points to be used for alloca instructions.
650   /// \param BodyGenCB Callback that will generate the region code.
651   /// \param PrivCB Callback to copy a given variable (think copy constructor).
652   /// \param FiniCB Callback to finalize variable copies.
653   /// \param IfCondition The evaluated 'if' clause expression, if any.
654   /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
655   /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
656   /// \param IsCancellable Flag to indicate a cancellable parallel region.
657   ///
658   /// \returns The insertion position *after* the parallel.
659   IRBuilder<>::InsertPoint
660   createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP,
661                  BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
662                  FinalizeCallbackTy FiniCB, Value *IfCondition,
663                  Value *NumThreads, omp::ProcBindKind ProcBind,
664                  bool IsCancellable);
665 
666   /// Generator for the control flow structure of an OpenMP canonical loop.
667   ///
668   /// This generator operates on the logical iteration space of the loop, i.e.
669   /// the caller only has to provide a loop trip count of the loop as defined by
670   /// base language semantics. The trip count is interpreted as an unsigned
671   /// integer. The induction variable passed to \p BodyGenCB will be of the same
672   /// type and run from 0 to \p TripCount - 1. It is up to the callback to
673   /// convert the logical iteration variable to the loop counter variable in the
674   /// loop body.
675   ///
676   /// \param Loc       The insert and source location description. The insert
677   ///                  location can be between two instructions or the end of a
678   ///                  degenerate block (e.g. a BB under construction).
679   /// \param BodyGenCB Callback that will generate the loop body code.
680   /// \param TripCount Number of iterations the loop body is executed.
681   /// \param Name      Base name used to derive BB and instruction names.
682   ///
683   /// \returns An object representing the created control flow structure which
684   ///          can be used for loop-associated directives.
685   CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc,
686                                          LoopBodyGenCallbackTy BodyGenCB,
687                                          Value *TripCount,
688                                          const Twine &Name = "loop");
689 
690   /// Generator for the control flow structure of an OpenMP canonical loop.
691   ///
692   /// Instead of a logical iteration space, this allows specifying user-defined
693   /// loop counter values using increment, upper- and lower bounds. To
694   /// disambiguate the terminology when counting downwards, instead of lower
695   /// bounds we use \p Start for the loop counter value in the first body
696   /// iteration.
697   ///
698   /// Consider the following limitations:
699   ///
700   ///  * A loop counter space over all integer values of its bit-width cannot be
701   ///    represented. E.g using uint8_t, its loop trip count of 256 cannot be
702   ///    stored into an 8 bit integer):
703   ///
704   ///      DO I = 0, 255, 1
705   ///
706   ///  * Unsigned wrapping is only supported when wrapping only "once"; E.g.
707   ///    effectively counting downwards:
708   ///
709   ///      for (uint8_t i = 100u; i > 0; i += 127u)
710   ///
711   ///
712   /// TODO: May need to add additional parameters to represent:
713   ///
714   ///  * Allow representing downcounting with unsigned integers.
715   ///
716   ///  * Sign of the step and the comparison operator might disagree:
717   ///
718   ///      for (int i = 0; i < 42; i -= 1u)
719   ///
720   //
721   /// \param Loc       The insert and source location description.
722   /// \param BodyGenCB Callback that will generate the loop body code.
723   /// \param Start     Value of the loop counter for the first iterations.
724   /// \param Stop      Loop counter values past this will stop the loop.
725   /// \param Step      Loop counter increment after each iteration; negative
726   ///                  means counting down.
727   /// \param IsSigned  Whether Start, Stop and Step are signed integers.
728   /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
729   ///                      counter.
730   /// \param ComputeIP Insertion point for instructions computing the trip
731   ///                  count. Can be used to ensure the trip count is available
732   ///                  at the outermost loop of a loop nest. If not set,
733   ///                  defaults to the preheader of the generated loop.
734   /// \param Name      Base name used to derive BB and instruction names.
735   ///
736   /// \returns An object representing the created control flow structure which
737   ///          can be used for loop-associated directives.
738   CanonicalLoopInfo *createCanonicalLoop(const LocationDescription &Loc,
739                                          LoopBodyGenCallbackTy BodyGenCB,
740                                          Value *Start, Value *Stop, Value *Step,
741                                          bool IsSigned, bool InclusiveStop,
742                                          InsertPointTy ComputeIP = {},
743                                          const Twine &Name = "loop");
744 
745   /// Collapse a loop nest into a single loop.
746   ///
747   /// Merges loops of a loop nest into a single CanonicalLoopNest representation
748   /// that has the same number of innermost loop iterations as the origin loop
749   /// nest. The induction variables of the input loops are derived from the
750   /// collapsed loop's induction variable. This is intended to be used to
751   /// implement OpenMP's collapse clause. Before applying a directive,
752   /// collapseLoops normalizes a loop nest to contain only a single loop and the
753   /// directive's implementation does not need to handle multiple loops itself.
754   /// This does not remove the need to handle all loop nest handling by
755   /// directives, such as the ordered(<n>) clause or the simd schedule-clause
756   /// modifier of the worksharing-loop directive.
757   ///
758   /// Example:
759   /// \code
760   ///   for (int i = 0; i < 7; ++i) // Canonical loop "i"
761   ///     for (int j = 0; j < 9; ++j) // Canonical loop "j"
762   ///       body(i, j);
763   /// \endcode
764   ///
765   /// After collapsing with Loops={i,j}, the loop is changed to
766   /// \code
767   ///   for (int ij = 0; ij < 63; ++ij) {
768   ///     int i = ij / 9;
769   ///     int j = ij % 9;
770   ///     body(i, j);
771   ///   }
772   /// \endcode
773   ///
774   /// In the current implementation, the following limitations apply:
775   ///
776   ///  * All input loops have an induction variable of the same type.
777   ///
778   ///  * The collapsed loop will have the same trip count integer type as the
779   ///    input loops. Therefore it is possible that the collapsed loop cannot
780   ///    represent all iterations of the input loops. For instance, assuming a
781   ///    32 bit integer type, and two input loops both iterating 2^16 times, the
782   ///    theoretical trip count of the collapsed loop would be 2^32 iteration,
783   ///    which cannot be represented in an 32-bit integer. Behavior is undefined
784   ///    in this case.
785   ///
786   ///  * The trip counts of every input loop must be available at \p ComputeIP.
787   ///    Non-rectangular loops are not yet supported.
788   ///
789   ///  * At each nest level, code between a surrounding loop and its nested loop
790   ///    is hoisted into the loop body, and such code will be executed more
791   ///    often than before collapsing (or not at all if any inner loop iteration
792   ///    has a trip count of 0). This is permitted by the OpenMP specification.
793   ///
794   /// \param DL        Debug location for instructions added for collapsing,
795   ///                  such as instructions to compute/derive the input loop's
796   ///                  induction variables.
797   /// \param Loops     Loops in the loop nest to collapse. Loops are specified
798   ///                  from outermost-to-innermost and every control flow of a
799   ///                  loop's body must pass through its directly nested loop.
800   /// \param ComputeIP Where additional instruction that compute the collapsed
801   ///                  trip count. If not set, defaults to before the generated
802   ///                  loop.
803   ///
804   /// \returns The CanonicalLoopInfo object representing the collapsed loop.
805   CanonicalLoopInfo *collapseLoops(DebugLoc DL,
806                                    ArrayRef<CanonicalLoopInfo *> Loops,
807                                    InsertPointTy ComputeIP);
808 
809   /// Get the default alignment value for given target
810   ///
811   /// \param TargetTriple   Target triple
812   /// \param Features       StringMap which describes extra CPU features
813   static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
814                                             const StringMap<bool> &Features);
815 
816   /// Retrieve (or create if non-existent) the address of a declare
817   /// target variable, used in conjunction with registerTargetGlobalVariable
818   /// to create declare target global variables.
819   ///
820   /// \param CaptureClause - enumerator corresponding to the OpenMP capture
821   /// clause used in conjunction with the variable being registered (link,
822   /// to, enter).
823   /// \param DeviceClause - enumerator corresponding to the OpenMP capture
824   /// clause used in conjunction with the variable being registered (nohost,
825   /// host, any)
826   /// \param IsDeclaration - boolean stating if the variable being registered
827   /// is a declaration-only and not a definition
828   /// \param IsExternallyVisible - boolean stating if the variable is externally
829   /// visible
830   /// \param EntryInfo - Unique entry information for the value generated
831   /// using getTargetEntryUniqueInfo, used to name generated pointer references
832   /// to the declare target variable
833   /// \param MangledName - the mangled name of the variable being registered
834   /// \param GeneratedRefs - references generated by invocations of
835   /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
836   /// these are required by Clang for book keeping.
837   /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
838   /// \param TargetTriple - The OpenMP device target triple we are compiling
839   /// for
840   /// \param LlvmPtrTy - The type of the variable we are generating or
841   /// retrieving an address for
842   /// \param GlobalInitializer - a lambda function which creates a constant
843   /// used for initializing a pointer reference to the variable in certain
844   /// cases. If a nullptr is passed, it will default to utilising the original
845   /// variable to initialize the pointer reference.
846   /// \param VariableLinkage - a lambda function which returns the variables
847   /// linkage type, if unspecified and a nullptr is given, it will instead
848   /// utilise the linkage stored on the existing global variable in the
849   /// LLVMModule.
850   Constant *getAddrOfDeclareTargetVar(
851       OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
852       OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
853       bool IsDeclaration, bool IsExternallyVisible,
854       TargetRegionEntryInfo EntryInfo, StringRef MangledName,
855       std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
856       std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
857       std::function<Constant *()> GlobalInitializer,
858       std::function<GlobalValue::LinkageTypes()> VariableLinkage);
859 
860   /// Registers a target variable for device or host.
861   ///
862   /// \param CaptureClause - enumerator corresponding to the OpenMP capture
863   /// clause used in conjunction with the variable being registered (link,
864   /// to, enter).
865   /// \param DeviceClause - enumerator corresponding to the OpenMP capture
866   /// clause used in conjunction with the variable being registered (nohost,
867   /// host, any)
868   /// \param IsDeclaration - boolean stating if the variable being registered
869   /// is a declaration-only and not a definition
870   /// \param IsExternallyVisible - boolean stating if the variable is externally
871   /// visible
872   /// \param EntryInfo - Unique entry information for the value generated
873   /// using getTargetEntryUniqueInfo, used to name generated pointer references
874   /// to the declare target variable
875   /// \param MangledName - the mangled name of the variable being registered
876   /// \param GeneratedRefs - references generated by invocations of
877   /// registerTargetGlobalVariable these are required by Clang for book
878   /// keeping.
879   /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
880   /// \param TargetTriple - The OpenMP device target triple we are compiling
881   /// for
882   /// \param GlobalInitializer - a lambda function which creates a constant
883   /// used for initializing a pointer reference to the variable in certain
884   /// cases. If a nullptr is passed, it will default to utilising the original
885   /// variable to initialize the pointer reference.
886   /// \param VariableLinkage - a lambda function which returns the variables
887   /// linkage type, if unspecified and a nullptr is given, it will instead
888   /// utilise the linkage stored on the existing global variable in the
889   /// LLVMModule.
890   /// \param LlvmPtrTy - The type of the variable we are generating or
891   /// retrieving an address for
892   /// \param Addr - the original llvm value (addr) of the variable to be
893   /// registered
894   void registerTargetGlobalVariable(
895       OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
896       OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
897       bool IsDeclaration, bool IsExternallyVisible,
898       TargetRegionEntryInfo EntryInfo, StringRef MangledName,
899       std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
900       std::vector<Triple> TargetTriple,
901       std::function<Constant *()> GlobalInitializer,
902       std::function<GlobalValue::LinkageTypes()> VariableLinkage,
903       Type *LlvmPtrTy, Constant *Addr);
904 
905   /// Get the offset of the OMP_MAP_MEMBER_OF field.
906   unsigned getFlagMemberOffset();
907 
908   /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
909   /// the position given.
910   /// \param Position - A value indicating the position of the parent
911   /// of the member in the kernel argument structure, often retrieved
912   /// by the parents position in the combined information vectors used
913   /// to generate the structure itself. Multiple children (member's of)
914   /// with the same parent will use the same returned member flag.
915   omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position);
916 
917   /// Given an initial flag set, this function modifies it to contain
918   /// the passed in MemberOfFlag generated from the getMemberOfFlag
919   /// function. The results are dependent on the existing flag bits
920   /// set in the original flag set.
921   /// \param Flags - The original set of flags to be modified with the
922   /// passed in MemberOfFlag.
923   /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
924   /// slightly based on the getMemberOfFlag which adjusts the flag bits
925   /// based on the members position in its parent.
926   void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags,
927                               omp::OpenMPOffloadMappingFlags MemberOfFlag);
928 
929 private:
930   /// Modifies the canonical loop to be a statically-scheduled workshare loop
931   /// which is executed on the device
932   ///
933   /// This takes a \p CLI representing a canonical loop, such as the one
934   /// created by \see createCanonicalLoop and emits additional instructions to
935   /// turn it into a workshare loop. In particular, it calls to an OpenMP
936   /// runtime function in the preheader to call OpenMP device rtl function
937   /// which handles worksharing of loop body interations.
938   ///
939   /// \param DL       Debug location for instructions added for the
940   ///                 workshare-loop construct itself.
941   /// \param CLI      A descriptor of the canonical loop to workshare.
942   /// \param AllocaIP An insertion point for Alloca instructions usable in the
943   ///                 preheader of the loop.
944   /// \param LoopType Information about type of loop worksharing.
945   ///                 It corresponds to type of loop workshare OpenMP pragma.
946   ///
947   /// \returns Point where to insert code after the workshare construct.
948   InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
949                                          InsertPointTy AllocaIP,
950                                          omp::WorksharingLoopType LoopType);
951 
952   /// Modifies the canonical loop to be a statically-scheduled workshare loop.
953   ///
954   /// This takes a \p LoopInfo representing a canonical loop, such as the one
955   /// created by \p createCanonicalLoop and emits additional instructions to
956   /// turn it into a workshare loop. In particular, it calls to an OpenMP
957   /// runtime function in the preheader to obtain the loop bounds to be used in
958   /// the current thread, updates the relevant instructions in the canonical
959   /// loop and calls to an OpenMP runtime finalization function after the loop.
960   ///
961   /// \param DL       Debug location for instructions added for the
962   ///                 workshare-loop construct itself.
963   /// \param CLI      A descriptor of the canonical loop to workshare.
964   /// \param AllocaIP An insertion point for Alloca instructions usable in the
965   ///                 preheader of the loop.
966   /// \param NeedsBarrier Indicates whether a barrier must be inserted after
967   ///                     the loop.
968   ///
969   /// \returns Point where to insert code after the workshare construct.
970   InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
971                                          InsertPointTy AllocaIP,
972                                          bool NeedsBarrier);
973 
974   /// Modifies the canonical loop a statically-scheduled workshare loop with a
975   /// user-specified chunk size.
976   ///
977   /// \param DL           Debug location for instructions added for the
978   ///                     workshare-loop construct itself.
979   /// \param CLI          A descriptor of the canonical loop to workshare.
980   /// \param AllocaIP     An insertion point for Alloca instructions usable in
981   ///                     the preheader of the loop.
982   /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
983   ///                     loop.
984   /// \param ChunkSize    The user-specified chunk size.
985   ///
986   /// \returns Point where to insert code after the workshare construct.
987   InsertPointTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
988                                                 CanonicalLoopInfo *CLI,
989                                                 InsertPointTy AllocaIP,
990                                                 bool NeedsBarrier,
991                                                 Value *ChunkSize);
992 
993   /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
994   ///
995   /// This takes a \p LoopInfo representing a canonical loop, such as the one
996   /// created by \p createCanonicalLoop and emits additional instructions to
997   /// turn it into a workshare loop. In particular, it calls to an OpenMP
998   /// runtime function in the preheader to obtain, and then in each iteration
999   /// to update the loop counter.
1000   ///
1001   /// \param DL       Debug location for instructions added for the
1002   ///                 workshare-loop construct itself.
1003   /// \param CLI      A descriptor of the canonical loop to workshare.
1004   /// \param AllocaIP An insertion point for Alloca instructions usable in the
1005   ///                 preheader of the loop.
1006   /// \param SchedType Type of scheduling to be passed to the init function.
1007   /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1008   ///                     the loop.
1009   /// \param Chunk    The size of loop chunk considered as a unit when
1010   ///                 scheduling. If \p nullptr, defaults to 1.
1011   ///
1012   /// \returns Point where to insert code after the workshare construct.
1013   InsertPointTy applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
1014                                           InsertPointTy AllocaIP,
1015                                           omp::OMPScheduleType SchedType,
1016                                           bool NeedsBarrier,
1017                                           Value *Chunk = nullptr);
1018 
1019   /// Create alternative version of the loop to support if clause
1020   ///
1021   /// OpenMP if clause can require to generate second loop. This loop
1022   /// will be executed when if clause condition is not met. createIfVersion
1023   /// adds branch instruction to the copied loop if \p  ifCond is not met.
1024   ///
1025   /// \param Loop       Original loop which should be versioned.
1026   /// \param IfCond     Value which corresponds to if clause condition
1027   /// \param VMap       Value to value map to define relation between
1028   ///                   original and copied loop values and loop blocks.
1029   /// \param NamePrefix Optional name prefix for if.then if.else blocks.
1030   void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
1031                        ValueToValueMapTy &VMap, const Twine &NamePrefix = "");
1032 
1033 public:
1034   /// Modifies the canonical loop to be a workshare loop.
1035   ///
1036   /// This takes a \p LoopInfo representing a canonical loop, such as the one
1037   /// created by \p createCanonicalLoop and emits additional instructions to
1038   /// turn it into a workshare loop. In particular, it calls to an OpenMP
1039   /// runtime function in the preheader to obtain the loop bounds to be used in
1040   /// the current thread, updates the relevant instructions in the canonical
1041   /// loop and calls to an OpenMP runtime finalization function after the loop.
1042   ///
1043   /// The concrete transformation is done by applyStaticWorkshareLoop,
1044   /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
1045   /// on the value of \p SchedKind and \p ChunkSize.
1046   ///
1047   /// \param DL       Debug location for instructions added for the
1048   ///                 workshare-loop construct itself.
1049   /// \param CLI      A descriptor of the canonical loop to workshare.
1050   /// \param AllocaIP An insertion point for Alloca instructions usable in the
1051   ///                 preheader of the loop.
1052   /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1053   ///                     the loop.
1054   /// \param SchedKind Scheduling algorithm to use.
1055   /// \param ChunkSize The chunk size for the inner loop.
1056   /// \param HasSimdModifier Whether the simd modifier is present in the
1057   ///                        schedule clause.
1058   /// \param HasMonotonicModifier Whether the monotonic modifier is present in
1059   ///                             the schedule clause.
1060   /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
1061   ///                                present in the schedule clause.
1062   /// \param HasOrderedClause Whether the (parameterless) ordered clause is
1063   ///                         present.
1064   /// \param LoopType Information about type of loop worksharing.
1065   ///                 It corresponds to type of loop workshare OpenMP pragma.
1066   ///
1067   /// \returns Point where to insert code after the workshare construct.
1068   InsertPointTy applyWorkshareLoop(
1069       DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1070       bool NeedsBarrier,
1071       llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
1072       Value *ChunkSize = nullptr, bool HasSimdModifier = false,
1073       bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
1074       bool HasOrderedClause = false,
1075       omp::WorksharingLoopType LoopType =
1076           omp::WorksharingLoopType::ForStaticLoop);
1077 
1078   /// Tile a loop nest.
1079   ///
1080   /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
1081   /// \p/ Loops must be perfectly nested, from outermost to innermost loop
1082   /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
1083   /// of every loop and every tile sizes must be usable in the outermost
1084   /// loop's preheader. This implies that the loop nest is rectangular.
1085   ///
1086   /// Example:
1087   /// \code
1088   ///   for (int i = 0; i < 15; ++i) // Canonical loop "i"
1089   ///     for (int j = 0; j < 14; ++j) // Canonical loop "j"
1090   ///         body(i, j);
1091   /// \endcode
1092   ///
1093   /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
1094   /// \code
1095   ///   for (int i1 = 0; i1 < 3; ++i1)
1096   ///     for (int j1 = 0; j1 < 2; ++j1)
1097   ///       for (int i2 = 0; i2 < 5; ++i2)
1098   ///         for (int j2 = 0; j2 < 7; ++j2)
1099   ///           body(i1*3+i2, j1*3+j2);
1100   /// \endcode
1101   ///
1102   /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
1103   /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
1104   /// handles non-constant trip counts, non-constant tile sizes and trip counts
1105   /// that are not multiples of the tile size. In the latter case the tile loop
1106   /// of the last floor-loop iteration will have fewer iterations than specified
1107   /// as its tile size.
1108   ///
1109   ///
1110   /// @param DL        Debug location for instructions added by tiling, for
1111   ///                  instance the floor- and tile trip count computation.
1112   /// @param Loops     Loops to tile. The CanonicalLoopInfo objects are
1113   ///                  invalidated by this method, i.e. should not used after
1114   ///                  tiling.
1115   /// @param TileSizes For each loop in \p Loops, the tile size for that
1116   ///                  dimensions.
1117   ///
1118   /// \returns A list of generated loops. Contains twice as many loops as the
1119   ///          input loop nest; the first half are the floor loops and the
1120   ///          second half are the tile loops.
1121   std::vector<CanonicalLoopInfo *>
1122   tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
1123             ArrayRef<Value *> TileSizes);
1124 
1125   /// Fully unroll a loop.
1126   ///
1127   /// Instead of unrolling the loop immediately (and duplicating its body
1128   /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
1129   /// metadata.
1130   ///
1131   /// \param DL   Debug location for instructions added by unrolling.
1132   /// \param Loop The loop to unroll. The loop will be invalidated.
1133   void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);
1134 
1135   /// Fully or partially unroll a loop. How the loop is unrolled is determined
1136   /// using LLVM's LoopUnrollPass.
1137   ///
1138   /// \param DL   Debug location for instructions added by unrolling.
1139   /// \param Loop The loop to unroll. The loop will be invalidated.
1140   void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);
1141 
1142   /// Partially unroll a loop.
1143   ///
1144   /// The CanonicalLoopInfo of the unrolled loop for use with chained
1145   /// loop-associated directive can be requested using \p UnrolledCLI. Not
1146   /// needing the CanonicalLoopInfo allows more efficient code generation by
1147   /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
1148   /// A loop-associated directive applied to the unrolled loop needs to know the
1149   /// new trip count which means that if using a heuristically determined unroll
1150   /// factor (\p Factor == 0), that factor must be computed immediately. We are
1151   /// using the same logic as the LoopUnrollPass to derived the unroll factor,
1152   /// but which assumes that some canonicalization has taken place (e.g.
1153   /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
1154   /// better when the unrolled loop's CanonicalLoopInfo is not needed.
1155   ///
1156   /// \param DL          Debug location for instructions added by unrolling.
1157   /// \param Loop        The loop to unroll. The loop will be invalidated.
1158   /// \param Factor      The factor to unroll the loop by. A factor of 0
1159   ///                    indicates that a heuristic should be used to determine
1160   ///                    the unroll-factor.
1161   /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
1162   ///                    partially unrolled loop. Otherwise, uses loop metadata
1163   ///                    to defer unrolling to the LoopUnrollPass.
1164   void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,
1165                          CanonicalLoopInfo **UnrolledCLI);
1166 
1167   /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
1168   /// is cloned. The metadata which prevents vectorization is added to
1169   /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
1170   /// to false.
1171   ///
1172   /// \param Loop        The loop to simd-ize.
1173   /// \param AlignedVars The map which containts pairs of the pointer
1174   ///                    and its corresponding alignment.
1175   /// \param IfCond      The value which corresponds to the if clause
1176   ///                    condition.
1177   /// \param Order       The enum to map order clause.
1178   /// \param Simdlen     The Simdlen length to apply to the simd loop.
1179   /// \param Safelen     The Safelen length to apply to the simd loop.
1180   void applySimd(CanonicalLoopInfo *Loop,
1181                  MapVector<Value *, Value *> AlignedVars, Value *IfCond,
1182                  omp::OrderKind Order, ConstantInt *Simdlen,
1183                  ConstantInt *Safelen);
1184 
1185   /// Generator for '#omp flush'
1186   ///
1187   /// \param Loc The location where the flush directive was encountered
1188   void createFlush(const LocationDescription &Loc);
1189 
1190   /// Generator for '#omp taskwait'
1191   ///
1192   /// \param Loc The location where the taskwait directive was encountered.
1193   void createTaskwait(const LocationDescription &Loc);
1194 
1195   /// Generator for '#omp taskyield'
1196   ///
1197   /// \param Loc The location where the taskyield directive was encountered.
1198   void createTaskyield(const LocationDescription &Loc);
1199 
1200   /// A struct to pack the relevant information for an OpenMP depend clause.
1201   struct DependData {
1202     omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown;
1203     Type *DepValueType;
1204     Value *DepVal;
1205     explicit DependData() = default;
DependDataDependData1206     DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType,
1207                Value *DepVal)
1208         : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
1209   };
1210 
1211   /// Generator for `#omp task`
1212   ///
1213   /// \param Loc The location where the task construct was encountered.
1214   /// \param AllocaIP The insertion point to be used for alloca instructions.
1215   /// \param BodyGenCB Callback that will generate the region code.
1216   /// \param Tied True if the task is tied, false if the task is untied.
1217   /// \param Final i1 value which is `true` if the task is final, `false` if the
1218   ///              task is not final.
1219   /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
1220   ///                    task is generated, and the encountering thread must
1221   ///                    suspend the current task region, for which execution
1222   ///                    cannot be resumed until execution of the structured
1223   ///                    block that is associated with the generated task is
1224   ///                    completed.
1225   InsertPointTy createTask(const LocationDescription &Loc,
1226                            InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1227                            bool Tied = true, Value *Final = nullptr,
1228                            Value *IfCondition = nullptr,
1229                            SmallVector<DependData> Dependencies = {});
1230 
1231   /// Generator for the taskgroup construct
1232   ///
1233   /// \param Loc The location where the taskgroup construct was encountered.
1234   /// \param AllocaIP The insertion point to be used for alloca instructions.
1235   /// \param BodyGenCB Callback that will generate the region code.
1236   InsertPointTy createTaskgroup(const LocationDescription &Loc,
1237                                 InsertPointTy AllocaIP,
1238                                 BodyGenCallbackTy BodyGenCB);
1239 
1240   using FileIdentifierInfoCallbackTy =
1241       std::function<std::tuple<std::string, uint64_t>()>;
1242 
1243   /// Creates a unique info for a target entry when provided a filename and
1244   /// line number from.
1245   ///
1246   /// \param CallBack A callback function which should return filename the entry
1247   /// resides in as well as the line number for the target entry
1248   /// \param ParentName The name of the parent the target entry resides in, if
1249   /// any.
1250   static TargetRegionEntryInfo
1251   getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
1252                            StringRef ParentName = "");
1253 
1254   /// Enum class for the RedctionGen CallBack type to be used.
1255   enum class ReductionGenCBKind { Clang, MLIR };
1256 
1257   /// ReductionGen CallBack for Clang
1258   ///
1259   /// \param CodeGenIP InsertPoint for CodeGen.
1260   /// \param Index Index of the ReductionInfo to generate code for.
1261   /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
1262   /// codegen, used for fixup later.
1263   /// \param RHSPtr Optionally used by Clang to
1264   /// return the RHSPtr it used for codegen, used for fixup later.
1265   /// \param CurFn Optionally used by Clang to pass in the Current Function as
1266   /// Clang context may be old.
1267   using ReductionGenClangCBTy =
1268       std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
1269                                   Value **LHS, Value **RHS, Function *CurFn)>;
1270 
1271   /// ReductionGen CallBack for MLIR
1272   ///
1273   /// \param CodeGenIP InsertPoint for CodeGen.
1274   /// \param LHS Pass in the LHS Value to be used for CodeGen.
1275   /// \param RHS Pass in the RHS Value to be used for CodeGen.
1276   using ReductionGenCBTy = std::function<InsertPointTy(
1277       InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
1278 
1279   /// Functions used to generate atomic reductions. Such functions take two
1280   /// Values representing pointers to LHS and RHS of the reduction, as well as
1281   /// the element type of these pointers. They are expected to atomically
1282   /// update the LHS to the reduced value.
1283   using ReductionGenAtomicCBTy =
1284       std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
1285 
1286   /// Enum class for reduction evaluation types scalar, complex and aggregate.
1287   enum class EvalKind { Scalar, Complex, Aggregate };
1288 
1289   /// Information about an OpenMP reduction.
1290   struct ReductionInfo {
ReductionInfoReductionInfo1291     ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
1292                   EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
1293                   ReductionGenClangCBTy ReductionGenClang,
1294                   ReductionGenAtomicCBTy AtomicReductionGen)
1295         : ElementType(ElementType), Variable(Variable),
1296           PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
1297           ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
1298           AtomicReductionGen(AtomicReductionGen) {}
ReductionInfoReductionInfo1299     ReductionInfo(Value *PrivateVariable)
1300         : ElementType(nullptr), Variable(nullptr),
1301           PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
1302           ReductionGen(), ReductionGenClang(), AtomicReductionGen() {}
1303 
1304     /// Reduction element type, must match pointee type of variable.
1305     Type *ElementType;
1306 
1307     /// Reduction variable of pointer type.
1308     Value *Variable;
1309 
1310     /// Thread-private partial reduction variable.
1311     Value *PrivateVariable;
1312 
1313     /// Reduction evaluation kind - scalar, complex or aggregate.
1314     EvalKind EvaluationKind;
1315 
1316     /// Callback for generating the reduction body. The IR produced by this will
1317     /// be used to combine two values in a thread-safe context, e.g., under
1318     /// lock or within the same thread, and therefore need not be atomic.
1319     ReductionGenCBTy ReductionGen;
1320 
1321     /// Clang callback for generating the reduction body. The IR produced by
1322     /// this will be used to combine two values in a thread-safe context, e.g.,
1323     /// under lock or within the same thread, and therefore need not be atomic.
1324     ReductionGenClangCBTy ReductionGenClang;
1325 
1326     /// Callback for generating the atomic reduction body, may be null. The IR
1327     /// produced by this will be used to atomically combine two values during
1328     /// reduction. If null, the implementation will use the non-atomic version
1329     /// along with the appropriate synchronization mechanisms.
1330     ReductionGenAtomicCBTy AtomicReductionGen;
1331   };
1332 
1333   enum class CopyAction : unsigned {
1334     // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1335     // the warp using shuffle instructions.
1336     RemoteLaneToThread,
1337     // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1338     ThreadCopy,
1339   };
1340 
1341   struct CopyOptionsTy {
1342     Value *RemoteLaneOffset = nullptr;
1343     Value *ScratchpadIndex = nullptr;
1344     Value *ScratchpadWidth = nullptr;
1345   };
1346 
1347   /// Supporting functions for Reductions CodeGen.
1348 private:
1349   /// Emit the llvm.used metadata.
1350   void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);
1351 
1352   /// Get the id of the current thread on the GPU.
1353   Value *getGPUThreadID();
1354 
1355   /// Get the GPU warp size.
1356   Value *getGPUWarpSize();
1357 
1358   /// Get the id of the warp in the block.
1359   /// We assume that the warp size is 32, which is always the case
1360   /// on the NVPTX device, to generate more efficient code.
1361   Value *getNVPTXWarpID();
1362 
1363   /// Get the id of the current lane in the Warp.
1364   /// We assume that the warp size is 32, which is always the case
1365   /// on the NVPTX device, to generate more efficient code.
1366   Value *getNVPTXLaneID();
1367 
1368   /// Cast value to the specified type.
1369   Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
1370 
1371   /// This function creates calls to one of two shuffle functions to copy
1372   /// variables between lanes in a warp.
1373   Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
1374                                       Type *ElementType, Value *Offset);
1375 
1376   /// Function to shuffle over the value from the remote lane.
1377   void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
1378                        Type *ElementType, Value *Offset,
1379                        Type *ReductionArrayTy);
1380 
1381   /// Emit instructions to copy a Reduce list, which contains partially
1382   /// aggregated values, in the specified direction.
1383   void emitReductionListCopy(
1384       InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
1385       ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
1386       CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
1387 
1388   /// Emit a helper that reduces data across two OpenMP threads (lanes)
1389   /// in the same warp.  It uses shuffle instructions to copy over data from
1390   /// a remote lane's stack.  The reduction algorithm performed is specified
1391   /// by the fourth parameter.
1392   ///
1393   /// Algorithm Versions.
1394   /// Full Warp Reduce (argument value 0):
1395   ///   This algorithm assumes that all 32 lanes are active and gathers
1396   ///   data from these 32 lanes, producing a single resultant value.
1397   /// Contiguous Partial Warp Reduce (argument value 1):
1398   ///   This algorithm assumes that only a *contiguous* subset of lanes
1399   ///   are active.  This happens for the last warp in a parallel region
1400   ///   when the user specified num_threads is not an integer multiple of
1401   ///   32.  This contiguous subset always starts with the zeroth lane.
1402   /// Partial Warp Reduce (argument value 2):
1403   ///   This algorithm gathers data from any number of lanes at any position.
1404   /// All reduced values are stored in the lowest possible lane.  The set
1405   /// of problems every algorithm addresses is a super set of those
1406   /// addressable by algorithms with a lower version number.  Overhead
1407   /// increases as algorithm version increases.
1408   ///
1409   /// Terminology
1410   /// Reduce element:
1411   ///   Reduce element refers to the individual data field with primitive
1412   ///   data types to be combined and reduced across threads.
1413   /// Reduce list:
1414   ///   Reduce list refers to a collection of local, thread-private
1415   ///   reduce elements.
1416   /// Remote Reduce list:
1417   ///   Remote Reduce list refers to a collection of remote (relative to
1418   ///   the current thread) reduce elements.
1419   ///
1420   /// We distinguish between three states of threads that are important to
1421   /// the implementation of this function.
1422   /// Alive threads:
1423   ///   Threads in a warp executing the SIMT instruction, as distinguished from
1424   ///   threads that are inactive due to divergent control flow.
1425   /// Active threads:
1426   ///   The minimal set of threads that has to be alive upon entry to this
1427   ///   function.  The computation is correct iff active threads are alive.
1428   ///   Some threads are alive but they are not active because they do not
1429   ///   contribute to the computation in any useful manner.  Turning them off
1430   ///   may introduce control flow overheads without any tangible benefits.
1431   /// Effective threads:
1432   ///   In order to comply with the argument requirements of the shuffle
1433   ///   function, we must keep all lanes holding data alive.  But at most
1434   ///   half of them perform value aggregation; we refer to this half of
1435   ///   threads as effective. The other half is simply handing off their
1436   ///   data.
1437   ///
1438   /// Procedure
1439   /// Value shuffle:
1440   ///   In this step active threads transfer data from higher lane positions
1441   ///   in the warp to lower lane positions, creating Remote Reduce list.
1442   /// Value aggregation:
1443   ///   In this step, effective threads combine their thread local Reduce list
1444   ///   with Remote Reduce list and store the result in the thread local
1445   ///   Reduce list.
1446   /// Value copy:
1447   ///   In this step, we deal with the assumption made by algorithm 2
1448   ///   (i.e. contiguity assumption).  When we have an odd number of lanes
1449   ///   active, say 2k+1, only k threads will be effective and therefore k
1450   ///   new values will be produced.  However, the Reduce list owned by the
1451   ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
1452   ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1453   ///   that the contiguity assumption still holds.
1454   ///
1455   /// \param ReductionInfos Array type containing the ReductionOps.
1456   /// \param ReduceFn The reduction function.
1457   /// \param FuncAttrs Optional param to specify any function attributes that
1458   ///                  need to be copied to the new function.
1459   ///
1460   /// \return The ShuffleAndReduce function.
1461   Function *emitShuffleAndReduceFunction(
1462       ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
1463       Function *ReduceFn, AttributeList FuncAttrs);
1464 
1465   /// This function emits a helper that gathers Reduce lists from the first
1466   /// lane of every active warp to lanes in the first warp.
1467   ///
1468   /// void inter_warp_copy_func(void* reduce_data, num_warps)
1469   ///   shared smem[warp_size];
1470   ///   For all data entries D in reduce_data:
1471   ///     sync
1472   ///     If (I am the first lane in each warp)
1473   ///       Copy my local D to smem[warp_id]
1474   ///     sync
1475   ///     if (I am the first warp)
1476   ///       Copy smem[thread_id] to my local D
1477   ///
1478   /// \param Loc The insert and source location description.
1479   /// \param ReductionInfos Array type containing the ReductionOps.
1480   /// \param FuncAttrs Optional param to specify any function attributes that
1481   ///                  need to be copied to the new function.
1482   ///
1483   /// \return The InterWarpCopy function.
1484   Function *emitInterWarpCopyFunction(const LocationDescription &Loc,
1485                                       ArrayRef<ReductionInfo> ReductionInfos,
1486                                       AttributeList FuncAttrs);
1487 
1488   /// This function emits a helper that copies all the reduction variables from
1489   /// the team into the provided global buffer for the reduction variables.
1490   ///
1491   /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1492   ///   For all data entries D in reduce_data:
1493   ///     Copy local D to buffer.D[Idx]
1494   ///
1495   /// \param ReductionInfos Array type containing the ReductionOps.
1496   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1497   /// \param FuncAttrs Optional param to specify any function attributes that
1498   ///                  need to be copied to the new function.
1499   ///
1500   /// \return The ListToGlobalCopy function.
1501   Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1502                                          Type *ReductionsBufferTy,
1503                                          AttributeList FuncAttrs);
1504 
1505   /// This function emits a helper that copies all the reduction variables from
1506   /// the team into the provided global buffer for the reduction variables.
1507   ///
1508   /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1509   ///   For all data entries D in reduce_data:
1510   ///     Copy buffer.D[Idx] to local D;
1511   ///
1512   /// \param ReductionInfos Array type containing the ReductionOps.
1513   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1514   /// \param FuncAttrs Optional param to specify any function attributes that
1515   ///                  need to be copied to the new function.
1516   ///
1517   /// \return The GlobalToList function.
1518   Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1519                                          Type *ReductionsBufferTy,
1520                                          AttributeList FuncAttrs);
1521 
1522   /// This function emits a helper that reduces all the reduction variables from
1523   /// the team into the provided global buffer for the reduction variables.
1524   ///
1525   /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
1526   ///  void *GlobPtrs[];
1527   ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
1528   ///  ...
1529   ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
1530   ///  reduce_function(GlobPtrs, reduce_data);
1531   ///
1532   /// \param ReductionInfos Array type containing the ReductionOps.
1533   /// \param ReduceFn The reduction function.
1534   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1535   /// \param FuncAttrs Optional param to specify any function attributes that
1536   ///                  need to be copied to the new function.
1537   ///
1538   /// \return The ListToGlobalReduce function.
1539   Function *
1540   emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1541                                  Function *ReduceFn, Type *ReductionsBufferTy,
1542                                  AttributeList FuncAttrs);
1543 
1544   /// This function emits a helper that reduces all the reduction variables from
1545   /// the team into the provided global buffer for the reduction variables.
1546   ///
1547   /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
1548   ///  void *GlobPtrs[];
1549   ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
1550   ///  ...
1551   ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
1552   ///  reduce_function(reduce_data, GlobPtrs);
1553   ///
1554   /// \param ReductionInfos Array type containing the ReductionOps.
1555   /// \param ReduceFn The reduction function.
1556   /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1557   /// \param FuncAttrs Optional param to specify any function attributes that
1558   ///                  need to be copied to the new function.
1559   ///
1560   /// \return The GlobalToListReduce function.
1561   Function *
1562   emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1563                                  Function *ReduceFn, Type *ReductionsBufferTy,
1564                                  AttributeList FuncAttrs);
1565 
1566   /// Get the function name of a reduction function.
1567   std::string getReductionFuncName(StringRef Name) const;
1568 
1569   /// Emits reduction function.
1570   /// \param ReducerName Name of the function calling the reduction.
1571   /// \param ReductionInfos Array type containing the ReductionOps.
1572   /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
1573   ///                           CodeGenCB kind.
1574   /// \param FuncAttrs Optional param to specify any function attributes that
1575   ///                  need to be copied to the new function.
1576   ///
1577   /// \return The reduction function.
1578   Function *createReductionFunction(
1579       StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
1580       ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
1581       AttributeList FuncAttrs = {});
1582 
1583 public:
1584   ///
1585   /// Design of OpenMP reductions on the GPU
1586   ///
1587   /// Consider a typical OpenMP program with one or more reduction
1588   /// clauses:
1589   ///
1590   /// float foo;
1591   /// double bar;
1592   /// #pragma omp target teams distribute parallel for \
1593   ///             reduction(+:foo) reduction(*:bar)
1594   /// for (int i = 0; i < N; i++) {
1595   ///   foo += A[i]; bar *= B[i];
1596   /// }
1597   ///
1598   /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1599   /// all teams.  In our OpenMP implementation on the NVPTX device an
1600   /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1601   /// within a team are mapped to CUDA threads within a threadblock.
1602   /// Our goal is to efficiently aggregate values across all OpenMP
1603   /// threads such that:
1604   ///
1605   ///   - the compiler and runtime are logically concise, and
1606   ///   - the reduction is performed efficiently in a hierarchical
1607   ///     manner as follows: within OpenMP threads in the same warp,
1608   ///     across warps in a threadblock, and finally across teams on
1609   ///     the NVPTX device.
1610   ///
1611   /// Introduction to Decoupling
1612   ///
1613   /// We would like to decouple the compiler and the runtime so that the
1614   /// latter is ignorant of the reduction variables (number, data types)
1615   /// and the reduction operators.  This allows a simpler interface
1616   /// and implementation while still attaining good performance.
1617   ///
1618   /// Pseudocode for the aforementioned OpenMP program generated by the
1619   /// compiler is as follows:
1620   ///
1621   /// 1. Create private copies of reduction variables on each OpenMP
1622   ///    thread: 'foo_private', 'bar_private'
1623   /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1624   ///    to it and writes the result in 'foo_private' and 'bar_private'
1625   ///    respectively.
1626   /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1627   ///    and store the result on the team master:
1628   ///
1629   ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1630   ///        reduceData, shuffleReduceFn, interWarpCpyFn)
1631   ///
1632   ///     where:
1633   ///       struct ReduceData {
1634   ///         double *foo;
1635   ///         double *bar;
1636   ///       } reduceData
1637   ///       reduceData.foo = &foo_private
1638   ///       reduceData.bar = &bar_private
1639   ///
1640   ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1641   ///     auxiliary functions generated by the compiler that operate on
1642   ///     variables of type 'ReduceData'.  They aid the runtime perform
1643   ///     algorithmic steps in a data agnostic manner.
1644   ///
1645   ///     'shuffleReduceFn' is a pointer to a function that reduces data
1646   ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
1647   ///     same warp.  It takes the following arguments as input:
1648   ///
1649   ///     a. variable of type 'ReduceData' on the calling lane,
1650   ///     b. its lane_id,
1651   ///     c. an offset relative to the current lane_id to generate a
1652   ///        remote_lane_id.  The remote lane contains the second
1653   ///        variable of type 'ReduceData' that is to be reduced.
1654   ///     d. an algorithm version parameter determining which reduction
1655   ///        algorithm to use.
1656   ///
1657   ///     'shuffleReduceFn' retrieves data from the remote lane using
1658   ///     efficient GPU shuffle intrinsics and reduces, using the
1659   ///     algorithm specified by the 4th parameter, the two operands
1660   ///     element-wise.  The result is written to the first operand.
1661   ///
1662   ///     Different reduction algorithms are implemented in different
1663   ///     runtime functions, all calling 'shuffleReduceFn' to perform
1664   ///     the essential reduction step.  Therefore, based on the 4th
1665   ///     parameter, this function behaves slightly differently to
1666   ///     cooperate with the runtime to ensure correctness under
1667   ///     different circumstances.
1668   ///
1669   ///     'InterWarpCpyFn' is a pointer to a function that transfers
1670   ///     reduced variables across warps.  It tunnels, through CUDA
1671   ///     shared memory, the thread-private data of type 'ReduceData'
1672   ///     from lane 0 of each warp to a lane in the first warp.
1673   /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1674   ///    The last team writes the global reduced value to memory.
1675   ///
1676   ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
1677   ///             reduceData, shuffleReduceFn, interWarpCpyFn,
1678   ///             scratchpadCopyFn, loadAndReduceFn)
1679   ///
1680   ///     'scratchpadCopyFn' is a helper that stores reduced
1681   ///     data from the team master to a scratchpad array in
1682   ///     global memory.
1683   ///
1684   ///     'loadAndReduceFn' is a helper that loads data from
1685   ///     the scratchpad array and reduces it with the input
1686   ///     operand.
1687   ///
1688   ///     These compiler generated functions hide address
1689   ///     calculation and alignment information from the runtime.
1690   /// 5. if ret == 1:
1691   ///     The team master of the last team stores the reduced
1692   ///     result to the globals in memory.
1693   ///     foo += reduceData.foo; bar *= reduceData.bar
1694   ///
1695   ///
1696   /// Warp Reduction Algorithms
1697   ///
1698   /// On the warp level, we have three algorithms implemented in the
1699   /// OpenMP runtime depending on the number of active lanes:
1700   ///
1701   /// Full Warp Reduction
1702   ///
1703   /// The reduce algorithm within a warp where all lanes are active
1704   /// is implemented in the runtime as follows:
1705   ///
1706   /// full_warp_reduce(void *reduce_data,
1707   ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1708   ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1709   ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
1710   /// }
1711   ///
1712   /// The algorithm completes in log(2, WARPSIZE) steps.
1713   ///
1714   /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1715   /// not used therefore we save instructions by not retrieving lane_id
1716   /// from the corresponding special registers.  The 4th parameter, which
1717   /// represents the version of the algorithm being used, is set to 0 to
1718   /// signify full warp reduction.
1719   ///
1720   /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1721   ///
1722   /// #reduce_elem refers to an element in the local lane's data structure
1723   /// #remote_elem is retrieved from a remote lane
1724   /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1725   /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1726   ///
1727   /// Contiguous Partial Warp Reduction
1728   ///
1729   /// This reduce algorithm is used within a warp where only the first
1730   /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
1731   /// number of OpenMP threads in a parallel region is not a multiple of
1732   /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
1733   ///
1734   /// void
1735   /// contiguous_partial_reduce(void *reduce_data,
1736   ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
1737   ///                           int size, int lane_id) {
1738   ///   int curr_size;
1739   ///   int offset;
1740   ///   curr_size = size;
1741   ///   mask = curr_size/2;
1742   ///   while (offset>0) {
1743   ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1744   ///     curr_size = (curr_size+1)/2;
1745   ///     offset = curr_size/2;
1746   ///   }
1747   /// }
1748   ///
1749   /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1750   ///
1751   /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1752   /// if (lane_id < offset)
1753   ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
1754   /// else
1755   ///     reduce_elem = remote_elem
1756   ///
1757   /// This algorithm assumes that the data to be reduced are located in a
1758   /// contiguous subset of lanes starting from the first.  When there is
1759   /// an odd number of active lanes, the data in the last lane is not
1760   /// aggregated with any other lane's dat but is instead copied over.
1761   ///
1762   /// Dispersed Partial Warp Reduction
1763   ///
1764   /// This algorithm is used within a warp when any discontiguous subset of
1765   /// lanes are active.  It is used to implement the reduction operation
1766   /// across lanes in an OpenMP simd region or in a nested parallel region.
1767   ///
1768   /// void
1769   /// dispersed_partial_reduce(void *reduce_data,
1770   ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1771   ///   int size, remote_id;
1772   ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
1773   ///   do {
1774   ///       remote_id = next_active_lane_id_right_after_me();
1775   ///       # the above function returns 0 of no active lane
1776   ///       # is present right after the current lane.
1777   ///       size = number_of_active_lanes_in_this_warp();
1778   ///       logical_lane_id /= 2;
1779   ///       ShuffleReduceFn(reduce_data, logical_lane_id,
1780   ///                       remote_id-1-threadIdx.x, 2);
1781   ///   } while (logical_lane_id % 2 == 0 && size > 1);
1782   /// }
1783   ///
1784   /// There is no assumption made about the initial state of the reduction.
1785   /// Any number of lanes (>=1) could be active at any position.  The reduction
1786   /// result is returned in the first active lane.
1787   ///
1788   /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1789   ///
1790   /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1791   /// if (lane_id % 2 == 0 && offset > 0)
1792   ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
1793   /// else
1794   ///     reduce_elem = remote_elem
1795   ///
1796   ///
1797   /// Intra-Team Reduction
1798   ///
1799   /// This function, as implemented in the runtime call
1800   /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1801   /// threads in a team.  It first reduces within a warp using the
1802   /// aforementioned algorithms.  We then proceed to gather all such
1803   /// reduced values at the first warp.
1804   ///
1805   /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1806   /// data from each of the "warp master" (zeroth lane of each warp, where
1807   /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
1808   /// a mathematical sense) the problem of reduction across warp masters in
1809   /// a block to the problem of warp reduction.
1810   ///
1811   ///
1812   /// Inter-Team Reduction
1813   ///
1814   /// Once a team has reduced its data to a single value, it is stored in
1815   /// a global scratchpad array.  Since each team has a distinct slot, this
1816   /// can be done without locking.
1817   ///
1818   /// The last team to write to the scratchpad array proceeds to reduce the
1819   /// scratchpad array.  One or more workers in the last team use the helper
1820   /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1821   /// the k'th worker reduces every k'th element.
1822   ///
1823   /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1824   /// reduce across workers and compute a globally reduced value.
1825   ///
1826   /// \param Loc                The location where the reduction was
1827   ///                           encountered. Must be within the associate
1828   ///                           directive and after the last local access to the
1829   ///                           reduction variables.
1830   /// \param AllocaIP           An insertion point suitable for allocas usable
1831   ///                           in reductions.
1832   /// \param CodeGenIP           An insertion point suitable for code
1833   /// generation. \param ReductionInfos     A list of info on each reduction
1834   /// variable. \param IsNoWait           Optional flag set if the reduction is
1835   /// marked as
1836   ///                           nowait.
1837   /// \param IsTeamsReduction   Optional flag set if it is a teams
1838   ///                           reduction.
1839   /// \param HasDistribute      Optional flag set if it is a
1840   ///                           distribute reduction.
1841   /// \param GridValue          Optional GPU grid value.
1842   /// \param ReductionBufNum    Optional OpenMPCUDAReductionBufNumValue to be
1843   /// used for teams reduction.
1844   /// \param SrcLocInfo         Source location information global.
1845   InsertPointTy createReductionsGPU(
1846       const LocationDescription &Loc, InsertPointTy AllocaIP,
1847       InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
1848       bool IsNoWait = false, bool IsTeamsReduction = false,
1849       bool HasDistribute = false,
1850       ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
1851       std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
1852       Value *SrcLocInfo = nullptr);
1853 
1854   // TODO: provide atomic and non-atomic reduction generators for reduction
1855   // operators defined by the OpenMP specification.
1856 
1857   /// Generator for '#omp reduction'.
1858   ///
1859   /// Emits the IR instructing the runtime to perform the specific kind of
1860   /// reductions. Expects reduction variables to have been privatized and
1861   /// initialized to reduction-neutral values separately. Emits the calls to
1862   /// runtime functions as well as the reduction function and the basic blocks
1863   /// performing the reduction atomically and non-atomically.
1864   ///
1865   /// The code emitted for the following:
1866   ///
1867   /// \code
1868   ///   type var_1;
1869   ///   type var_2;
1870   ///   #pragma omp <directive> reduction(reduction-op:var_1,var_2)
1871   ///   /* body */;
1872   /// \endcode
1873   ///
1874   /// corresponds to the following sketch.
1875   ///
1876   /// \code
1877   /// void _outlined_par() {
1878   ///   // N is the number of different reductions.
1879   ///   void *red_array[] = {privatized_var_1, privatized_var_2, ...};
1880   ///   switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
1881   ///                        _omp_reduction_func,
1882   ///                        _gomp_critical_user.reduction.var)) {
1883   ///   case 1: {
1884   ///     var_1 = var_1 <reduction-op> privatized_var_1;
1885   ///     var_2 = var_2 <reduction-op> privatized_var_2;
1886   ///     // ...
1887   ///    __kmpc_end_reduce(...);
1888   ///     break;
1889   ///   }
1890   ///   case 2: {
1891   ///     _Atomic<ReductionOp>(var_1, privatized_var_1);
1892   ///     _Atomic<ReductionOp>(var_2, privatized_var_2);
1893   ///     // ...
1894   ///     break;
1895   ///   }
1896   ///   default: break;
1897   ///   }
1898   /// }
1899   ///
1900   /// void _omp_reduction_func(void **lhs, void **rhs) {
1901   ///   *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
1902   ///   *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
1903   ///   // ...
1904   /// }
1905   /// \endcode
1906   ///
1907   /// \param Loc                The location where the reduction was
1908   ///                           encountered. Must be within the associate
1909   ///                           directive and after the last local access to the
1910   ///                           reduction variables.
1911   /// \param AllocaIP           An insertion point suitable for allocas usable
1912   ///                           in reductions.
1913   /// \param ReductionInfos     A list of info on each reduction variable.
1914   /// \param IsNoWait           A flag set if the reduction is marked as nowait.
1915   /// \param IsByRef            A flag set if the reduction is using reference
1916   /// or direct value.
1917   InsertPointTy createReductions(const LocationDescription &Loc,
1918                                  InsertPointTy AllocaIP,
1919                                  ArrayRef<ReductionInfo> ReductionInfos,
1920                                  ArrayRef<bool> IsByRef, bool IsNoWait = false);
1921 
1922   ///}
1923 
1924   /// Return the insertion point used by the underlying IRBuilder.
getInsertionPoint()1925   InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
1926 
1927   /// Update the internal location to \p Loc.
updateToLocation(const LocationDescription & Loc)1928   bool updateToLocation(const LocationDescription &Loc) {
1929     Builder.restoreIP(Loc.IP);
1930     Builder.SetCurrentDebugLocation(Loc.DL);
1931     return Loc.IP.getBlock() != nullptr;
1932   }
1933 
1934   /// Return the function declaration for the runtime function with \p FnID.
1935   FunctionCallee getOrCreateRuntimeFunction(Module &M,
1936                                             omp::RuntimeFunction FnID);
1937 
1938   Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID);
1939 
1940   /// Return the (LLVM-IR) string describing the source location \p LocStr.
1941   Constant *getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize);
1942 
1943   /// Return the (LLVM-IR) string describing the default source location.
1944   Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize);
1945 
1946   /// Return the (LLVM-IR) string describing the source location identified by
1947   /// the arguments.
1948   Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
1949                                  unsigned Line, unsigned Column,
1950                                  uint32_t &SrcLocStrSize);
1951 
1952   /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
1953   /// fallback if \p DL does not specify the function name.
1954   Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize,
1955                                  Function *F = nullptr);
1956 
1957   /// Return the (LLVM-IR) string describing the source location \p Loc.
1958   Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
1959                                  uint32_t &SrcLocStrSize);
1960 
1961   /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
1962   /// TODO: Create a enum class for the Reserve2Flags
1963   Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize,
1964                              omp::IdentFlag Flags = omp::IdentFlag(0),
1965                              unsigned Reserve2Flags = 0);
1966 
1967   /// Create a hidden global flag \p Name in the module with initial value \p
1968   /// Value.
1969   GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
1970 
1971   /// Generate control flow and cleanup for cancellation.
1972   ///
1973   /// \param CancelFlag Flag indicating if the cancellation is performed.
1974   /// \param CanceledDirective The kind of directive that is cancled.
1975   /// \param ExitCB Extra code to be generated in the exit block.
1976   void emitCancelationCheckImpl(Value *CancelFlag,
1977                                 omp::Directive CanceledDirective,
1978                                 FinalizeCallbackTy ExitCB = {});
1979 
1980   /// Generate a target region entry call.
1981   ///
1982   /// \param Loc The location at which the request originated and is fulfilled.
1983   /// \param AllocaIP The insertion point to be used for alloca instructions.
1984   /// \param Return Return value of the created function returned by reference.
1985   /// \param DeviceID Identifier for the device via the 'device' clause.
1986   /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
1987   ///                 or 0 if unspecified and -1 if there is no 'teams' clause.
1988   /// \param NumThreads Number of threads via the 'thread_limit' clause.
1989   /// \param HostPtr Pointer to the host-side pointer of the target kernel.
1990   /// \param KernelArgs Array of arguments to the kernel.
1991   InsertPointTy emitTargetKernel(const LocationDescription &Loc,
1992                                  InsertPointTy AllocaIP, Value *&Return,
1993                                  Value *Ident, Value *DeviceID, Value *NumTeams,
1994                                  Value *NumThreads, Value *HostPtr,
1995                                  ArrayRef<Value *> KernelArgs);
1996 
1997   /// Generate a flush runtime call.
1998   ///
1999   /// \param Loc The location at which the request originated and is fulfilled.
2000   void emitFlush(const LocationDescription &Loc);
2001 
2002   /// The finalization stack made up of finalize callbacks currently in-flight,
2003   /// wrapped into FinalizationInfo objects that reference also the finalization
2004   /// target block and the kind of cancellable directive.
2005   SmallVector<FinalizationInfo, 8> FinalizationStack;
2006 
2007   /// Return true if the last entry in the finalization stack is of kind \p DK
2008   /// and cancellable.
isLastFinalizationInfoCancellable(omp::Directive DK)2009   bool isLastFinalizationInfoCancellable(omp::Directive DK) {
2010     return !FinalizationStack.empty() &&
2011            FinalizationStack.back().IsCancellable &&
2012            FinalizationStack.back().DK == DK;
2013   }
2014 
2015   /// Generate a taskwait runtime call.
2016   ///
2017   /// \param Loc The location at which the request originated and is fulfilled.
2018   void emitTaskwaitImpl(const LocationDescription &Loc);
2019 
2020   /// Generate a taskyield runtime call.
2021   ///
2022   /// \param Loc The location at which the request originated and is fulfilled.
2023   void emitTaskyieldImpl(const LocationDescription &Loc);
2024 
2025   /// Return the current thread ID.
2026   ///
2027   /// \param Ident The ident (ident_t*) describing the query origin.
2028   Value *getOrCreateThreadID(Value *Ident);
2029 
2030   /// The OpenMPIRBuilder Configuration
2031   OpenMPIRBuilderConfig Config;
2032 
2033   /// The underlying LLVM-IR module
2034   Module &M;
2035 
2036   /// The LLVM-IR Builder used to create IR.
2037   IRBuilder<> Builder;
2038 
2039   /// Map to remember source location strings
2040   StringMap<Constant *> SrcLocStrMap;
2041 
2042   /// Map to remember existing ident_t*.
2043   DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;
2044 
2045   /// Info manager to keep track of target regions.
2046   OffloadEntriesInfoManager OffloadInfoManager;
2047 
2048   /// The target triple of the underlying module.
2049   const Triple T;
2050 
2051   /// Helper that contains information about regions we need to outline
2052   /// during finalization.
2053   struct OutlineInfo {
2054     using PostOutlineCBTy = std::function<void(Function &)>;
2055     PostOutlineCBTy PostOutlineCB;
2056     BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
2057     SmallVector<Value *, 2> ExcludeArgsFromAggregate;
2058 
2059     /// Collect all blocks in between EntryBB and ExitBB in both the given
2060     /// vector and set.
2061     void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
2062                        SmallVectorImpl<BasicBlock *> &BlockVector);
2063 
2064     /// Return the function that contains the region to be outlined.
getFunctionOutlineInfo2065     Function *getFunction() const { return EntryBB->getParent(); }
2066   };
2067 
2068   /// Collection of regions that need to be outlined during finalization.
2069   SmallVector<OutlineInfo, 16> OutlineInfos;
2070 
2071   /// A collection of candidate target functions that's constant allocas will
2072   /// attempt to be raised on a call of finalize after all currently enqueued
2073   /// outline info's have been processed.
2074   SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates;
2075 
2076   /// Collection of owned canonical loop objects that eventually need to be
2077   /// free'd.
2078   std::forward_list<CanonicalLoopInfo> LoopInfos;
2079 
2080   /// Add a new region that will be outlined later.
addOutlineInfo(OutlineInfo && OI)2081   void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
2082 
2083   /// An ordered map of auto-generated variables to their unique names.
2084   /// It stores variables with the following names: 1) ".gomp_critical_user_" +
2085   /// <critical_section_name> + ".var" for "omp critical" directives; 2)
2086   /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
2087   /// variables.
2088   StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars;
2089 
2090   /// Computes the size of type in bytes.
2091   Value *getSizeInBytes(Value *BasePtr);
2092 
2093   // Emit a branch from the current block to the Target block only if
2094   // the current block has a terminator.
2095   void emitBranch(BasicBlock *Target);
2096 
2097   // If BB has no use then delete it and return. Else place BB after the current
2098   // block, if possible, or else at the end of the function. Also add a branch
2099   // from current block to BB if current block does not have a terminator.
2100   void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished = false);
2101 
2102   /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
2103   /// Here is the logic:
2104   /// if (Cond) {
2105   ///   ThenGen();
2106   /// } else {
2107   ///   ElseGen();
2108   /// }
2109   void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
2110                     BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP = {});
2111 
2112   /// Create the global variable holding the offload mappings information.
2113   GlobalVariable *createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
2114                                         std::string VarName);
2115 
2116   /// Create the global variable holding the offload names information.
2117   GlobalVariable *
2118   createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
2119                         std::string VarName);
2120 
2121   struct MapperAllocas {
2122     AllocaInst *ArgsBase = nullptr;
2123     AllocaInst *Args = nullptr;
2124     AllocaInst *ArgSizes = nullptr;
2125   };
2126 
2127   /// Create the allocas instruction used in call to mapper functions.
2128   void createMapperAllocas(const LocationDescription &Loc,
2129                            InsertPointTy AllocaIP, unsigned NumOperands,
2130                            struct MapperAllocas &MapperAllocas);
2131 
2132   /// Create the call for the target mapper function.
2133   /// \param Loc The source location description.
2134   /// \param MapperFunc Function to be called.
2135   /// \param SrcLocInfo Source location information global.
2136   /// \param MaptypesArg The argument types.
2137   /// \param MapnamesArg The argument names.
2138   /// \param MapperAllocas The AllocaInst used for the call.
2139   /// \param DeviceID Device ID for the call.
2140   /// \param NumOperands Number of operands in the call.
2141   void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,
2142                       Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,
2143                       struct MapperAllocas &MapperAllocas, int64_t DeviceID,
2144                       unsigned NumOperands);
2145 
2146   /// Container for the arguments used to pass data to the runtime library.
2147   struct TargetDataRTArgs {
2148     /// The array of base pointer passed to the runtime library.
2149     Value *BasePointersArray = nullptr;
2150     /// The array of section pointers passed to the runtime library.
2151     Value *PointersArray = nullptr;
2152     /// The array of sizes passed to the runtime library.
2153     Value *SizesArray = nullptr;
2154     /// The array of map types passed to the runtime library for the beginning
2155     /// of the region or for the entire region if there are no separate map
2156     /// types for the region end.
2157     Value *MapTypesArray = nullptr;
2158     /// The array of map types passed to the runtime library for the end of the
2159     /// region, or nullptr if there are no separate map types for the region
2160     /// end.
2161     Value *MapTypesArrayEnd = nullptr;
2162     /// The array of user-defined mappers passed to the runtime library.
2163     Value *MappersArray = nullptr;
2164     /// The array of original declaration names of mapped pointers sent to the
2165     /// runtime library for debugging
2166     Value *MapNamesArray = nullptr;
2167 
TargetDataRTArgsTargetDataRTArgs2168     explicit TargetDataRTArgs() {}
TargetDataRTArgsTargetDataRTArgs2169     explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
2170                               Value *SizesArray, Value *MapTypesArray,
2171                               Value *MapTypesArrayEnd, Value *MappersArray,
2172                               Value *MapNamesArray)
2173         : BasePointersArray(BasePointersArray), PointersArray(PointersArray),
2174           SizesArray(SizesArray), MapTypesArray(MapTypesArray),
2175           MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray),
2176           MapNamesArray(MapNamesArray) {}
2177   };
2178 
2179   /// Data structure that contains the needed information to construct the
2180   /// kernel args vector.
2181   struct TargetKernelArgs {
2182     /// Number of arguments passed to the runtime library.
2183     unsigned NumTargetItems;
2184     /// Arguments passed to the runtime library
2185     TargetDataRTArgs RTArgs;
2186     /// The number of iterations
2187     Value *NumIterations;
2188     /// The number of teams.
2189     Value *NumTeams;
2190     /// The number of threads.
2191     Value *NumThreads;
2192     /// The size of the dynamic shared memory.
2193     Value *DynCGGroupMem;
2194     /// True if the kernel has 'no wait' clause.
2195     bool HasNoWait;
2196 
2197     /// Constructor for TargetKernelArgs
TargetKernelArgsTargetKernelArgs2198     TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
2199                      Value *NumIterations, Value *NumTeams, Value *NumThreads,
2200                      Value *DynCGGroupMem, bool HasNoWait)
2201         : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
2202           NumIterations(NumIterations), NumTeams(NumTeams),
2203           NumThreads(NumThreads), DynCGGroupMem(DynCGGroupMem),
2204           HasNoWait(HasNoWait) {}
2205   };
2206 
2207   /// Create the kernel args vector used by emitTargetKernel. This function
2208   /// creates various constant values that are used in the resulting args
2209   /// vector.
2210   static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
2211                                   IRBuilderBase &Builder,
2212                                   SmallVector<Value *> &ArgsVector);
2213 
2214   /// Struct that keeps the information that should be kept throughout
2215   /// a 'target data' region.
2216   class TargetDataInfo {
2217     /// Set to true if device pointer information have to be obtained.
2218     bool RequiresDevicePointerInfo = false;
2219     /// Set to true if Clang emits separate runtime calls for the beginning and
2220     /// end of the region.  These calls might have separate map type arrays.
2221     bool SeparateBeginEndCalls = false;
2222 
2223   public:
2224     TargetDataRTArgs RTArgs;
2225 
2226     SmallMapVector<const Value *, std::pair<Value *, Value *>, 4>
2227         DevicePtrInfoMap;
2228 
2229     /// Indicate whether any user-defined mapper exists.
2230     bool HasMapper = false;
2231     /// The total number of pointers passed to the runtime library.
2232     unsigned NumberOfPtrs = 0u;
2233 
TargetDataInfo()2234     explicit TargetDataInfo() {}
TargetDataInfo(bool RequiresDevicePointerInfo,bool SeparateBeginEndCalls)2235     explicit TargetDataInfo(bool RequiresDevicePointerInfo,
2236                             bool SeparateBeginEndCalls)
2237         : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
2238           SeparateBeginEndCalls(SeparateBeginEndCalls) {}
2239     /// Clear information about the data arrays.
clearArrayInfo()2240     void clearArrayInfo() {
2241       RTArgs = TargetDataRTArgs();
2242       HasMapper = false;
2243       NumberOfPtrs = 0u;
2244     }
2245     /// Return true if the current target data information has valid arrays.
isValid()2246     bool isValid() {
2247       return RTArgs.BasePointersArray && RTArgs.PointersArray &&
2248              RTArgs.SizesArray && RTArgs.MapTypesArray &&
2249              (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs;
2250     }
requiresDevicePointerInfo()2251     bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
separateBeginEndCalls()2252     bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
2253   };
2254 
2255   enum class DeviceInfoTy { None, Pointer, Address };
2256   using MapValuesArrayTy = SmallVector<Value *, 4>;
2257   using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>;
2258   using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>;
2259   using MapNamesArrayTy = SmallVector<Constant *, 4>;
2260   using MapDimArrayTy = SmallVector<uint64_t, 4>;
2261   using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>;
2262 
2263   /// This structure contains combined information generated for mappable
2264   /// clauses, including base pointers, pointers, sizes, map types, user-defined
2265   /// mappers, and non-contiguous information.
2266   struct MapInfosTy {
2267     struct StructNonContiguousInfo {
2268       bool IsNonContiguous = false;
2269       MapDimArrayTy Dims;
2270       MapNonContiguousArrayTy Offsets;
2271       MapNonContiguousArrayTy Counts;
2272       MapNonContiguousArrayTy Strides;
2273     };
2274     MapValuesArrayTy BasePointers;
2275     MapValuesArrayTy Pointers;
2276     MapDeviceInfoArrayTy DevicePointers;
2277     MapValuesArrayTy Sizes;
2278     MapFlagsArrayTy Types;
2279     MapNamesArrayTy Names;
2280     StructNonContiguousInfo NonContigInfo;
2281 
2282     /// Append arrays in \a CurInfo.
appendMapInfosTy2283     void append(MapInfosTy &CurInfo) {
2284       BasePointers.append(CurInfo.BasePointers.begin(),
2285                           CurInfo.BasePointers.end());
2286       Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
2287       DevicePointers.append(CurInfo.DevicePointers.begin(),
2288                             CurInfo.DevicePointers.end());
2289       Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
2290       Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
2291       Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
2292       NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(),
2293                                 CurInfo.NonContigInfo.Dims.end());
2294       NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(),
2295                                    CurInfo.NonContigInfo.Offsets.end());
2296       NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(),
2297                                   CurInfo.NonContigInfo.Counts.end());
2298       NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(),
2299                                    CurInfo.NonContigInfo.Strides.end());
2300     }
2301   };
2302 
2303   /// Callback function type for functions emitting the host fallback code that
2304   /// is executed when the kernel launch fails. It takes an insertion point as
2305   /// parameter where the code should be emitted. It returns an insertion point
2306   /// that points right after after the emitted code.
2307   using EmitFallbackCallbackTy = function_ref<InsertPointTy(InsertPointTy)>;
2308 
2309   /// Generate a target region entry call and host fallback call.
2310   ///
2311   /// \param Loc The location at which the request originated and is fulfilled.
2312   /// \param OutlinedFn The outlined kernel function.
2313   /// \param OutlinedFnID The ooulined function ID.
2314   /// \param EmitTargetCallFallbackCB Call back function to generate host
2315   ///        fallback code.
2316   /// \param Args Data structure holding information about the kernel arguments.
2317   /// \param DeviceID Identifier for the device via the 'device' clause.
2318   /// \param RTLoc Source location identifier
2319   /// \param AllocaIP The insertion point to be used for alloca instructions.
2320   InsertPointTy emitKernelLaunch(
2321       const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
2322       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
2323       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
2324 
2325   /// Generate a target-task for the target construct
2326   ///
2327   /// \param OutlinedFn The outlined device/target kernel function.
2328   /// \param OutlinedFnID The ooulined function ID.
2329   /// \param EmitTargetCallFallbackCB Call back function to generate host
2330   ///        fallback code.
2331   /// \param Args Data structure holding information about the kernel arguments.
2332   /// \param DeviceID Identifier for the device via the 'device' clause.
2333   /// \param RTLoc Source location identifier
2334   /// \param AllocaIP The insertion point to be used for alloca instructions.
2335   /// \param Dependencies Vector of DependData objects holding information of
2336   ///        dependencies as specified by the 'depend' clause.
2337   /// \param HasNoWait True if the target construct had 'nowait' on it, false
2338   ///        otherwise
2339   InsertPointTy emitTargetTask(
2340       Function *OutlinedFn, Value *OutlinedFnID,
2341       EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
2342       Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
2343       SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);
2344 
2345   /// Emit the arguments to be passed to the runtime library based on the
2346   /// arrays of base pointers, pointers, sizes, map types, and mappers.  If
2347   /// ForEndCall, emit map types to be passed for the end of the region instead
2348   /// of the beginning.
2349   void emitOffloadingArraysArgument(IRBuilderBase &Builder,
2350                                     OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
2351                                     OpenMPIRBuilder::TargetDataInfo &Info,
2352                                     bool EmitDebug = false,
2353                                     bool ForEndCall = false);
2354 
2355   /// Emit an array of struct descriptors to be assigned to the offload args.
2356   void emitNonContiguousDescriptor(InsertPointTy AllocaIP,
2357                                    InsertPointTy CodeGenIP,
2358                                    MapInfosTy &CombinedInfo,
2359                                    TargetDataInfo &Info);
2360 
2361   /// Emit the arrays used to pass the captures and map information to the
2362   /// offloading runtime library. If there is no map or capture information,
2363   /// return nullptr by reference.
2364   void emitOffloadingArrays(
2365       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
2366       TargetDataInfo &Info, bool IsNonContiguous = false,
2367       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2368       function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
2369 
2370   /// Creates offloading entry for the provided entry ID \a ID, address \a
2371   /// Addr, size \a Size, and flags \a Flags.
2372   void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
2373                           int32_t Flags, GlobalValue::LinkageTypes,
2374                           StringRef Name = "");
2375 
2376   /// The kind of errors that can occur when emitting the offload entries and
2377   /// metadata.
2378   enum EmitMetadataErrorKind {
2379     EMIT_MD_TARGET_REGION_ERROR,
2380     EMIT_MD_DECLARE_TARGET_ERROR,
2381     EMIT_MD_GLOBAL_VAR_LINK_ERROR
2382   };
2383 
2384   /// Callback function type
2385   using EmitMetadataErrorReportFunctionTy =
2386       std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
2387 
2388   // Emit the offloading entries and metadata so that the device codegen side
2389   // can easily figure out what to emit. The produced metadata looks like
2390   // this:
2391   //
2392   // !omp_offload.info = !{!1, ...}
2393   //
2394   // We only generate metadata for function that contain target regions.
2395   void createOffloadEntriesAndInfoMetadata(
2396       EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
2397 
2398 public:
2399   /// Generator for __kmpc_copyprivate
2400   ///
2401   /// \param Loc The source location description.
2402   /// \param BufSize Number of elements in the buffer.
2403   /// \param CpyBuf List of pointers to data to be copied.
2404   /// \param CpyFn function to call for copying data.
2405   /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
2406   ///
2407   /// \return The insertion position *after* the CopyPrivate call.
2408 
2409   InsertPointTy createCopyPrivate(const LocationDescription &Loc,
2410                                   llvm::Value *BufSize, llvm::Value *CpyBuf,
2411                                   llvm::Value *CpyFn, llvm::Value *DidIt);
2412 
2413   /// Generator for '#omp single'
2414   ///
2415   /// \param Loc The source location description.
2416   /// \param BodyGenCB Callback that will generate the region code.
2417   /// \param FiniCB Callback to finalize variable copies.
2418   /// \param IsNowait If false, a barrier is emitted.
2419   /// \param CPVars copyprivate variables.
2420   /// \param CPFuncs copy functions to use for each copyprivate variable.
2421   ///
2422   /// \returns The insertion position *after* the single call.
2423   InsertPointTy createSingle(const LocationDescription &Loc,
2424                              BodyGenCallbackTy BodyGenCB,
2425                              FinalizeCallbackTy FiniCB, bool IsNowait,
2426                              ArrayRef<llvm::Value *> CPVars = {},
2427                              ArrayRef<llvm::Function *> CPFuncs = {});
2428 
2429   /// Generator for '#omp master'
2430   ///
2431   /// \param Loc The insert and source location description.
2432   /// \param BodyGenCB Callback that will generate the region code.
2433   /// \param FiniCB Callback to finalize variable copies.
2434   ///
2435   /// \returns The insertion position *after* the master.
2436   InsertPointTy createMaster(const LocationDescription &Loc,
2437                              BodyGenCallbackTy BodyGenCB,
2438                              FinalizeCallbackTy FiniCB);
2439 
2440   /// Generator for '#omp masked'
2441   ///
2442   /// \param Loc The insert and source location description.
2443   /// \param BodyGenCB Callback that will generate the region code.
2444   /// \param FiniCB Callback to finialize variable copies.
2445   ///
2446   /// \returns The insertion position *after* the masked.
2447   InsertPointTy createMasked(const LocationDescription &Loc,
2448                              BodyGenCallbackTy BodyGenCB,
2449                              FinalizeCallbackTy FiniCB, Value *Filter);
2450 
2451   /// Generator for '#omp critical'
2452   ///
2453   /// \param Loc The insert and source location description.
2454   /// \param BodyGenCB Callback that will generate the region body code.
2455   /// \param FiniCB Callback to finalize variable copies.
2456   /// \param CriticalName name of the lock used by the critical directive
2457   /// \param HintInst Hint Instruction for hint clause associated with critical
2458   ///
2459   /// \returns The insertion position *after* the critical.
2460   InsertPointTy createCritical(const LocationDescription &Loc,
2461                                BodyGenCallbackTy BodyGenCB,
2462                                FinalizeCallbackTy FiniCB,
2463                                StringRef CriticalName, Value *HintInst);
2464 
2465   /// Generator for '#omp ordered depend (source | sink)'
2466   ///
2467   /// \param Loc The insert and source location description.
2468   /// \param AllocaIP The insertion point to be used for alloca instructions.
2469   /// \param NumLoops The number of loops in depend clause.
2470   /// \param StoreValues The value will be stored in vector address.
2471   /// \param Name The name of alloca instruction.
2472   /// \param IsDependSource If true, depend source; otherwise, depend sink.
2473   ///
2474   /// \return The insertion position *after* the ordered.
2475   InsertPointTy createOrderedDepend(const LocationDescription &Loc,
2476                                     InsertPointTy AllocaIP, unsigned NumLoops,
2477                                     ArrayRef<llvm::Value *> StoreValues,
2478                                     const Twine &Name, bool IsDependSource);
2479 
2480   /// Generator for '#omp ordered [threads | simd]'
2481   ///
2482   /// \param Loc The insert and source location description.
2483   /// \param BodyGenCB Callback that will generate the region code.
2484   /// \param FiniCB Callback to finalize variable copies.
2485   /// \param IsThreads If true, with threads clause or without clause;
2486   /// otherwise, with simd clause;
2487   ///
2488   /// \returns The insertion position *after* the ordered.
2489   InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc,
2490                                          BodyGenCallbackTy BodyGenCB,
2491                                          FinalizeCallbackTy FiniCB,
2492                                          bool IsThreads);
2493 
2494   /// Generator for '#omp sections'
2495   ///
2496   /// \param Loc The insert and source location description.
2497   /// \param AllocaIP The insertion points to be used for alloca instructions.
2498   /// \param SectionCBs Callbacks that will generate body of each section.
2499   /// \param PrivCB Callback to copy a given variable (think copy constructor).
2500   /// \param FiniCB Callback to finalize variable copies.
2501   /// \param IsCancellable Flag to indicate a cancellable parallel region.
2502   /// \param IsNowait If true, barrier - to ensure all sections are executed
2503   /// before moving forward will not be generated.
2504   /// \returns The insertion position *after* the sections.
2505   InsertPointTy createSections(const LocationDescription &Loc,
2506                                InsertPointTy AllocaIP,
2507                                ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
2508                                PrivatizeCallbackTy PrivCB,
2509                                FinalizeCallbackTy FiniCB, bool IsCancellable,
2510                                bool IsNowait);
2511 
2512   /// Generator for '#omp section'
2513   ///
2514   /// \param Loc The insert and source location description.
2515   /// \param BodyGenCB Callback that will generate the region body code.
2516   /// \param FiniCB Callback to finalize variable copies.
2517   /// \returns The insertion position *after* the section.
2518   InsertPointTy createSection(const LocationDescription &Loc,
2519                               BodyGenCallbackTy BodyGenCB,
2520                               FinalizeCallbackTy FiniCB);
2521 
2522   /// Generator for `#omp teams`
2523   ///
2524   /// \param Loc The location where the teams construct was encountered.
2525   /// \param BodyGenCB Callback that will generate the region code.
2526   /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
2527   ///        it is as if lower bound is specified as equal to upperbound. If
2528   ///        this is non-null, then upperbound must also be non-null.
2529   /// \param NumTeamsUpper Upper bound on the number of teams.
2530   /// \param ThreadLimit on the number of threads that may participate in a
2531   ///        contention group created by each team.
2532   /// \param IfExpr is the integer argument value of the if condition on the
2533   ///        teams clause.
2534   InsertPointTy
2535   createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2536               Value *NumTeamsLower = nullptr, Value *NumTeamsUpper = nullptr,
2537               Value *ThreadLimit = nullptr, Value *IfExpr = nullptr);
2538 
2539   /// Generate conditional branch and relevant BasicBlocks through which private
2540   /// threads copy the 'copyin' variables from Master copy to threadprivate
2541   /// copies.
2542   ///
2543   /// \param IP insertion block for copyin conditional
2544   /// \param MasterVarPtr a pointer to the master variable
2545   /// \param PrivateVarPtr a pointer to the threadprivate variable
2546   /// \param IntPtrTy Pointer size type
2547   /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
2548   //				 and copy.in.end block
2549   ///
2550   /// \returns The insertion point where copying operation to be emitted.
2551   InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr,
2552                                          Value *PrivateAddr,
2553                                          llvm::IntegerType *IntPtrTy,
2554                                          bool BranchtoEnd = true);
2555 
2556   /// Create a runtime call for kmpc_Alloc
2557   ///
2558   /// \param Loc The insert and source location description.
2559   /// \param Size Size of allocated memory space
2560   /// \param Allocator Allocator information instruction
2561   /// \param Name Name of call Instruction for OMP_alloc
2562   ///
2563   /// \returns CallInst to the OMP_Alloc call
2564   CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
2565                            Value *Allocator, std::string Name = "");
2566 
2567   /// Create a runtime call for kmpc_free
2568   ///
2569   /// \param Loc The insert and source location description.
2570   /// \param Addr Address of memory space to be freed
2571   /// \param Allocator Allocator information instruction
2572   /// \param Name Name of call Instruction for OMP_Free
2573   ///
2574   /// \returns CallInst to the OMP_Free call
2575   CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
2576                           Value *Allocator, std::string Name = "");
2577 
2578   /// Create a runtime call for kmpc_threadprivate_cached
2579   ///
2580   /// \param Loc The insert and source location description.
2581   /// \param Pointer pointer to data to be cached
2582   /// \param Size size of data to be cached
2583   /// \param Name Name of call Instruction for callinst
2584   ///
2585   /// \returns CallInst to the thread private cache call.
2586   CallInst *createCachedThreadPrivate(const LocationDescription &Loc,
2587                                       llvm::Value *Pointer,
2588                                       llvm::ConstantInt *Size,
2589                                       const llvm::Twine &Name = Twine(""));
2590 
2591   /// Create a runtime call for __tgt_interop_init
2592   ///
2593   /// \param Loc The insert and source location description.
2594   /// \param InteropVar variable to be allocated
2595   /// \param InteropType type of interop operation
2596   /// \param Device devide to which offloading will occur
2597   /// \param NumDependences  number of dependence variables
2598   /// \param DependenceAddress pointer to dependence variables
2599   /// \param HaveNowaitClause does nowait clause exist
2600   ///
2601   /// \returns CallInst to the __tgt_interop_init call
2602   CallInst *createOMPInteropInit(const LocationDescription &Loc,
2603                                  Value *InteropVar,
2604                                  omp::OMPInteropType InteropType, Value *Device,
2605                                  Value *NumDependences,
2606                                  Value *DependenceAddress,
2607                                  bool HaveNowaitClause);
2608 
2609   /// Create a runtime call for __tgt_interop_destroy
2610   ///
2611   /// \param Loc The insert and source location description.
2612   /// \param InteropVar variable to be allocated
2613   /// \param Device devide to which offloading will occur
2614   /// \param NumDependences  number of dependence variables
2615   /// \param DependenceAddress pointer to dependence variables
2616   /// \param HaveNowaitClause does nowait clause exist
2617   ///
2618   /// \returns CallInst to the __tgt_interop_destroy call
2619   CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
2620                                     Value *InteropVar, Value *Device,
2621                                     Value *NumDependences,
2622                                     Value *DependenceAddress,
2623                                     bool HaveNowaitClause);
2624 
2625   /// Create a runtime call for __tgt_interop_use
2626   ///
2627   /// \param Loc The insert and source location description.
2628   /// \param InteropVar variable to be allocated
2629   /// \param Device devide to which offloading will occur
2630   /// \param NumDependences  number of dependence variables
2631   /// \param DependenceAddress pointer to dependence variables
2632   /// \param HaveNowaitClause does nowait clause exist
2633   ///
2634   /// \returns CallInst to the __tgt_interop_use call
2635   CallInst *createOMPInteropUse(const LocationDescription &Loc,
2636                                 Value *InteropVar, Value *Device,
2637                                 Value *NumDependences, Value *DependenceAddress,
2638                                 bool HaveNowaitClause);
2639 
2640   /// The `omp target` interface
2641   ///
2642   /// For more information about the usage of this interface,
2643   /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
2644   ///
2645   ///{
2646 
2647   /// Create a runtime call for kmpc_target_init
2648   ///
2649   /// \param Loc The insert and source location description.
2650   /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
2651   /// \param MinThreads Minimal number of threads, or 0.
2652   /// \param MaxThreads Maximal number of threads, or 0.
2653   /// \param MinTeams Minimal number of teams, or 0.
2654   /// \param MaxTeams Maximal number of teams, or 0.
2655   InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD,
2656                                  int32_t MinThreadsVal = 0,
2657                                  int32_t MaxThreadsVal = 0,
2658                                  int32_t MinTeamsVal = 0,
2659                                  int32_t MaxTeamsVal = 0);
2660 
2661   /// Create a runtime call for kmpc_target_deinit
2662   ///
2663   /// \param Loc The insert and source location description.
2664   /// \param TeamsReductionDataSize The maximal size of all the reduction data
2665   ///        for teams reduction.
2666   /// \param TeamsReductionBufferLength The number of elements (each of up to
2667   ///        \p TeamsReductionDataSize size), in the teams reduction buffer.
2668   void createTargetDeinit(const LocationDescription &Loc,
2669                           int32_t TeamsReductionDataSize = 0,
2670                           int32_t TeamsReductionBufferLength = 1024);
2671 
2672   ///}
2673 
2674   /// Helpers to read/write kernel annotations from the IR.
2675   ///
2676   ///{
2677 
2678   /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
2679   /// is set.
2680   static std::pair<int32_t, int32_t>
2681   readThreadBoundsForKernel(const Triple &T, Function &Kernel);
2682   static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel,
2683                                          int32_t LB, int32_t UB);
2684 
2685   /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
2686   /// is set.
2687   static std::pair<int32_t, int32_t> readTeamBoundsForKernel(const Triple &T,
2688                                                              Function &Kernel);
2689   static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB,
2690                                   int32_t UB);
2691   ///}
2692 
2693 private:
2694   // Sets the function attributes expected for the outlined function
2695   void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
2696 
2697   // Creates the function ID/Address for the given outlined function.
2698   // In the case of an embedded device function the address of the function is
2699   // used, in the case of a non-offload function a constant is created.
2700   Constant *createOutlinedFunctionID(Function *OutlinedFn,
2701                                      StringRef EntryFnIDName);
2702 
2703   // Creates the region entry address for the outlined function
2704   Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
2705                                         StringRef EntryFnName);
2706 
2707 public:
2708   /// Functions used to generate a function with the given name.
2709   using FunctionGenCallback = std::function<Function *(StringRef FunctionName)>;
2710 
2711   /// Create a unique name for the entry function using the source location
2712   /// information of the current target region. The name will be something like:
2713   ///
2714   /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
2715   ///
2716   /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
2717   /// mangled name of the function that encloses the target region and BB is the
2718   /// line number of the target region. CC is a count added when more than one
2719   /// region is located at the same location.
2720   ///
2721   /// If this target outline function is not an offload entry, we don't need to
2722   /// register it. This may happen if it is guarded by an if clause that is
2723   /// false at compile time, or no target archs have been specified.
2724   ///
2725   /// The created target region ID is used by the runtime library to identify
2726   /// the current target region, so it only has to be unique and not
2727   /// necessarily point to anything. It could be the pointer to the outlined
2728   /// function that implements the target region, but we aren't using that so
2729   /// that the compiler doesn't need to keep that, and could therefore inline
2730   /// the host function if proven worthwhile during optimization. In the other
2731   /// hand, if emitting code for the device, the ID has to be the function
2732   /// address so that it can retrieved from the offloading entry and launched
2733   /// by the runtime library. We also mark the outlined function to have
2734   /// external linkage in case we are emitting code for the device, because
2735   /// these functions will be entry points to the device.
2736   ///
2737   /// \param InfoManager The info manager keeping track of the offload entries
2738   /// \param EntryInfo The entry information about the function
2739   /// \param GenerateFunctionCallback The callback function to generate the code
2740   /// \param OutlinedFunction Pointer to the outlined function
2741   /// \param EntryFnIDName Name of the ID o be created
2742   void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
2743                                 FunctionGenCallback &GenerateFunctionCallback,
2744                                 bool IsOffloadEntry, Function *&OutlinedFn,
2745                                 Constant *&OutlinedFnID);
2746 
2747   /// Registers the given function and sets up the attribtues of the function
2748   /// Returns the FunctionID.
2749   ///
2750   /// \param InfoManager The info manager keeping track of the offload entries
2751   /// \param EntryInfo The entry information about the function
2752   /// \param OutlinedFunction Pointer to the outlined function
2753   /// \param EntryFnName Name of the outlined function
2754   /// \param EntryFnIDName Name of the ID o be created
2755   Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
2756                                          Function *OutlinedFunction,
2757                                          StringRef EntryFnName,
2758                                          StringRef EntryFnIDName);
2759 
2760   /// Type of BodyGen to use for region codegen
2761   ///
2762   /// Priv: If device pointer privatization is required, emit the body of the
2763   /// region here. It will have to be duplicated: with and without
2764   /// privatization.
2765   /// DupNoPriv: If we need device pointer privatization, we need
2766   /// to emit the body of the region with no privatization in the 'else' branch
2767   /// of the conditional.
2768   /// NoPriv: If we don't require privatization of device
2769   /// pointers, we emit the body in between the runtime calls. This avoids
2770   /// duplicating the body code.
2771   enum BodyGenTy { Priv, DupNoPriv, NoPriv };
2772 
2773   /// Callback type for creating the map infos for the kernel parameters.
2774   /// \param CodeGenIP is the insertion point where code should be generated,
2775   ///        if any.
2776   using GenMapInfoCallbackTy =
2777       function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>;
2778 
2779   /// Generator for '#omp target data'
2780   ///
2781   /// \param Loc The location where the target data construct was encountered.
2782   /// \param AllocaIP The insertion points to be used for alloca instructions.
2783   /// \param CodeGenIP The insertion point at which the target directive code
2784   /// should be placed.
2785   /// \param IsBegin If true then emits begin mapper call otherwise emits
2786   /// end mapper call.
2787   /// \param DeviceID Stores the DeviceID from the device clause.
2788   /// \param IfCond Value which corresponds to the if clause condition.
2789   /// \param Info Stores all information realted to the Target Data directive.
2790   /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
2791   /// \param BodyGenCB Optional Callback to generate the region code.
2792   /// \param DeviceAddrCB Optional callback to generate code related to
2793   /// use_device_ptr and use_device_addr.
2794   /// \param CustomMapperCB Optional callback to generate code related to
2795   /// custom mappers.
2796   OpenMPIRBuilder::InsertPointTy createTargetData(
2797       const LocationDescription &Loc, InsertPointTy AllocaIP,
2798       InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
2799       TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
2800       omp::RuntimeFunction *MapperFunc = nullptr,
2801       function_ref<InsertPointTy(InsertPointTy CodeGenIP,
2802                                  BodyGenTy BodyGenType)>
2803           BodyGenCB = nullptr,
2804       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2805       function_ref<Value *(unsigned int)> CustomMapperCB = nullptr,
2806       Value *SrcLocInfo = nullptr);
2807 
2808   using TargetBodyGenCallbackTy = function_ref<InsertPointTy(
2809       InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
2810 
2811   using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointTy(
2812       Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
2813       InsertPointTy CodeGenIP)>;
2814 
2815   /// Generator for '#omp target'
2816   ///
2817   /// \param Loc where the target data construct was encountered.
2818   /// \param CodeGenIP The insertion point where the call to the outlined
2819   /// function should be emitted.
2820   /// \param EntryInfo The entry information about the function.
2821   /// \param NumTeams Number of teams specified in the num_teams clause.
2822   /// \param NumThreads Number of teams specified in the thread_limit clause.
2823   /// \param Inputs The input values to the region that will be passed.
2824   /// as arguments to the outlined function.
2825   /// \param BodyGenCB Callback that will generate the region code.
2826   /// \param ArgAccessorFuncCB Callback that will generate accessors
2827   /// instructions for passed in target arguments where neccessary
2828   /// \param Dependencies A vector of DependData objects that carry
2829   // dependency information as passed in the depend clause
2830   InsertPointTy createTarget(const LocationDescription &Loc,
2831                              OpenMPIRBuilder::InsertPointTy AllocaIP,
2832                              OpenMPIRBuilder::InsertPointTy CodeGenIP,
2833                              TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
2834                              int32_t NumThreads,
2835                              SmallVectorImpl<Value *> &Inputs,
2836                              GenMapInfoCallbackTy GenMapInfoCB,
2837                              TargetBodyGenCallbackTy BodyGenCB,
2838                              TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
2839                              SmallVector<DependData> Dependencies = {});
2840 
2841   /// Returns __kmpc_for_static_init_* runtime function for the specified
2842   /// size \a IVSize and sign \a IVSigned. Will create a distribute call
2843   /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
2844   FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned,
2845                                              bool IsGPUDistribute);
2846 
2847   /// Returns __kmpc_dispatch_init_* runtime function for the specified
2848   /// size \a IVSize and sign \a IVSigned.
2849   FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned);
2850 
2851   /// Returns __kmpc_dispatch_next_* runtime function for the specified
2852   /// size \a IVSize and sign \a IVSigned.
2853   FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned);
2854 
2855   /// Returns __kmpc_dispatch_fini_* runtime function for the specified
2856   /// size \a IVSize and sign \a IVSigned.
2857   FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned);
2858 
2859   /// Returns __kmpc_dispatch_deinit runtime function.
2860   FunctionCallee createDispatchDeinitFunction();
2861 
2862   /// Declarations for LLVM-IR types (simple, array, function and structure) are
2863   /// generated below. Their names are defined and used in OpenMPKinds.def. Here
2864   /// we provide the declarations, the initializeTypes function will provide the
2865   /// values.
2866   ///
2867   ///{
2868 #define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
2869 #define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                             \
2870   ArrayType *VarName##Ty = nullptr;                                            \
2871   PointerType *VarName##PtrTy = nullptr;
2872 #define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
2873   FunctionType *VarName = nullptr;                                             \
2874   PointerType *VarName##Ptr = nullptr;
2875 #define OMP_STRUCT_TYPE(VarName, StrName, ...)                                 \
2876   StructType *VarName = nullptr;                                               \
2877   PointerType *VarName##Ptr = nullptr;
2878 #include "llvm/Frontend/OpenMP/OMPKinds.def"
2879 
2880   ///}
2881 
2882 private:
2883   /// Create all simple and struct types exposed by the runtime and remember
2884   /// the llvm::PointerTypes of them for easy access later.
2885   void initializeTypes(Module &M);
2886 
2887   /// Common interface for generating entry calls for OMP Directives.
2888   /// if the directive has a region/body, It will set the insertion
2889   /// point to the body
2890   ///
2891   /// \param OMPD Directive to generate entry blocks for
2892   /// \param EntryCall Call to the entry OMP Runtime Function
2893   /// \param ExitBB block where the region ends.
2894   /// \param Conditional indicate if the entry call result will be used
2895   ///        to evaluate a conditional of whether a thread will execute
2896   ///        body code or not.
2897   ///
2898   /// \return The insertion position in exit block
2899   InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
2900                                          BasicBlock *ExitBB,
2901                                          bool Conditional = false);
2902 
2903   /// Common interface to finalize the region
2904   ///
2905   /// \param OMPD Directive to generate exiting code for
2906   /// \param FinIP Insertion point for emitting Finalization code and exit call
2907   /// \param ExitCall Call to the ending OMP Runtime Function
2908   /// \param HasFinalize indicate if the directive will require finalization
2909   ///         and has a finalization callback in the stack that
2910   ///        should be called.
2911   ///
2912   /// \return The insertion position in exit block
2913   InsertPointTy emitCommonDirectiveExit(omp::Directive OMPD,
2914                                         InsertPointTy FinIP,
2915                                         Instruction *ExitCall,
2916                                         bool HasFinalize = true);
2917 
2918   /// Common Interface to generate OMP inlined regions
2919   ///
2920   /// \param OMPD Directive to generate inlined region for
2921   /// \param EntryCall Call to the entry OMP Runtime Function
2922   /// \param ExitCall Call to the ending OMP Runtime Function
2923   /// \param BodyGenCB Body code generation callback.
2924   /// \param FiniCB Finalization Callback. Will be called when finalizing region
2925   /// \param Conditional indicate if the entry call result will be used
2926   ///        to evaluate a conditional of whether a thread will execute
2927   ///        body code or not.
2928   /// \param HasFinalize indicate if the directive will require finalization
2929   ///        and has a finalization callback in the stack that
2930   ///        should be called.
2931   /// \param IsCancellable if HasFinalize is set to true, indicate if the
2932   ///        the directive should be cancellable.
2933   /// \return The insertion point after the region
2934 
2935   InsertPointTy
2936   EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
2937                        Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
2938                        FinalizeCallbackTy FiniCB, bool Conditional = false,
2939                        bool HasFinalize = true, bool IsCancellable = false);
2940 
2941   /// Get the platform-specific name separator.
2942   /// \param Parts different parts of the final name that needs separation
2943   /// \param FirstSeparator First separator used between the initial two
2944   ///        parts of the name.
2945   /// \param Separator separator used between all of the rest consecutive
2946   ///        parts of the name
2947   static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
2948                                            StringRef FirstSeparator,
2949                                            StringRef Separator);
2950 
2951   /// Returns corresponding lock object for the specified critical region
2952   /// name. If the lock object does not exist it is created, otherwise the
2953   /// reference to the existing copy is returned.
2954   /// \param CriticalName Name of the critical region.
2955   ///
2956   Value *getOMPCriticalRegionLock(StringRef CriticalName);
2957 
2958   /// Callback type for Atomic Expression update
2959   /// ex:
2960   /// \code{.cpp}
2961   /// unsigned x = 0;
2962   /// #pragma omp atomic update
2963   /// x = Expr(x_old);  //Expr() is any legal operation
2964   /// \endcode
2965   ///
2966   /// \param XOld the value of the atomic memory address to use for update
2967   /// \param IRB reference to the IRBuilder to use
2968   ///
2969   /// \returns Value to update X to.
2970   using AtomicUpdateCallbackTy =
2971       const function_ref<Value *(Value *XOld, IRBuilder<> &IRB)>;
2972 
2973 private:
2974   enum AtomicKind { Read, Write, Update, Capture, Compare };
2975 
2976   /// Determine whether to emit flush or not
2977   ///
2978   /// \param Loc    The insert and source location description.
2979   /// \param AO     The required atomic ordering
2980   /// \param AK     The OpenMP atomic operation kind used.
2981   ///
2982   /// \returns		wether a flush was emitted or not
2983   bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
2984                                     AtomicOrdering AO, AtomicKind AK);
2985 
2986   /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
2987   /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
2988   /// Only Scalar data types.
2989   ///
2990   /// \param AllocaIP	  The insertion point to be used for alloca
2991   ///                   instructions.
2992   /// \param X			    The target atomic pointer to be updated
2993   /// \param XElemTy    The element type of the atomic pointer.
2994   /// \param Expr		    The value to update X with.
2995   /// \param AO			    Atomic ordering of the generated atomic
2996   ///                   instructions.
2997   /// \param RMWOp		  The binary operation used for update. If
2998   ///                   operation is not supported by atomicRMW,
2999   ///                   or belong to {FADD, FSUB, BAD_BINOP}.
3000   ///                   Then a `cmpExch` based	atomic will be generated.
3001   /// \param UpdateOp 	Code generator for complex expressions that cannot be
3002   ///                   expressed through atomicrmw instruction.
3003   /// \param VolatileX	     true if \a X volatile?
3004   /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3005   ///                     update expression, false otherwise.
3006   ///                     (e.g. true for X = X BinOp Expr)
3007   ///
3008   /// \returns A pair of the old value of X before the update, and the value
3009   ///          used for the update.
3010   std::pair<Value *, Value *>
3011   emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3012                    AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3013                    AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
3014                    bool IsXBinopExpr);
3015 
3016   /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
3017   ///
3018   /// \Return The instruction
3019   Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3020                                 AtomicRMWInst::BinOp RMWOp);
3021 
3022 public:
3023   /// a struct to pack relevant information while generating atomic Ops
3024   struct AtomicOpValue {
3025     Value *Var = nullptr;
3026     Type *ElemTy = nullptr;
3027     bool IsSigned = false;
3028     bool IsVolatile = false;
3029   };
3030 
3031   /// Emit atomic Read for : V = X --- Only Scalar data types.
3032   ///
3033   /// \param Loc    The insert and source location description.
3034   /// \param X			The target pointer to be atomically read
3035   /// \param V			Memory address where to store atomically read
3036   /// 					    value
3037   /// \param AO			Atomic ordering of the generated atomic
3038   /// 					    instructions.
3039   ///
3040   /// \return Insertion point after generated atomic read IR.
3041   InsertPointTy createAtomicRead(const LocationDescription &Loc,
3042                                  AtomicOpValue &X, AtomicOpValue &V,
3043                                  AtomicOrdering AO);
3044 
3045   /// Emit atomic write for : X = Expr --- Only Scalar data types.
3046   ///
3047   /// \param Loc    The insert and source location description.
3048   /// \param X			The target pointer to be atomically written to
3049   /// \param Expr		The value to store.
3050   /// \param AO			Atomic ordering of the generated atomic
3051   ///               instructions.
3052   ///
3053   /// \return Insertion point after generated atomic Write IR.
3054   InsertPointTy createAtomicWrite(const LocationDescription &Loc,
3055                                   AtomicOpValue &X, Value *Expr,
3056                                   AtomicOrdering AO);
3057 
3058   /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3059   /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3060   /// Only Scalar data types.
3061   ///
3062   /// \param Loc      The insert and source location description.
3063   /// \param AllocaIP The insertion point to be used for alloca instructions.
3064   /// \param X        The target atomic pointer to be updated
3065   /// \param Expr     The value to update X with.
3066   /// \param AO       Atomic ordering of the generated atomic instructions.
3067   /// \param RMWOp    The binary operation used for update. If operation
3068   ///                 is	not supported by atomicRMW, or belong to
3069   ///	                {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
3070   ///                 atomic will be generated.
3071   /// \param UpdateOp 	Code generator for complex expressions that cannot be
3072   ///                   expressed through atomicrmw instruction.
3073   /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3074   ///                     update expression, false otherwise.
3075   ///	                    (e.g. true for X = X BinOp Expr)
3076   ///
3077   /// \return Insertion point after generated atomic update IR.
3078   InsertPointTy createAtomicUpdate(const LocationDescription &Loc,
3079                                    InsertPointTy AllocaIP, AtomicOpValue &X,
3080                                    Value *Expr, AtomicOrdering AO,
3081                                    AtomicRMWInst::BinOp RMWOp,
3082                                    AtomicUpdateCallbackTy &UpdateOp,
3083                                    bool IsXBinopExpr);
3084 
3085   /// Emit atomic update for constructs: --- Only Scalar data types
3086   /// V = X; X = X BinOp Expr ,
3087   /// X = X BinOp Expr; V = X,
3088   /// V = X; X = Expr BinOp X,
3089   /// X = Expr BinOp X; V = X,
3090   /// V = X; X = UpdateOp(X),
3091   /// X = UpdateOp(X); V = X,
3092   ///
3093   /// \param Loc        The insert and source location description.
3094   /// \param AllocaIP   The insertion point to be used for alloca instructions.
3095   /// \param X          The target atomic pointer to be updated
3096   /// \param V          Memory address where to store captured value
3097   /// \param Expr       The value to update X with.
3098   /// \param AO         Atomic ordering of the generated atomic instructions
3099   /// \param RMWOp      The binary operation used for update. If
3100   ///                   operation is not supported by atomicRMW, or belong to
3101   ///	                  {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
3102   ///                   atomic will be generated.
3103   /// \param UpdateOp   Code generator for complex expressions that cannot be
3104   ///                   expressed through atomicrmw instruction.
3105   /// \param UpdateExpr true if X is an in place update of the form
3106   ///                   X = X BinOp Expr or X = Expr BinOp X
3107   /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
3108   ///                     update expression, false otherwise.
3109   ///                     (e.g. true for X = X BinOp Expr)
3110   /// \param IsPostfixUpdate true if original value of 'x' must be stored in
3111   ///                        'v', not an updated one.
3112   ///
3113   /// \return Insertion point after generated atomic capture IR.
3114   InsertPointTy
3115   createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP,
3116                       AtomicOpValue &X, AtomicOpValue &V, Value *Expr,
3117                       AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3118                       AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr,
3119                       bool IsPostfixUpdate, bool IsXBinopExpr);
3120 
3121   /// Emit atomic compare for constructs: --- Only scalar data types
3122   /// cond-expr-stmt:
3123   /// x = x ordop expr ? expr : x;
3124   /// x = expr ordop x ? expr : x;
3125   /// x = x == e ? d : x;
3126   /// x = e == x ? d : x; (this one is not in the spec)
3127   /// cond-update-stmt:
3128   /// if (x ordop expr) { x = expr; }
3129   /// if (expr ordop x) { x = expr; }
3130   /// if (x == e) { x = d; }
3131   /// if (e == x) { x = d; } (this one is not in the spec)
3132   /// conditional-update-capture-atomic:
3133   /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
3134   /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
3135   /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3136   ///                                         IsFailOnly=true)
3137   /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
3138   /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3139   ///                                                IsFailOnly=true)
3140   ///
3141   /// \param Loc          The insert and source location description.
3142   /// \param X            The target atomic pointer to be updated.
3143   /// \param V            Memory address where to store captured value (for
3144   ///                     compare capture only).
3145   /// \param R            Memory address where to store comparison result
3146   ///                     (for compare capture with '==' only).
3147   /// \param E            The expected value ('e') for forms that use an
3148   ///                     equality comparison or an expression ('expr') for
3149   ///                     forms that use 'ordop' (logically an atomic maximum or
3150   ///                     minimum).
3151   /// \param D            The desired value for forms that use an equality
3152   ///                     comparison. If forms that use 'ordop', it should be
3153   ///                     \p nullptr.
3154   /// \param AO           Atomic ordering of the generated atomic instructions.
3155   /// \param Op           Atomic compare operation. It can only be ==, <, or >.
3156   /// \param IsXBinopExpr True if the conditional statement is in the form where
3157   ///                     x is on LHS. It only matters for < or >.
3158   /// \param IsPostfixUpdate  True if original value of 'x' must be stored in
3159   ///                         'v', not an updated one (for compare capture
3160   ///                         only).
3161   /// \param IsFailOnly   True if the original value of 'x' is stored to 'v'
3162   ///                     only when the comparison fails. This is only valid for
3163   ///                     the case the comparison is '=='.
3164   ///
3165   /// \return Insertion point after generated atomic capture IR.
3166   InsertPointTy
3167   createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X,
3168                       AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
3169                       AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
3170                       bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
3171   InsertPointTy createAtomicCompare(const LocationDescription &Loc,
3172                                     AtomicOpValue &X, AtomicOpValue &V,
3173                                     AtomicOpValue &R, Value *E, Value *D,
3174                                     AtomicOrdering AO,
3175                                     omp::OMPAtomicCompareOp Op,
3176                                     bool IsXBinopExpr, bool IsPostfixUpdate,
3177                                     bool IsFailOnly, AtomicOrdering Failure);
3178 
3179   /// Create the control flow structure of a canonical OpenMP loop.
3180   ///
3181   /// The emitted loop will be disconnected, i.e. no edge to the loop's
3182   /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
3183   /// IRBuilder location is not preserved.
3184   ///
3185   /// \param DL        DebugLoc used for the instructions in the skeleton.
3186   /// \param TripCount Value to be used for the trip count.
3187   /// \param F         Function in which to insert the BasicBlocks.
3188   /// \param PreInsertBefore  Where to insert BBs that execute before the body,
3189   ///                         typically the body itself.
3190   /// \param PostInsertBefore Where to insert BBs that execute after the body.
3191   /// \param Name      Base name used to derive BB
3192   ///                  and instruction names.
3193   ///
3194   /// \returns The CanonicalLoopInfo that represents the emitted loop.
3195   CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount,
3196                                         Function *F,
3197                                         BasicBlock *PreInsertBefore,
3198                                         BasicBlock *PostInsertBefore,
3199                                         const Twine &Name = {});
3200   /// OMP Offload Info Metadata name string
3201   const std::string ompOffloadInfoName = "omp_offload.info";
3202 
3203   /// Loads all the offload entries information from the host IR
3204   /// metadata. This function is only meant to be used with device code
3205   /// generation.
3206   ///
3207   /// \param M         Module to load Metadata info from. Module passed maybe
3208   /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
3209   void loadOffloadInfoMetadata(Module &M);
3210 
3211   /// Loads all the offload entries information from the host IR
3212   /// metadata read from the file passed in as the HostFilePath argument. This
3213   /// function is only meant to be used with device code generation.
3214   ///
3215   /// \param HostFilePath The path to the host IR file,
3216   /// used to load in offload metadata for the device, allowing host and device
3217   /// to maintain the same metadata mapping.
3218   void loadOffloadInfoMetadata(StringRef HostFilePath);
3219 
3220   /// Gets (if variable with the given name already exist) or creates
3221   /// internal global variable with the specified Name. The created variable has
3222   /// linkage CommonLinkage by default and is initialized by null value.
3223   /// \param Ty Type of the global variable. If it is exist already the type
3224   /// must be the same.
3225   /// \param Name Name of the variable.
3226   GlobalVariable *getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
3227                                               unsigned AddressSpace = 0);
3228 };
3229 
3230 /// Class to represented the control flow structure of an OpenMP canonical loop.
3231 ///
3232 /// The control-flow structure is standardized for easy consumption by
3233 /// directives associated with loops. For instance, the worksharing-loop
3234 /// construct may change this control flow such that each loop iteration is
3235 /// executed on only one thread. The constraints of a canonical loop in brief
3236 /// are:
3237 ///
3238 ///  * The number of loop iterations must have been computed before entering the
3239 ///    loop.
3240 ///
3241 ///  * Has an (unsigned) logical induction variable that starts at zero and
3242 ///    increments by one.
3243 ///
3244 ///  * The loop's CFG itself has no side-effects. The OpenMP specification
3245 ///    itself allows side-effects, but the order in which they happen, including
3246 ///    how often or whether at all, is unspecified. We expect that the frontend
3247 ///    will emit those side-effect instructions somewhere (e.g. before the loop)
3248 ///    such that the CanonicalLoopInfo itself can be side-effect free.
3249 ///
3250 /// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
3251 /// execution of a loop body that satifies these constraints. It does NOT
3252 /// represent arbitrary SESE regions that happen to contain a loop. Do not use
3253 /// CanonicalLoopInfo for such purposes.
3254 ///
3255 /// The control flow can be described as follows:
3256 ///
3257 ///     Preheader
3258 ///        |
3259 ///  /-> Header
3260 ///  |     |
3261 ///  |    Cond---\
3262 ///  |     |     |
3263 ///  |    Body   |
3264 ///  |    | |    |
3265 ///  |   <...>   |
3266 ///  |    | |    |
3267 ///   \--Latch   |
3268 ///              |
3269 ///             Exit
3270 ///              |
3271 ///            After
3272 ///
3273 /// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
3274 /// including) and end at AfterIP (at the After's first instruction, excluding).
3275 /// That is, instructions in the Preheader and After blocks (except the
3276 /// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
3277 /// side-effects. Typically, the Preheader is used to compute the loop's trip
3278 /// count. The instructions from BodyIP (at the Body block's first instruction,
3279 /// excluding) until the Latch are also considered outside CanonicalLoopInfo's
3280 /// control and thus can have side-effects. The body block is the single entry
3281 /// point into the loop body, which may contain arbitrary control flow as long
3282 /// as all control paths eventually branch to the Latch block.
3283 ///
3284 /// TODO: Consider adding another standardized BasicBlock between Body CFG and
3285 /// Latch to guarantee that there is only a single edge to the latch. It would
3286 /// make loop transformations easier to not needing to consider multiple
3287 /// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
3288 /// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
3289 /// executes after each body iteration.
3290 ///
3291 /// There must be no loop-carried dependencies through llvm::Values. This is
3292 /// equivalant to that the Latch has no PHINode and the Header's only PHINode is
3293 /// for the induction variable.
3294 ///
3295 /// All code in Header, Cond, Latch and Exit (plus the terminator of the
3296 /// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
3297 /// by assertOK(). They are expected to not be modified unless explicitly
3298 /// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
3299 /// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
3300 /// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
3301 /// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
3302 /// anymore as its underlying control flow may not exist anymore.
3303 /// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
3304 /// may also return a new CanonicalLoopInfo that can be passed to other
3305 /// loop-associated construct implementing methods. These loop-transforming
3306 /// methods may either create a new CanonicalLoopInfo usually using
3307 /// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
3308 /// modify one of the input CanonicalLoopInfo and return it as representing the
3309 /// modified loop. What is done is an implementation detail of
3310 /// transformation-implementing method and callers should always assume that the
3311 /// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
3312 /// Returned CanonicalLoopInfo have the same structure and guarantees as the one
3313 /// created by createCanonicalLoop, such that transforming methods do not have
3314 /// to special case where the CanonicalLoopInfo originated from.
3315 ///
3316 /// Generally, methods consuming CanonicalLoopInfo do not need an
3317 /// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
3318 /// CanonicalLoopInfo to insert new or modify existing instructions. Unless
3319 /// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
3320 /// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
3321 /// any InsertPoint in the Preheader, After or Block can still be used after
3322 /// calling such a method.
3323 ///
3324 /// TODO: Provide mechanisms for exception handling and cancellation points.
3325 ///
3326 /// Defined outside OpenMPIRBuilder because nested classes cannot be
3327 /// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
3328 class CanonicalLoopInfo {
3329   friend class OpenMPIRBuilder;
3330 
3331 private:
3332   BasicBlock *Header = nullptr;
3333   BasicBlock *Cond = nullptr;
3334   BasicBlock *Latch = nullptr;
3335   BasicBlock *Exit = nullptr;
3336 
3337   /// Add the control blocks of this loop to \p BBs.
3338   ///
3339   /// This does not include any block from the body, including the one returned
3340   /// by getBody().
3341   ///
3342   /// FIXME: This currently includes the Preheader and After blocks even though
3343   /// their content is (mostly) not under CanonicalLoopInfo's control.
3344   /// Re-evaluated whether this makes sense.
3345   void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
3346 
3347   /// Sets the number of loop iterations to the given value. This value must be
3348   /// valid in the condition block (i.e., defined in the preheader) and is
3349   /// interpreted as an unsigned integer.
3350   void setTripCount(Value *TripCount);
3351 
3352   /// Replace all uses of the canonical induction variable in the loop body with
3353   /// a new one.
3354   ///
3355   /// The intended use case is to update the induction variable for an updated
3356   /// iteration space such that it can stay normalized in the 0...tripcount-1
3357   /// range.
3358   ///
3359   /// The \p Updater is called with the (presumable updated) current normalized
3360   /// induction variable and is expected to return the value that uses of the
3361   /// pre-updated induction values should use instead, typically dependent on
3362   /// the new induction variable. This is a lambda (instead of e.g. just passing
3363   /// the new value) to be able to distinguish the uses of the pre-updated
3364   /// induction variable and uses of the induction varible to compute the
3365   /// updated induction variable value.
3366   void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
3367 
3368 public:
3369   /// Returns whether this object currently represents the IR of a loop. If
3370   /// returning false, it may have been consumed by a loop transformation or not
3371   /// been intialized. Do not use in this case;
isValid()3372   bool isValid() const { return Header; }
3373 
3374   /// The preheader ensures that there is only a single edge entering the loop.
3375   /// Code that must be execute before any loop iteration can be emitted here,
3376   /// such as computing the loop trip count and begin lifetime markers. Code in
3377   /// the preheader is not considered part of the canonical loop.
3378   BasicBlock *getPreheader() const;
3379 
3380   /// The header is the entry for each iteration. In the canonical control flow,
3381   /// it only contains the PHINode for the induction variable.
getHeader()3382   BasicBlock *getHeader() const {
3383     assert(isValid() && "Requires a valid canonical loop");
3384     return Header;
3385   }
3386 
3387   /// The condition block computes whether there is another loop iteration. If
3388   /// yes, branches to the body; otherwise to the exit block.
getCond()3389   BasicBlock *getCond() const {
3390     assert(isValid() && "Requires a valid canonical loop");
3391     return Cond;
3392   }
3393 
3394   /// The body block is the single entry for a loop iteration and not controlled
3395   /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
3396   /// eventually branch to the \p Latch block.
getBody()3397   BasicBlock *getBody() const {
3398     assert(isValid() && "Requires a valid canonical loop");
3399     return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
3400   }
3401 
3402   /// Reaching the latch indicates the end of the loop body code. In the
3403   /// canonical control flow, it only contains the increment of the induction
3404   /// variable.
getLatch()3405   BasicBlock *getLatch() const {
3406     assert(isValid() && "Requires a valid canonical loop");
3407     return Latch;
3408   }
3409 
3410   /// Reaching the exit indicates no more iterations are being executed.
getExit()3411   BasicBlock *getExit() const {
3412     assert(isValid() && "Requires a valid canonical loop");
3413     return Exit;
3414   }
3415 
3416   /// The after block is intended for clean-up code such as lifetime end
3417   /// markers. It is separate from the exit block to ensure, analogous to the
3418   /// preheader, it having just a single entry edge and being free from PHI
3419   /// nodes should there be multiple loop exits (such as from break
3420   /// statements/cancellations).
getAfter()3421   BasicBlock *getAfter() const {
3422     assert(isValid() && "Requires a valid canonical loop");
3423     return Exit->getSingleSuccessor();
3424   }
3425 
3426   /// Returns the llvm::Value containing the number of loop iterations. It must
3427   /// be valid in the preheader and always interpreted as an unsigned integer of
3428   /// any bit-width.
getTripCount()3429   Value *getTripCount() const {
3430     assert(isValid() && "Requires a valid canonical loop");
3431     Instruction *CmpI = &Cond->front();
3432     assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
3433     return CmpI->getOperand(1);
3434   }
3435 
3436   /// Returns the instruction representing the current logical induction
3437   /// variable. Always unsigned, always starting at 0 with an increment of one.
getIndVar()3438   Instruction *getIndVar() const {
3439     assert(isValid() && "Requires a valid canonical loop");
3440     Instruction *IndVarPHI = &Header->front();
3441     assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
3442     return IndVarPHI;
3443   }
3444 
3445   /// Return the type of the induction variable (and the trip count).
getIndVarType()3446   Type *getIndVarType() const {
3447     assert(isValid() && "Requires a valid canonical loop");
3448     return getIndVar()->getType();
3449   }
3450 
3451   /// Return the insertion point for user code before the loop.
getPreheaderIP()3452   OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
3453     assert(isValid() && "Requires a valid canonical loop");
3454     BasicBlock *Preheader = getPreheader();
3455     return {Preheader, std::prev(Preheader->end())};
3456   };
3457 
3458   /// Return the insertion point for user code in the body.
getBodyIP()3459   OpenMPIRBuilder::InsertPointTy getBodyIP() const {
3460     assert(isValid() && "Requires a valid canonical loop");
3461     BasicBlock *Body = getBody();
3462     return {Body, Body->begin()};
3463   };
3464 
3465   /// Return the insertion point for user code after the loop.
getAfterIP()3466   OpenMPIRBuilder::InsertPointTy getAfterIP() const {
3467     assert(isValid() && "Requires a valid canonical loop");
3468     BasicBlock *After = getAfter();
3469     return {After, After->begin()};
3470   };
3471 
getFunction()3472   Function *getFunction() const {
3473     assert(isValid() && "Requires a valid canonical loop");
3474     return Header->getParent();
3475   }
3476 
3477   /// Consistency self-check.
3478   void assertOK() const;
3479 
3480   /// Invalidate this loop. That is, the underlying IR does not fulfill the
3481   /// requirements of an OpenMP canonical loop anymore.
3482   void invalidate();
3483 };
3484 
3485 } // end namespace llvm
3486 
3487 #endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
3488